@agentunion/kite 1.0.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +329 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +197 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +329 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +197 -0
- package/extensions/event_hub_bench/entry.py +624 -379
- package/extensions/event_hub_bench/module.md +2 -1
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +508 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +508 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/watchdog/entry.py +468 -102
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +170 -69
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +390 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +375 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/kernel/__init__.py +0 -0
- package/kernel/entry.py +407 -0
- package/{core/event_hub/hub.py → kernel/event_hub.py} +62 -74
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +23 -8
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/launcher/entry.py +1778 -0
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/launcher/process_manager.py +880 -0
- package/main.py +11 -210
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -157
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -206
- package/core/launcher/entry.py +0 -1158
- package/core/launcher/process_manager.py +0 -470
- package/core/registry/entry.py +0 -110
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -289
- package/extensions/services/watchdog/server.py +0 -167
- /package/{core → extensions/services/web/vendor/bluetooth}/__init__.py +0 -0
- /package/{core/event_hub → extensions/services/web/vendor/conversation}/__init__.py +0 -0
- /package/{core/launcher → extensions/services/web/vendor/task}/__init__.py +0 -0
- /package/{core/registry → extensions/services/web/vendor/tools}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
package/core/launcher/entry.py
DELETED
|
@@ -1,1158 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors processes.
|
|
3
|
-
|
|
4
|
-
Thread model:
|
|
5
|
-
- Main thread: asyncio event loop (process management + monitor loop)
|
|
6
|
-
- API thread: independent thread running uvicorn + FastAPI
|
|
7
|
-
- stdout threads: one daemon thread per child process (ProcessManager)
|
|
8
|
-
- (Windows) keyboard listener thread: polls for 'q' key
|
|
9
|
-
|
|
10
|
-
4-Phase startup:
|
|
11
|
-
Phase 1: Registry → stdout port → KITE_REGISTRY_PORT → API → register self + tokens
|
|
12
|
-
Phase 2: Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready
|
|
13
|
-
Phase 3: Event Hub → Registry → Registry → Event Hub WS → module.ready
|
|
14
|
-
Phase 4: start remaining enabled modules in topo order
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import asyncio
|
|
18
|
-
import json
|
|
19
|
-
import os
|
|
20
|
-
import secrets
|
|
21
|
-
import signal
|
|
22
|
-
import sys
|
|
23
|
-
import threading
|
|
24
|
-
import time
|
|
25
|
-
import uuid
|
|
26
|
-
|
|
27
|
-
import httpx
|
|
28
|
-
import uvicorn
|
|
29
|
-
import websockets
|
|
30
|
-
from fastapi import FastAPI, HTTPException
|
|
31
|
-
|
|
32
|
-
from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
|
|
33
|
-
from .process_manager import ProcessManager
|
|
34
|
-
|
|
35
|
-
IS_WINDOWS = sys.platform == "win32"
|
|
36
|
-
|
|
37
|
-
# Core module names that are started in Phase 1-2 (not Phase 4)
|
|
38
|
-
CORE_MODULE_NAMES = {"registry", "event_hub"}
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class Launcher:
|
|
42
|
-
"""Kite system entry point. Starts Registry, manages modules, exposes API."""
|
|
43
|
-
|
|
44
|
-
def __init__(self, kite_token: str):
|
|
45
|
-
self.kite_token = kite_token
|
|
46
|
-
self.instance_id = str(os.getpid())
|
|
47
|
-
os.environ["KITE_INSTANCE"] = self.instance_id
|
|
48
|
-
|
|
49
|
-
# Resolve instance workspace (must happen before ProcessManager init)
|
|
50
|
-
self._resolve_instance_dir()
|
|
51
|
-
os.environ["KITE_MODULE_DATA"] = os.path.join(
|
|
52
|
-
os.environ["KITE_INSTANCE_DIR"], "launcher",
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
self.process_manager = ProcessManager(
|
|
56
|
-
kite_token, self.instance_id,
|
|
57
|
-
on_kite_message=self._on_kite_message,
|
|
58
|
-
)
|
|
59
|
-
self.module_scanner = ModuleScanner(
|
|
60
|
-
discovery=self._load_discovery(),
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
self.registry_port: int = 0
|
|
64
|
-
self.api_port: int = 0
|
|
65
|
-
self.modules: dict[str, ModuleInfo] = {}
|
|
66
|
-
self._shutdown_event = asyncio.Event()
|
|
67
|
-
self._thread_shutdown = threading.Event()
|
|
68
|
-
self._api_server: uvicorn.Server | None = None
|
|
69
|
-
self._api_ready = threading.Event()
|
|
70
|
-
self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
|
|
71
|
-
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
72
|
-
|
|
73
|
-
# Three-layer state model: desired_state per module
|
|
74
|
-
self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
|
|
75
|
-
|
|
76
|
-
# Event Hub WebSocket client
|
|
77
|
-
self._event_hub_ws_url: str = ""
|
|
78
|
-
self._launcher_ws_token: str = ""
|
|
79
|
-
self._ws: object | None = None
|
|
80
|
-
self._ws_task: asyncio.Task | None = None
|
|
81
|
-
self._loop: asyncio.AbstractEventLoop | None = None
|
|
82
|
-
|
|
83
|
-
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
84
|
-
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
85
|
-
|
|
86
|
-
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
87
|
-
# Used by ProcessManager stdout callback (cross-thread)
|
|
88
|
-
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
89
|
-
|
|
90
|
-
self._lifecycle_log = os.path.join(
|
|
91
|
-
os.environ["KITE_INSTANCE_DIR"], "launcher", "lifecycle.jsonl",
|
|
92
|
-
)
|
|
93
|
-
self._app = self._create_api_app()
|
|
94
|
-
|
|
95
|
-
# ── Instance workspace resolution ──
|
|
96
|
-
|
|
97
|
-
@staticmethod
|
|
98
|
-
def _resolve_instance_dir():
|
|
99
|
-
"""Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
|
|
100
|
-
Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
|
|
101
|
-
or create new one. Sets KITE_INSTANCE_DIR env var.
|
|
102
|
-
"""
|
|
103
|
-
if os.environ.get("KITE_INSTANCE_DIR"):
|
|
104
|
-
return # already set (e.g. by tests or parent)
|
|
105
|
-
|
|
106
|
-
cwd = os.environ.get("KITE_CWD", os.getcwd())
|
|
107
|
-
workspace = os.environ.get("KITE_WORKSPACE", "")
|
|
108
|
-
if not workspace:
|
|
109
|
-
home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
|
|
110
|
-
workspace = os.path.join(home, ".kite", "workspace")
|
|
111
|
-
os.environ["KITE_WORKSPACE"] = workspace
|
|
112
|
-
|
|
113
|
-
basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
|
|
114
|
-
suffix = 0
|
|
115
|
-
|
|
116
|
-
while True:
|
|
117
|
-
name = basename if suffix == 0 else f"{basename}~{suffix}"
|
|
118
|
-
candidate = os.path.join(workspace, name)
|
|
119
|
-
cwd_file = os.path.join(candidate, ".cwd")
|
|
120
|
-
|
|
121
|
-
if not os.path.exists(candidate):
|
|
122
|
-
# Empty slot — create new workspace
|
|
123
|
-
os.makedirs(candidate, exist_ok=True)
|
|
124
|
-
with open(cwd_file, "w", encoding="utf-8") as f:
|
|
125
|
-
f.write(cwd)
|
|
126
|
-
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
127
|
-
print(f"[launcher] 实例工作区已创建: {candidate}")
|
|
128
|
-
return
|
|
129
|
-
|
|
130
|
-
if os.path.isfile(cwd_file):
|
|
131
|
-
try:
|
|
132
|
-
with open(cwd_file, "r", encoding="utf-8") as f:
|
|
133
|
-
if f.read().strip() == cwd:
|
|
134
|
-
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
135
|
-
print(f"[launcher] 实例工作区已找到: {candidate}")
|
|
136
|
-
return
|
|
137
|
-
except Exception:
|
|
138
|
-
pass
|
|
139
|
-
|
|
140
|
-
suffix += 1
|
|
141
|
-
|
|
142
|
-
# ── Kite stdout message callback ──
|
|
143
|
-
|
|
144
|
-
def _on_kite_message(self, module_name: str, msg: dict):
|
|
145
|
-
"""Called by ProcessManager stdout reader thread when a kite message is detected.
|
|
146
|
-
Thread-safe: only touches _msg_waiters (dict + threading.Event).
|
|
147
|
-
"""
|
|
148
|
-
kite_type = msg.get("kite", "")
|
|
149
|
-
key = f"{module_name}:{kite_type}"
|
|
150
|
-
waiter = self._msg_waiters.get(key)
|
|
151
|
-
if waiter:
|
|
152
|
-
waiter[1].update(msg)
|
|
153
|
-
waiter[0].set()
|
|
154
|
-
|
|
155
|
-
async def _wait_kite_message(self, module_name: str, kite_type: str,
|
|
156
|
-
timeout: float) -> dict | None:
|
|
157
|
-
"""Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
|
|
158
|
-
Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
|
|
159
|
-
"""
|
|
160
|
-
key = f"{module_name}:{kite_type}"
|
|
161
|
-
evt = threading.Event()
|
|
162
|
-
data = {}
|
|
163
|
-
self._msg_waiters[key] = (evt, data)
|
|
164
|
-
shutdown = self._thread_shutdown
|
|
165
|
-
try:
|
|
166
|
-
def _wait():
|
|
167
|
-
deadline = time.monotonic() + timeout
|
|
168
|
-
while time.monotonic() < deadline:
|
|
169
|
-
if evt.wait(timeout=0.5):
|
|
170
|
-
return True
|
|
171
|
-
if shutdown.is_set():
|
|
172
|
-
return False
|
|
173
|
-
return False
|
|
174
|
-
got = await asyncio.get_running_loop().run_in_executor(None, _wait)
|
|
175
|
-
return data if got else None
|
|
176
|
-
finally:
|
|
177
|
-
self._msg_waiters.pop(key, None)
|
|
178
|
-
|
|
179
|
-
# ── Public entry ──
|
|
180
|
-
|
|
181
|
-
def run(self):
|
|
182
|
-
"""Synchronous entry point. Sets up signals, runs the async main loop."""
|
|
183
|
-
print("[launcher] Kite 启动中...")
|
|
184
|
-
print("[launcher] ── 环境变量 ──")
|
|
185
|
-
for key in sorted(k for k in os.environ if k.startswith("KITE_")):
|
|
186
|
-
print(f"[launcher] {key} = {os.environ[key]}")
|
|
187
|
-
print(f"[launcher] PID = {os.getpid()}")
|
|
188
|
-
print(f"[launcher] PYTHON = {sys.executable}")
|
|
189
|
-
print(f"[launcher] PLATFORM = {sys.platform}")
|
|
190
|
-
|
|
191
|
-
if IS_WINDOWS:
|
|
192
|
-
self._setup_windows_exit()
|
|
193
|
-
else:
|
|
194
|
-
self._setup_unix_signals()
|
|
195
|
-
|
|
196
|
-
try:
|
|
197
|
-
asyncio.run(self._async_main())
|
|
198
|
-
except KeyboardInterrupt:
|
|
199
|
-
pass
|
|
200
|
-
finally:
|
|
201
|
-
self._final_cleanup()
|
|
202
|
-
|
|
203
|
-
def _request_shutdown(self, reason: str = ""):
|
|
204
|
-
"""Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
|
|
205
|
-
if self._thread_shutdown.is_set():
|
|
206
|
-
return # already shutting down
|
|
207
|
-
print(f"\n[launcher] {reason or '收到关闭请求'}")
|
|
208
|
-
self._thread_shutdown.set()
|
|
209
|
-
# Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
|
|
210
|
-
loop = self._loop
|
|
211
|
-
if loop and not loop.is_closed():
|
|
212
|
-
try:
|
|
213
|
-
loop.call_soon_threadsafe(self._shutdown_event.set)
|
|
214
|
-
except RuntimeError:
|
|
215
|
-
pass
|
|
216
|
-
# Safety net: force exit after 15s no matter what
|
|
217
|
-
def _force():
|
|
218
|
-
time.sleep(15)
|
|
219
|
-
os._exit(1)
|
|
220
|
-
threading.Thread(target=_force, daemon=True).start()
|
|
221
|
-
|
|
222
|
-
def _setup_unix_signals(self):
|
|
223
|
-
"""Register SIGTERM/SIGINT handlers on Linux/macOS."""
|
|
224
|
-
def _handler(signum, frame):
|
|
225
|
-
self._request_shutdown(f"收到信号 {signum},正在关闭...")
|
|
226
|
-
signal.signal(signal.SIGTERM, _handler)
|
|
227
|
-
signal.signal(signal.SIGINT, _handler)
|
|
228
|
-
|
|
229
|
-
def _setup_windows_exit(self):
|
|
230
|
-
"""SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
|
|
231
|
-
|
|
232
|
-
Why not signal.signal(SIGINT)?
|
|
233
|
-
Python's signal delivery requires the main thread to be executing bytecode.
|
|
234
|
-
When the main thread is blocked in C code (asyncio ProactorEventLoop →
|
|
235
|
-
GetQueuedCompletionStatus), SIGINT is never delivered.
|
|
236
|
-
SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
|
|
237
|
-
always works regardless of what the main thread is doing.
|
|
238
|
-
"""
|
|
239
|
-
import ctypes
|
|
240
|
-
|
|
241
|
-
@ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
|
|
242
|
-
def _ctrl_handler(ctrl_type):
|
|
243
|
-
if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
|
|
244
|
-
self._request_shutdown("收到 Ctrl+C,正在关闭...")
|
|
245
|
-
return 1 # handled — prevent default (which kills the process)
|
|
246
|
-
return 0
|
|
247
|
-
|
|
248
|
-
# prevent GC of the C callback
|
|
249
|
-
self._ctrl_handler_ref = _ctrl_handler
|
|
250
|
-
ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
|
|
251
|
-
|
|
252
|
-
# 'q' key: handle via msvcrt polling
|
|
253
|
-
def _listen():
|
|
254
|
-
import msvcrt
|
|
255
|
-
while not self._thread_shutdown.is_set():
|
|
256
|
-
if msvcrt.kbhit():
|
|
257
|
-
ch = msvcrt.getch()
|
|
258
|
-
if ch in (b'q', b'Q'):
|
|
259
|
-
self._request_shutdown("收到退出请求,正在关闭...")
|
|
260
|
-
return
|
|
261
|
-
time.sleep(0.1)
|
|
262
|
-
threading.Thread(target=_listen, daemon=True).start()
|
|
263
|
-
|
|
264
|
-
# ── Async main (4-Phase startup) ──
|
|
265
|
-
|
|
266
|
-
async def _async_main(self):
|
|
267
|
-
"""Full 4-phase startup sequence, then monitor loop."""
|
|
268
|
-
self._loop = asyncio.get_running_loop()
|
|
269
|
-
|
|
270
|
-
# Validate core modules exist (mechanism 12)
|
|
271
|
-
self._validate_core_modules()
|
|
272
|
-
|
|
273
|
-
# Cleanup leftovers from previous instances
|
|
274
|
-
self.process_manager.cleanup_leftovers()
|
|
275
|
-
|
|
276
|
-
# Phase 1: Registry bootstrap
|
|
277
|
-
await self._phase1_registry()
|
|
278
|
-
if self._shutdown_event.is_set(): return
|
|
279
|
-
|
|
280
|
-
# Scan modules (can happen before Phase 2)
|
|
281
|
-
self.modules = self.module_scanner.scan()
|
|
282
|
-
for name, info in self.modules.items():
|
|
283
|
-
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
284
|
-
print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'}")
|
|
285
|
-
|
|
286
|
-
# Generate per-module tokens (including event_hub and registry)
|
|
287
|
-
await self._register_module_tokens()
|
|
288
|
-
if self._shutdown_event.is_set(): return
|
|
289
|
-
|
|
290
|
-
# Phase 2: Event Hub bootstrap
|
|
291
|
-
await self._phase2_event_hub()
|
|
292
|
-
if self._shutdown_event.is_set(): return
|
|
293
|
-
|
|
294
|
-
# Phase 3: Wait for Registry delayed ready
|
|
295
|
-
await self._phase3_registry_ready()
|
|
296
|
-
if self._shutdown_event.is_set(): return
|
|
297
|
-
|
|
298
|
-
# Phase 4: Start remaining enabled modules
|
|
299
|
-
# Initialize desired_state from config_state
|
|
300
|
-
for name, info in self.modules.items():
|
|
301
|
-
if info.state == "enabled":
|
|
302
|
-
self._desired_states[name] = "running"
|
|
303
|
-
else: # manual, disabled
|
|
304
|
-
self._desired_states[name] = "stopped"
|
|
305
|
-
# Core modules are already running
|
|
306
|
-
for cn in CORE_MODULE_NAMES:
|
|
307
|
-
self._desired_states[cn] = "running"
|
|
308
|
-
|
|
309
|
-
await self._phase4_start_modules()
|
|
310
|
-
if self._shutdown_event.is_set(): return
|
|
311
|
-
|
|
312
|
-
# Post-startup
|
|
313
|
-
self.process_manager.persist_records()
|
|
314
|
-
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
315
|
-
|
|
316
|
-
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
|
|
317
|
-
await self._monitor_loop()
|
|
318
|
-
|
|
319
|
-
await self._graceful_shutdown_all()
|
|
320
|
-
|
|
321
|
-
# ── Phase 1: Registry ──
|
|
322
|
-
|
|
323
|
-
async def _phase1_registry(self):
|
|
324
|
-
"""Start Registry → capture port from stdout → set env → start API → register self."""
|
|
325
|
-
registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
|
|
326
|
-
registry_info = ModuleInfo(
|
|
327
|
-
name="registry",
|
|
328
|
-
display_name="Registry",
|
|
329
|
-
type="infrastructure",
|
|
330
|
-
state="enabled",
|
|
331
|
-
runtime="python",
|
|
332
|
-
entry="entry.py",
|
|
333
|
-
module_dir=registry_dir,
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
boot_info = {"token": self.kite_token}
|
|
337
|
-
self._log_lifecycle("starting", "registry")
|
|
338
|
-
ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
|
|
339
|
-
if not ok:
|
|
340
|
-
self._log_lifecycle("start_failed", "registry")
|
|
341
|
-
raise RuntimeError("启动 Registry 失败")
|
|
342
|
-
|
|
343
|
-
# Wait for Registry to output port via stdout (mechanism 2)
|
|
344
|
-
print("[launcher] 等待 Registry 端口...")
|
|
345
|
-
msg = await self._wait_kite_message("registry", "port", timeout=6)
|
|
346
|
-
if not msg or not msg.get("port"):
|
|
347
|
-
raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
|
|
348
|
-
self.registry_port = int(msg["port"])
|
|
349
|
-
print(f"[launcher] Registry 端口: {self.registry_port}")
|
|
350
|
-
|
|
351
|
-
# Set KITE_REGISTRY_PORT for all subsequent child processes
|
|
352
|
-
os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
|
|
353
|
-
|
|
354
|
-
# Start Launcher API in a separate thread
|
|
355
|
-
self._start_api_thread()
|
|
356
|
-
|
|
357
|
-
# Register Launcher itself to Registry
|
|
358
|
-
await self._register_self()
|
|
359
|
-
|
|
360
|
-
async def _register_self(self):
|
|
361
|
-
"""Register Launcher itself to Registry."""
|
|
362
|
-
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
363
|
-
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
364
|
-
payload = {
|
|
365
|
-
"action": "register",
|
|
366
|
-
"module_id": "launcher",
|
|
367
|
-
"module_type": "infrastructure",
|
|
368
|
-
"name": "Launcher",
|
|
369
|
-
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
370
|
-
"health_endpoint": "/launcher/modules",
|
|
371
|
-
"events_publish": {
|
|
372
|
-
"module.started": {},
|
|
373
|
-
"module.stopped": {},
|
|
374
|
-
"module.state_changed": {},
|
|
375
|
-
},
|
|
376
|
-
"events_subscribe": [">"],
|
|
377
|
-
}
|
|
378
|
-
try:
|
|
379
|
-
async with httpx.AsyncClient() as client:
|
|
380
|
-
resp = await client.post(url, json=payload, headers=headers, timeout=5)
|
|
381
|
-
if resp.status_code == 200:
|
|
382
|
-
print("[launcher] 已注册到 Registry")
|
|
383
|
-
else:
|
|
384
|
-
print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
|
|
385
|
-
except Exception as e:
|
|
386
|
-
print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
|
|
387
|
-
|
|
388
|
-
# ── Phase 2: Event Hub ──
|
|
389
|
-
|
|
390
|
-
async def _phase2_event_hub(self):
|
|
391
|
-
"""Start Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready."""
|
|
392
|
-
# Find event_hub in scanned modules or build manually
|
|
393
|
-
eh_info = self.modules.get("event_hub")
|
|
394
|
-
if not eh_info:
|
|
395
|
-
eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
|
|
396
|
-
eh_info = ModuleInfo(
|
|
397
|
-
name="event_hub",
|
|
398
|
-
display_name="Event Hub",
|
|
399
|
-
type="infrastructure",
|
|
400
|
-
state="enabled",
|
|
401
|
-
runtime="python",
|
|
402
|
-
entry="entry.py",
|
|
403
|
-
module_dir=eh_dir,
|
|
404
|
-
)
|
|
405
|
-
|
|
406
|
-
token = self._module_tokens.get("event_hub", "")
|
|
407
|
-
if not token:
|
|
408
|
-
token = secrets.token_hex(32)
|
|
409
|
-
self._module_tokens["event_hub"] = token
|
|
410
|
-
await self._register_tokens_to_registry({"event_hub": token})
|
|
411
|
-
|
|
412
|
-
boot_info = {"token": token}
|
|
413
|
-
self._log_lifecycle("starting", "event_hub")
|
|
414
|
-
ok = self.process_manager.start_module(eh_info, boot_info=boot_info)
|
|
415
|
-
if not ok:
|
|
416
|
-
self._log_lifecycle("start_failed", "event_hub")
|
|
417
|
-
raise RuntimeError("启动 Event Hub 失败")
|
|
418
|
-
|
|
419
|
-
# Send launcher_ws_token via stdin (mechanism 6)
|
|
420
|
-
self._launcher_ws_token = secrets.token_hex(32)
|
|
421
|
-
self.process_manager.write_stdin("event_hub", {
|
|
422
|
-
"kite": "launcher_ws_token",
|
|
423
|
-
"launcher_ws_token": self._launcher_ws_token,
|
|
424
|
-
})
|
|
425
|
-
|
|
426
|
-
# Wait for ws_endpoint from stdout (mechanism 5)
|
|
427
|
-
print("[launcher] 等待 Event Hub ws_endpoint...")
|
|
428
|
-
msg = await self._wait_kite_message("event_hub", "ws_endpoint", timeout=6)
|
|
429
|
-
if not msg or not msg.get("ws_endpoint"):
|
|
430
|
-
raise RuntimeError("致命错误: Event Hub 在 6s 内未报告 ws_endpoint")
|
|
431
|
-
self._event_hub_ws_url = msg["ws_endpoint"]
|
|
432
|
-
print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url}")
|
|
433
|
-
|
|
434
|
-
# Connect to Event Hub WebSocket with launcher_ws_token
|
|
435
|
-
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
436
|
-
|
|
437
|
-
# Wait for Event Hub module.ready (sent when Launcher connects)
|
|
438
|
-
ready = await self._wait_event("module.ready", "event_hub", timeout=15)
|
|
439
|
-
if ready:
|
|
440
|
-
print("[launcher] Event Hub 已就绪")
|
|
441
|
-
else:
|
|
442
|
-
print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
|
|
443
|
-
|
|
444
|
-
self._log_lifecycle("started", "event_hub")
|
|
445
|
-
await self._publish_event("module.started", {"module_id": "event_hub"})
|
|
446
|
-
self.process_manager.close_stdio("event_hub")
|
|
447
|
-
|
|
448
|
-
# ── Phase 3: Registry delayed ready ──
|
|
449
|
-
|
|
450
|
-
async def _phase3_registry_ready(self):
|
|
451
|
-
"""Wait for Registry module.ready (triggered after Event Hub registers to Registry
|
|
452
|
-
and Registry connects to Event Hub WS)."""
|
|
453
|
-
print("[launcher] 等待 Registry 延迟就绪...")
|
|
454
|
-
ready = await self._wait_event("module.ready", "registry", timeout=12)
|
|
455
|
-
if ready:
|
|
456
|
-
print("[launcher] Registry 已就绪")
|
|
457
|
-
else:
|
|
458
|
-
print("[launcher] 警告: Registry 在 12s 内未发送 module.ready (降级运行)")
|
|
459
|
-
|
|
460
|
-
self._log_lifecycle("started", "registry")
|
|
461
|
-
await self._publish_event("module.started", {"module_id": "registry"})
|
|
462
|
-
self.process_manager.close_stdio("registry")
|
|
463
|
-
|
|
464
|
-
# ── Phase 4: Start remaining modules ──
|
|
465
|
-
|
|
466
|
-
async def _phase4_start_modules(self):
|
|
467
|
-
"""Start enabled modules (excluding core) in dependency order."""
|
|
468
|
-
to_start = [m for m in self.modules.values()
|
|
469
|
-
if self._desired_states.get(m.name) == "running"
|
|
470
|
-
and m.name not in CORE_MODULE_NAMES]
|
|
471
|
-
if not to_start:
|
|
472
|
-
print("[launcher] 没有额外模块需要启动")
|
|
473
|
-
return
|
|
474
|
-
|
|
475
|
-
# Auto-start manual modules if depended upon
|
|
476
|
-
needed = set(m.name for m in to_start)
|
|
477
|
-
for m in list(to_start):
|
|
478
|
-
for dep in m.depends_on:
|
|
479
|
-
if dep not in needed and dep not in CORE_MODULE_NAMES:
|
|
480
|
-
dep_info = self.modules.get(dep)
|
|
481
|
-
if dep_info and dep_info.state != "disabled":
|
|
482
|
-
needed.add(dep)
|
|
483
|
-
to_start.append(dep_info)
|
|
484
|
-
self._desired_states[dep] = "running"
|
|
485
|
-
print(f"[launcher] 自动启动 '{dep}' (被依赖)")
|
|
486
|
-
elif dep_info and dep_info.state == "disabled":
|
|
487
|
-
print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
|
|
488
|
-
|
|
489
|
-
try:
|
|
490
|
-
sorted_modules = self._topo_sort(to_start)
|
|
491
|
-
except RuntimeError as e:
|
|
492
|
-
print(f"[launcher] 错误: {e}")
|
|
493
|
-
return
|
|
494
|
-
|
|
495
|
-
print(f"[launcher] 正在启动 {len(sorted_modules)} 个模块...")
|
|
496
|
-
for info in sorted_modules:
|
|
497
|
-
await self._start_one_module(info)
|
|
498
|
-
|
|
499
|
-
# ── Event Hub WebSocket connection ──
|
|
500
|
-
|
|
501
|
-
async def _ws_loop(self):
|
|
502
|
-
"""Connect to Event Hub, reconnect on failure."""
|
|
503
|
-
while not self._thread_shutdown.is_set():
|
|
504
|
-
try:
|
|
505
|
-
await self._ws_connect()
|
|
506
|
-
except asyncio.CancelledError:
|
|
507
|
-
return
|
|
508
|
-
except Exception as e:
|
|
509
|
-
print(f"[launcher] Event Hub 连接错误: {e}")
|
|
510
|
-
self._ws = None
|
|
511
|
-
await asyncio.sleep(5)
|
|
512
|
-
|
|
513
|
-
async def _ws_connect(self):
|
|
514
|
-
"""Single WebSocket session with launcher_ws_token auth."""
|
|
515
|
-
ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}"
|
|
516
|
-
async with websockets.connect(ws_url, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
517
|
-
self._ws = ws
|
|
518
|
-
print("[launcher] 已连接到 Event Hub")
|
|
519
|
-
|
|
520
|
-
# Subscribe to all events
|
|
521
|
-
await ws.send(json.dumps({
|
|
522
|
-
"type": "subscribe",
|
|
523
|
-
"events": [">"],
|
|
524
|
-
}))
|
|
525
|
-
|
|
526
|
-
# Receive loop
|
|
527
|
-
async for raw in ws:
|
|
528
|
-
try:
|
|
529
|
-
msg = json.loads(raw)
|
|
530
|
-
except (json.JSONDecodeError, TypeError):
|
|
531
|
-
continue
|
|
532
|
-
msg_type = msg.get("type", "")
|
|
533
|
-
if msg_type == "event":
|
|
534
|
-
source = msg.get("source", "unknown")
|
|
535
|
-
event = msg.get("event", "")
|
|
536
|
-
data = msg.get("data", {})
|
|
537
|
-
# Trigger event waiters
|
|
538
|
-
module_id = data.get("module_id", "")
|
|
539
|
-
waiter_key = f"{event}:{module_id}"
|
|
540
|
-
waiter = self._event_waiters.get(waiter_key)
|
|
541
|
-
if waiter:
|
|
542
|
-
waiter[1].update(data)
|
|
543
|
-
waiter[0].set()
|
|
544
|
-
ts = msg.get("timestamp", "")
|
|
545
|
-
latency_str = ""
|
|
546
|
-
if ts:
|
|
547
|
-
try:
|
|
548
|
-
from datetime import datetime, timezone
|
|
549
|
-
sent = datetime.fromisoformat(ts)
|
|
550
|
-
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
551
|
-
latency_str = f" ({delay_ms:.1f}ms)"
|
|
552
|
-
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
553
|
-
except Exception:
|
|
554
|
-
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
555
|
-
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
556
|
-
else:
|
|
557
|
-
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
558
|
-
elif msg_type == "error":
|
|
559
|
-
print(f"[launcher] Event Hub 错误: {msg.get('message')}")
|
|
560
|
-
|
|
561
|
-
async def _publish_event(self, event_type: str, data: dict):
|
|
562
|
-
"""Publish an event to Event Hub via WebSocket."""
|
|
563
|
-
if not self._ws:
|
|
564
|
-
return
|
|
565
|
-
from datetime import datetime, timezone
|
|
566
|
-
msg = {
|
|
567
|
-
"type": "event",
|
|
568
|
-
"event_id": str(uuid.uuid4()),
|
|
569
|
-
"event": event_type,
|
|
570
|
-
"source": "launcher",
|
|
571
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
572
|
-
"data": data,
|
|
573
|
-
}
|
|
574
|
-
try:
|
|
575
|
-
await self._ws.send(json.dumps(msg))
|
|
576
|
-
except Exception as e:
|
|
577
|
-
print(f"[launcher] 发布事件失败: {e}")
|
|
578
|
-
|
|
579
|
-
def _publish_event_threadsafe(self, event_type: str, data: dict):
|
|
580
|
-
"""Publish event from non-async context (API thread). Fire-and-forget."""
|
|
581
|
-
if not self._ws or not self._loop:
|
|
582
|
-
return
|
|
583
|
-
asyncio.run_coroutine_threadsafe(
|
|
584
|
-
self._publish_event(event_type, data), self._loop,
|
|
585
|
-
)
|
|
586
|
-
|
|
587
|
-
async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
|
|
588
|
-
"""Wait for a specific event from a module. Returns data dict or None on timeout."""
|
|
589
|
-
key = f"{event_type}:{module_id}"
|
|
590
|
-
evt = asyncio.Event()
|
|
591
|
-
data = {}
|
|
592
|
-
self._event_waiters[key] = (evt, data)
|
|
593
|
-
try:
|
|
594
|
-
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
595
|
-
return data
|
|
596
|
-
except asyncio.TimeoutError:
|
|
597
|
-
return None
|
|
598
|
-
finally:
|
|
599
|
-
self._event_waiters.pop(key, None)
|
|
600
|
-
|
|
601
|
-
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
602
|
-
"""Graceful shutdown: send event → wait ack → wait ready → kill."""
|
|
603
|
-
self._log_lifecycle("stopping", name, reason=reason)
|
|
604
|
-
await self._publish_event("module.shutdown", {
|
|
605
|
-
"module_id": name, "reason": reason, "timeout": timeout,
|
|
606
|
-
})
|
|
607
|
-
|
|
608
|
-
ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
|
|
609
|
-
if not ack:
|
|
610
|
-
self.process_manager.stop_module(name, timeout=5)
|
|
611
|
-
await self._publish_event("module.stopped", {"module_id": name})
|
|
612
|
-
return
|
|
613
|
-
|
|
614
|
-
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
615
|
-
ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
|
|
616
|
-
if ready:
|
|
617
|
-
self.process_manager.stop_module(name, timeout=1)
|
|
618
|
-
else:
|
|
619
|
-
self.process_manager.stop_module(name, timeout=3)
|
|
620
|
-
|
|
621
|
-
self._log_lifecycle("stopped", name, reason=reason)
|
|
622
|
-
await self._publish_event("module.stopped", {"module_id": name})
|
|
623
|
-
|
|
624
|
-
async def _graceful_shutdown_all(self):
|
|
625
|
-
"""Broadcast module.shutdown to all running modules, then force-kill survivors."""
|
|
626
|
-
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
627
|
-
# Also check core modules
|
|
628
|
-
for cn in CORE_MODULE_NAMES:
|
|
629
|
-
if self.process_manager.is_running(cn) and cn not in running:
|
|
630
|
-
running.append(cn)
|
|
631
|
-
if not running:
|
|
632
|
-
return
|
|
633
|
-
print(f"[launcher] 优雅关闭: {', '.join(running)}")
|
|
634
|
-
for name in running:
|
|
635
|
-
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
636
|
-
await self._publish_event("module.shutdown", {
|
|
637
|
-
"module_id": name, "reason": "system_shutdown", "timeout": 10,
|
|
638
|
-
})
|
|
639
|
-
deadline = time.time() + 10
|
|
640
|
-
while time.time() < deadline:
|
|
641
|
-
still_running = [n for n in running if self.process_manager.is_running(n)]
|
|
642
|
-
if not still_running:
|
|
643
|
-
break
|
|
644
|
-
await asyncio.sleep(0.5)
|
|
645
|
-
self.process_manager.stop_all(timeout=3)
|
|
646
|
-
for name in running:
|
|
647
|
-
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
648
|
-
|
|
649
|
-
# ── Heartbeat to Registry ──
|
|
650
|
-
|
|
651
|
-
async def _heartbeat_loop(self):
|
|
652
|
-
"""Send heartbeat to Registry every 30 seconds."""
|
|
653
|
-
while not self._thread_shutdown.is_set():
|
|
654
|
-
await asyncio.sleep(30)
|
|
655
|
-
try:
|
|
656
|
-
async with httpx.AsyncClient() as client:
|
|
657
|
-
await client.post(
|
|
658
|
-
f"http://127.0.0.1:{self.registry_port}/modules",
|
|
659
|
-
json={"action": "heartbeat", "module_id": "launcher"},
|
|
660
|
-
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
661
|
-
timeout=5,
|
|
662
|
-
)
|
|
663
|
-
except Exception:
|
|
664
|
-
pass
|
|
665
|
-
|
|
666
|
-
# ── Module startup ──
|
|
667
|
-
|
|
668
|
-
def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
|
|
669
|
-
"""Topological sort by depends_on. Raises RuntimeError on cycle."""
|
|
670
|
-
name_map = {m.name: m for m in modules}
|
|
671
|
-
visited = set()
|
|
672
|
-
in_stack = set()
|
|
673
|
-
order = []
|
|
674
|
-
|
|
675
|
-
def visit(name):
|
|
676
|
-
if name in in_stack:
|
|
677
|
-
raise RuntimeError(f"Circular dependency detected involving '{name}'")
|
|
678
|
-
if name in visited:
|
|
679
|
-
return
|
|
680
|
-
in_stack.add(name)
|
|
681
|
-
info = name_map.get(name)
|
|
682
|
-
if info:
|
|
683
|
-
for dep in info.depends_on:
|
|
684
|
-
visit(dep)
|
|
685
|
-
in_stack.remove(name)
|
|
686
|
-
visited.add(name)
|
|
687
|
-
if info:
|
|
688
|
-
order.append(info)
|
|
689
|
-
|
|
690
|
-
for m in modules:
|
|
691
|
-
visit(m.name)
|
|
692
|
-
return order
|
|
693
|
-
|
|
694
|
-
async def _start_one_module(self, info: ModuleInfo):
|
|
695
|
-
"""Start a single module: publish starting → start process → wait ready → started → close stdio."""
|
|
696
|
-
self._log_lifecycle("starting", info.name)
|
|
697
|
-
await self._publish_event("module.starting", {"module_id": info.name})
|
|
698
|
-
|
|
699
|
-
token = self._module_tokens.get(info.name, "")
|
|
700
|
-
boot_info = {"token": token}
|
|
701
|
-
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
702
|
-
if not ok:
|
|
703
|
-
self._log_lifecycle("start_failed", info.name)
|
|
704
|
-
return
|
|
705
|
-
|
|
706
|
-
# Wait for module.ready (configurable timeout, degrade on timeout)
|
|
707
|
-
timeout = info.launch.timeout
|
|
708
|
-
ready = await self._wait_event("module.ready", info.name, timeout=timeout)
|
|
709
|
-
if ready:
|
|
710
|
-
print(f"[launcher] 模块 '{info.name}' 已就绪")
|
|
711
|
-
else:
|
|
712
|
-
print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
|
|
713
|
-
|
|
714
|
-
rec = self.process_manager.get_record(info.name)
|
|
715
|
-
self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
|
|
716
|
-
await self._publish_event("module.started", {"module_id": info.name})
|
|
717
|
-
self.process_manager.close_stdio(info.name)
|
|
718
|
-
|
|
719
|
-
async def _register_module_tokens(self):
|
|
720
|
-
"""Generate per-module tokens and register the mapping to Registry."""
|
|
721
|
-
# Include all scanned modules + core modules
|
|
722
|
-
for name in self.modules:
|
|
723
|
-
if name not in self._module_tokens:
|
|
724
|
-
self._module_tokens[name] = secrets.token_hex(32)
|
|
725
|
-
# Ensure registry has a token
|
|
726
|
-
if "registry" not in self._module_tokens:
|
|
727
|
-
self._module_tokens["registry"] = secrets.token_hex(32)
|
|
728
|
-
|
|
729
|
-
if not self._module_tokens:
|
|
730
|
-
return
|
|
731
|
-
|
|
732
|
-
await self._register_tokens_to_registry(self._module_tokens)
|
|
733
|
-
|
|
734
|
-
async def _register_tokens_to_registry(self, tokens: dict):
|
|
735
|
-
"""Register token mapping to Registry via POST /tokens."""
|
|
736
|
-
url = f"http://127.0.0.1:{self.registry_port}/tokens"
|
|
737
|
-
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
738
|
-
try:
|
|
739
|
-
async with httpx.AsyncClient() as client:
|
|
740
|
-
resp = await client.post(url, json=tokens, headers=headers, timeout=5)
|
|
741
|
-
if resp.status_code == 200:
|
|
742
|
-
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
743
|
-
else:
|
|
744
|
-
print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
|
|
745
|
-
except Exception as e:
|
|
746
|
-
print(f"[launcher] 警告: 注册模块令牌失败: {e}")
|
|
747
|
-
|
|
748
|
-
# ── Validation ──
|
|
749
|
-
|
|
750
|
-
def _validate_core_modules(self):
|
|
751
|
-
"""Validate core modules exist (mechanism 12)."""
|
|
752
|
-
project_root = os.environ["KITE_PROJECT"]
|
|
753
|
-
for name in ("registry", "event_hub"):
|
|
754
|
-
mod_dir = os.path.join(project_root, "core", name)
|
|
755
|
-
md_path = os.path.join(mod_dir, "module.md")
|
|
756
|
-
if not os.path.isdir(mod_dir):
|
|
757
|
-
print(f"[launcher] 致命: 核心模块 '{name}' 目录未找到: {mod_dir}")
|
|
758
|
-
sys.exit(1)
|
|
759
|
-
if not os.path.isfile(md_path):
|
|
760
|
-
print(f"[launcher] 致命: 核心模块 '{name}' 缺少 module.md: {md_path}")
|
|
761
|
-
sys.exit(1)
|
|
762
|
-
# Try to parse frontmatter
|
|
763
|
-
try:
|
|
764
|
-
with open(md_path, "r", encoding="utf-8") as f:
|
|
765
|
-
fm = _parse_frontmatter(f.read())
|
|
766
|
-
if not fm:
|
|
767
|
-
print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
|
|
768
|
-
sys.exit(1)
|
|
769
|
-
except Exception as e:
|
|
770
|
-
print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
|
|
771
|
-
sys.exit(1)
|
|
772
|
-
|
|
773
|
-
# ── API thread ──
|
|
774
|
-
|
|
775
|
-
def _start_api_thread(self):
|
|
776
|
-
"""Start the Launcher API server in a separate thread with OS-assigned port."""
|
|
777
|
-
self.api_port = self._get_free_port()
|
|
778
|
-
config = uvicorn.Config(
|
|
779
|
-
self._app,
|
|
780
|
-
host="127.0.0.1",
|
|
781
|
-
port=self.api_port,
|
|
782
|
-
log_level="warning",
|
|
783
|
-
)
|
|
784
|
-
self._api_server = uvicorn.Server(config)
|
|
785
|
-
|
|
786
|
-
def _run():
|
|
787
|
-
self._api_server.run()
|
|
788
|
-
|
|
789
|
-
t = threading.Thread(target=_run, daemon=True)
|
|
790
|
-
t.start()
|
|
791
|
-
|
|
792
|
-
deadline = time.time() + 5
|
|
793
|
-
while time.time() < deadline:
|
|
794
|
-
if self._api_server.started:
|
|
795
|
-
break
|
|
796
|
-
time.sleep(0.05)
|
|
797
|
-
else:
|
|
798
|
-
print("[launcher] 警告: API 服务器可能尚未完全就绪")
|
|
799
|
-
|
|
800
|
-
print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
|
|
801
|
-
|
|
802
|
-
# ── Monitor loop ──
|
|
803
|
-
|
|
804
|
-
async def _monitor_loop(self):
|
|
805
|
-
"""Check child processes every second. Handle crashes.
|
|
806
|
-
Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
|
|
807
|
-
"""
|
|
808
|
-
MAX_FAIL = 3
|
|
809
|
-
MAX_FAILED_MODULES = 3
|
|
810
|
-
|
|
811
|
-
while not self._shutdown_event.is_set():
|
|
812
|
-
exited = self.process_manager.check_exited()
|
|
813
|
-
|
|
814
|
-
for name, rc in exited:
|
|
815
|
-
print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
|
|
816
|
-
self._log_lifecycle("exited", name, exit_code=rc)
|
|
817
|
-
await self._publish_event("module.stopped", {
|
|
818
|
-
"module_id": name, "exit_code": rc,
|
|
819
|
-
})
|
|
820
|
-
info = self.modules.get(name)
|
|
821
|
-
|
|
822
|
-
# Core module crash → full restart
|
|
823
|
-
if name in CORE_MODULE_NAMES or (info and info.is_core()):
|
|
824
|
-
print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
|
|
825
|
-
self._log_lifecycle("core_crash", name, exit_code=rc)
|
|
826
|
-
await self._full_restart()
|
|
827
|
-
return
|
|
828
|
-
|
|
829
|
-
# Non-core: attempt restart if desired_state is "running"
|
|
830
|
-
self._fail_counts[name] = self._fail_counts.get(name, 0) + 1
|
|
831
|
-
count = self._fail_counts[name]
|
|
832
|
-
|
|
833
|
-
if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
|
|
834
|
-
print(f"[launcher] 正在重启 '{name}' (第 {count}/{MAX_FAIL} 次)...")
|
|
835
|
-
await self._start_one_module(info)
|
|
836
|
-
elif count >= MAX_FAIL:
|
|
837
|
-
self._desired_states[name] = "stopped"
|
|
838
|
-
self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
|
|
839
|
-
print(f"[launcher] 模块 '{name}' 失败 {MAX_FAIL} 次,已放弃")
|
|
840
|
-
|
|
841
|
-
failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
|
|
842
|
-
if failed_count >= MAX_FAILED_MODULES:
|
|
843
|
-
print(f"[launcher] {failed_count} 个模块永久失败,启动器退出")
|
|
844
|
-
return
|
|
845
|
-
|
|
846
|
-
if exited:
|
|
847
|
-
self.process_manager.persist_records()
|
|
848
|
-
|
|
849
|
-
# Wait 1s but wake immediately on shutdown signal
|
|
850
|
-
try:
|
|
851
|
-
await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
|
|
852
|
-
return # shutdown requested
|
|
853
|
-
except asyncio.TimeoutError:
|
|
854
|
-
pass
|
|
855
|
-
|
|
856
|
-
async def _full_restart(self):
|
|
857
|
-
"""Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
|
|
858
|
-
print("[launcher] 全量重启: 正在停止所有模块...")
|
|
859
|
-
|
|
860
|
-
# Disconnect Event Hub WS
|
|
861
|
-
if self._ws_task:
|
|
862
|
-
self._ws_task.cancel()
|
|
863
|
-
self._ws_task = None
|
|
864
|
-
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
865
|
-
self._heartbeat_task.cancel()
|
|
866
|
-
self._heartbeat_task = None
|
|
867
|
-
self._ws = None
|
|
868
|
-
self._event_hub_ws_url = ""
|
|
869
|
-
self._launcher_ws_token = ""
|
|
870
|
-
|
|
871
|
-
await self._graceful_shutdown_all()
|
|
872
|
-
self._fail_counts.clear()
|
|
873
|
-
self._module_tokens.clear()
|
|
874
|
-
|
|
875
|
-
# Regenerate kite_token
|
|
876
|
-
self.kite_token = secrets.token_hex(32)
|
|
877
|
-
self.process_manager.kite_token = self.kite_token
|
|
878
|
-
|
|
879
|
-
print("[launcher] 全量重启: 重新执行 Phase 1-4...")
|
|
880
|
-
try:
|
|
881
|
-
await self._phase1_registry()
|
|
882
|
-
self.modules = self.module_scanner.scan()
|
|
883
|
-
for n, info in self.modules.items():
|
|
884
|
-
self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
|
|
885
|
-
await self._register_module_tokens()
|
|
886
|
-
await self._phase2_event_hub()
|
|
887
|
-
await self._phase3_registry_ready()
|
|
888
|
-
await self._phase4_start_modules()
|
|
889
|
-
self.process_manager.persist_records()
|
|
890
|
-
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
891
|
-
print("[launcher] 全量重启完成,恢复监控循环")
|
|
892
|
-
await self._monitor_loop()
|
|
893
|
-
except Exception as e:
|
|
894
|
-
print(f"[launcher] 全量重启失败: {e}")
|
|
895
|
-
|
|
896
|
-
# ── Shutdown ──
|
|
897
|
-
|
|
898
|
-
def _final_cleanup(self):
|
|
899
|
-
"""Called on exit — stop all processes, stop API, clear records."""
|
|
900
|
-
print("[launcher] 正在关闭...")
|
|
901
|
-
|
|
902
|
-
if self._ws_task:
|
|
903
|
-
self._ws_task.cancel()
|
|
904
|
-
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
905
|
-
self._heartbeat_task.cancel()
|
|
906
|
-
|
|
907
|
-
self.process_manager.stop_all(timeout=10)
|
|
908
|
-
|
|
909
|
-
if self._api_server:
|
|
910
|
-
self._api_server.should_exit = True
|
|
911
|
-
|
|
912
|
-
# Clear instance runtime files
|
|
913
|
-
self.process_manager._write_records_file([])
|
|
914
|
-
try:
|
|
915
|
-
os.remove(self.process_manager.records_path)
|
|
916
|
-
except OSError:
|
|
917
|
-
pass
|
|
918
|
-
print("[launcher] 再见。")
|
|
919
|
-
|
|
920
|
-
if IS_WINDOWS:
|
|
921
|
-
os._exit(0)
|
|
922
|
-
|
|
923
|
-
# ── Utilities ──
|
|
924
|
-
|
|
925
|
-
def _load_discovery(self) -> dict | None:
|
|
926
|
-
"""Read discovery config from launcher's own module.md."""
|
|
927
|
-
md_path = os.path.join(os.environ["KITE_PROJECT"], "core", "launcher", "module.md")
|
|
928
|
-
try:
|
|
929
|
-
with open(md_path, "r", encoding="utf-8") as f:
|
|
930
|
-
fm = _parse_frontmatter(f.read())
|
|
931
|
-
discovery = fm.get("discovery")
|
|
932
|
-
if isinstance(discovery, dict) and discovery:
|
|
933
|
-
print(f"[launcher] 发现来源: {', '.join(discovery.keys())}")
|
|
934
|
-
return discovery
|
|
935
|
-
except Exception as e:
|
|
936
|
-
print(f"[launcher] 警告: 读取发现配置失败: {e}")
|
|
937
|
-
return None
|
|
938
|
-
|
|
939
|
-
def _log_lifecycle(self, event: str, module: str, **extra):
|
|
940
|
-
"""Append one JSONL line to lifecycle.jsonl."""
|
|
941
|
-
from datetime import datetime, timezone
|
|
942
|
-
record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
|
|
943
|
-
record.update(extra)
|
|
944
|
-
try:
|
|
945
|
-
os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
|
|
946
|
-
with open(self._lifecycle_log, "a", encoding="utf-8") as f:
|
|
947
|
-
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
948
|
-
except Exception:
|
|
949
|
-
pass
|
|
950
|
-
|
|
951
|
-
@staticmethod
|
|
952
|
-
def _get_free_port() -> int:
|
|
953
|
-
"""Get a free port assigned by the OS (bind to port 0)."""
|
|
954
|
-
import socket
|
|
955
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
956
|
-
s.bind(("127.0.0.1", 0))
|
|
957
|
-
return s.getsockname()[1]
|
|
958
|
-
|
|
959
|
-
# ── API app ──
|
|
960
|
-
|
|
961
|
-
def _create_api_app(self) -> FastAPI:
|
|
962
|
-
"""Create the FastAPI app with Launcher management routes."""
|
|
963
|
-
app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
|
|
964
|
-
launcher = self
|
|
965
|
-
|
|
966
|
-
@app.get("/launcher/modules")
|
|
967
|
-
async def list_modules():
|
|
968
|
-
"""List all modules and their current status."""
|
|
969
|
-
result = []
|
|
970
|
-
for name, info in launcher.modules.items():
|
|
971
|
-
running = launcher.process_manager.is_running(name)
|
|
972
|
-
rec = launcher.process_manager.get_record(name)
|
|
973
|
-
result.append({
|
|
974
|
-
"name": name,
|
|
975
|
-
"display_name": info.display_name,
|
|
976
|
-
"type": info.type,
|
|
977
|
-
"config_state": info.state,
|
|
978
|
-
"desired_state": launcher._desired_states.get(name, "stopped"),
|
|
979
|
-
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
980
|
-
"pid": rec.pid if running and rec else None,
|
|
981
|
-
"monitor": info.monitor,
|
|
982
|
-
})
|
|
983
|
-
return result
|
|
984
|
-
|
|
985
|
-
@app.post("/launcher/modules/{name}/start")
|
|
986
|
-
async def start_module(name: str):
|
|
987
|
-
"""Start a module by name."""
|
|
988
|
-
info = launcher.modules.get(name)
|
|
989
|
-
if not info:
|
|
990
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
991
|
-
if info.state == "disabled":
|
|
992
|
-
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
993
|
-
|
|
994
|
-
if name not in launcher._module_tokens:
|
|
995
|
-
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
996
|
-
try:
|
|
997
|
-
async with httpx.AsyncClient() as client:
|
|
998
|
-
await client.post(
|
|
999
|
-
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1000
|
-
json={name: launcher._module_tokens[name]},
|
|
1001
|
-
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1002
|
-
timeout=5,
|
|
1003
|
-
)
|
|
1004
|
-
except Exception as e:
|
|
1005
|
-
print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
|
|
1006
|
-
|
|
1007
|
-
token = launcher._module_tokens[name]
|
|
1008
|
-
boot_info = {"token": token}
|
|
1009
|
-
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
1010
|
-
if ok:
|
|
1011
|
-
launcher._desired_states[name] = "running"
|
|
1012
|
-
launcher._fail_counts.pop(name, None)
|
|
1013
|
-
launcher.process_manager.persist_records()
|
|
1014
|
-
rec = launcher.process_manager.get_record(name)
|
|
1015
|
-
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
|
|
1016
|
-
launcher._publish_event_threadsafe("module.started", {"module_id": name})
|
|
1017
|
-
return {"status": "started", "name": name}
|
|
1018
|
-
launcher._log_lifecycle("start_failed", name, via="api")
|
|
1019
|
-
raise HTTPException(500, f"Failed to start '{name}'")
|
|
1020
|
-
|
|
1021
|
-
@app.post("/launcher/modules/{name}/stop")
|
|
1022
|
-
async def stop_module(name: str, body: dict = None):
|
|
1023
|
-
"""Stop a module with graceful shutdown."""
|
|
1024
|
-
info = launcher.modules.get(name)
|
|
1025
|
-
if not info:
|
|
1026
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1027
|
-
reason = (body or {}).get("reason", "stop_requested")
|
|
1028
|
-
launcher._desired_states[name] = "stopped"
|
|
1029
|
-
await launcher._graceful_stop(name, reason)
|
|
1030
|
-
launcher.process_manager.persist_records()
|
|
1031
|
-
return {"status": "stopped", "name": name}
|
|
1032
|
-
|
|
1033
|
-
@app.post("/launcher/modules/{name}/restart")
|
|
1034
|
-
async def restart_module(name: str, body: dict = None):
|
|
1035
|
-
"""Restart a module (stop + start)."""
|
|
1036
|
-
info = launcher.modules.get(name)
|
|
1037
|
-
if not info:
|
|
1038
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1039
|
-
if info.state == "disabled":
|
|
1040
|
-
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
1041
|
-
reason = (body or {}).get("reason", "restart")
|
|
1042
|
-
await launcher._graceful_stop(name, reason)
|
|
1043
|
-
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1044
|
-
try:
|
|
1045
|
-
async with httpx.AsyncClient() as client:
|
|
1046
|
-
await client.post(
|
|
1047
|
-
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1048
|
-
json={name: launcher._module_tokens[name]},
|
|
1049
|
-
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1050
|
-
timeout=5,
|
|
1051
|
-
)
|
|
1052
|
-
except Exception:
|
|
1053
|
-
pass
|
|
1054
|
-
token = launcher._module_tokens[name]
|
|
1055
|
-
boot_info = {"token": token}
|
|
1056
|
-
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
1057
|
-
if ok:
|
|
1058
|
-
launcher._desired_states[name] = "running"
|
|
1059
|
-
launcher._fail_counts.pop(name, None)
|
|
1060
|
-
launcher.process_manager.persist_records()
|
|
1061
|
-
rec = launcher.process_manager.get_record(name)
|
|
1062
|
-
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
|
|
1063
|
-
launcher._publish_event_threadsafe("module.started", {"module_id": name})
|
|
1064
|
-
return {"status": "restarted", "name": name}
|
|
1065
|
-
launcher._log_lifecycle("start_failed", name, via="restart_api")
|
|
1066
|
-
raise HTTPException(500, f"Failed to restart '{name}'")
|
|
1067
|
-
|
|
1068
|
-
@app.post("/launcher/rescan")
|
|
1069
|
-
async def rescan_modules():
|
|
1070
|
-
"""Rescan module directories for new/removed modules."""
|
|
1071
|
-
old_names = set(launcher.modules.keys())
|
|
1072
|
-
launcher.modules = launcher.module_scanner.scan()
|
|
1073
|
-
new_names = set(launcher.modules.keys())
|
|
1074
|
-
added = list(new_names - old_names)
|
|
1075
|
-
removed = list(old_names - new_names)
|
|
1076
|
-
for name in added:
|
|
1077
|
-
info = launcher.modules[name]
|
|
1078
|
-
launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
1079
|
-
for name in added:
|
|
1080
|
-
info = launcher.modules[name]
|
|
1081
|
-
launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
1082
|
-
if added:
|
|
1083
|
-
new_tokens = {}
|
|
1084
|
-
for name in added:
|
|
1085
|
-
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1086
|
-
new_tokens[name] = launcher._module_tokens[name]
|
|
1087
|
-
try:
|
|
1088
|
-
async with httpx.AsyncClient() as client:
|
|
1089
|
-
await client.post(
|
|
1090
|
-
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1091
|
-
json=new_tokens,
|
|
1092
|
-
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1093
|
-
timeout=5,
|
|
1094
|
-
)
|
|
1095
|
-
except Exception:
|
|
1096
|
-
pass
|
|
1097
|
-
return {"added": added, "removed": removed, "total": len(launcher.modules)}
|
|
1098
|
-
|
|
1099
|
-
@app.put("/launcher/modules/{name}/state")
|
|
1100
|
-
async def update_state(name: str, body: dict):
|
|
1101
|
-
"""Update module state (enabled/manual/disabled). Writes to module.md."""
|
|
1102
|
-
info = launcher.modules.get(name)
|
|
1103
|
-
if not info:
|
|
1104
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1105
|
-
|
|
1106
|
-
new_state = body.get("state", "")
|
|
1107
|
-
if new_state not in ("enabled", "manual", "disabled"):
|
|
1108
|
-
raise HTTPException(400, "state must be enabled, manual, or disabled")
|
|
1109
|
-
|
|
1110
|
-
if info.is_core() and new_state == "disabled":
|
|
1111
|
-
raise HTTPException(403, "Core modules cannot be disabled")
|
|
1112
|
-
|
|
1113
|
-
old_state = info.state
|
|
1114
|
-
info.state = new_state
|
|
1115
|
-
|
|
1116
|
-
if new_state == "enabled":
|
|
1117
|
-
launcher._desired_states[name] = "running"
|
|
1118
|
-
else:
|
|
1119
|
-
launcher._desired_states[name] = "stopped"
|
|
1120
|
-
|
|
1121
|
-
_update_module_md_state(info.module_dir, new_state)
|
|
1122
|
-
launcher._publish_event_threadsafe("module.state_changed", {
|
|
1123
|
-
"module_id": name,
|
|
1124
|
-
"old_state": old_state,
|
|
1125
|
-
"new_state": new_state,
|
|
1126
|
-
})
|
|
1127
|
-
return {
|
|
1128
|
-
"name": name,
|
|
1129
|
-
"old_state": old_state,
|
|
1130
|
-
"new_state": new_state,
|
|
1131
|
-
}
|
|
1132
|
-
|
|
1133
|
-
return app
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
def _update_module_md_state(module_dir: str, new_state: str):
|
|
1137
|
-
"""Update the state field in a module's module.md frontmatter."""
|
|
1138
|
-
import re
|
|
1139
|
-
md_path = os.path.join(module_dir, "module.md")
|
|
1140
|
-
if not os.path.isfile(md_path):
|
|
1141
|
-
return
|
|
1142
|
-
|
|
1143
|
-
try:
|
|
1144
|
-
with open(md_path, "r", encoding="utf-8") as f:
|
|
1145
|
-
content = f.read()
|
|
1146
|
-
|
|
1147
|
-
updated = re.sub(
|
|
1148
|
-
r'^(state:\s*)(\S+)',
|
|
1149
|
-
rf'\g<1>{new_state}',
|
|
1150
|
-
content,
|
|
1151
|
-
count=1,
|
|
1152
|
-
flags=re.MULTILINE,
|
|
1153
|
-
)
|
|
1154
|
-
|
|
1155
|
-
with open(md_path, "w", encoding="utf-8") as f:
|
|
1156
|
-
f.write(updated)
|
|
1157
|
-
except Exception as e:
|
|
1158
|
-
print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
|