@agentunion/kite 1.0.6 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +127 -25
- package/core/event_hub/entry.py +384 -61
- package/core/event_hub/hub.py +8 -0
- package/core/event_hub/module.md +0 -1
- package/core/event_hub/server.py +169 -38
- package/core/kite_log.py +241 -0
- package/core/launcher/entry.py +1306 -425
- package/core/launcher/module_scanner.py +10 -9
- package/core/launcher/process_manager.py +555 -121
- package/core/registry/entry.py +335 -30
- package/core/registry/server.py +339 -256
- package/core/registry/store.py +13 -2
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +380 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +236 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +380 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +236 -0
- package/{core → extensions}/event_hub_bench/entry.py +664 -371
- package/{core → extensions}/event_hub_bench/module.md +4 -2
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +380 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/backup/server.py +244 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +380 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/model_service/server.py +236 -0
- package/extensions/services/watchdog/entry.py +460 -143
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +128 -13
- package/extensions/services/watchdog/server.py +75 -13
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +487 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +332 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/__init__.py +0 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/__init__.py +0 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/__init__.py +0 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/main.py +344 -4
- package/package.json +11 -2
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
- package/core/data_dir.py +0 -62
- package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
- package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
- package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
- package/core/launcher/data/log/lifecycle.jsonl +0 -1158
- package/core/launcher/data/token.txt +0 -1
- package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
- package/core/registry/data/port.txt +0 -1
- package/core/registry/data/port_484.txt +0 -1
- package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
- /package/{core/event_hub/bench_results/.gitkeep → extensions/services/web/vendor/bluetooth/__init__.py} +0 -0
package/core/launcher/entry.py
CHANGED
|
@@ -4,8 +4,16 @@ Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors p
|
|
|
4
4
|
Thread model:
|
|
5
5
|
- Main thread: asyncio event loop (process management + monitor loop)
|
|
6
6
|
- API thread: independent thread running uvicorn + FastAPI
|
|
7
|
-
- stdout threads: one daemon thread per child process
|
|
7
|
+
- stdout threads: one daemon thread per child process (ProcessManager)
|
|
8
8
|
- (Windows) keyboard listener thread: polls for 'q' key
|
|
9
|
+
|
|
10
|
+
4-Phase startup:
|
|
11
|
+
Phase 1: Registry + Event Hub (parallel start) → Registry stdout port → stdin broadcast port to Event Hub
|
|
12
|
+
→ API → register self + tokens → stdin launcher_ws_token to Event Hub
|
|
13
|
+
→ stdout ws_endpoint → WS connect → module.ready
|
|
14
|
+
Phase 2: (reserved — Event Hub ready handled in Phase 1)
|
|
15
|
+
Phase 3: Registry delayed ready (Event Hub → Registry → Event Hub WS → module.ready)
|
|
16
|
+
Phase 4: start remaining enabled modules in topo order
|
|
9
17
|
"""
|
|
10
18
|
|
|
11
19
|
import asyncio
|
|
@@ -22,25 +30,43 @@ import httpx
|
|
|
22
30
|
import uvicorn
|
|
23
31
|
import websockets
|
|
24
32
|
from fastapi import FastAPI, HTTPException
|
|
25
|
-
from fastapi.responses import JSONResponse
|
|
26
33
|
|
|
27
|
-
from .module_scanner import ModuleScanner, ModuleInfo, _parse_frontmatter
|
|
34
|
+
from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
|
|
28
35
|
from .process_manager import ProcessManager
|
|
29
|
-
from core.data_dir import get_launcher_data_dir
|
|
30
36
|
|
|
31
37
|
IS_WINDOWS = sys.platform == "win32"
|
|
32
38
|
|
|
39
|
+
# Shutdown timeout constants (seconds)
|
|
40
|
+
SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
|
|
41
|
+
SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
|
|
42
|
+
SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
|
|
43
|
+
SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
|
|
44
|
+
|
|
45
|
+
# Core module names that are started in Phase 1-2 (not Phase 4)
|
|
46
|
+
CORE_MODULE_NAMES = {"registry", "event_hub"}
|
|
47
|
+
|
|
48
|
+
WATCHDOG_MODULE_NAME = "watchdog"
|
|
49
|
+
|
|
33
50
|
|
|
34
51
|
class Launcher:
|
|
35
52
|
"""Kite system entry point. Starts Registry, manages modules, exposes API."""
|
|
36
53
|
|
|
37
54
|
def __init__(self, kite_token: str):
|
|
38
55
|
self.kite_token = kite_token
|
|
39
|
-
self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
40
56
|
self.instance_id = str(os.getpid())
|
|
41
|
-
|
|
57
|
+
os.environ["KITE_INSTANCE"] = self.instance_id
|
|
58
|
+
|
|
59
|
+
# Resolve instance workspace (must happen before ProcessManager init)
|
|
60
|
+
self._resolve_instance_dir()
|
|
61
|
+
os.environ["KITE_MODULE_DATA"] = os.path.join(
|
|
62
|
+
os.environ["KITE_INSTANCE_DIR"], "launcher",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self.process_manager = ProcessManager(
|
|
66
|
+
kite_token, self.instance_id,
|
|
67
|
+
on_kite_message=self._on_kite_message,
|
|
68
|
+
)
|
|
42
69
|
self.module_scanner = ModuleScanner(
|
|
43
|
-
self.project_root,
|
|
44
70
|
discovery=self._load_discovery(),
|
|
45
71
|
)
|
|
46
72
|
|
|
@@ -49,36 +75,158 @@ class Launcher:
|
|
|
49
75
|
self.modules: dict[str, ModuleInfo] = {}
|
|
50
76
|
self._shutdown_event = asyncio.Event()
|
|
51
77
|
self._thread_shutdown = threading.Event()
|
|
78
|
+
self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
|
|
52
79
|
self._api_server: uvicorn.Server | None = None
|
|
53
80
|
self._api_ready = threading.Event()
|
|
54
|
-
self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
|
|
55
81
|
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
56
82
|
|
|
57
83
|
# Three-layer state model: desired_state per module
|
|
58
|
-
# Initialized from config_state: enabled→running, manual→stopped, disabled→stopped
|
|
59
84
|
self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
|
|
60
85
|
|
|
61
86
|
# Event Hub WebSocket client
|
|
62
87
|
self._event_hub_ws_url: str = ""
|
|
88
|
+
self._launcher_ws_token: str = ""
|
|
63
89
|
self._ws: object | None = None
|
|
64
90
|
self._ws_task: asyncio.Task | None = None
|
|
65
91
|
self._loop: asyncio.AbstractEventLoop | None = None
|
|
66
92
|
|
|
67
93
|
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
68
|
-
# event_key format: "event_type:module_id"
|
|
69
94
|
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
70
95
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
96
|
+
# Module ready times: module_name -> seconds from start to ready
|
|
97
|
+
self._ready_times: dict[str, float] = {}
|
|
98
|
+
|
|
99
|
+
# Shared HTTP client for Registry communication (lazy-init, reuses TCP connections)
|
|
100
|
+
self._http: httpx.AsyncClient | None = None
|
|
101
|
+
|
|
102
|
+
# Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
|
|
103
|
+
self._exit_reasons: dict[str, str] = {}
|
|
104
|
+
|
|
105
|
+
# Graceful shutdown capability: module_name -> True if module declared support
|
|
106
|
+
# Registry and Event Hub default to True (they start before Watchdog can observe)
|
|
107
|
+
self._graceful_modules: dict[str, bool] = {"registry": True, "event_hub": True}
|
|
108
|
+
|
|
109
|
+
# System-wide shutdown flag: prevents Watchdog restart during shutdown
|
|
110
|
+
self._system_shutting_down = False
|
|
111
|
+
|
|
112
|
+
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
113
|
+
# Used by ProcessManager stdout callback (cross-thread)
|
|
114
|
+
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
115
|
+
|
|
116
|
+
suffix = self.process_manager.instance_suffix
|
|
117
|
+
state_dir = os.path.join(os.environ["KITE_INSTANCE_DIR"], "launcher", "state")
|
|
118
|
+
os.makedirs(state_dir, exist_ok=True)
|
|
119
|
+
self._lifecycle_log = os.path.join(state_dir, f"lifecycle{suffix}.jsonl")
|
|
120
|
+
# Clear lifecycle log on startup (like latest.log)
|
|
121
|
+
try:
|
|
122
|
+
with open(self._lifecycle_log, "w", encoding="utf-8") as f:
|
|
123
|
+
pass
|
|
124
|
+
except Exception:
|
|
125
|
+
pass
|
|
126
|
+
os.environ["KITE_INSTANCE_SUFFIX"] = suffix
|
|
74
127
|
self._app = self._create_api_app()
|
|
75
128
|
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _fmt_elapsed(seconds: float) -> str:
|
|
131
|
+
"""Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
132
|
+
if seconds < 1:
|
|
133
|
+
return f"{seconds * 1000:.0f}ms"
|
|
134
|
+
if seconds < 10:
|
|
135
|
+
return f"{seconds:.1f}s"
|
|
136
|
+
return f"{seconds:.0f}s"
|
|
137
|
+
|
|
138
|
+
# ── Instance workspace resolution ──
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _resolve_instance_dir():
|
|
142
|
+
"""Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
|
|
143
|
+
Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
|
|
144
|
+
or create new one. Sets KITE_INSTANCE_DIR env var.
|
|
145
|
+
"""
|
|
146
|
+
if os.environ.get("KITE_INSTANCE_DIR"):
|
|
147
|
+
return # already set (e.g. by tests or parent)
|
|
148
|
+
|
|
149
|
+
cwd = os.environ.get("KITE_CWD", os.getcwd())
|
|
150
|
+
workspace = os.environ.get("KITE_WORKSPACE", "")
|
|
151
|
+
if not workspace:
|
|
152
|
+
home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
|
|
153
|
+
workspace = os.path.join(home, ".kite", "workspace")
|
|
154
|
+
os.environ["KITE_WORKSPACE"] = workspace
|
|
155
|
+
|
|
156
|
+
basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
|
|
157
|
+
suffix = 0
|
|
158
|
+
|
|
159
|
+
while True:
|
|
160
|
+
name = basename if suffix == 0 else f"{basename}~{suffix}"
|
|
161
|
+
candidate = os.path.join(workspace, name)
|
|
162
|
+
cwd_file = os.path.join(candidate, ".cwd")
|
|
163
|
+
|
|
164
|
+
if not os.path.exists(candidate):
|
|
165
|
+
# Empty slot — create new workspace
|
|
166
|
+
os.makedirs(candidate, exist_ok=True)
|
|
167
|
+
with open(cwd_file, "w", encoding="utf-8") as f:
|
|
168
|
+
f.write(cwd)
|
|
169
|
+
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
if os.path.isfile(cwd_file):
|
|
173
|
+
try:
|
|
174
|
+
with open(cwd_file, "r", encoding="utf-8") as f:
|
|
175
|
+
if f.read().strip() == cwd:
|
|
176
|
+
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
177
|
+
return
|
|
178
|
+
except Exception:
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
suffix += 1
|
|
182
|
+
|
|
183
|
+
# ── Kite stdout message callback ──
|
|
184
|
+
|
|
185
|
+
def _on_kite_message(self, module_name: str, msg: dict):
|
|
186
|
+
"""Called by ProcessManager stdout reader thread when a kite message is detected.
|
|
187
|
+
Thread-safe: only touches _msg_waiters (dict + threading.Event).
|
|
188
|
+
"""
|
|
189
|
+
kite_type = msg.get("kite", "")
|
|
190
|
+
key = f"{module_name}:{kite_type}"
|
|
191
|
+
waiter = self._msg_waiters.get(key)
|
|
192
|
+
if waiter:
|
|
193
|
+
waiter[1].update(msg)
|
|
194
|
+
waiter[0].set()
|
|
195
|
+
|
|
196
|
+
async def _wait_kite_message(self, module_name: str, kite_type: str,
|
|
197
|
+
timeout: float) -> dict | None:
|
|
198
|
+
"""Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
|
|
199
|
+
Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
|
|
200
|
+
"""
|
|
201
|
+
key = f"{module_name}:{kite_type}"
|
|
202
|
+
evt = threading.Event()
|
|
203
|
+
data = {}
|
|
204
|
+
self._msg_waiters[key] = (evt, data)
|
|
205
|
+
shutdown = self._thread_shutdown
|
|
206
|
+
try:
|
|
207
|
+
def _wait():
|
|
208
|
+
deadline = time.monotonic() + timeout
|
|
209
|
+
while time.monotonic() < deadline:
|
|
210
|
+
if evt.wait(timeout=0.5):
|
|
211
|
+
return True
|
|
212
|
+
if shutdown.is_set():
|
|
213
|
+
return False
|
|
214
|
+
return False
|
|
215
|
+
got = await asyncio.get_running_loop().run_in_executor(None, _wait)
|
|
216
|
+
return data if got else None
|
|
217
|
+
finally:
|
|
218
|
+
self._msg_waiters.pop(key, None)
|
|
219
|
+
|
|
76
220
|
# ── Public entry ──
|
|
77
221
|
|
|
78
222
|
def run(self):
|
|
79
223
|
"""Synchronous entry point. Sets up signals, runs the async main loop."""
|
|
80
|
-
print("[launcher]
|
|
81
|
-
|
|
224
|
+
print("[launcher] ── 环境 ──")
|
|
225
|
+
for key in sorted(k for k in os.environ if k.startswith("KITE_")):
|
|
226
|
+
print(f"[launcher] {key} = {os.environ[key]}")
|
|
227
|
+
print(f"[launcher] PID = {os.getpid()}")
|
|
228
|
+
print(f"[launcher] PYTHON = {sys.executable}")
|
|
229
|
+
print(f"[launcher] PLATFORM = {sys.platform}")
|
|
82
230
|
|
|
83
231
|
if IS_WINDOWS:
|
|
84
232
|
self._setup_windows_exit()
|
|
@@ -89,116 +237,505 @@ class Launcher:
|
|
|
89
237
|
asyncio.run(self._async_main())
|
|
90
238
|
except KeyboardInterrupt:
|
|
91
239
|
pass
|
|
240
|
+
except RuntimeError as e:
|
|
241
|
+
print(f"[launcher] 启动失败: {e}")
|
|
92
242
|
finally:
|
|
93
243
|
self._final_cleanup()
|
|
94
244
|
|
|
245
|
+
def _request_shutdown(self, reason: str = ""):
|
|
246
|
+
"""Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
|
|
247
|
+
if self._thread_shutdown.is_set():
|
|
248
|
+
return # already shutting down
|
|
249
|
+
print(f"[launcher] {reason or '收到关闭请求'}")
|
|
250
|
+
self._thread_shutdown.set()
|
|
251
|
+
# Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
|
|
252
|
+
loop = self._loop
|
|
253
|
+
if loop and not loop.is_closed():
|
|
254
|
+
try:
|
|
255
|
+
loop.call_soon_threadsafe(self._shutdown_event.set)
|
|
256
|
+
except RuntimeError:
|
|
257
|
+
pass
|
|
258
|
+
# Safety net: force exit after 10s only if normal shutdown hasn't completed
|
|
259
|
+
def _force():
|
|
260
|
+
if self._shutdown_complete.wait(timeout=10):
|
|
261
|
+
return # Normal shutdown completed — no need to force
|
|
262
|
+
try:
|
|
263
|
+
pm = self.process_manager
|
|
264
|
+
still = [n for n in pm._processes if pm.is_running(n)]
|
|
265
|
+
except Exception:
|
|
266
|
+
still = []
|
|
267
|
+
if still:
|
|
268
|
+
print(f"[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出")
|
|
269
|
+
else:
|
|
270
|
+
print("[launcher] 关闭超时,强制退出")
|
|
271
|
+
os._exit(1)
|
|
272
|
+
threading.Thread(target=_force, daemon=True).start()
|
|
273
|
+
|
|
95
274
|
def _setup_unix_signals(self):
|
|
96
275
|
"""Register SIGTERM/SIGINT handlers on Linux/macOS."""
|
|
97
276
|
def _handler(signum, frame):
|
|
98
|
-
|
|
99
|
-
self._thread_shutdown.set()
|
|
277
|
+
self._request_shutdown(f"收到信号 {signum},正在关闭...")
|
|
100
278
|
signal.signal(signal.SIGTERM, _handler)
|
|
101
279
|
signal.signal(signal.SIGINT, _handler)
|
|
102
280
|
|
|
103
281
|
def _setup_windows_exit(self):
|
|
104
|
-
"""
|
|
282
|
+
"""SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
|
|
283
|
+
|
|
284
|
+
Why not signal.signal(SIGINT)?
|
|
285
|
+
Python's signal delivery requires the main thread to be executing bytecode.
|
|
286
|
+
When the main thread is blocked in C code (asyncio ProactorEventLoop →
|
|
287
|
+
GetQueuedCompletionStatus), SIGINT is never delivered.
|
|
288
|
+
SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
|
|
289
|
+
always works regardless of what the main thread is doing.
|
|
290
|
+
"""
|
|
291
|
+
import ctypes
|
|
292
|
+
|
|
293
|
+
@ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
|
|
294
|
+
def _ctrl_handler(ctrl_type):
|
|
295
|
+
if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
|
|
296
|
+
self._request_shutdown("收到 Ctrl+C,正在关闭...")
|
|
297
|
+
return 1 # handled — prevent default (which kills the process)
|
|
298
|
+
return 0
|
|
299
|
+
|
|
300
|
+
# prevent GC of the C callback
|
|
301
|
+
self._ctrl_handler_ref = _ctrl_handler
|
|
302
|
+
ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
|
|
303
|
+
|
|
304
|
+
# 'q' key: handle via msvcrt polling
|
|
105
305
|
def _listen():
|
|
106
306
|
import msvcrt
|
|
107
307
|
while not self._thread_shutdown.is_set():
|
|
108
308
|
if msvcrt.kbhit():
|
|
109
309
|
ch = msvcrt.getch()
|
|
110
|
-
if ch in (b'q', b'Q'
|
|
111
|
-
|
|
112
|
-
self._thread_shutdown.set()
|
|
310
|
+
if ch in (b'q', b'Q'):
|
|
311
|
+
self._request_shutdown("收到退出请求,正在关闭...")
|
|
113
312
|
return
|
|
114
313
|
time.sleep(0.1)
|
|
115
|
-
|
|
116
|
-
t.start()
|
|
314
|
+
threading.Thread(target=_listen, daemon=True).start()
|
|
117
315
|
|
|
118
|
-
# ── Async main ──
|
|
316
|
+
# ── Async main (4-Phase startup) ──
|
|
119
317
|
|
|
120
318
|
async def _async_main(self):
|
|
121
|
-
"""Full startup sequence, then monitor loop."""
|
|
319
|
+
"""Full 4-phase startup sequence, then monitor loop."""
|
|
122
320
|
self._loop = asyncio.get_running_loop()
|
|
321
|
+
t_start = time.monotonic()
|
|
322
|
+
self._start_unix = time.time()
|
|
323
|
+
phase_times = {}
|
|
324
|
+
G = "\033[32m"
|
|
325
|
+
R = "\033[0m"
|
|
326
|
+
|
|
327
|
+
# Validate core modules exist (mechanism 12)
|
|
328
|
+
self._validate_core_modules()
|
|
329
|
+
|
|
330
|
+
# Cleanup leftovers from previous instances (current instance dir)
|
|
331
|
+
local_cleaned = self.process_manager.cleanup_leftovers()
|
|
332
|
+
|
|
333
|
+
# Cross-directory leftover cleanup (background, non-blocking)
|
|
334
|
+
# run_in_executor returns a Future (not coroutine), so use ensure_future
|
|
335
|
+
self._global_cleanup_task = asyncio.ensure_future(
|
|
336
|
+
asyncio.get_running_loop().run_in_executor(
|
|
337
|
+
None, self.process_manager.cleanup_global_leftovers
|
|
338
|
+
)
|
|
339
|
+
)
|
|
123
340
|
|
|
124
|
-
|
|
125
|
-
|
|
341
|
+
try:
|
|
342
|
+
# Phase 1+2: Registry + Event Hub parallel bootstrap
|
|
343
|
+
t0 = time.monotonic()
|
|
344
|
+
await self._phase1_parallel_bootstrap()
|
|
345
|
+
elapsed_p1 = time.monotonic() - t0
|
|
346
|
+
phase_times["Phase 1+2: Registry + Event Hub (并行)"] = elapsed_p1
|
|
347
|
+
print(f"{G}[launcher] ✓ Phase 1+2 完成: Registry + Event Hub 已就绪 ({elapsed_p1:.2f}s){R}")
|
|
348
|
+
if self._shutdown_event.is_set(): return
|
|
349
|
+
|
|
350
|
+
# Phase 3: Wait for Registry delayed ready
|
|
351
|
+
t0 = time.monotonic()
|
|
352
|
+
await self._phase3_registry_ready()
|
|
353
|
+
elapsed = time.monotonic() - t0
|
|
354
|
+
phase_times["Phase 3: Registry 事件总线"] = elapsed
|
|
355
|
+
print(f"{G}[launcher] ✓ Phase 3 完成: Registry 已连接事件总线 ({elapsed:.2f}s){R}")
|
|
356
|
+
if self._shutdown_event.is_set(): return
|
|
357
|
+
|
|
358
|
+
# Initialize desired_state from config_state (needed before Phase 3.5)
|
|
359
|
+
for name, info in self.modules.items():
|
|
360
|
+
if info.state == "enabled":
|
|
361
|
+
self._desired_states[name] = "running"
|
|
362
|
+
else: # manual, disabled
|
|
363
|
+
self._desired_states[name] = "stopped"
|
|
364
|
+
# Core modules are already running
|
|
365
|
+
for cn in CORE_MODULE_NAMES:
|
|
366
|
+
self._desired_states[cn] = "running"
|
|
367
|
+
|
|
368
|
+
# Phase 3.5: Watchdog ready
|
|
369
|
+
# If started in parallel (Phase 1), just wait for module.ready
|
|
370
|
+
# Otherwise start it now (fallback)
|
|
371
|
+
watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
|
|
372
|
+
if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
|
|
373
|
+
t0 = time.monotonic()
|
|
374
|
+
if getattr(self, '_watchdog_parallel', False):
|
|
375
|
+
print(f"[launcher] Phase 3.5: Watchdog 已并行启动,等待就绪...")
|
|
376
|
+
ready = await self._wait_event("module.ready", "watchdog", timeout=15)
|
|
377
|
+
elapsed = time.monotonic() - t0
|
|
378
|
+
if ready and not ready.get("_exited"):
|
|
379
|
+
self._graceful_modules["watchdog"] = bool(ready.get("graceful_shutdown"))
|
|
380
|
+
self._ready_times["watchdog"] = elapsed
|
|
381
|
+
print(f"[launcher] Watchdog 已就绪")
|
|
382
|
+
self._log_lifecycle("started", "watchdog")
|
|
383
|
+
await self._publish_event("module.started", {"module_id": "watchdog"})
|
|
384
|
+
self.process_manager.close_stdio("watchdog")
|
|
385
|
+
else:
|
|
386
|
+
print(f"[launcher] 警告: Watchdog 在 15s 内未就绪")
|
|
387
|
+
else:
|
|
388
|
+
print(f"[launcher] Phase 3.5: 启动 Watchdog...")
|
|
389
|
+
await self._start_one_module(watchdog_info)
|
|
390
|
+
elapsed = time.monotonic() - t0
|
|
391
|
+
print(f"{G}[launcher] ✓ Phase 3.5 完成: Watchdog ({elapsed:.2f}s){R}")
|
|
392
|
+
if self._shutdown_event.is_set(): return
|
|
393
|
+
|
|
394
|
+
# Phase 4: Start remaining enabled modules
|
|
395
|
+
t0 = time.monotonic()
|
|
396
|
+
await self._phase4_start_modules()
|
|
397
|
+
elapsed = time.monotonic() - t0
|
|
398
|
+
phase_times["Phase 4: Extensions"] = elapsed
|
|
399
|
+
print(f"{G}[launcher] ✓ Phase 4 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
|
|
400
|
+
if self._shutdown_event.is_set(): return
|
|
401
|
+
|
|
402
|
+
# Post-startup
|
|
403
|
+
self.process_manager.persist_records()
|
|
404
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
126
405
|
|
|
127
|
-
|
|
128
|
-
|
|
406
|
+
# Wait for global leftover cleanup to finish (non-blocking with timeout)
|
|
407
|
+
global_cleaned = {}
|
|
408
|
+
if hasattr(self, '_global_cleanup_task'):
|
|
409
|
+
try:
|
|
410
|
+
global_cleaned = await asyncio.wait_for(self._global_cleanup_task, timeout=5) or {}
|
|
411
|
+
except asyncio.TimeoutError:
|
|
412
|
+
print("[launcher] 警告: 全局遗留清理超时 (5s),跳过")
|
|
413
|
+
except Exception as e:
|
|
414
|
+
print(f"[launcher] 警告: 全局遗留清理出错: {e}")
|
|
415
|
+
# Merge local + global cleanup stats
|
|
416
|
+
cleaned_stats: dict[str, int] = {}
|
|
417
|
+
for d in (local_cleaned, global_cleaned):
|
|
418
|
+
for k, v in d.items():
|
|
419
|
+
cleaned_stats[k] = cleaned_stats.get(k, 0) + v
|
|
420
|
+
|
|
421
|
+
# Global instance scan (via executor to avoid blocking)
|
|
422
|
+
global_instances = await asyncio.get_running_loop().run_in_executor(
|
|
423
|
+
None, self.process_manager.get_global_instances
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# ── Startup report ──
|
|
427
|
+
total_time = time.monotonic() - t_start
|
|
428
|
+
await self._print_startup_report(total_time, phase_times,
|
|
429
|
+
global_instances=global_instances,
|
|
430
|
+
cleaned_stats=cleaned_stats)
|
|
431
|
+
# Notify all modules that system startup is complete
|
|
432
|
+
await self._publish_event("system.ready", {
|
|
433
|
+
"startup_time": round(total_time, 2),
|
|
434
|
+
})
|
|
129
435
|
|
|
130
|
-
|
|
131
|
-
|
|
436
|
+
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
|
|
437
|
+
await self._monitor_loop()
|
|
438
|
+
finally:
|
|
439
|
+
try:
|
|
440
|
+
await self._graceful_shutdown_all()
|
|
441
|
+
except Exception as e:
|
|
442
|
+
print(f"[launcher] 优雅关闭出错: {e}")
|
|
132
443
|
|
|
133
|
-
|
|
134
|
-
await self._register_self()
|
|
444
|
+
# ── Phase 1+2: Parallel bootstrap (Registry + Event Hub) ──
|
|
135
445
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
for name, info in self.modules.items():
|
|
139
|
-
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
140
|
-
print(f"[launcher] Found {len(self.modules)} module(s): {', '.join(self.modules.keys()) or '(none)'}")
|
|
446
|
+
async def _phase1_parallel_bootstrap(self):
|
|
447
|
+
"""Start Registry + Event Hub processes in parallel to overlap cold-start time.
|
|
141
448
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
449
|
+
Flow:
|
|
450
|
+
1. Start Registry + Event Hub processes simultaneously
|
|
451
|
+
2. Wait for Registry to report port via stdout
|
|
452
|
+
3. Set KITE_REGISTRY_PORT env (for Phase 3.5/4 modules) + start API
|
|
453
|
+
4. Scan modules + register self & tokens (parallel)
|
|
454
|
+
5. Send launcher_ws_token + registry_port to Event Hub via stdin
|
|
455
|
+
6. Wait for Event Hub ws_endpoint → WS connect → module.ready
|
|
456
|
+
"""
|
|
457
|
+
t_registry = time.monotonic()
|
|
148
458
|
|
|
149
|
-
# Step
|
|
150
|
-
|
|
459
|
+
# ── Step 1: Start both processes ──
|
|
460
|
+
registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
|
|
461
|
+
registry_info = ModuleInfo(
|
|
462
|
+
name="registry",
|
|
463
|
+
display_name="Registry",
|
|
464
|
+
type="infrastructure",
|
|
465
|
+
state="enabled",
|
|
466
|
+
runtime="python",
|
|
467
|
+
entry="entry.py",
|
|
468
|
+
module_dir=registry_dir,
|
|
469
|
+
)
|
|
470
|
+
boot_info_registry = {"token": self.kite_token}
|
|
471
|
+
self._log_lifecycle("starting", "registry")
|
|
472
|
+
ok = self.process_manager.start_module(registry_info, boot_info=boot_info_registry)
|
|
473
|
+
if not ok:
|
|
474
|
+
self._log_lifecycle("start_failed", "registry")
|
|
475
|
+
raise RuntimeError("启动 Registry 失败")
|
|
476
|
+
|
|
477
|
+
# Start Event Hub in parallel (before Registry port is known)
|
|
478
|
+
eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
|
|
479
|
+
eh_info = ModuleInfo(
|
|
480
|
+
name="event_hub",
|
|
481
|
+
display_name="Event Hub",
|
|
482
|
+
type="infrastructure",
|
|
483
|
+
state="enabled",
|
|
484
|
+
runtime="python",
|
|
485
|
+
entry="entry.py",
|
|
486
|
+
module_dir=eh_dir,
|
|
487
|
+
)
|
|
488
|
+
# Generate Event Hub token early (will register to Registry once it's up)
|
|
489
|
+
eh_token = secrets.token_hex(32)
|
|
490
|
+
self._module_tokens["event_hub"] = eh_token
|
|
491
|
+
boot_info_eh = {"token": eh_token}
|
|
492
|
+
self._log_lifecycle("starting", "event_hub")
|
|
493
|
+
ok = self.process_manager.start_module(eh_info, boot_info=boot_info_eh)
|
|
494
|
+
if not ok:
|
|
495
|
+
self._log_lifecycle("start_failed", "event_hub")
|
|
496
|
+
raise RuntimeError("启动 Event Hub 失败")
|
|
497
|
+
|
|
498
|
+
# Start Watchdog in parallel (before Registry port is known)
|
|
499
|
+
# Watchdog will block on stdin waiting for registry_port
|
|
500
|
+
watchdog_dir = os.path.join(os.environ["KITE_PROJECT"], "extensions", "services", "watchdog")
|
|
501
|
+
watchdog_md = os.path.join(watchdog_dir, "module.md")
|
|
502
|
+
self._watchdog_parallel = False # track whether watchdog was started in parallel
|
|
503
|
+
if os.path.isfile(watchdog_md):
|
|
504
|
+
wd_token = secrets.token_hex(32)
|
|
505
|
+
self._module_tokens["watchdog"] = wd_token
|
|
506
|
+
# Parse watchdog module.md for ModuleInfo
|
|
507
|
+
try:
|
|
508
|
+
with open(watchdog_md, "r", encoding="utf-8") as f:
|
|
509
|
+
wd_fm = _parse_frontmatter(f.read())
|
|
510
|
+
wd_info = ModuleInfo(
|
|
511
|
+
name="watchdog",
|
|
512
|
+
display_name=wd_fm.get("display_name", "Watchdog"),
|
|
513
|
+
type=wd_fm.get("type", "service"),
|
|
514
|
+
state="enabled",
|
|
515
|
+
runtime=wd_fm.get("runtime", "python"),
|
|
516
|
+
entry=wd_fm.get("entry", "entry.py"),
|
|
517
|
+
module_dir=watchdog_dir,
|
|
518
|
+
)
|
|
519
|
+
boot_info_wd = {"token": wd_token}
|
|
520
|
+
self._log_lifecycle("starting", "watchdog")
|
|
521
|
+
ok = self.process_manager.start_module(wd_info, boot_info=boot_info_wd)
|
|
522
|
+
if ok:
|
|
523
|
+
self._watchdog_parallel = True
|
|
524
|
+
else:
|
|
525
|
+
self._log_lifecycle("start_failed", "watchdog")
|
|
526
|
+
print("[launcher] 警告: Watchdog 并行启动失败,将在 Phase 3.5 重试")
|
|
527
|
+
except Exception as e:
|
|
528
|
+
print(f"[launcher] 警告: Watchdog module.md 解析失败: {e}")
|
|
151
529
|
|
|
152
|
-
|
|
153
|
-
|
|
530
|
+
parallel_modules = "Registry + Event Hub" + (" + Watchdog" if self._watchdog_parallel else "")
|
|
531
|
+
print(f"[launcher] {parallel_modules} 进程已同时启动,等待 Registry 端口...")
|
|
154
532
|
|
|
155
|
-
#
|
|
533
|
+
# Persist immediately after starting core processes
|
|
156
534
|
self.process_manager.persist_records()
|
|
157
535
|
|
|
158
|
-
# Step
|
|
159
|
-
await self.
|
|
536
|
+
# ── Step 2: Wait for Registry port ──
|
|
537
|
+
msg = await self._wait_kite_message("registry", "port", timeout=6)
|
|
538
|
+
if not msg or not msg.get("port"):
|
|
539
|
+
raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
|
|
540
|
+
self.registry_port = int(msg["port"])
|
|
541
|
+
self._ready_times["registry"] = time.monotonic() - t_registry
|
|
542
|
+
_wait_s = time.monotonic() - t_registry
|
|
543
|
+
print(f"[launcher] Registry 端口: {self.registry_port} (等待 {self._fmt_elapsed(_wait_s)})")
|
|
544
|
+
|
|
545
|
+
# ── Step 3: Set env + start API + immediately unblock Event Hub ──
|
|
546
|
+
os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
|
|
547
|
+
self._start_api_thread()
|
|
160
548
|
|
|
161
|
-
#
|
|
162
|
-
self.
|
|
549
|
+
# Send launcher_ws_token + registry_port to Event Hub ASAP (unblock it)
|
|
550
|
+
self._launcher_ws_token = secrets.token_hex(32)
|
|
551
|
+
self.process_manager.write_stdin("event_hub", {
|
|
552
|
+
"kite": "launcher_ws_token",
|
|
553
|
+
"launcher_ws_token": self._launcher_ws_token,
|
|
554
|
+
})
|
|
555
|
+
self.process_manager.write_stdin("event_hub", {
|
|
556
|
+
"kite": "registry_port",
|
|
557
|
+
"registry_port": self.registry_port,
|
|
558
|
+
})
|
|
163
559
|
|
|
164
|
-
#
|
|
165
|
-
|
|
166
|
-
|
|
560
|
+
# Send registry_port to Watchdog via stdin (if started in parallel)
|
|
561
|
+
# Watchdog will retry querying launcher.api_endpoint until it's available
|
|
562
|
+
if self.process_manager.is_running("watchdog"):
|
|
563
|
+
self.process_manager.write_stdin("watchdog", {
|
|
564
|
+
"kite": "registry_port",
|
|
565
|
+
"registry_port": self.registry_port,
|
|
566
|
+
})
|
|
167
567
|
|
|
168
|
-
#
|
|
169
|
-
|
|
568
|
+
# ── Step 4: Scan + register tokens ‖ wait for Event Hub ws_endpoint (parallel) ──
|
|
569
|
+
# Pre-register ws_endpoint waiter BEFORE gather to avoid race condition:
|
|
570
|
+
# module_scanner.scan() is synchronous and blocks the event loop,
|
|
571
|
+
# so the _wait_event_hub_endpoint coroutine wouldn't register its waiter in time.
|
|
572
|
+
ws_waiter_key = "event_hub:ws_endpoint"
|
|
573
|
+
ws_evt = threading.Event()
|
|
574
|
+
ws_data: dict = {}
|
|
575
|
+
self._msg_waiters[ws_waiter_key] = (ws_evt, ws_data)
|
|
576
|
+
|
|
577
|
+
async def _scan_and_register_tokens():
|
|
578
|
+
t_scan = time.monotonic()
|
|
579
|
+
self.modules = self.module_scanner.scan()
|
|
580
|
+
for name, info in self.modules.items():
|
|
581
|
+
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
582
|
+
_scan_s = time.monotonic() - t_scan
|
|
583
|
+
print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
|
|
584
|
+
t_reg = time.monotonic()
|
|
585
|
+
await self._register_module_tokens()
|
|
586
|
+
_reg_s = time.monotonic() - t_reg
|
|
587
|
+
print(f"[launcher] 令牌注册完成 ({self._fmt_elapsed(_reg_s)})")
|
|
588
|
+
|
|
589
|
+
async def _wait_event_hub_endpoint():
|
|
590
|
+
t_wait_eh = time.monotonic()
|
|
591
|
+
print("[launcher] 等待 Event Hub ws_endpoint...")
|
|
592
|
+
shutdown = self._thread_shutdown
|
|
593
|
+
def _wait():
|
|
594
|
+
deadline = time.monotonic() + 10
|
|
595
|
+
while time.monotonic() < deadline:
|
|
596
|
+
if ws_evt.wait(timeout=0.5):
|
|
597
|
+
return True
|
|
598
|
+
if shutdown.is_set():
|
|
599
|
+
return False
|
|
600
|
+
return False
|
|
601
|
+
got = await asyncio.get_running_loop().run_in_executor(None, _wait)
|
|
602
|
+
self._msg_waiters.pop(ws_waiter_key, None)
|
|
603
|
+
if not got or not ws_data.get("ws_endpoint"):
|
|
604
|
+
raise RuntimeError("致命错误: Event Hub 在 10s 内未报告 ws_endpoint")
|
|
605
|
+
self._event_hub_ws_url = ws_data["ws_endpoint"]
|
|
606
|
+
_eh_s = time.monotonic() - t_wait_eh
|
|
607
|
+
print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url} (等待 {self._fmt_elapsed(_eh_s)})")
|
|
608
|
+
|
|
609
|
+
# Run all three in parallel: register_self + scan_tokens + wait_event_hub
|
|
610
|
+
await asyncio.gather(
|
|
611
|
+
self._register_self(),
|
|
612
|
+
_scan_and_register_tokens(),
|
|
613
|
+
_wait_event_hub_endpoint(),
|
|
614
|
+
)
|
|
615
|
+
if self._shutdown_event.is_set():
|
|
616
|
+
return
|
|
617
|
+
|
|
618
|
+
# ── Step 5: WS connect → module.ready ──
|
|
619
|
+
t_eh = time.monotonic()
|
|
620
|
+
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
621
|
+
|
|
622
|
+
# Wait for Event Hub module.ready (sent when Launcher connects)
|
|
623
|
+
ready = await self._wait_event("module.ready", "event_hub", timeout=15)
|
|
624
|
+
if ready:
|
|
625
|
+
self._graceful_modules["event_hub"] = bool(ready.get("graceful_shutdown"))
|
|
626
|
+
print("[launcher] Event Hub 已就绪")
|
|
627
|
+
else:
|
|
628
|
+
print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
|
|
170
629
|
|
|
171
|
-
|
|
630
|
+
self._ready_times["event_hub"] = time.monotonic() - t_eh
|
|
631
|
+
self._log_lifecycle("started", "event_hub")
|
|
632
|
+
await self._publish_event("module.started", {"module_id": "event_hub"})
|
|
633
|
+
self.process_manager.close_stdio("event_hub")
|
|
172
634
|
|
|
173
|
-
|
|
174
|
-
""
|
|
175
|
-
|
|
635
|
+
# Store eh_info in modules dict if not already present (from scan)
|
|
636
|
+
if "event_hub" not in self.modules:
|
|
637
|
+
self.modules["event_hub"] = eh_info
|
|
638
|
+
|
|
639
|
+
def _get_http(self) -> httpx.AsyncClient:
|
|
640
|
+
"""Get shared HTTP client (lazy-init, reuses TCP connections to Registry)."""
|
|
641
|
+
if self._http is None or self._http.is_closed:
|
|
642
|
+
self._http = httpx.AsyncClient(timeout=5)
|
|
643
|
+
return self._http
|
|
644
|
+
|
|
645
|
+
async def _close_http(self):
|
|
646
|
+
"""Close shared HTTP client."""
|
|
647
|
+
if self._http and not self._http.is_closed:
|
|
648
|
+
await self._http.aclose()
|
|
649
|
+
self._http = None
|
|
650
|
+
|
|
651
|
+
async def _register_self(self):
|
|
652
|
+
"""Register Launcher itself to Registry."""
|
|
653
|
+
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
176
654
|
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
655
|
+
payload = {
|
|
656
|
+
"action": "register",
|
|
657
|
+
"module_id": "launcher",
|
|
658
|
+
"module_type": "infrastructure",
|
|
659
|
+
"name": "Launcher",
|
|
660
|
+
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
661
|
+
"health_endpoint": "/launcher/modules",
|
|
662
|
+
"events_publish": {
|
|
663
|
+
"module.started": {},
|
|
664
|
+
"module.stopped": {},
|
|
665
|
+
"module.state_changed": {},
|
|
666
|
+
},
|
|
667
|
+
"events_subscribe": [">"],
|
|
668
|
+
}
|
|
669
|
+
try:
|
|
670
|
+
client = self._get_http()
|
|
671
|
+
resp = await client.post(url, json=payload, headers=headers)
|
|
672
|
+
if resp.status_code == 200:
|
|
673
|
+
print("[launcher] 已注册到 Registry")
|
|
674
|
+
else:
|
|
675
|
+
print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
|
|
676
|
+
except Exception as e:
|
|
677
|
+
print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
|
|
177
678
|
|
|
178
|
-
|
|
179
|
-
print("[launcher] Waiting for Event Hub to register...")
|
|
180
|
-
deadline = time.time() + 15
|
|
181
|
-
while time.time() < deadline:
|
|
182
|
-
try:
|
|
183
|
-
async with httpx.AsyncClient() as client:
|
|
184
|
-
resp = await client.get(
|
|
185
|
-
f"{url}/get/event_hub.metadata.ws_endpoint",
|
|
186
|
-
headers=headers, timeout=3,
|
|
187
|
-
)
|
|
188
|
-
if resp.status_code == 200:
|
|
189
|
-
self._event_hub_ws_url = resp.json()
|
|
190
|
-
if self._event_hub_ws_url:
|
|
191
|
-
break
|
|
192
|
-
except Exception:
|
|
193
|
-
pass
|
|
194
|
-
await asyncio.sleep(1)
|
|
679
|
+
# ── (Phase 2 merged into _phase1_parallel_bootstrap) ──
|
|
195
680
|
|
|
196
|
-
|
|
197
|
-
|
|
681
|
+
# ── Phase 3: Registry delayed ready ──
|
|
682
|
+
|
|
683
|
+
async def _phase3_registry_ready(self):
|
|
684
|
+
"""Wait for Registry module.ready (triggered after Event Hub registers to Registry
|
|
685
|
+
and Registry connects to Event Hub WS)."""
|
|
686
|
+
print("[launcher] 等待 Registry 连接 Event Hub...")
|
|
687
|
+
ready = await self._wait_event("module.ready", "registry", timeout=12)
|
|
688
|
+
if ready:
|
|
689
|
+
self._graceful_modules["registry"] = bool(ready.get("graceful_shutdown"))
|
|
690
|
+
print("[launcher] Registry 事件总线连接完成")
|
|
691
|
+
else:
|
|
692
|
+
print("[launcher] 警告: Registry 在 12s 内未连接事件总线 (降级运行)")
|
|
693
|
+
|
|
694
|
+
self._log_lifecycle("started", "registry")
|
|
695
|
+
await self._publish_event("module.started", {"module_id": "registry"})
|
|
696
|
+
self.process_manager.close_stdio("registry")
|
|
697
|
+
|
|
698
|
+
# ── Phase 4: Start remaining modules ──
|
|
699
|
+
|
|
700
|
+
async def _phase4_start_modules(self):
|
|
701
|
+
"""Start enabled modules (excluding core) in dependency order."""
|
|
702
|
+
to_start = [m for m in self.modules.values()
|
|
703
|
+
if self._desired_states.get(m.name) == "running"
|
|
704
|
+
and m.name not in CORE_MODULE_NAMES
|
|
705
|
+
and m.name != WATCHDOG_MODULE_NAME]
|
|
706
|
+
if not to_start:
|
|
707
|
+
print("[launcher] 没有额外模块需要启动")
|
|
198
708
|
return
|
|
199
709
|
|
|
200
|
-
|
|
201
|
-
|
|
710
|
+
# Auto-start manual modules if depended upon
|
|
711
|
+
needed = set(m.name for m in to_start)
|
|
712
|
+
for m in list(to_start):
|
|
713
|
+
for dep in m.depends_on:
|
|
714
|
+
if dep not in needed and dep not in CORE_MODULE_NAMES:
|
|
715
|
+
dep_info = self.modules.get(dep)
|
|
716
|
+
if dep_info and dep_info.state != "disabled":
|
|
717
|
+
needed.add(dep)
|
|
718
|
+
to_start.append(dep_info)
|
|
719
|
+
self._desired_states[dep] = "running"
|
|
720
|
+
print(f"[launcher] 自动启动 '{dep}' (被依赖)")
|
|
721
|
+
elif dep_info and dep_info.state == "disabled":
|
|
722
|
+
print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
|
|
723
|
+
|
|
724
|
+
try:
|
|
725
|
+
layers = self._topo_layers(to_start)
|
|
726
|
+
except RuntimeError as e:
|
|
727
|
+
print(f"[launcher] 错误: {e}")
|
|
728
|
+
return
|
|
729
|
+
|
|
730
|
+
total = sum(len(layer) for layer in layers)
|
|
731
|
+
print(f"[launcher] 正在启动 {total} 个模块...")
|
|
732
|
+
for layer in layers:
|
|
733
|
+
if len(layer) == 1:
|
|
734
|
+
await self._start_one_module(layer[0])
|
|
735
|
+
else:
|
|
736
|
+
await asyncio.gather(*(self._start_one_module(info) for info in layer))
|
|
737
|
+
|
|
738
|
+
# ── Event Hub WebSocket connection ──
|
|
202
739
|
|
|
203
740
|
async def _ws_loop(self):
|
|
204
741
|
"""Connect to Event Hub, reconnect on failure."""
|
|
@@ -208,16 +745,19 @@ class Launcher:
|
|
|
208
745
|
except asyncio.CancelledError:
|
|
209
746
|
return
|
|
210
747
|
except Exception as e:
|
|
211
|
-
|
|
748
|
+
if not self._system_shutting_down:
|
|
749
|
+
print(f"[launcher] Event Hub 连接错误: {e}")
|
|
212
750
|
self._ws = None
|
|
213
751
|
await asyncio.sleep(5)
|
|
214
752
|
|
|
215
753
|
async def _ws_connect(self):
|
|
216
|
-
"""Single WebSocket session
|
|
217
|
-
ws_url = f"{self._event_hub_ws_url}?token={self.
|
|
218
|
-
|
|
754
|
+
"""Single WebSocket session with launcher_ws_token auth."""
|
|
755
|
+
ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}&id=launcher"
|
|
756
|
+
t_ws_connect = time.monotonic()
|
|
757
|
+
async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
219
758
|
self._ws = ws
|
|
220
|
-
|
|
759
|
+
_ws_s = time.monotonic() - t_ws_connect
|
|
760
|
+
print(f"[launcher] 已连接到 Event Hub ({self._fmt_elapsed(_ws_s)})")
|
|
221
761
|
|
|
222
762
|
# Subscribe to all events
|
|
223
763
|
await ws.send(json.dumps({
|
|
@@ -231,52 +771,89 @@ class Launcher:
|
|
|
231
771
|
msg = json.loads(raw)
|
|
232
772
|
except (json.JSONDecodeError, TypeError):
|
|
233
773
|
continue
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
waiter
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
774
|
+
try:
|
|
775
|
+
msg_type = msg.get("type", "")
|
|
776
|
+
if msg_type == "event":
|
|
777
|
+
source = msg.get("source", "unknown")
|
|
778
|
+
event = msg.get("event", "")
|
|
779
|
+
data = msg.get("data") if isinstance(msg.get("data"), dict) else {}
|
|
780
|
+
# Trigger event waiters
|
|
781
|
+
module_id = data.get("module_id", "")
|
|
782
|
+
waiter_key = f"{event}:{module_id}"
|
|
783
|
+
waiter = self._event_waiters.get(waiter_key)
|
|
784
|
+
if waiter:
|
|
785
|
+
waiter[1].update(data)
|
|
786
|
+
waiter[0].set()
|
|
787
|
+
# module.exiting also wakes module.ready waiter
|
|
788
|
+
# (module won't send ready — no point waiting)
|
|
789
|
+
if event == "module.exiting" and module_id:
|
|
790
|
+
ready_key = f"module.ready:{module_id}"
|
|
791
|
+
ready_waiter = self._event_waiters.get(ready_key)
|
|
792
|
+
if ready_waiter:
|
|
793
|
+
ready_waiter[1].update(data)
|
|
794
|
+
ready_waiter[1]["_exited"] = True
|
|
795
|
+
ready_waiter[0].set()
|
|
796
|
+
# module.crash → print red crash summary (real-time notification)
|
|
797
|
+
if event == "module.crash" and module_id:
|
|
798
|
+
RED = "\033[91m"
|
|
799
|
+
RESET = "\033[0m"
|
|
800
|
+
exc_type = data.get("exception_type", "Unknown")
|
|
801
|
+
preview = data.get("traceback_preview", "")
|
|
802
|
+
severity = data.get("severity", "error")
|
|
803
|
+
print(f"[launcher] {RED}模块 '{module_id}' 崩溃: "
|
|
804
|
+
f"{exc_type} — {preview}{RESET}")
|
|
805
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
806
|
+
crash_log = os.path.join(
|
|
807
|
+
os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
808
|
+
module_id, "log", f"crashes{_suffix}.jsonl"
|
|
809
|
+
)
|
|
810
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
811
|
+
ts = msg.get("timestamp", "")
|
|
812
|
+
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
813
|
+
# from benchmark/test traffic
|
|
814
|
+
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
815
|
+
continue
|
|
816
|
+
latency_str = ""
|
|
817
|
+
if ts:
|
|
818
|
+
try:
|
|
819
|
+
from datetime import datetime, timezone
|
|
820
|
+
sent = datetime.fromisoformat(ts)
|
|
821
|
+
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
822
|
+
latency_str = f" ({delay_ms:.1f}ms)"
|
|
823
|
+
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
824
|
+
except Exception:
|
|
825
|
+
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
826
|
+
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
827
|
+
else:
|
|
828
|
+
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
829
|
+
elif msg_type == "error":
|
|
830
|
+
print(f"[launcher] Event Hub 错误: {msg.get('message')}")
|
|
831
|
+
except Exception as e:
|
|
832
|
+
print(f"[launcher] 事件处理异常(已忽略): {e}")
|
|
262
833
|
|
|
263
834
|
async def _publish_event(self, event_type: str, data: dict):
|
|
264
|
-
"""Publish an event to Event Hub via WebSocket.
|
|
835
|
+
"""Publish an event to Event Hub via WebSocket. Uses create_task to avoid
|
|
836
|
+
deadlock with _ws_connect recv loop (websockets 15.x send can block when
|
|
837
|
+
incoming frames are pending and recv is held by async-for)."""
|
|
265
838
|
if not self._ws:
|
|
266
839
|
return
|
|
267
840
|
from datetime import datetime, timezone
|
|
268
|
-
msg = {
|
|
841
|
+
msg = json.dumps({
|
|
269
842
|
"type": "event",
|
|
270
843
|
"event_id": str(uuid.uuid4()),
|
|
271
844
|
"event": event_type,
|
|
272
845
|
"source": "launcher",
|
|
273
846
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
274
847
|
"data": data,
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
848
|
+
})
|
|
849
|
+
|
|
850
|
+
async def _send():
|
|
851
|
+
try:
|
|
852
|
+
await self._ws.send(msg)
|
|
853
|
+
except Exception as e:
|
|
854
|
+
print(f"[launcher] 发布事件失败: {e}")
|
|
855
|
+
|
|
856
|
+
asyncio.create_task(_send())
|
|
280
857
|
|
|
281
858
|
def _publish_event_threadsafe(self, event_type: str, data: dict):
|
|
282
859
|
"""Publish event from non-async context (API thread). Fire-and-forget."""
|
|
@@ -301,57 +878,127 @@ class Launcher:
|
|
|
301
878
|
self._event_waiters.pop(key, None)
|
|
302
879
|
|
|
303
880
|
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
304
|
-
"""Graceful shutdown: send event → wait ack → wait ready → kill.
|
|
881
|
+
"""Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
|
|
882
|
+
Modules that did not declare graceful_shutdown in module.ready are terminated directly.
|
|
883
|
+
"""
|
|
305
884
|
self._log_lifecycle("stopping", name, reason=reason)
|
|
306
|
-
|
|
885
|
+
|
|
886
|
+
if not self._graceful_modules.get(name):
|
|
887
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
888
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
889
|
+
await self._publish_event("module.stopped", {
|
|
890
|
+
"module_id": name,
|
|
891
|
+
"graceful_shutdown": False,
|
|
892
|
+
})
|
|
893
|
+
return
|
|
894
|
+
|
|
307
895
|
await self._publish_event("module.shutdown", {
|
|
308
896
|
"module_id": name, "reason": reason, "timeout": timeout,
|
|
309
897
|
})
|
|
310
898
|
|
|
311
|
-
# Step 2: wait for ack (3s)
|
|
312
899
|
ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
|
|
313
900
|
if not ack:
|
|
314
|
-
|
|
315
|
-
self.
|
|
316
|
-
|
|
901
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
902
|
+
await self._publish_event("module.stopped", {
|
|
903
|
+
"module_id": name,
|
|
904
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
905
|
+
})
|
|
317
906
|
return
|
|
318
907
|
|
|
319
|
-
# Step 3: wait for ready
|
|
320
908
|
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
321
909
|
ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
|
|
322
910
|
if ready:
|
|
323
|
-
|
|
324
|
-
self.process_manager.stop_module(name, timeout=1)
|
|
911
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
|
|
325
912
|
else:
|
|
326
|
-
|
|
327
|
-
self.process_manager.stop_module(name, timeout=3)
|
|
913
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
328
914
|
|
|
329
915
|
self._log_lifecycle("stopped", name, reason=reason)
|
|
330
|
-
await self._publish_event("module.stopped", {
|
|
916
|
+
await self._publish_event("module.stopped", {
|
|
917
|
+
"module_id": name,
|
|
918
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
919
|
+
})
|
|
331
920
|
|
|
332
921
|
async def _graceful_shutdown_all(self):
|
|
333
|
-
"""
|
|
922
|
+
"""Shut down all modules. Order:
|
|
923
|
+
1. Send shutdown to graceful modules (excl. Event Hub) — let them start cleanup
|
|
924
|
+
2. Terminate non-graceful modules (fast, runs during graceful cleanup)
|
|
925
|
+
3. Wait for graceful modules to exit (process monitoring)
|
|
926
|
+
4. Shut down Event Hub last (keeps event routing alive throughout)
|
|
927
|
+
"""
|
|
928
|
+
self._system_shutting_down = True
|
|
334
929
|
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
930
|
+
# Also check core modules
|
|
931
|
+
for cn in CORE_MODULE_NAMES:
|
|
932
|
+
if self.process_manager.is_running(cn) and cn not in running:
|
|
933
|
+
running.append(cn)
|
|
335
934
|
if not running:
|
|
935
|
+
print("[launcher] 没有运行中的模块需要关闭")
|
|
336
936
|
return
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
for
|
|
937
|
+
|
|
938
|
+
graceful = [n for n in running if self._graceful_modules.get(n)]
|
|
939
|
+
non_graceful = [n for n in running if not self._graceful_modules.get(n)]
|
|
940
|
+
|
|
941
|
+
# Defer Event Hub — it must stay alive to route shutdown events
|
|
942
|
+
hub_deferred = "event_hub" in graceful
|
|
943
|
+
graceful_batch = [n for n in graceful if n != "event_hub"] if hub_deferred else graceful
|
|
944
|
+
|
|
945
|
+
print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
|
|
946
|
+
|
|
947
|
+
# Phase 1: Notify graceful modules first (they start cleanup immediately)
|
|
948
|
+
for name in graceful_batch:
|
|
340
949
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
341
950
|
await self._publish_event("module.shutdown", {
|
|
342
|
-
"module_id": name, "reason": "system_shutdown", "timeout":
|
|
951
|
+
"module_id": name, "reason": "system_shutdown", "timeout": 5,
|
|
343
952
|
})
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
self.process_manager.stop_all(timeout=3)
|
|
352
|
-
for name in running:
|
|
953
|
+
|
|
954
|
+
# Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
|
|
955
|
+
if non_graceful:
|
|
956
|
+
print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
|
|
957
|
+
for name in non_graceful:
|
|
958
|
+
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
959
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
353
960
|
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
354
961
|
|
|
962
|
+
# Phase 3: Wait for graceful modules to exit (process monitoring)
|
|
963
|
+
if graceful_batch:
|
|
964
|
+
deadline = time.time() + 5
|
|
965
|
+
while time.time() < deadline:
|
|
966
|
+
still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
|
|
967
|
+
if not still_running:
|
|
968
|
+
print("[launcher] 所有优雅退出模块已自行退出")
|
|
969
|
+
break
|
|
970
|
+
remaining = max(0, deadline - time.time())
|
|
971
|
+
print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
|
|
972
|
+
await asyncio.sleep(1)
|
|
973
|
+
# Force kill survivors
|
|
974
|
+
for name in graceful_batch:
|
|
975
|
+
if self.process_manager.is_running(name):
|
|
976
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
977
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
978
|
+
|
|
979
|
+
# Phase 4: All other modules exited — now shut down Event Hub
|
|
980
|
+
if hub_deferred and self.process_manager.is_running("event_hub"):
|
|
981
|
+
self._log_lifecycle("stopping", "event_hub", reason="system_shutdown")
|
|
982
|
+
await self._publish_event("module.shutdown", {
|
|
983
|
+
"module_id": "event_hub", "reason": "system_shutdown", "timeout": 5,
|
|
984
|
+
})
|
|
985
|
+
deadline = time.time() + 5
|
|
986
|
+
while time.time() < deadline:
|
|
987
|
+
if not self.process_manager.is_running("event_hub"):
|
|
988
|
+
print("[launcher] Event Hub 已退出")
|
|
989
|
+
break
|
|
990
|
+
await asyncio.sleep(0.5)
|
|
991
|
+
if self.process_manager.is_running("event_hub"):
|
|
992
|
+
self.process_manager.stop_module("event_hub", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
993
|
+
self._log_lifecycle("stopped", "event_hub", reason="system_shutdown")
|
|
994
|
+
|
|
995
|
+
# Final safety net
|
|
996
|
+
try:
|
|
997
|
+
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
998
|
+
except Exception as e:
|
|
999
|
+
print(f"[launcher] stop_all 出错: {e}")
|
|
1000
|
+
await self._close_http()
|
|
1001
|
+
|
|
355
1002
|
# ── Heartbeat to Registry ──
|
|
356
1003
|
|
|
357
1004
|
async def _heartbeat_loop(self):
|
|
@@ -359,110 +1006,15 @@ class Launcher:
|
|
|
359
1006
|
while not self._thread_shutdown.is_set():
|
|
360
1007
|
await asyncio.sleep(30)
|
|
361
1008
|
try:
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
)
|
|
1009
|
+
client = self._get_http()
|
|
1010
|
+
await client.post(
|
|
1011
|
+
f"http://127.0.0.1:{self.registry_port}/modules",
|
|
1012
|
+
json={"action": "heartbeat", "module_id": "launcher"},
|
|
1013
|
+
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
1014
|
+
)
|
|
369
1015
|
except Exception:
|
|
370
1016
|
pass
|
|
371
1017
|
|
|
372
|
-
# ── Registry startup ──
|
|
373
|
-
|
|
374
|
-
async def _start_registry(self):
|
|
375
|
-
"""Start Registry as a subprocess, wait for it to write port.txt and /health to respond."""
|
|
376
|
-
registry_dir = os.path.join(self.project_root, "core", "registry")
|
|
377
|
-
if not os.path.isdir(registry_dir):
|
|
378
|
-
raise RuntimeError(f"Registry module not found at {registry_dir}")
|
|
379
|
-
|
|
380
|
-
# Use centralized data directory
|
|
381
|
-
from core.data_dir import get_registry_data_dir
|
|
382
|
-
registry_data_dir = get_registry_data_dir()
|
|
383
|
-
|
|
384
|
-
# Clean our instance's port file before starting
|
|
385
|
-
port_file = os.path.join(registry_data_dir, f"port_{self.instance_id}.txt")
|
|
386
|
-
if os.path.isfile(port_file):
|
|
387
|
-
os.remove(port_file)
|
|
388
|
-
|
|
389
|
-
registry_info = ModuleInfo(
|
|
390
|
-
name="registry",
|
|
391
|
-
display_name="Registry",
|
|
392
|
-
type="infrastructure",
|
|
393
|
-
state="enabled",
|
|
394
|
-
runtime="python",
|
|
395
|
-
entry="entry.py",
|
|
396
|
-
module_dir=registry_dir,
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
# Pass launcher_token + bind config via stdin
|
|
400
|
-
boot_info = {"token": self.kite_token, "registry_port": 0, "bind": "127.0.0.1", "instance_id": self.instance_id}
|
|
401
|
-
ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
|
|
402
|
-
if not ok:
|
|
403
|
-
raise RuntimeError("Failed to start Registry")
|
|
404
|
-
|
|
405
|
-
# Wait for Registry to write port.txt
|
|
406
|
-
print("[launcher] Waiting for Registry to report its port...")
|
|
407
|
-
deadline = time.time() + 10
|
|
408
|
-
while time.time() < deadline:
|
|
409
|
-
if os.path.isfile(port_file):
|
|
410
|
-
try:
|
|
411
|
-
with open(port_file, "r") as f:
|
|
412
|
-
self.registry_port = int(f.read().strip())
|
|
413
|
-
break
|
|
414
|
-
except (ValueError, OSError):
|
|
415
|
-
pass
|
|
416
|
-
await asyncio.sleep(0.2)
|
|
417
|
-
else:
|
|
418
|
-
raise RuntimeError("Registry failed to write port.txt within 10s")
|
|
419
|
-
|
|
420
|
-
# Poll /health until ready
|
|
421
|
-
url = f"http://127.0.0.1:{self.registry_port}/health"
|
|
422
|
-
print(f"[launcher] Registry on port {self.registry_port}, waiting for health check...")
|
|
423
|
-
|
|
424
|
-
deadline = time.time() + 10
|
|
425
|
-
async with httpx.AsyncClient() as client:
|
|
426
|
-
while time.time() < deadline:
|
|
427
|
-
try:
|
|
428
|
-
resp = await client.get(url, timeout=1)
|
|
429
|
-
if resp.status_code == 200:
|
|
430
|
-
print("[launcher] Registry is ready")
|
|
431
|
-
return
|
|
432
|
-
except Exception:
|
|
433
|
-
pass
|
|
434
|
-
await asyncio.sleep(0.2)
|
|
435
|
-
|
|
436
|
-
raise RuntimeError("Registry failed to become ready within 10s")
|
|
437
|
-
|
|
438
|
-
async def _register_self(self):
|
|
439
|
-
"""Register Launcher itself to Registry using new API."""
|
|
440
|
-
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
441
|
-
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
442
|
-
payload = {
|
|
443
|
-
"action": "register",
|
|
444
|
-
"module_id": "launcher",
|
|
445
|
-
"module_type": "infrastructure",
|
|
446
|
-
"name": "Launcher",
|
|
447
|
-
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
448
|
-
"health_endpoint": "/launcher/modules",
|
|
449
|
-
"events_publish": {
|
|
450
|
-
"module.started": {},
|
|
451
|
-
"module.stopped": {},
|
|
452
|
-
"module.state_changed": {},
|
|
453
|
-
},
|
|
454
|
-
"events_subscribe": [">"],
|
|
455
|
-
}
|
|
456
|
-
try:
|
|
457
|
-
async with httpx.AsyncClient() as client:
|
|
458
|
-
resp = await client.post(url, json=payload, headers=headers, timeout=5)
|
|
459
|
-
if resp.status_code == 200:
|
|
460
|
-
print("[launcher] Registered self to Registry")
|
|
461
|
-
else:
|
|
462
|
-
print(f"[launcher] WARNING: Registry registration returned {resp.status_code}")
|
|
463
|
-
except Exception as e:
|
|
464
|
-
print(f"[launcher] WARNING: failed to register to Registry: {e}")
|
|
465
|
-
|
|
466
1018
|
# ── Module startup ──
|
|
467
1019
|
|
|
468
1020
|
def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
|
|
@@ -491,86 +1043,133 @@ class Launcher:
|
|
|
491
1043
|
visit(m.name)
|
|
492
1044
|
return order
|
|
493
1045
|
|
|
1046
|
+
def _topo_layers(self, modules: list[ModuleInfo]) -> list[list[ModuleInfo]]:
|
|
1047
|
+
"""Topological sort into layers. Modules in the same layer have no
|
|
1048
|
+
inter-dependencies and can be started in parallel."""
|
|
1049
|
+
name_map = {m.name: m for m in modules}
|
|
1050
|
+
all_names = set(name_map.keys())
|
|
1051
|
+
|
|
1052
|
+
# Compute depth (longest path from root) for each module
|
|
1053
|
+
depth: dict[str, int] = {}
|
|
1054
|
+
in_stack: set[str] = set()
|
|
1055
|
+
|
|
1056
|
+
def get_depth(name: str) -> int:
|
|
1057
|
+
if name in depth:
|
|
1058
|
+
return depth[name]
|
|
1059
|
+
if name in in_stack:
|
|
1060
|
+
raise RuntimeError(f"Circular dependency detected involving '{name}'")
|
|
1061
|
+
in_stack.add(name)
|
|
1062
|
+
info = name_map.get(name)
|
|
1063
|
+
d = 0
|
|
1064
|
+
if info:
|
|
1065
|
+
for dep in info.depends_on:
|
|
1066
|
+
if dep in all_names:
|
|
1067
|
+
d = max(d, get_depth(dep) + 1)
|
|
1068
|
+
in_stack.remove(name)
|
|
1069
|
+
depth[name] = d
|
|
1070
|
+
return d
|
|
1071
|
+
|
|
1072
|
+
for name in all_names:
|
|
1073
|
+
get_depth(name)
|
|
1074
|
+
|
|
1075
|
+
# Group by depth
|
|
1076
|
+
max_depth = max(depth.values()) if depth else 0
|
|
1077
|
+
layers: list[list[ModuleInfo]] = [[] for _ in range(max_depth + 1)]
|
|
1078
|
+
for name, d in depth.items():
|
|
1079
|
+
layers[d].append(name_map[name])
|
|
1080
|
+
return layers
|
|
1081
|
+
|
|
494
1082
|
async def _start_one_module(self, info: ModuleInfo):
|
|
495
|
-
"""Start a single module: publish starting
|
|
1083
|
+
"""Start a single module: publish starting → start process → wait ready → started → close stdio."""
|
|
496
1084
|
self._log_lifecycle("starting", info.name)
|
|
497
1085
|
await self._publish_event("module.starting", {"module_id": info.name})
|
|
498
1086
|
|
|
499
1087
|
token = self._module_tokens.get(info.name, "")
|
|
500
|
-
boot_info = {
|
|
501
|
-
|
|
502
|
-
"registry_port": self.registry_port,
|
|
503
|
-
"preferred_port": info.preferred_port,
|
|
504
|
-
"advertise_ip": "127.0.0.1",
|
|
505
|
-
}
|
|
1088
|
+
boot_info = {"token": token}
|
|
1089
|
+
t0 = time.monotonic()
|
|
506
1090
|
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
507
1091
|
if not ok:
|
|
508
1092
|
self._log_lifecycle("start_failed", info.name)
|
|
509
1093
|
return
|
|
510
1094
|
|
|
511
|
-
#
|
|
1095
|
+
# Persist immediately after starting to ensure PID is recorded
|
|
1096
|
+
# (in case launcher crashes before Phase 4 completes)
|
|
1097
|
+
self.process_manager.persist_records()
|
|
1098
|
+
|
|
1099
|
+
# Wait for module.ready or module.exiting (whichever comes first)
|
|
512
1100
|
timeout = info.launch.timeout
|
|
513
1101
|
ready = await self._wait_event("module.ready", info.name, timeout=timeout)
|
|
514
|
-
|
|
515
|
-
|
|
1102
|
+
elapsed = time.monotonic() - t0
|
|
1103
|
+
if ready and ready.get("_exited"):
|
|
1104
|
+
# Module sent module.exiting before ready — it chose to quit
|
|
1105
|
+
reason = ready.get("reason", "unknown")
|
|
1106
|
+
self._exit_reasons[info.name] = reason
|
|
1107
|
+
print(f"[launcher] 模块 '{info.name}' 主动退出: {reason} ({elapsed:.2f}s)")
|
|
1108
|
+
elif ready:
|
|
1109
|
+
self._graceful_modules[info.name] = bool(ready.get("graceful_shutdown"))
|
|
1110
|
+
self._ready_times[info.name] = elapsed
|
|
1111
|
+
print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
|
|
516
1112
|
else:
|
|
517
|
-
print(f"[launcher]
|
|
1113
|
+
print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
|
|
518
1114
|
|
|
519
1115
|
rec = self.process_manager.get_record(info.name)
|
|
520
1116
|
self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
|
|
521
1117
|
await self._publish_event("module.started", {"module_id": info.name})
|
|
522
|
-
|
|
523
|
-
async def _start_enabled_modules(self):
|
|
524
|
-
"""Start modules in dependency order, auto-starting manual deps if needed."""
|
|
525
|
-
to_start = [m for m in self.modules.values()
|
|
526
|
-
if self._desired_states.get(m.name) == "running"]
|
|
527
|
-
if not to_start:
|
|
528
|
-
print("[launcher] No modules to start")
|
|
529
|
-
return
|
|
530
|
-
|
|
531
|
-
# Auto-start manual modules if depended upon
|
|
532
|
-
needed = set(m.name for m in to_start)
|
|
533
|
-
for m in to_start:
|
|
534
|
-
for dep in m.depends_on:
|
|
535
|
-
if dep not in needed:
|
|
536
|
-
dep_info = self.modules.get(dep)
|
|
537
|
-
if dep_info and dep_info.state != "disabled":
|
|
538
|
-
needed.add(dep)
|
|
539
|
-
to_start.append(dep_info)
|
|
540
|
-
self._desired_states[dep] = "running"
|
|
541
|
-
print(f"[launcher] Auto-starting '{dep}' (dependency)")
|
|
542
|
-
elif dep_info and dep_info.state == "disabled":
|
|
543
|
-
print(f"[launcher] ERROR: '{m.name}' depends on disabled module '{dep}'")
|
|
544
|
-
|
|
545
|
-
try:
|
|
546
|
-
sorted_modules = self._topo_sort(to_start)
|
|
547
|
-
except RuntimeError as e:
|
|
548
|
-
print(f"[launcher] ERROR: {e}")
|
|
549
|
-
return
|
|
550
|
-
|
|
551
|
-
print(f"[launcher] Starting {len(sorted_modules)} module(s)...")
|
|
552
|
-
for info in sorted_modules:
|
|
553
|
-
await self._start_one_module(info)
|
|
1118
|
+
self.process_manager.close_stdio(info.name)
|
|
554
1119
|
|
|
555
1120
|
async def _register_module_tokens(self):
|
|
556
1121
|
"""Generate per-module tokens and register the mapping to Registry."""
|
|
1122
|
+
# Include all scanned modules + core modules
|
|
557
1123
|
for name in self.modules:
|
|
558
|
-
|
|
1124
|
+
if name not in self._module_tokens:
|
|
1125
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
1126
|
+
# Ensure registry has a token
|
|
1127
|
+
if "registry" not in self._module_tokens:
|
|
1128
|
+
self._module_tokens["registry"] = secrets.token_hex(32)
|
|
559
1129
|
|
|
560
1130
|
if not self._module_tokens:
|
|
561
1131
|
return
|
|
562
1132
|
|
|
1133
|
+
await self._register_tokens_to_registry(self._module_tokens)
|
|
1134
|
+
|
|
1135
|
+
async def _register_tokens_to_registry(self, tokens: dict):
|
|
1136
|
+
"""Register token mapping to Registry via POST /tokens."""
|
|
563
1137
|
url = f"http://127.0.0.1:{self.registry_port}/tokens"
|
|
564
1138
|
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
565
1139
|
try:
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
1140
|
+
client = self._get_http()
|
|
1141
|
+
resp = await client.post(url, json=tokens, headers=headers)
|
|
1142
|
+
if resp.status_code == 200:
|
|
1143
|
+
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
1144
|
+
else:
|
|
1145
|
+
print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
|
|
572
1146
|
except Exception as e:
|
|
573
|
-
print(f"[launcher]
|
|
1147
|
+
print(f"[launcher] 警告: 注册模块令牌失败: {e}")
|
|
1148
|
+
|
|
1149
|
+
# ── Validation ──
|
|
1150
|
+
|
|
1151
|
+
def _validate_core_modules(self):
|
|
1152
|
+
"""Validate core modules exist (mechanism 12)."""
|
|
1153
|
+
project_root = os.environ["KITE_PROJECT"]
|
|
1154
|
+
for name in ("registry", "event_hub"):
|
|
1155
|
+
mod_dir = os.path.join(project_root, "core", name)
|
|
1156
|
+
md_path = os.path.join(mod_dir, "module.md")
|
|
1157
|
+
if not os.path.isdir(mod_dir):
|
|
1158
|
+
print(f"[launcher] 致命: 核心模块 '{name}' 目录未找到: {mod_dir}")
|
|
1159
|
+
sys.exit(1)
|
|
1160
|
+
if not os.path.isfile(md_path):
|
|
1161
|
+
print(f"[launcher] 致命: 核心模块 '{name}' 缺少 module.md: {md_path}")
|
|
1162
|
+
sys.exit(1)
|
|
1163
|
+
# Try to parse frontmatter
|
|
1164
|
+
try:
|
|
1165
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
1166
|
+
fm = _parse_frontmatter(f.read())
|
|
1167
|
+
if not fm:
|
|
1168
|
+
print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
|
|
1169
|
+
sys.exit(1)
|
|
1170
|
+
except Exception as e:
|
|
1171
|
+
print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
|
|
1172
|
+
sys.exit(1)
|
|
574
1173
|
|
|
575
1174
|
# ── API thread ──
|
|
576
1175
|
|
|
@@ -591,70 +1190,119 @@ class Launcher:
|
|
|
591
1190
|
t = threading.Thread(target=_run, daemon=True)
|
|
592
1191
|
t.start()
|
|
593
1192
|
|
|
594
|
-
# Wait for API server to actually be ready before proceeding
|
|
595
1193
|
deadline = time.time() + 5
|
|
596
1194
|
while time.time() < deadline:
|
|
597
1195
|
if self._api_server.started:
|
|
598
1196
|
break
|
|
599
1197
|
time.sleep(0.05)
|
|
600
1198
|
else:
|
|
601
|
-
print("[launcher]
|
|
1199
|
+
print("[launcher] 警告: API 服务器可能尚未完全就绪")
|
|
602
1200
|
|
|
603
|
-
print(f"[launcher] API
|
|
1201
|
+
print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
|
|
1202
|
+
|
|
1203
|
+
# ── Module crash summary ──
|
|
1204
|
+
|
|
1205
|
+
def _print_module_crash_summary(self, name: str):
|
|
1206
|
+
"""Read module's crashes.jsonl last record and print red summary to console.
|
|
1207
|
+
Complement to module.crash event — reliable even if event was never sent."""
|
|
1208
|
+
RED = "\033[91m"
|
|
1209
|
+
RESET = "\033[0m"
|
|
1210
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
1211
|
+
crash_log = os.path.join(
|
|
1212
|
+
os.environ.get("KITE_INSTANCE_DIR", ""), name, "log", f"crashes{_suffix}.jsonl"
|
|
1213
|
+
)
|
|
1214
|
+
if not os.path.isfile(crash_log):
|
|
1215
|
+
return
|
|
1216
|
+
try:
|
|
1217
|
+
with open(crash_log, "rb") as f:
|
|
1218
|
+
f.seek(0, 2)
|
|
1219
|
+
size = f.tell()
|
|
1220
|
+
if size == 0:
|
|
1221
|
+
return
|
|
1222
|
+
f.seek(max(0, size - 4096))
|
|
1223
|
+
lines = f.read().decode("utf-8").strip().split("\n")
|
|
1224
|
+
last = json.loads(lines[-1])
|
|
1225
|
+
exc_type = last.get("exception_type", "Unknown")
|
|
1226
|
+
ctx = last.get("context", {})
|
|
1227
|
+
file_name = ctx.get("file", "unknown")
|
|
1228
|
+
line_no = ctx.get("line", "?")
|
|
1229
|
+
print(f"[launcher] {RED}崩溃: "
|
|
1230
|
+
f"{exc_type} in {file_name}:{line_no}{RESET}")
|
|
1231
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
1232
|
+
except Exception:
|
|
1233
|
+
pass
|
|
604
1234
|
|
|
605
1235
|
# ── Monitor loop ──
|
|
606
1236
|
|
|
607
1237
|
async def _monitor_loop(self):
|
|
608
|
-
"""Check child processes every second. Handle crashes.
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
1238
|
+
"""Check child processes every second. Handle crashes.
|
|
1239
|
+
Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
|
|
1240
|
+
|
|
1241
|
+
Responsibility split:
|
|
1242
|
+
- Core module crash → full restart (Launcher handles)
|
|
1243
|
+
- Watchdog crash → Launcher restarts directly (up to 3 times)
|
|
1244
|
+
- Other module exit → publish module.stopped event only; Watchdog decides restart
|
|
1245
|
+
"""
|
|
1246
|
+
WATCHDOG_MAX_FAIL = 3
|
|
1247
|
+
watchdog_fail_count = 0
|
|
1248
|
+
|
|
1249
|
+
while not self._shutdown_event.is_set():
|
|
613
1250
|
exited = self.process_manager.check_exited()
|
|
614
1251
|
|
|
615
1252
|
for name, rc in exited:
|
|
616
|
-
print(f"[launcher]
|
|
1253
|
+
print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
|
|
1254
|
+
if rc != 0:
|
|
1255
|
+
self._print_module_crash_summary(name)
|
|
617
1256
|
self._log_lifecycle("exited", name, exit_code=rc)
|
|
618
1257
|
await self._publish_event("module.stopped", {
|
|
619
1258
|
"module_id": name, "exit_code": rc,
|
|
1259
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
620
1260
|
})
|
|
621
1261
|
info = self.modules.get(name)
|
|
622
1262
|
|
|
623
|
-
# Core module crash → full restart
|
|
624
|
-
if info and info.is_core(
|
|
625
|
-
print(f"[launcher]
|
|
1263
|
+
# 1) Core module crash → full restart
|
|
1264
|
+
if name in CORE_MODULE_NAMES or (info and info.is_core()):
|
|
1265
|
+
print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
|
|
626
1266
|
self._log_lifecycle("core_crash", name, exit_code=rc)
|
|
627
1267
|
await self._full_restart()
|
|
628
1268
|
return
|
|
629
1269
|
|
|
630
|
-
#
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
1270
|
+
# 2) Watchdog crash → Launcher restarts directly
|
|
1271
|
+
if name == WATCHDOG_MODULE_NAME:
|
|
1272
|
+
if self._system_shutting_down:
|
|
1273
|
+
print(f"[launcher] Watchdog 退出(系统关闭中),跳过重启")
|
|
1274
|
+
continue
|
|
1275
|
+
watchdog_fail_count += 1
|
|
1276
|
+
if watchdog_fail_count <= WATCHDOG_MAX_FAIL and info:
|
|
1277
|
+
print(f"[launcher] Watchdog 崩溃,正在重启 (第 {watchdog_fail_count}/{WATCHDOG_MAX_FAIL} 次)...")
|
|
1278
|
+
await self._start_one_module(info)
|
|
1279
|
+
else:
|
|
1280
|
+
self._desired_states[name] = "stopped"
|
|
1281
|
+
self._log_lifecycle("failed", name, reason=f"exceeded {WATCHDOG_MAX_FAIL} retries")
|
|
1282
|
+
print(f"[launcher] Watchdog 失败 {WATCHDOG_MAX_FAIL} 次,已放弃")
|
|
1283
|
+
continue
|
|
641
1284
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
if failed_count >= MAX_FAILED_MODULES:
|
|
645
|
-
print(f"[launcher] {failed_count} modules permanently failed, Launcher exiting")
|
|
646
|
-
return
|
|
1285
|
+
# 3) Other modules → event already published above; Watchdog decides restart
|
|
1286
|
+
# (no restart logic here — Watchdog handles it via module.stopped event)
|
|
647
1287
|
|
|
648
1288
|
if exited:
|
|
649
1289
|
self.process_manager.persist_records()
|
|
650
1290
|
|
|
651
|
-
|
|
1291
|
+
# Wait 1s but wake immediately on shutdown signal
|
|
1292
|
+
try:
|
|
1293
|
+
await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
|
|
1294
|
+
return # shutdown requested
|
|
1295
|
+
except asyncio.TimeoutError:
|
|
1296
|
+
pass
|
|
652
1297
|
|
|
653
1298
|
async def _full_restart(self):
|
|
654
|
-
"""Stop all modules,
|
|
655
|
-
print("[launcher]
|
|
1299
|
+
"""Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
|
|
1300
|
+
print("[launcher] 全量重启: 正在停止所有模块...")
|
|
1301
|
+
|
|
1302
|
+
# Persist records before shutdown so cleanup_leftovers can find survivors
|
|
1303
|
+
self.process_manager.persist_records()
|
|
656
1304
|
|
|
657
|
-
# Disconnect Event Hub
|
|
1305
|
+
# Disconnect Event Hub WS
|
|
658
1306
|
if self._ws_task:
|
|
659
1307
|
self._ws_task.cancel()
|
|
660
1308
|
self._ws_task = None
|
|
@@ -662,79 +1310,303 @@ class Launcher:
|
|
|
662
1310
|
self._heartbeat_task.cancel()
|
|
663
1311
|
self._heartbeat_task = None
|
|
664
1312
|
self._ws = None
|
|
1313
|
+
self._event_hub_ws_url = ""
|
|
1314
|
+
self._launcher_ws_token = ""
|
|
665
1315
|
|
|
666
1316
|
await self._graceful_shutdown_all()
|
|
667
|
-
|
|
1317
|
+
|
|
1318
|
+
# Cleanup any leftover processes that survived graceful shutdown.
|
|
1319
|
+
# Note: _graceful_shutdown_all() clears _processes/_records dicts, but
|
|
1320
|
+
# cleanup_leftovers() reads from processes.json (persisted above), so it can
|
|
1321
|
+
# still find and kill survivors.
|
|
1322
|
+
self.process_manager.cleanup_leftovers()
|
|
668
1323
|
|
|
669
1324
|
self._module_tokens.clear()
|
|
670
1325
|
|
|
671
|
-
|
|
1326
|
+
# Regenerate kite_token
|
|
1327
|
+
self.kite_token = secrets.token_hex(32)
|
|
1328
|
+
self.process_manager.kite_token = self.kite_token
|
|
1329
|
+
|
|
1330
|
+
print("[launcher] 全量重启: 重新执行 Phase 1-4...")
|
|
672
1331
|
try:
|
|
673
|
-
await self.
|
|
674
|
-
await self.
|
|
675
|
-
|
|
676
|
-
for n, info in self.modules.items():
|
|
677
|
-
self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
|
|
678
|
-
await self._register_module_tokens()
|
|
679
|
-
await self._start_enabled_modules()
|
|
1332
|
+
await self._phase1_parallel_bootstrap()
|
|
1333
|
+
await self._phase3_registry_ready()
|
|
1334
|
+
await self._phase4_start_modules()
|
|
680
1335
|
self.process_manager.persist_records()
|
|
681
|
-
|
|
682
|
-
print("[launcher]
|
|
1336
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
1337
|
+
print("[launcher] 全量重启完成,恢复监控循环")
|
|
683
1338
|
await self._monitor_loop()
|
|
684
1339
|
except Exception as e:
|
|
685
|
-
print(f"[launcher]
|
|
1340
|
+
print(f"[launcher] 全量重启失败: {e}")
|
|
686
1341
|
|
|
687
1342
|
# ── Shutdown ──
|
|
688
1343
|
|
|
689
1344
|
def _final_cleanup(self):
|
|
690
1345
|
"""Called on exit — stop all processes, stop API, clear records."""
|
|
691
|
-
|
|
1346
|
+
try:
|
|
1347
|
+
print("[launcher] 正在执行最终清理...")
|
|
1348
|
+
|
|
1349
|
+
if self._ws_task:
|
|
1350
|
+
self._ws_task.cancel()
|
|
1351
|
+
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
1352
|
+
self._heartbeat_task.cancel()
|
|
1353
|
+
|
|
1354
|
+
# Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
|
|
1355
|
+
# This is just a safety check — should normally find nothing.
|
|
1356
|
+
remaining = [n for n in self.process_manager._processes
|
|
1357
|
+
if self.process_manager.is_running(n)]
|
|
1358
|
+
if remaining:
|
|
1359
|
+
print(f"[launcher] 警告: 仍有残留进程 (不应出现): {', '.join(remaining)}")
|
|
1360
|
+
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
1361
|
+
else:
|
|
1362
|
+
print("[launcher] 无残留进程")
|
|
692
1363
|
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
696
|
-
self._heartbeat_task.cancel()
|
|
1364
|
+
if self._api_server:
|
|
1365
|
+
self._api_server.should_exit = True
|
|
697
1366
|
|
|
698
|
-
|
|
1367
|
+
# Clear instance runtime files
|
|
1368
|
+
try:
|
|
1369
|
+
os.remove(self.process_manager.records_path)
|
|
1370
|
+
except OSError:
|
|
1371
|
+
pass
|
|
1372
|
+
except Exception as e:
|
|
1373
|
+
print(f"[launcher] 最终清理出错: {e}")
|
|
1374
|
+
finally:
|
|
1375
|
+
# Signal the safety-net thread that normal shutdown has completed
|
|
1376
|
+
self._shutdown_complete.set()
|
|
1377
|
+
print("[launcher] 再见。")
|
|
1378
|
+
|
|
1379
|
+
if IS_WINDOWS:
|
|
1380
|
+
os._exit(0)
|
|
1381
|
+
|
|
1382
|
+
# ── Startup report ──
|
|
1383
|
+
|
|
1384
|
+
async def _print_startup_report(self, total_time: float, phase_times: dict[str, float], *,
|
|
1385
|
+
global_instances=None, cleaned_stats: dict[str, int] | None = None):
|
|
1386
|
+
"""Print a green startup summary with module list and timing."""
|
|
1387
|
+
G = "\033[32m" # green
|
|
1388
|
+
Y = "\033[33m" # yellow
|
|
1389
|
+
R = "\033[0m" # reset
|
|
1390
|
+
B = "\033[1;32m" # bold green
|
|
1391
|
+
|
|
1392
|
+
running = []
|
|
1393
|
+
exited = []
|
|
1394
|
+
stopped = []
|
|
1395
|
+
for name, info in self.modules.items():
|
|
1396
|
+
rec = self.process_manager.get_record(name)
|
|
1397
|
+
is_running = self.process_manager.is_running(name)
|
|
1398
|
+
if is_running and rec:
|
|
1399
|
+
running.append((name, info, rec))
|
|
1400
|
+
elif self._desired_states.get(name) == "running" and not is_running:
|
|
1401
|
+
# Was started but already exited (e.g. module.exiting)
|
|
1402
|
+
exited.append((name, info))
|
|
1403
|
+
else:
|
|
1404
|
+
stopped.append((name, info))
|
|
1405
|
+
|
|
1406
|
+
# Calculate kernel startup time (Phase 1+2+3)
|
|
1407
|
+
kernel_time = 0
|
|
1408
|
+
for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
|
|
1409
|
+
if phase_name in phase_times:
|
|
1410
|
+
kernel_time += phase_times[phase_name]
|
|
1411
|
+
|
|
1412
|
+
lines = [
|
|
1413
|
+
"",
|
|
1414
|
+
f"{B}{'=' * 60}",
|
|
1415
|
+
f" Kite 内核启动完成 耗时 {kernel_time:.2f}s",
|
|
1416
|
+
f" Kite 全部模块启动完成 总耗时 {total_time:.2f}s",
|
|
1417
|
+
f"{'=' * 60}{R}",
|
|
1418
|
+
]
|
|
1419
|
+
|
|
1420
|
+
# Phase breakdown
|
|
1421
|
+
lines.append(f"{G} 阶段耗时:{R}")
|
|
1422
|
+
|
|
1423
|
+
# Kernel modules section
|
|
1424
|
+
lines.append(f"{G} 内核模块:{R}")
|
|
1425
|
+
for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
|
|
1426
|
+
if phase_name in phase_times:
|
|
1427
|
+
elapsed = phase_times[phase_name]
|
|
1428
|
+
lines.append(f"{G} {phase_name:<26s} {elapsed:>6.2f}s{R}")
|
|
1429
|
+
|
|
1430
|
+
# Extension modules section
|
|
1431
|
+
lines.append(f"{G} 扩展模块:{R}")
|
|
1432
|
+
if "Phase 4: Extensions" in phase_times:
|
|
1433
|
+
elapsed = phase_times["Phase 4: Extensions"]
|
|
1434
|
+
lines.append(f"{G} {'Phase 4: Extensions':<26s} {elapsed:>6.2f}s{R}")
|
|
1435
|
+
|
|
1436
|
+
# Sort running modules by ready time
|
|
1437
|
+
running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
|
|
1438
|
+
|
|
1439
|
+
# Running modules with ready time and elapsed from Kite start
|
|
1440
|
+
DIM = "\033[90m"
|
|
1441
|
+
lines.append(f"{G} 运行中 ({len(running)}):{R}")
|
|
1442
|
+
|
|
1443
|
+
# CJK-aware display width helpers
|
|
1444
|
+
def _dw(s):
|
|
1445
|
+
"""Display width: CJK chars count as 2, others as 1."""
|
|
1446
|
+
w = 0
|
|
1447
|
+
for c in str(s):
|
|
1448
|
+
w += 2 if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f' or '\uff00' <= c <= '\uffef' else 1
|
|
1449
|
+
return w
|
|
1450
|
+
|
|
1451
|
+
def _rpad(s, width):
|
|
1452
|
+
"""Left-align s in a field of given display width."""
|
|
1453
|
+
return str(s) + ' ' * max(0, width - _dw(s))
|
|
1454
|
+
|
|
1455
|
+
def _lpad(s, width):
|
|
1456
|
+
"""Right-align s in a field of given display width."""
|
|
1457
|
+
return ' ' * max(0, width - _dw(s)) + str(s)
|
|
1458
|
+
|
|
1459
|
+
# Column definitions: (header, align, min_width)
|
|
1460
|
+
headers = ['模块', 'PID', '启动耗时', '进程启动时长', '类型']
|
|
1461
|
+
aligns = ['left', 'right', 'right', 'right', 'left'] # alignment per column
|
|
1462
|
+
|
|
1463
|
+
# Build data rows first to calculate column widths
|
|
1464
|
+
rows = []
|
|
1465
|
+
for name, info, rec in running_sorted:
|
|
1466
|
+
label = info.display_name or name
|
|
1467
|
+
ready_t = self._ready_times.get(name)
|
|
1468
|
+
time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
|
|
1469
|
+
if ready_t is not None and hasattr(self, '_start_unix'):
|
|
1470
|
+
elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
|
|
1471
|
+
es_str = f"{elapsed_from_start:.2f}s"
|
|
1472
|
+
else:
|
|
1473
|
+
es_str = "—"
|
|
1474
|
+
rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]"])
|
|
1475
|
+
|
|
1476
|
+
# Calculate column widths: max of header and all data display widths
|
|
1477
|
+
col_widths = [_dw(h) for h in headers]
|
|
1478
|
+
for row in rows:
|
|
1479
|
+
for i, cell in enumerate(row):
|
|
1480
|
+
col_widths[i] = max(col_widths[i], _dw(cell))
|
|
1481
|
+
|
|
1482
|
+
# Render header
|
|
1483
|
+
hdr_parts = []
|
|
1484
|
+
for i, h in enumerate(headers):
|
|
1485
|
+
if aligns[i] == 'left':
|
|
1486
|
+
hdr_parts.append(_rpad(h, col_widths[i]))
|
|
1487
|
+
else:
|
|
1488
|
+
hdr_parts.append(_lpad(h, col_widths[i]))
|
|
1489
|
+
lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
|
|
1490
|
+
|
|
1491
|
+
# Render data rows
|
|
1492
|
+
for row in rows:
|
|
1493
|
+
parts = []
|
|
1494
|
+
for i, cell in enumerate(row):
|
|
1495
|
+
if aligns[i] == 'left':
|
|
1496
|
+
parts.append(_rpad(cell, col_widths[i]))
|
|
1497
|
+
else:
|
|
1498
|
+
parts.append(_lpad(cell, col_widths[i]))
|
|
1499
|
+
lines.append(f"{G} ✓ {' '.join(parts)}{R}")
|
|
1500
|
+
|
|
1501
|
+
# Exited modules (started but already quit)
|
|
1502
|
+
if exited:
|
|
1503
|
+
lines.append(f"{Y} 已退出 ({len(exited)}):{R}")
|
|
1504
|
+
for name, info in exited:
|
|
1505
|
+
label = info.display_name or name
|
|
1506
|
+
reason = self._exit_reasons.get(name, "")
|
|
1507
|
+
reason_str = f": {reason}" if reason else ""
|
|
1508
|
+
lines.append(f"{Y} ↗ {label:<20s} (主动退出{reason_str}){R}")
|
|
1509
|
+
|
|
1510
|
+
# Stopped modules
|
|
1511
|
+
if stopped:
|
|
1512
|
+
lines.append(f"{G} 未启动 ({len(stopped)}):{R}")
|
|
1513
|
+
for name, info in stopped:
|
|
1514
|
+
label = info.display_name or name
|
|
1515
|
+
lines.append(f"{G} - {label:<20s} ({info.state}){R}")
|
|
1516
|
+
|
|
1517
|
+
lines.append(f"{G} Launcher API: http://127.0.0.1:{self.api_port} 实例: {self.instance_id}{R}")
|
|
1518
|
+
|
|
1519
|
+
# Query Registry for web module's access URL
|
|
1520
|
+
web_url = await self._get_web_url()
|
|
1521
|
+
if web_url:
|
|
1522
|
+
lines.append(f"{B} Web 管理后台: {web_url}{R}")
|
|
1523
|
+
|
|
1524
|
+
# Instance info
|
|
1525
|
+
instances = self.process_manager.get_alive_instances()
|
|
1526
|
+
inst_num = self.process_manager.instance_num
|
|
1527
|
+
suffix_display = self.process_manager.instance_suffix or "(无)"
|
|
1528
|
+
inst_dir = os.environ.get("KITE_INSTANCE_DIR", "")
|
|
1529
|
+
cwd = os.environ.get("KITE_CWD", "")
|
|
1530
|
+
debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
|
|
1531
|
+
lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
|
|
1532
|
+
lines.append(f"{G} 实例目录: {inst_dir}{R}")
|
|
1533
|
+
lines.append(f"{G} 工作目录: {cwd}{R}")
|
|
1534
|
+
if len(instances) > 1:
|
|
1535
|
+
lines.append(f"{G} 所有实例:{R}")
|
|
1536
|
+
for i in instances:
|
|
1537
|
+
s = "" if i["num"] == 1 else f"~{i['num']}"
|
|
1538
|
+
debug_tag = " [DEBUG]" if i.get("debug", False) else ""
|
|
1539
|
+
current_tag = " (当前)" if i["is_self"] else ""
|
|
1540
|
+
lines.append(f"{G} #{i['num']} PID {i['launcher_pid']} "
|
|
1541
|
+
f"模块数 {i['module_count']} (processes{s}.json){debug_tag}{current_tag}{R}")
|
|
1542
|
+
|
|
1543
|
+
# Cross-directory instances from other projects
|
|
1544
|
+
if global_instances:
|
|
1545
|
+
my_inst_basename = os.path.basename(os.environ.get("KITE_INSTANCE_DIR", ""))
|
|
1546
|
+
other_instances = [i for i in global_instances
|
|
1547
|
+
if not i["is_self"] and i["instance_dir"] != my_inst_basename]
|
|
1548
|
+
if other_instances:
|
|
1549
|
+
lines.append(f"{G} 其他项目实例:{R}")
|
|
1550
|
+
for i in other_instances:
|
|
1551
|
+
debug_tag = " [DEBUG]" if i.get("debug", False) else ""
|
|
1552
|
+
cwd_display = f" {i['cwd']}" if i["cwd"] else ""
|
|
1553
|
+
lines.append(
|
|
1554
|
+
f"{G} {i['instance_dir']:<20s} "
|
|
1555
|
+
f"#{i['num']} PID {i['launcher_pid']} "
|
|
1556
|
+
f"模块数 {i['module_count']}"
|
|
1557
|
+
f"{cwd_display}{debug_tag}{R}"
|
|
1558
|
+
)
|
|
699
1559
|
|
|
700
|
-
if
|
|
701
|
-
|
|
1560
|
+
if cleaned_stats:
|
|
1561
|
+
total = sum(cleaned_stats.values())
|
|
1562
|
+
if len(cleaned_stats) == 1:
|
|
1563
|
+
inst, count = next(iter(cleaned_stats.items()))
|
|
1564
|
+
lines.append(f"{Y} 已清理残留进程: {inst} ({count} 个){R}")
|
|
1565
|
+
else:
|
|
1566
|
+
lines.append(f"{Y} 已清理残留进程 (共 {total} 个):{R}")
|
|
1567
|
+
for inst, count in cleaned_stats.items():
|
|
1568
|
+
lines.append(f"{Y} {inst}: {count} 个{R}")
|
|
702
1569
|
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
port_file = os.path.join(get_registry_data_dir(), f"port_{self.instance_id}.txt")
|
|
1570
|
+
lines.append(f"{B}{'=' * 60}{R}")
|
|
1571
|
+
lines.append("")
|
|
1572
|
+
|
|
1573
|
+
print("\n".join(lines))
|
|
1574
|
+
|
|
1575
|
+
async def _get_web_url(self) -> str:
|
|
1576
|
+
"""Query Registry for the web module's api_endpoint. Returns URL or empty string."""
|
|
711
1577
|
try:
|
|
712
|
-
|
|
713
|
-
|
|
1578
|
+
client = self._get_http()
|
|
1579
|
+
resp = await client.get(
|
|
1580
|
+
f"http://127.0.0.1:{self.registry_port}/get/web.api_endpoint",
|
|
1581
|
+
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
1582
|
+
timeout=3,
|
|
1583
|
+
)
|
|
1584
|
+
if resp.status_code == 200:
|
|
1585
|
+
val = resp.json()
|
|
1586
|
+
if val and isinstance(val, str):
|
|
1587
|
+
# Show localhost instead of 127.0.0.1 for friendliness
|
|
1588
|
+
return val.replace("://127.0.0.1:", "://localhost:")
|
|
1589
|
+
except Exception:
|
|
714
1590
|
pass
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
if IS_WINDOWS:
|
|
718
|
-
os._exit(0)
|
|
1591
|
+
return ""
|
|
719
1592
|
|
|
720
1593
|
# ── Utilities ──
|
|
721
1594
|
|
|
722
1595
|
def _load_discovery(self) -> dict | None:
|
|
723
1596
|
"""Read discovery config from launcher's own module.md."""
|
|
724
|
-
md_path = os.path.join(
|
|
1597
|
+
md_path = os.path.join(os.environ["KITE_PROJECT"], "core", "launcher", "module.md")
|
|
725
1598
|
try:
|
|
726
1599
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
727
1600
|
fm = _parse_frontmatter(f.read())
|
|
728
1601
|
discovery = fm.get("discovery")
|
|
729
1602
|
if isinstance(discovery, dict) and discovery:
|
|
730
|
-
print(f"[launcher] Discovery sources: {', '.join(discovery.keys())}")
|
|
731
1603
|
return discovery
|
|
732
1604
|
except Exception as e:
|
|
733
|
-
print(f"[launcher]
|
|
1605
|
+
print(f"[launcher] 警告: 读取发现配置失败: {e}")
|
|
734
1606
|
return None
|
|
735
1607
|
|
|
736
1608
|
def _log_lifecycle(self, event: str, module: str, **extra):
|
|
737
|
-
"""Append one JSONL line to
|
|
1609
|
+
"""Append one JSONL line to lifecycle.jsonl."""
|
|
738
1610
|
from datetime import datetime, timezone
|
|
739
1611
|
record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
|
|
740
1612
|
record.update(extra)
|
|
@@ -757,12 +1629,29 @@ class Launcher:
|
|
|
757
1629
|
|
|
758
1630
|
def _create_api_app(self) -> FastAPI:
|
|
759
1631
|
"""Create the FastAPI app with Launcher management routes."""
|
|
1632
|
+
from fastapi import Request, HTTPException
|
|
760
1633
|
app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
|
|
761
|
-
launcher = self
|
|
1634
|
+
launcher = self
|
|
1635
|
+
|
|
1636
|
+
def _require_auth(request: Request):
|
|
1637
|
+
"""Verify Bearer token and IP whitelist. Raise 401/403 on failure."""
|
|
1638
|
+
# IP whitelist: only allow 127.0.0.1
|
|
1639
|
+
client_host = request.client.host if request.client else None
|
|
1640
|
+
if client_host not in ("127.0.0.1", "::1", "localhost"):
|
|
1641
|
+
raise HTTPException(status_code=403, detail="Access denied: only localhost allowed")
|
|
1642
|
+
|
|
1643
|
+
# Bearer token verification
|
|
1644
|
+
auth = request.headers.get("Authorization", "")
|
|
1645
|
+
if not auth.startswith("Bearer "):
|
|
1646
|
+
raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
|
|
1647
|
+
token = auth[7:].strip()
|
|
1648
|
+
if token != launcher.kite_token:
|
|
1649
|
+
raise HTTPException(status_code=401, detail="Invalid token")
|
|
762
1650
|
|
|
763
1651
|
@app.get("/launcher/modules")
|
|
764
|
-
async def list_modules():
|
|
765
|
-
"""List all modules and their current status
|
|
1652
|
+
async def list_modules(request: Request):
|
|
1653
|
+
"""List all modules and their current status."""
|
|
1654
|
+
_require_auth(request)
|
|
766
1655
|
result = []
|
|
767
1656
|
for name, info in launcher.modules.items():
|
|
768
1657
|
running = launcher.process_manager.is_running(name)
|
|
@@ -780,39 +1669,32 @@ class Launcher:
|
|
|
780
1669
|
return result
|
|
781
1670
|
|
|
782
1671
|
@app.post("/launcher/modules/{name}/start")
|
|
783
|
-
async def start_module(name: str):
|
|
784
|
-
"""Start a module by name.
|
|
1672
|
+
async def start_module(name: str, request: Request):
|
|
1673
|
+
"""Start a module by name."""
|
|
1674
|
+
_require_auth(request)
|
|
785
1675
|
info = launcher.modules.get(name)
|
|
786
1676
|
if not info:
|
|
787
1677
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
788
1678
|
if info.state == "disabled":
|
|
789
1679
|
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
790
1680
|
|
|
791
|
-
# Generate token if not already present
|
|
792
1681
|
if name not in launcher._module_tokens:
|
|
793
1682
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
794
|
-
# Register the new token to Registry
|
|
795
1683
|
try:
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
)
|
|
1684
|
+
client = launcher._get_http()
|
|
1685
|
+
await client.post(
|
|
1686
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1687
|
+
json={name: launcher._module_tokens[name]},
|
|
1688
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1689
|
+
)
|
|
803
1690
|
except Exception as e:
|
|
804
|
-
print(f"[launcher]
|
|
1691
|
+
print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
|
|
805
1692
|
|
|
806
1693
|
token = launcher._module_tokens[name]
|
|
807
|
-
boot_info = {
|
|
808
|
-
"token": token,
|
|
809
|
-
"registry_port": launcher.registry_port,
|
|
810
|
-
"preferred_port": info.preferred_port,
|
|
811
|
-
}
|
|
1694
|
+
boot_info = {"token": token}
|
|
812
1695
|
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
813
1696
|
if ok:
|
|
814
1697
|
launcher._desired_states[name] = "running"
|
|
815
|
-
launcher._fail_counts.pop(name, None)
|
|
816
1698
|
launcher.process_manager.persist_records()
|
|
817
1699
|
rec = launcher.process_manager.get_record(name)
|
|
818
1700
|
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
|
|
@@ -822,8 +1704,9 @@ class Launcher:
|
|
|
822
1704
|
raise HTTPException(500, f"Failed to start '{name}'")
|
|
823
1705
|
|
|
824
1706
|
@app.post("/launcher/modules/{name}/stop")
|
|
825
|
-
async def stop_module(name: str, body: dict = None):
|
|
826
|
-
"""Stop a module with graceful shutdown.
|
|
1707
|
+
async def stop_module(name: str, request: Request, body: dict = None):
|
|
1708
|
+
"""Stop a module with graceful shutdown."""
|
|
1709
|
+
_require_auth(request)
|
|
827
1710
|
info = launcher.modules.get(name)
|
|
828
1711
|
if not info:
|
|
829
1712
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -834,8 +1717,9 @@ class Launcher:
|
|
|
834
1717
|
return {"status": "stopped", "name": name}
|
|
835
1718
|
|
|
836
1719
|
@app.post("/launcher/modules/{name}/restart")
|
|
837
|
-
async def restart_module(name: str, body: dict = None):
|
|
1720
|
+
async def restart_module(name: str, request: Request, body: dict = None):
|
|
838
1721
|
"""Restart a module (stop + start)."""
|
|
1722
|
+
_require_auth(request)
|
|
839
1723
|
info = launcher.modules.get(name)
|
|
840
1724
|
if not info:
|
|
841
1725
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -843,28 +1727,21 @@ class Launcher:
|
|
|
843
1727
|
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
844
1728
|
reason = (body or {}).get("reason", "restart")
|
|
845
1729
|
await launcher._graceful_stop(name, reason)
|
|
846
|
-
# Re-generate token
|
|
847
1730
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
848
1731
|
try:
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
)
|
|
1732
|
+
client = launcher._get_http()
|
|
1733
|
+
await client.post(
|
|
1734
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1735
|
+
json={name: launcher._module_tokens[name]},
|
|
1736
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1737
|
+
)
|
|
856
1738
|
except Exception:
|
|
857
1739
|
pass
|
|
858
1740
|
token = launcher._module_tokens[name]
|
|
859
|
-
boot_info = {
|
|
860
|
-
"token": token,
|
|
861
|
-
"registry_port": launcher.registry_port,
|
|
862
|
-
"preferred_port": info.preferred_port,
|
|
863
|
-
}
|
|
1741
|
+
boot_info = {"token": token}
|
|
864
1742
|
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
865
1743
|
if ok:
|
|
866
1744
|
launcher._desired_states[name] = "running"
|
|
867
|
-
launcher._fail_counts.pop(name, None)
|
|
868
1745
|
launcher.process_manager.persist_records()
|
|
869
1746
|
rec = launcher.process_manager.get_record(name)
|
|
870
1747
|
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
|
|
@@ -874,8 +1751,9 @@ class Launcher:
|
|
|
874
1751
|
raise HTTPException(500, f"Failed to restart '{name}'")
|
|
875
1752
|
|
|
876
1753
|
@app.post("/launcher/rescan")
|
|
877
|
-
async def rescan_modules():
|
|
1754
|
+
async def rescan_modules(request: Request):
|
|
878
1755
|
"""Rescan module directories for new/removed modules."""
|
|
1756
|
+
_require_auth(request)
|
|
879
1757
|
old_names = set(launcher.modules.keys())
|
|
880
1758
|
launcher.modules = launcher.module_scanner.scan()
|
|
881
1759
|
new_names = set(launcher.modules.keys())
|
|
@@ -884,31 +1762,37 @@ class Launcher:
|
|
|
884
1762
|
for name in added:
|
|
885
1763
|
info = launcher.modules[name]
|
|
886
1764
|
launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
887
|
-
# Initialize desired_state for new modules
|
|
888
1765
|
for name in added:
|
|
889
1766
|
info = launcher.modules[name]
|
|
890
1767
|
launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
891
|
-
# Register tokens for new modules
|
|
892
1768
|
if added:
|
|
893
1769
|
new_tokens = {}
|
|
894
1770
|
for name in added:
|
|
895
1771
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
896
1772
|
new_tokens[name] = launcher._module_tokens[name]
|
|
897
1773
|
try:
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
)
|
|
1774
|
+
client = launcher._get_http()
|
|
1775
|
+
await client.post(
|
|
1776
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1777
|
+
json=new_tokens,
|
|
1778
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1779
|
+
)
|
|
905
1780
|
except Exception:
|
|
906
1781
|
pass
|
|
907
1782
|
return {"added": added, "removed": removed, "total": len(launcher.modules)}
|
|
908
1783
|
|
|
1784
|
+
@app.post("/launcher/shutdown")
|
|
1785
|
+
async def shutdown_launcher(request: Request, body: dict = None):
|
|
1786
|
+
"""Shutdown the entire Kite system (equivalent to Ctrl+C)."""
|
|
1787
|
+
_require_auth(request)
|
|
1788
|
+
reason = (body or {}).get("reason", "api_request")
|
|
1789
|
+
launcher._request_shutdown(f"API shutdown request: {reason}")
|
|
1790
|
+
return {"status": "shutting_down", "reason": reason}
|
|
1791
|
+
|
|
909
1792
|
@app.put("/launcher/modules/{name}/state")
|
|
910
|
-
async def update_state(name: str, body: dict):
|
|
1793
|
+
async def update_state(name: str, request: Request, body: dict):
|
|
911
1794
|
"""Update module state (enabled/manual/disabled). Writes to module.md."""
|
|
1795
|
+
_require_auth(request)
|
|
912
1796
|
info = launcher.modules.get(name)
|
|
913
1797
|
if not info:
|
|
914
1798
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -917,14 +1801,12 @@ class Launcher:
|
|
|
917
1801
|
if new_state not in ("enabled", "manual", "disabled"):
|
|
918
1802
|
raise HTTPException(400, "state must be enabled, manual, or disabled")
|
|
919
1803
|
|
|
920
|
-
|
|
921
|
-
if info.is_core(launcher.project_root) and new_state == "disabled":
|
|
1804
|
+
if info.is_core() and new_state == "disabled":
|
|
922
1805
|
raise HTTPException(403, "Core modules cannot be disabled")
|
|
923
1806
|
|
|
924
1807
|
old_state = info.state
|
|
925
1808
|
info.state = new_state
|
|
926
1809
|
|
|
927
|
-
# Update desired_state to match new config_state
|
|
928
1810
|
if new_state == "enabled":
|
|
929
1811
|
launcher._desired_states[name] = "running"
|
|
930
1812
|
else:
|
|
@@ -956,7 +1838,6 @@ def _update_module_md_state(module_dir: str, new_state: str):
|
|
|
956
1838
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
957
1839
|
content = f.read()
|
|
958
1840
|
|
|
959
|
-
# Replace state: xxx in frontmatter
|
|
960
1841
|
updated = re.sub(
|
|
961
1842
|
r'^(state:\s*)(\S+)',
|
|
962
1843
|
rf'\g<1>{new_state}',
|
|
@@ -968,4 +1849,4 @@ def _update_module_md_state(module_dir: str, new_state: str):
|
|
|
968
1849
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
969
1850
|
f.write(updated)
|
|
970
1851
|
except Exception as e:
|
|
971
|
-
print(f"[launcher]
|
|
1852
|
+
print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
|