@agentunion/kite 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__init__.py +1 -0
- package/__main__.py +15 -0
- package/cli.js +70 -0
- package/core/__init__.py +0 -0
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/BENCHMARK.md +94 -0
- package/core/event_hub/__init__.py +0 -0
- package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
- package/core/event_hub/bench.py +459 -0
- package/core/event_hub/bench_extreme.py +308 -0
- package/core/event_hub/bench_perf.py +350 -0
- package/core/event_hub/bench_results/.gitkeep +0 -0
- package/core/event_hub/bench_results/2026-02-28_13-26-48.json +51 -0
- package/core/event_hub/bench_results/2026-02-28_13-44-45.json +51 -0
- package/core/event_hub/bench_results/2026-02-28_13-45-39.json +51 -0
- package/core/event_hub/dedup.py +31 -0
- package/core/event_hub/entry.py +113 -0
- package/core/event_hub/hub.py +263 -0
- package/core/event_hub/module.md +21 -0
- package/core/event_hub/router.py +21 -0
- package/core/event_hub/server.py +138 -0
- package/core/event_hub_bench/entry.py +371 -0
- package/core/event_hub_bench/module.md +25 -0
- package/core/launcher/__init__.py +0 -0
- package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
- package/core/launcher/data/log/lifecycle.jsonl +1045 -0
- package/core/launcher/data/processes_14752.json +32 -0
- package/core/launcher/data/token.txt +1 -0
- package/core/launcher/entry.py +965 -0
- package/core/launcher/module.md +37 -0
- package/core/launcher/module_scanner.py +253 -0
- package/core/launcher/process_manager.py +435 -0
- package/core/registry/__init__.py +0 -0
- package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
- package/core/registry/data/port.txt +1 -0
- package/core/registry/data/port_14752.txt +1 -0
- package/core/registry/data/port_484.txt +1 -0
- package/core/registry/entry.py +73 -0
- package/core/registry/module.md +30 -0
- package/core/registry/server.py +256 -0
- package/core/registry/store.py +232 -0
- package/extensions/__init__.py +0 -0
- package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/__init__.py +0 -0
- package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__init__.py +0 -0
- package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/entry.py +143 -0
- package/extensions/services/watchdog/module.md +25 -0
- package/extensions/services/watchdog/monitor.py +420 -0
- package/extensions/services/watchdog/server.py +167 -0
- package/main.py +17 -0
- package/package.json +27 -0
|
@@ -0,0 +1,965 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors processes.
|
|
3
|
+
|
|
4
|
+
Thread model:
|
|
5
|
+
- Main thread: asyncio event loop (process management + monitor loop)
|
|
6
|
+
- API thread: independent thread running uvicorn + FastAPI
|
|
7
|
+
- stdout threads: one daemon thread per child process
|
|
8
|
+
- (Windows) keyboard listener thread: polls for 'q' key
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import secrets
|
|
15
|
+
import signal
|
|
16
|
+
import sys
|
|
17
|
+
import threading
|
|
18
|
+
import time
|
|
19
|
+
import uuid
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
import uvicorn
|
|
23
|
+
import websockets
|
|
24
|
+
from fastapi import FastAPI, HTTPException
|
|
25
|
+
from fastapi.responses import JSONResponse
|
|
26
|
+
|
|
27
|
+
from .module_scanner import ModuleScanner, ModuleInfo, _parse_frontmatter
|
|
28
|
+
from .process_manager import ProcessManager
|
|
29
|
+
|
|
30
|
+
IS_WINDOWS = sys.platform == "win32"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Launcher:
|
|
34
|
+
"""Kite system entry point. Starts Registry, manages modules, exposes API."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, kite_token: str):
|
|
37
|
+
self.kite_token = kite_token
|
|
38
|
+
self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
39
|
+
self.instance_id = str(os.getpid())
|
|
40
|
+
self.process_manager = ProcessManager(self.project_root, kite_token, self.instance_id)
|
|
41
|
+
self.module_scanner = ModuleScanner(
|
|
42
|
+
self.project_root,
|
|
43
|
+
discovery=self._load_discovery(),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
self.registry_port: int = 0
|
|
47
|
+
self.api_port: int = 0
|
|
48
|
+
self.modules: dict[str, ModuleInfo] = {}
|
|
49
|
+
self._shutdown_event = asyncio.Event()
|
|
50
|
+
self._thread_shutdown = threading.Event()
|
|
51
|
+
self._api_server: uvicorn.Server | None = None
|
|
52
|
+
self._api_ready = threading.Event()
|
|
53
|
+
self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
|
|
54
|
+
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
55
|
+
|
|
56
|
+
# Three-layer state model: desired_state per module
|
|
57
|
+
# Initialized from config_state: enabled→running, manual→stopped, disabled→stopped
|
|
58
|
+
self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
|
|
59
|
+
|
|
60
|
+
# Event Hub WebSocket client
|
|
61
|
+
self._event_hub_ws_url: str = ""
|
|
62
|
+
self._ws: object | None = None
|
|
63
|
+
self._ws_task: asyncio.Task | None = None
|
|
64
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
65
|
+
|
|
66
|
+
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
67
|
+
# event_key format: "event_type:module_id"
|
|
68
|
+
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
69
|
+
|
|
70
|
+
self._lifecycle_log = os.path.join(
|
|
71
|
+
self.project_root, "core", "launcher", "data", "log", "lifecycle.jsonl",
|
|
72
|
+
)
|
|
73
|
+
self._app = self._create_api_app()
|
|
74
|
+
|
|
75
|
+
# ── Public entry ──
|
|
76
|
+
|
|
77
|
+
def run(self):
|
|
78
|
+
"""Synchronous entry point. Sets up signals, runs the async main loop."""
|
|
79
|
+
print("[launcher] Kite starting...")
|
|
80
|
+
print(f"[launcher] Project root: {self.project_root}")
|
|
81
|
+
|
|
82
|
+
if IS_WINDOWS:
|
|
83
|
+
self._setup_windows_exit()
|
|
84
|
+
else:
|
|
85
|
+
self._setup_unix_signals()
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
asyncio.run(self._async_main())
|
|
89
|
+
except KeyboardInterrupt:
|
|
90
|
+
pass
|
|
91
|
+
finally:
|
|
92
|
+
self._final_cleanup()
|
|
93
|
+
|
|
94
|
+
def _setup_unix_signals(self):
|
|
95
|
+
"""Register SIGTERM/SIGINT handlers on Linux/macOS."""
|
|
96
|
+
def _handler(signum, frame):
|
|
97
|
+
print(f"\n[launcher] Received signal {signum}, shutting down...")
|
|
98
|
+
self._thread_shutdown.set()
|
|
99
|
+
signal.signal(signal.SIGTERM, _handler)
|
|
100
|
+
signal.signal(signal.SIGINT, _handler)
|
|
101
|
+
|
|
102
|
+
def _setup_windows_exit(self):
|
|
103
|
+
"""Start a daemon thread that listens for 'q' or Ctrl+C on Windows."""
|
|
104
|
+
def _listen():
|
|
105
|
+
import msvcrt
|
|
106
|
+
while not self._thread_shutdown.is_set():
|
|
107
|
+
if msvcrt.kbhit():
|
|
108
|
+
ch = msvcrt.getch()
|
|
109
|
+
if ch in (b'q', b'Q', b'\x03'): # q or Ctrl+C
|
|
110
|
+
print("\n[launcher] Exit requested, shutting down...")
|
|
111
|
+
self._thread_shutdown.set()
|
|
112
|
+
return
|
|
113
|
+
time.sleep(0.1)
|
|
114
|
+
t = threading.Thread(target=_listen, daemon=True)
|
|
115
|
+
t.start()
|
|
116
|
+
|
|
117
|
+
# ── Async main ──
|
|
118
|
+
|
|
119
|
+
async def _async_main(self):
|
|
120
|
+
"""Full startup sequence, then monitor loop."""
|
|
121
|
+
self._loop = asyncio.get_running_loop()
|
|
122
|
+
|
|
123
|
+
# Step 1: cleanup leftovers
|
|
124
|
+
self.process_manager.cleanup_leftovers()
|
|
125
|
+
|
|
126
|
+
# Step 2: start Registry
|
|
127
|
+
await self._start_registry()
|
|
128
|
+
|
|
129
|
+
# Step 3: start Launcher API in a separate thread
|
|
130
|
+
self._start_api_thread()
|
|
131
|
+
|
|
132
|
+
# Step 4: register Launcher to Registry
|
|
133
|
+
await self._register_self()
|
|
134
|
+
|
|
135
|
+
# Step 5: scan modules
|
|
136
|
+
self.modules = self.module_scanner.scan()
|
|
137
|
+
for name, info in self.modules.items():
|
|
138
|
+
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
139
|
+
print(f"[launcher] Found {len(self.modules)} module(s): {', '.join(self.modules.keys()) or '(none)'}")
|
|
140
|
+
|
|
141
|
+
# Step 5.5: initialize desired_state from config_state
|
|
142
|
+
for name, info in self.modules.items():
|
|
143
|
+
if info.state == "enabled":
|
|
144
|
+
self._desired_states[name] = "running"
|
|
145
|
+
else: # manual, disabled
|
|
146
|
+
self._desired_states[name] = "stopped"
|
|
147
|
+
|
|
148
|
+
# Step 6: generate per-module tokens and register to Registry
|
|
149
|
+
await self._register_module_tokens()
|
|
150
|
+
|
|
151
|
+
# Step 7: start enabled modules
|
|
152
|
+
await self._start_enabled_modules()
|
|
153
|
+
|
|
154
|
+
# Step 8: persist records
|
|
155
|
+
self.process_manager.persist_records()
|
|
156
|
+
|
|
157
|
+
# Step 9: connect to Event Hub (best-effort, non-blocking)
|
|
158
|
+
await self._connect_event_hub()
|
|
159
|
+
|
|
160
|
+
# Step 10: start heartbeat to Registry
|
|
161
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
162
|
+
|
|
163
|
+
# Step 11: monitor loop
|
|
164
|
+
print("[launcher] Entering monitor loop (press Ctrl+C or 'q' to exit)")
|
|
165
|
+
await self._monitor_loop()
|
|
166
|
+
|
|
167
|
+
# Graceful shutdown all modules before event loop closes
|
|
168
|
+
await self._graceful_shutdown_all()
|
|
169
|
+
|
|
170
|
+
# ── Event Hub connection ──
|
|
171
|
+
|
|
172
|
+
async def _connect_event_hub(self):
|
|
173
|
+
"""Discover Event Hub WS endpoint (with retry) and start background client."""
|
|
174
|
+
url = f"http://127.0.0.1:{self.registry_port}"
|
|
175
|
+
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
176
|
+
|
|
177
|
+
# Event Hub needs time to start and register itself to Registry
|
|
178
|
+
print("[launcher] Waiting for Event Hub to register...")
|
|
179
|
+
deadline = time.time() + 15
|
|
180
|
+
while time.time() < deadline:
|
|
181
|
+
try:
|
|
182
|
+
async with httpx.AsyncClient() as client:
|
|
183
|
+
resp = await client.get(
|
|
184
|
+
f"{url}/get/event_hub.metadata.ws_endpoint",
|
|
185
|
+
headers=headers, timeout=3,
|
|
186
|
+
)
|
|
187
|
+
if resp.status_code == 200:
|
|
188
|
+
self._event_hub_ws_url = resp.json()
|
|
189
|
+
if self._event_hub_ws_url:
|
|
190
|
+
break
|
|
191
|
+
except Exception:
|
|
192
|
+
pass
|
|
193
|
+
await asyncio.sleep(1)
|
|
194
|
+
|
|
195
|
+
if not self._event_hub_ws_url:
|
|
196
|
+
print("[launcher] WARNING: Could not discover Event Hub WS, events disabled")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
print(f"[launcher] Event Hub discovered: {self._event_hub_ws_url}")
|
|
200
|
+
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
201
|
+
|
|
202
|
+
async def _ws_loop(self):
|
|
203
|
+
"""Connect to Event Hub, reconnect on failure."""
|
|
204
|
+
while not self._thread_shutdown.is_set():
|
|
205
|
+
try:
|
|
206
|
+
await self._ws_connect()
|
|
207
|
+
except asyncio.CancelledError:
|
|
208
|
+
return
|
|
209
|
+
except Exception as e:
|
|
210
|
+
print(f"[launcher] Event Hub connection error: {e}")
|
|
211
|
+
self._ws = None
|
|
212
|
+
await asyncio.sleep(5)
|
|
213
|
+
|
|
214
|
+
async def _ws_connect(self):
|
|
215
|
+
"""Single WebSocket session: connect, subscribe to all events, display them."""
|
|
216
|
+
ws_url = f"{self._event_hub_ws_url}?token={self.kite_token}"
|
|
217
|
+
async with websockets.connect(ws_url) as ws:
|
|
218
|
+
self._ws = ws
|
|
219
|
+
print("[launcher] Connected to Event Hub")
|
|
220
|
+
|
|
221
|
+
# Subscribe to all events
|
|
222
|
+
await ws.send(json.dumps({
|
|
223
|
+
"type": "subscribe",
|
|
224
|
+
"events": [">"],
|
|
225
|
+
}))
|
|
226
|
+
|
|
227
|
+
# Receive loop
|
|
228
|
+
async for raw in ws:
|
|
229
|
+
try:
|
|
230
|
+
msg = json.loads(raw)
|
|
231
|
+
except (json.JSONDecodeError, TypeError):
|
|
232
|
+
continue
|
|
233
|
+
msg_type = msg.get("type", "")
|
|
234
|
+
if msg_type == "event":
|
|
235
|
+
source = msg.get("source", "unknown")
|
|
236
|
+
event = msg.get("event", "")
|
|
237
|
+
data = msg.get("data", {})
|
|
238
|
+
# Trigger event waiters
|
|
239
|
+
module_id = data.get("module_id", "")
|
|
240
|
+
waiter_key = f"{event}:{module_id}"
|
|
241
|
+
waiter = self._event_waiters.get(waiter_key)
|
|
242
|
+
if waiter:
|
|
243
|
+
waiter[1].update(data)
|
|
244
|
+
waiter[0].set()
|
|
245
|
+
ts = msg.get("timestamp", "")
|
|
246
|
+
latency_str = ""
|
|
247
|
+
if ts:
|
|
248
|
+
try:
|
|
249
|
+
from datetime import datetime, timezone
|
|
250
|
+
sent = datetime.fromisoformat(ts)
|
|
251
|
+
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
252
|
+
latency_str = f" ({delay_ms:.1f}ms)"
|
|
253
|
+
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
254
|
+
except Exception:
|
|
255
|
+
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
256
|
+
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
257
|
+
else:
|
|
258
|
+
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
259
|
+
elif msg_type == "error":
|
|
260
|
+
print(f"[launcher] Event Hub error: {msg.get('message')}")
|
|
261
|
+
|
|
262
|
+
async def _publish_event(self, event_type: str, data: dict):
|
|
263
|
+
"""Publish an event to Event Hub via WebSocket."""
|
|
264
|
+
if not self._ws:
|
|
265
|
+
return
|
|
266
|
+
from datetime import datetime, timezone
|
|
267
|
+
msg = {
|
|
268
|
+
"type": "event",
|
|
269
|
+
"event_id": str(uuid.uuid4()),
|
|
270
|
+
"event": event_type,
|
|
271
|
+
"source": "launcher",
|
|
272
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
273
|
+
"data": data,
|
|
274
|
+
}
|
|
275
|
+
try:
|
|
276
|
+
await self._ws.send(json.dumps(msg))
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print(f"[launcher] Failed to publish event: {e}")
|
|
279
|
+
|
|
280
|
+
def _publish_event_threadsafe(self, event_type: str, data: dict):
|
|
281
|
+
"""Publish event from non-async context (API thread). Fire-and-forget."""
|
|
282
|
+
if not self._ws or not self._loop:
|
|
283
|
+
return
|
|
284
|
+
asyncio.run_coroutine_threadsafe(
|
|
285
|
+
self._publish_event(event_type, data), self._loop,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
|
|
289
|
+
"""Wait for a specific event from a module. Returns data dict or None on timeout."""
|
|
290
|
+
key = f"{event_type}:{module_id}"
|
|
291
|
+
evt = asyncio.Event()
|
|
292
|
+
data = {}
|
|
293
|
+
self._event_waiters[key] = (evt, data)
|
|
294
|
+
try:
|
|
295
|
+
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
296
|
+
return data
|
|
297
|
+
except asyncio.TimeoutError:
|
|
298
|
+
return None
|
|
299
|
+
finally:
|
|
300
|
+
self._event_waiters.pop(key, None)
|
|
301
|
+
|
|
302
|
+
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
303
|
+
"""Graceful shutdown: send event → wait ack → wait ready → kill."""
|
|
304
|
+
self._log_lifecycle("stopping", name, reason=reason)
|
|
305
|
+
# Step 1: send module.shutdown event
|
|
306
|
+
await self._publish_event("module.shutdown", {
|
|
307
|
+
"module_id": name, "reason": reason, "timeout": timeout,
|
|
308
|
+
})
|
|
309
|
+
|
|
310
|
+
# Step 2: wait for ack (3s)
|
|
311
|
+
ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
|
|
312
|
+
if not ack:
|
|
313
|
+
# No ack — fallback to direct terminate
|
|
314
|
+
self.process_manager.stop_module(name, timeout=5)
|
|
315
|
+
await self._publish_event("module.stopped", {"module_id": name})
|
|
316
|
+
return
|
|
317
|
+
|
|
318
|
+
# Step 3: wait for ready
|
|
319
|
+
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
320
|
+
ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
|
|
321
|
+
if ready:
|
|
322
|
+
# Module is ready to die — kill immediately
|
|
323
|
+
self.process_manager.stop_module(name, timeout=1)
|
|
324
|
+
else:
|
|
325
|
+
# Timeout — force stop
|
|
326
|
+
self.process_manager.stop_module(name, timeout=3)
|
|
327
|
+
|
|
328
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
329
|
+
await self._publish_event("module.stopped", {"module_id": name})
|
|
330
|
+
|
|
331
|
+
async def _graceful_shutdown_all(self):
|
|
332
|
+
"""Broadcast module.shutdown to all running modules, then force-kill survivors."""
|
|
333
|
+
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
334
|
+
if not running:
|
|
335
|
+
return
|
|
336
|
+
print(f"[launcher] Graceful shutdown: {', '.join(running)}")
|
|
337
|
+
# Broadcast shutdown event
|
|
338
|
+
for name in running:
|
|
339
|
+
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
340
|
+
await self._publish_event("module.shutdown", {
|
|
341
|
+
"module_id": name, "reason": "system_shutdown", "timeout": 10,
|
|
342
|
+
})
|
|
343
|
+
# Wait up to 10s total, then force-kill
|
|
344
|
+
deadline = time.time() + 10
|
|
345
|
+
while time.time() < deadline:
|
|
346
|
+
still_running = [n for n in running if self.process_manager.is_running(n)]
|
|
347
|
+
if not still_running:
|
|
348
|
+
break
|
|
349
|
+
await asyncio.sleep(0.5)
|
|
350
|
+
self.process_manager.stop_all(timeout=3)
|
|
351
|
+
for name in running:
|
|
352
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
353
|
+
|
|
354
|
+
# ── Heartbeat to Registry ──
|
|
355
|
+
|
|
356
|
+
async def _heartbeat_loop(self):
|
|
357
|
+
"""Send heartbeat to Registry every 30 seconds."""
|
|
358
|
+
while not self._thread_shutdown.is_set():
|
|
359
|
+
await asyncio.sleep(30)
|
|
360
|
+
try:
|
|
361
|
+
async with httpx.AsyncClient() as client:
|
|
362
|
+
await client.post(
|
|
363
|
+
f"http://127.0.0.1:{self.registry_port}/modules",
|
|
364
|
+
json={"action": "heartbeat", "module_id": "launcher"},
|
|
365
|
+
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
366
|
+
timeout=5,
|
|
367
|
+
)
|
|
368
|
+
except Exception:
|
|
369
|
+
pass
|
|
370
|
+
|
|
371
|
+
# ── Registry startup ──
|
|
372
|
+
|
|
373
|
+
async def _start_registry(self):
|
|
374
|
+
"""Start Registry as a subprocess, wait for it to write port.txt and /health to respond."""
|
|
375
|
+
registry_dir = os.path.join(self.project_root, "core", "registry")
|
|
376
|
+
if not os.path.isdir(registry_dir):
|
|
377
|
+
raise RuntimeError(f"Registry module not found at {registry_dir}")
|
|
378
|
+
|
|
379
|
+
# Clean our instance's port file before starting
|
|
380
|
+
port_file = os.path.join(registry_dir, "data", f"port_{self.instance_id}.txt")
|
|
381
|
+
if os.path.isfile(port_file):
|
|
382
|
+
os.remove(port_file)
|
|
383
|
+
|
|
384
|
+
registry_info = ModuleInfo(
|
|
385
|
+
name="registry",
|
|
386
|
+
display_name="Registry",
|
|
387
|
+
type="infrastructure",
|
|
388
|
+
state="enabled",
|
|
389
|
+
runtime="python",
|
|
390
|
+
entry="entry.py",
|
|
391
|
+
module_dir=registry_dir,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Pass launcher_token + bind config via stdin
|
|
395
|
+
boot_info = {"token": self.kite_token, "registry_port": 0, "bind": "127.0.0.1", "instance_id": self.instance_id}
|
|
396
|
+
ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
|
|
397
|
+
if not ok:
|
|
398
|
+
raise RuntimeError("Failed to start Registry")
|
|
399
|
+
|
|
400
|
+
# Wait for Registry to write port.txt
|
|
401
|
+
print("[launcher] Waiting for Registry to report its port...")
|
|
402
|
+
deadline = time.time() + 10
|
|
403
|
+
while time.time() < deadline:
|
|
404
|
+
if os.path.isfile(port_file):
|
|
405
|
+
try:
|
|
406
|
+
with open(port_file, "r") as f:
|
|
407
|
+
self.registry_port = int(f.read().strip())
|
|
408
|
+
break
|
|
409
|
+
except (ValueError, OSError):
|
|
410
|
+
pass
|
|
411
|
+
await asyncio.sleep(0.2)
|
|
412
|
+
else:
|
|
413
|
+
raise RuntimeError("Registry failed to write port.txt within 10s")
|
|
414
|
+
|
|
415
|
+
# Poll /health until ready
|
|
416
|
+
url = f"http://127.0.0.1:{self.registry_port}/health"
|
|
417
|
+
print(f"[launcher] Registry on port {self.registry_port}, waiting for health check...")
|
|
418
|
+
|
|
419
|
+
deadline = time.time() + 10
|
|
420
|
+
async with httpx.AsyncClient() as client:
|
|
421
|
+
while time.time() < deadline:
|
|
422
|
+
try:
|
|
423
|
+
resp = await client.get(url, timeout=1)
|
|
424
|
+
if resp.status_code == 200:
|
|
425
|
+
print("[launcher] Registry is ready")
|
|
426
|
+
return
|
|
427
|
+
except Exception:
|
|
428
|
+
pass
|
|
429
|
+
await asyncio.sleep(0.2)
|
|
430
|
+
|
|
431
|
+
raise RuntimeError("Registry failed to become ready within 10s")
|
|
432
|
+
|
|
433
|
+
async def _register_self(self):
|
|
434
|
+
"""Register Launcher itself to Registry using new API."""
|
|
435
|
+
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
436
|
+
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
437
|
+
payload = {
|
|
438
|
+
"action": "register",
|
|
439
|
+
"module_id": "launcher",
|
|
440
|
+
"module_type": "infrastructure",
|
|
441
|
+
"name": "Launcher",
|
|
442
|
+
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
443
|
+
"health_endpoint": "/launcher/modules",
|
|
444
|
+
"events_publish": {
|
|
445
|
+
"module.started": {},
|
|
446
|
+
"module.stopped": {},
|
|
447
|
+
"module.state_changed": {},
|
|
448
|
+
},
|
|
449
|
+
"events_subscribe": [">"],
|
|
450
|
+
}
|
|
451
|
+
try:
|
|
452
|
+
async with httpx.AsyncClient() as client:
|
|
453
|
+
resp = await client.post(url, json=payload, headers=headers, timeout=5)
|
|
454
|
+
if resp.status_code == 200:
|
|
455
|
+
print("[launcher] Registered self to Registry")
|
|
456
|
+
else:
|
|
457
|
+
print(f"[launcher] WARNING: Registry registration returned {resp.status_code}")
|
|
458
|
+
except Exception as e:
|
|
459
|
+
print(f"[launcher] WARNING: failed to register to Registry: {e}")
|
|
460
|
+
|
|
461
|
+
# ── Module startup ──
|
|
462
|
+
|
|
463
|
+
def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
|
|
464
|
+
"""Topological sort by depends_on. Raises RuntimeError on cycle."""
|
|
465
|
+
name_map = {m.name: m for m in modules}
|
|
466
|
+
visited = set()
|
|
467
|
+
in_stack = set()
|
|
468
|
+
order = []
|
|
469
|
+
|
|
470
|
+
def visit(name):
|
|
471
|
+
if name in in_stack:
|
|
472
|
+
raise RuntimeError(f"Circular dependency detected involving '{name}'")
|
|
473
|
+
if name in visited:
|
|
474
|
+
return
|
|
475
|
+
in_stack.add(name)
|
|
476
|
+
info = name_map.get(name)
|
|
477
|
+
if info:
|
|
478
|
+
for dep in info.depends_on:
|
|
479
|
+
visit(dep)
|
|
480
|
+
in_stack.remove(name)
|
|
481
|
+
visited.add(name)
|
|
482
|
+
if info:
|
|
483
|
+
order.append(info)
|
|
484
|
+
|
|
485
|
+
for m in modules:
|
|
486
|
+
visit(m.name)
|
|
487
|
+
return order
|
|
488
|
+
|
|
489
|
+
async def _start_one_module(self, info: ModuleInfo):
|
|
490
|
+
"""Start a single module: publish starting event, start process, wait for ready."""
|
|
491
|
+
self._log_lifecycle("starting", info.name)
|
|
492
|
+
await self._publish_event("module.starting", {"module_id": info.name})
|
|
493
|
+
|
|
494
|
+
token = self._module_tokens.get(info.name, "")
|
|
495
|
+
boot_info = {
|
|
496
|
+
"token": token,
|
|
497
|
+
"registry_port": self.registry_port,
|
|
498
|
+
"preferred_port": info.preferred_port,
|
|
499
|
+
"advertise_ip": "127.0.0.1",
|
|
500
|
+
}
|
|
501
|
+
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
502
|
+
if not ok:
|
|
503
|
+
self._log_lifecycle("start_failed", info.name)
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
# Wait for module.ready (configurable timeout, degrade on timeout)
|
|
507
|
+
timeout = info.launch.timeout
|
|
508
|
+
ready = await self._wait_event("module.ready", info.name, timeout=timeout)
|
|
509
|
+
if ready:
|
|
510
|
+
print(f"[launcher] Module '{info.name}' is ready")
|
|
511
|
+
else:
|
|
512
|
+
print(f"[launcher] WARNING: '{info.name}' did not send module.ready within {timeout}s")
|
|
513
|
+
|
|
514
|
+
rec = self.process_manager.get_record(info.name)
|
|
515
|
+
self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
|
|
516
|
+
await self._publish_event("module.started", {"module_id": info.name})
|
|
517
|
+
|
|
518
|
+
async def _start_enabled_modules(self):
|
|
519
|
+
"""Start modules in dependency order, auto-starting manual deps if needed."""
|
|
520
|
+
to_start = [m for m in self.modules.values()
|
|
521
|
+
if self._desired_states.get(m.name) == "running"]
|
|
522
|
+
if not to_start:
|
|
523
|
+
print("[launcher] No modules to start")
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
# Auto-start manual modules if depended upon
|
|
527
|
+
needed = set(m.name for m in to_start)
|
|
528
|
+
for m in to_start:
|
|
529
|
+
for dep in m.depends_on:
|
|
530
|
+
if dep not in needed:
|
|
531
|
+
dep_info = self.modules.get(dep)
|
|
532
|
+
if dep_info and dep_info.state != "disabled":
|
|
533
|
+
needed.add(dep)
|
|
534
|
+
to_start.append(dep_info)
|
|
535
|
+
self._desired_states[dep] = "running"
|
|
536
|
+
print(f"[launcher] Auto-starting '{dep}' (dependency)")
|
|
537
|
+
elif dep_info and dep_info.state == "disabled":
|
|
538
|
+
print(f"[launcher] ERROR: '{m.name}' depends on disabled module '{dep}'")
|
|
539
|
+
|
|
540
|
+
try:
|
|
541
|
+
sorted_modules = self._topo_sort(to_start)
|
|
542
|
+
except RuntimeError as e:
|
|
543
|
+
print(f"[launcher] ERROR: {e}")
|
|
544
|
+
return
|
|
545
|
+
|
|
546
|
+
print(f"[launcher] Starting {len(sorted_modules)} module(s)...")
|
|
547
|
+
for info in sorted_modules:
|
|
548
|
+
await self._start_one_module(info)
|
|
549
|
+
|
|
550
|
+
async def _register_module_tokens(self):
|
|
551
|
+
"""Generate per-module tokens and register the mapping to Registry."""
|
|
552
|
+
for name in self.modules:
|
|
553
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
554
|
+
|
|
555
|
+
if not self._module_tokens:
|
|
556
|
+
return
|
|
557
|
+
|
|
558
|
+
url = f"http://127.0.0.1:{self.registry_port}/tokens"
|
|
559
|
+
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
560
|
+
try:
|
|
561
|
+
async with httpx.AsyncClient() as client:
|
|
562
|
+
resp = await client.post(url, json=self._module_tokens, headers=headers, timeout=5)
|
|
563
|
+
if resp.status_code == 200:
|
|
564
|
+
print(f"[launcher] Registered {len(self._module_tokens)} module token(s)")
|
|
565
|
+
else:
|
|
566
|
+
print(f"[launcher] WARNING: token registration returned {resp.status_code}")
|
|
567
|
+
except Exception as e:
|
|
568
|
+
print(f"[launcher] WARNING: failed to register module tokens: {e}")
|
|
569
|
+
|
|
570
|
+
# ── API thread ──
|
|
571
|
+
|
|
572
|
+
def _start_api_thread(self):
|
|
573
|
+
"""Start the Launcher API server in a separate thread with OS-assigned port."""
|
|
574
|
+
self.api_port = self._get_free_port()
|
|
575
|
+
config = uvicorn.Config(
|
|
576
|
+
self._app,
|
|
577
|
+
host="127.0.0.1",
|
|
578
|
+
port=self.api_port,
|
|
579
|
+
log_level="warning",
|
|
580
|
+
)
|
|
581
|
+
self._api_server = uvicorn.Server(config)
|
|
582
|
+
|
|
583
|
+
def _run():
|
|
584
|
+
self._api_server.run()
|
|
585
|
+
|
|
586
|
+
t = threading.Thread(target=_run, daemon=True)
|
|
587
|
+
t.start()
|
|
588
|
+
|
|
589
|
+
# Wait for API server to actually be ready before proceeding
|
|
590
|
+
deadline = time.time() + 5
|
|
591
|
+
while time.time() < deadline:
|
|
592
|
+
if self._api_server.started:
|
|
593
|
+
break
|
|
594
|
+
time.sleep(0.05)
|
|
595
|
+
else:
|
|
596
|
+
print("[launcher] WARNING: API server may not be fully ready")
|
|
597
|
+
|
|
598
|
+
print(f"[launcher] API server started on port {self.api_port}")
|
|
599
|
+
|
|
600
|
+
# ── Monitor loop ──
|
|
601
|
+
|
|
602
|
+
async def _monitor_loop(self):
|
|
603
|
+
"""Check child processes every second. Handle crashes."""
|
|
604
|
+
MAX_FAIL = 3
|
|
605
|
+
MAX_FAILED_MODULES = 3
|
|
606
|
+
|
|
607
|
+
while not self._thread_shutdown.is_set():
|
|
608
|
+
exited = self.process_manager.check_exited()
|
|
609
|
+
|
|
610
|
+
for name, rc in exited:
|
|
611
|
+
print(f"[launcher] Module '{name}' exited with code {rc}")
|
|
612
|
+
self._log_lifecycle("exited", name, exit_code=rc)
|
|
613
|
+
await self._publish_event("module.stopped", {
|
|
614
|
+
"module_id": name, "exit_code": rc,
|
|
615
|
+
})
|
|
616
|
+
info = self.modules.get(name)
|
|
617
|
+
|
|
618
|
+
# Core module crash → full restart
|
|
619
|
+
if info and info.is_core(self.project_root):
|
|
620
|
+
print(f"[launcher] CRITICAL: core module '{name}' crashed, restarting all...")
|
|
621
|
+
self._log_lifecycle("core_crash", name, exit_code=rc)
|
|
622
|
+
await self._full_restart()
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
# Non-core: attempt restart if desired_state is "running"
|
|
626
|
+
self._fail_counts[name] = self._fail_counts.get(name, 0) + 1
|
|
627
|
+
count = self._fail_counts[name]
|
|
628
|
+
|
|
629
|
+
if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
|
|
630
|
+
print(f"[launcher] Restarting '{name}' (attempt {count}/{MAX_FAIL})...")
|
|
631
|
+
await self._start_one_module(info)
|
|
632
|
+
elif count >= MAX_FAIL:
|
|
633
|
+
self._desired_states[name] = "stopped"
|
|
634
|
+
self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
|
|
635
|
+
print(f"[launcher] Module '{name}' failed {MAX_FAIL} times, giving up")
|
|
636
|
+
|
|
637
|
+
# Too many failed modules → exit
|
|
638
|
+
failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
|
|
639
|
+
if failed_count >= MAX_FAILED_MODULES:
|
|
640
|
+
print(f"[launcher] {failed_count} modules permanently failed, Launcher exiting")
|
|
641
|
+
return
|
|
642
|
+
|
|
643
|
+
if exited:
|
|
644
|
+
self.process_manager.persist_records()
|
|
645
|
+
|
|
646
|
+
await asyncio.sleep(1)
|
|
647
|
+
|
|
648
|
+
async def _full_restart(self):
|
|
649
|
+
"""Stop all modules, then re-run the startup sequence."""
|
|
650
|
+
print("[launcher] Full restart: stopping all modules...")
|
|
651
|
+
|
|
652
|
+
# Disconnect Event Hub
|
|
653
|
+
if self._ws_task:
|
|
654
|
+
self._ws_task.cancel()
|
|
655
|
+
self._ws_task = None
|
|
656
|
+
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
657
|
+
self._heartbeat_task.cancel()
|
|
658
|
+
self._heartbeat_task = None
|
|
659
|
+
self._ws = None
|
|
660
|
+
|
|
661
|
+
await self._graceful_shutdown_all()
|
|
662
|
+
self._fail_counts.clear()
|
|
663
|
+
|
|
664
|
+
self._module_tokens.clear()
|
|
665
|
+
|
|
666
|
+
print("[launcher] Full restart: re-running startup sequence...")
|
|
667
|
+
try:
|
|
668
|
+
await self._start_registry()
|
|
669
|
+
await self._register_self()
|
|
670
|
+
self.modules = self.module_scanner.scan()
|
|
671
|
+
for n, info in self.modules.items():
|
|
672
|
+
self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
|
|
673
|
+
await self._register_module_tokens()
|
|
674
|
+
await self._start_enabled_modules()
|
|
675
|
+
self.process_manager.persist_records()
|
|
676
|
+
await self._connect_event_hub()
|
|
677
|
+
print("[launcher] Full restart complete, resuming monitor loop")
|
|
678
|
+
await self._monitor_loop()
|
|
679
|
+
except Exception as e:
|
|
680
|
+
print(f"[launcher] Full restart failed: {e}")
|
|
681
|
+
|
|
682
|
+
# ── Shutdown ──
|
|
683
|
+
|
|
684
|
+
def _final_cleanup(self):
|
|
685
|
+
"""Called on exit — stop all processes, stop API, clear records."""
|
|
686
|
+
print("[launcher] Shutting down...")
|
|
687
|
+
|
|
688
|
+
if self._ws_task:
|
|
689
|
+
self._ws_task.cancel()
|
|
690
|
+
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
691
|
+
self._heartbeat_task.cancel()
|
|
692
|
+
|
|
693
|
+
self.process_manager.stop_all(timeout=10)
|
|
694
|
+
|
|
695
|
+
if self._api_server:
|
|
696
|
+
self._api_server.should_exit = True
|
|
697
|
+
|
|
698
|
+
# Clear instance runtime files
|
|
699
|
+
self.process_manager._write_records_file([])
|
|
700
|
+
try:
|
|
701
|
+
os.remove(self.process_manager.records_path)
|
|
702
|
+
except OSError:
|
|
703
|
+
pass
|
|
704
|
+
port_file = os.path.join(self.project_root, "core", "registry", "data", f"port_{self.instance_id}.txt")
|
|
705
|
+
try:
|
|
706
|
+
os.remove(port_file)
|
|
707
|
+
except OSError:
|
|
708
|
+
pass
|
|
709
|
+
print("[launcher] Goodbye.")
|
|
710
|
+
|
|
711
|
+
if IS_WINDOWS:
|
|
712
|
+
os._exit(0)
|
|
713
|
+
|
|
714
|
+
# ── Utilities ──
|
|
715
|
+
|
|
716
|
+
def _load_discovery(self) -> dict | None:
|
|
717
|
+
"""Read discovery config from launcher's own module.md."""
|
|
718
|
+
md_path = os.path.join(self.project_root, "core", "launcher", "module.md")
|
|
719
|
+
try:
|
|
720
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
721
|
+
fm = _parse_frontmatter(f.read())
|
|
722
|
+
discovery = fm.get("discovery")
|
|
723
|
+
if isinstance(discovery, dict) and discovery:
|
|
724
|
+
print(f"[launcher] Discovery sources: {', '.join(discovery.keys())}")
|
|
725
|
+
return discovery
|
|
726
|
+
except Exception as e:
|
|
727
|
+
print(f"[launcher] WARNING: failed to read discovery config: {e}")
|
|
728
|
+
return None
|
|
729
|
+
|
|
730
|
+
def _log_lifecycle(self, event: str, module: str, **extra):
|
|
731
|
+
"""Append one JSONL line to core/launcher/data/lifecycle.jsonl."""
|
|
732
|
+
from datetime import datetime, timezone
|
|
733
|
+
record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
|
|
734
|
+
record.update(extra)
|
|
735
|
+
try:
|
|
736
|
+
os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
|
|
737
|
+
with open(self._lifecycle_log, "a", encoding="utf-8") as f:
|
|
738
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
739
|
+
except Exception:
|
|
740
|
+
pass
|
|
741
|
+
|
|
742
|
+
@staticmethod
|
|
743
|
+
def _get_free_port() -> int:
|
|
744
|
+
"""Get a free port assigned by the OS (bind to port 0)."""
|
|
745
|
+
import socket
|
|
746
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
747
|
+
s.bind(("127.0.0.1", 0))
|
|
748
|
+
return s.getsockname()[1]
|
|
749
|
+
|
|
750
|
+
# ── API app ──
|
|
751
|
+
|
|
752
|
+
def _create_api_app(self) -> FastAPI:
|
|
753
|
+
"""Create the FastAPI app with Launcher management routes."""
|
|
754
|
+
app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
|
|
755
|
+
launcher = self # closure reference
|
|
756
|
+
|
|
757
|
+
@app.get("/launcher/modules")
|
|
758
|
+
async def list_modules():
|
|
759
|
+
"""List all modules and their current status (three-layer state model)."""
|
|
760
|
+
result = []
|
|
761
|
+
for name, info in launcher.modules.items():
|
|
762
|
+
running = launcher.process_manager.is_running(name)
|
|
763
|
+
rec = launcher.process_manager.get_record(name)
|
|
764
|
+
result.append({
|
|
765
|
+
"name": name,
|
|
766
|
+
"display_name": info.display_name,
|
|
767
|
+
"type": info.type,
|
|
768
|
+
"config_state": info.state,
|
|
769
|
+
"desired_state": launcher._desired_states.get(name, "stopped"),
|
|
770
|
+
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
771
|
+
"pid": rec.pid if running and rec else None,
|
|
772
|
+
"monitor": info.monitor,
|
|
773
|
+
})
|
|
774
|
+
return result
|
|
775
|
+
|
|
776
|
+
@app.post("/launcher/modules/{name}/start")
|
|
777
|
+
async def start_module(name: str):
|
|
778
|
+
"""Start a module by name. Generates token and passes boot_info via stdin."""
|
|
779
|
+
info = launcher.modules.get(name)
|
|
780
|
+
if not info:
|
|
781
|
+
raise HTTPException(404, f"Module '{name}' not found")
|
|
782
|
+
if info.state == "disabled":
|
|
783
|
+
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
784
|
+
|
|
785
|
+
# Generate token if not already present
|
|
786
|
+
if name not in launcher._module_tokens:
|
|
787
|
+
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
788
|
+
# Register the new token to Registry
|
|
789
|
+
try:
|
|
790
|
+
async with httpx.AsyncClient() as client:
|
|
791
|
+
await client.post(
|
|
792
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
793
|
+
json={name: launcher._module_tokens[name]},
|
|
794
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
795
|
+
timeout=5,
|
|
796
|
+
)
|
|
797
|
+
except Exception as e:
|
|
798
|
+
print(f"[launcher] WARNING: failed to register token for {name}: {e}")
|
|
799
|
+
|
|
800
|
+
token = launcher._module_tokens[name]
|
|
801
|
+
boot_info = {
|
|
802
|
+
"token": token,
|
|
803
|
+
"registry_port": launcher.registry_port,
|
|
804
|
+
"preferred_port": info.preferred_port,
|
|
805
|
+
}
|
|
806
|
+
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
807
|
+
if ok:
|
|
808
|
+
launcher._desired_states[name] = "running"
|
|
809
|
+
launcher._fail_counts.pop(name, None)
|
|
810
|
+
launcher.process_manager.persist_records()
|
|
811
|
+
rec = launcher.process_manager.get_record(name)
|
|
812
|
+
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
|
|
813
|
+
launcher._publish_event_threadsafe("module.started", {"module_id": name})
|
|
814
|
+
return {"status": "started", "name": name}
|
|
815
|
+
launcher._log_lifecycle("start_failed", name, via="api")
|
|
816
|
+
raise HTTPException(500, f"Failed to start '{name}'")
|
|
817
|
+
|
|
818
|
+
@app.post("/launcher/modules/{name}/stop")
|
|
819
|
+
async def stop_module(name: str, body: dict = None):
|
|
820
|
+
"""Stop a module with graceful shutdown. Accepts optional reason."""
|
|
821
|
+
info = launcher.modules.get(name)
|
|
822
|
+
if not info:
|
|
823
|
+
raise HTTPException(404, f"Module '{name}' not found")
|
|
824
|
+
reason = (body or {}).get("reason", "stop_requested")
|
|
825
|
+
launcher._desired_states[name] = "stopped"
|
|
826
|
+
await launcher._graceful_stop(name, reason)
|
|
827
|
+
launcher.process_manager.persist_records()
|
|
828
|
+
return {"status": "stopped", "name": name}
|
|
829
|
+
|
|
830
|
+
@app.post("/launcher/modules/{name}/restart")
|
|
831
|
+
async def restart_module(name: str, body: dict = None):
|
|
832
|
+
"""Restart a module (stop + start)."""
|
|
833
|
+
info = launcher.modules.get(name)
|
|
834
|
+
if not info:
|
|
835
|
+
raise HTTPException(404, f"Module '{name}' not found")
|
|
836
|
+
if info.state == "disabled":
|
|
837
|
+
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
838
|
+
reason = (body or {}).get("reason", "restart")
|
|
839
|
+
await launcher._graceful_stop(name, reason)
|
|
840
|
+
# Re-generate token
|
|
841
|
+
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
842
|
+
try:
|
|
843
|
+
async with httpx.AsyncClient() as client:
|
|
844
|
+
await client.post(
|
|
845
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
846
|
+
json={name: launcher._module_tokens[name]},
|
|
847
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
848
|
+
timeout=5,
|
|
849
|
+
)
|
|
850
|
+
except Exception:
|
|
851
|
+
pass
|
|
852
|
+
token = launcher._module_tokens[name]
|
|
853
|
+
boot_info = {
|
|
854
|
+
"token": token,
|
|
855
|
+
"registry_port": launcher.registry_port,
|
|
856
|
+
"preferred_port": info.preferred_port,
|
|
857
|
+
}
|
|
858
|
+
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
859
|
+
if ok:
|
|
860
|
+
launcher._desired_states[name] = "running"
|
|
861
|
+
launcher._fail_counts.pop(name, None)
|
|
862
|
+
launcher.process_manager.persist_records()
|
|
863
|
+
rec = launcher.process_manager.get_record(name)
|
|
864
|
+
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
|
|
865
|
+
launcher._publish_event_threadsafe("module.started", {"module_id": name})
|
|
866
|
+
return {"status": "restarted", "name": name}
|
|
867
|
+
launcher._log_lifecycle("start_failed", name, via="restart_api")
|
|
868
|
+
raise HTTPException(500, f"Failed to restart '{name}'")
|
|
869
|
+
|
|
870
|
+
@app.post("/launcher/rescan")
|
|
871
|
+
async def rescan_modules():
|
|
872
|
+
"""Rescan module directories for new/removed modules."""
|
|
873
|
+
old_names = set(launcher.modules.keys())
|
|
874
|
+
launcher.modules = launcher.module_scanner.scan()
|
|
875
|
+
new_names = set(launcher.modules.keys())
|
|
876
|
+
added = list(new_names - old_names)
|
|
877
|
+
removed = list(old_names - new_names)
|
|
878
|
+
for name in added:
|
|
879
|
+
info = launcher.modules[name]
|
|
880
|
+
launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
881
|
+
# Initialize desired_state for new modules
|
|
882
|
+
for name in added:
|
|
883
|
+
info = launcher.modules[name]
|
|
884
|
+
launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
885
|
+
# Register tokens for new modules
|
|
886
|
+
if added:
|
|
887
|
+
new_tokens = {}
|
|
888
|
+
for name in added:
|
|
889
|
+
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
890
|
+
new_tokens[name] = launcher._module_tokens[name]
|
|
891
|
+
try:
|
|
892
|
+
async with httpx.AsyncClient() as client:
|
|
893
|
+
await client.post(
|
|
894
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
895
|
+
json=new_tokens,
|
|
896
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
897
|
+
timeout=5,
|
|
898
|
+
)
|
|
899
|
+
except Exception:
|
|
900
|
+
pass
|
|
901
|
+
return {"added": added, "removed": removed, "total": len(launcher.modules)}
|
|
902
|
+
|
|
903
|
+
@app.put("/launcher/modules/{name}/state")
|
|
904
|
+
async def update_state(name: str, body: dict):
|
|
905
|
+
"""Update module state (enabled/manual/disabled). Writes to module.md."""
|
|
906
|
+
info = launcher.modules.get(name)
|
|
907
|
+
if not info:
|
|
908
|
+
raise HTTPException(404, f"Module '{name}' not found")
|
|
909
|
+
|
|
910
|
+
new_state = body.get("state", "")
|
|
911
|
+
if new_state not in ("enabled", "manual", "disabled"):
|
|
912
|
+
raise HTTPException(400, "state must be enabled, manual, or disabled")
|
|
913
|
+
|
|
914
|
+
# Core modules cannot be disabled
|
|
915
|
+
if info.is_core(launcher.project_root) and new_state == "disabled":
|
|
916
|
+
raise HTTPException(403, "Core modules cannot be disabled")
|
|
917
|
+
|
|
918
|
+
old_state = info.state
|
|
919
|
+
info.state = new_state
|
|
920
|
+
|
|
921
|
+
# Update desired_state to match new config_state
|
|
922
|
+
if new_state == "enabled":
|
|
923
|
+
launcher._desired_states[name] = "running"
|
|
924
|
+
else:
|
|
925
|
+
launcher._desired_states[name] = "stopped"
|
|
926
|
+
|
|
927
|
+
_update_module_md_state(info.module_dir, new_state)
|
|
928
|
+
launcher._publish_event_threadsafe("module.state_changed", {
|
|
929
|
+
"module_id": name,
|
|
930
|
+
"old_state": old_state,
|
|
931
|
+
"new_state": new_state,
|
|
932
|
+
})
|
|
933
|
+
return {
|
|
934
|
+
"name": name,
|
|
935
|
+
"old_state": old_state,
|
|
936
|
+
"new_state": new_state,
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
return app
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def _update_module_md_state(module_dir: str, new_state: str):
|
|
943
|
+
"""Update the state field in a module's module.md frontmatter."""
|
|
944
|
+
import re
|
|
945
|
+
md_path = os.path.join(module_dir, "module.md")
|
|
946
|
+
if not os.path.isfile(md_path):
|
|
947
|
+
return
|
|
948
|
+
|
|
949
|
+
try:
|
|
950
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
951
|
+
content = f.read()
|
|
952
|
+
|
|
953
|
+
# Replace state: xxx in frontmatter
|
|
954
|
+
updated = re.sub(
|
|
955
|
+
r'^(state:\s*)(\S+)',
|
|
956
|
+
rf'\g<1>{new_state}',
|
|
957
|
+
content,
|
|
958
|
+
count=1,
|
|
959
|
+
flags=re.MULTILINE,
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
|
963
|
+
f.write(updated)
|
|
964
|
+
except Exception as e:
|
|
965
|
+
print(f"[launcher] WARNING: failed to update module.md state: {e}")
|