@agentunion/kite 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/__init__.py +1 -0
  2. package/__main__.py +15 -0
  3. package/cli.js +70 -0
  4. package/core/__init__.py +0 -0
  5. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  6. package/core/event_hub/BENCHMARK.md +94 -0
  7. package/core/event_hub/__init__.py +0 -0
  8. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  9. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  10. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  11. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  12. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  13. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  14. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  15. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  16. package/core/event_hub/bench.py +459 -0
  17. package/core/event_hub/bench_extreme.py +308 -0
  18. package/core/event_hub/bench_perf.py +350 -0
  19. package/core/event_hub/bench_results/.gitkeep +0 -0
  20. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +51 -0
  21. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +51 -0
  22. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +51 -0
  23. package/core/event_hub/dedup.py +31 -0
  24. package/core/event_hub/entry.py +113 -0
  25. package/core/event_hub/hub.py +263 -0
  26. package/core/event_hub/module.md +21 -0
  27. package/core/event_hub/router.py +21 -0
  28. package/core/event_hub/server.py +138 -0
  29. package/core/event_hub_bench/entry.py +371 -0
  30. package/core/event_hub_bench/module.md +25 -0
  31. package/core/launcher/__init__.py +0 -0
  32. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  33. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  34. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  35. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  36. package/core/launcher/data/log/lifecycle.jsonl +1045 -0
  37. package/core/launcher/data/processes_14752.json +32 -0
  38. package/core/launcher/data/token.txt +1 -0
  39. package/core/launcher/entry.py +965 -0
  40. package/core/launcher/module.md +37 -0
  41. package/core/launcher/module_scanner.py +253 -0
  42. package/core/launcher/process_manager.py +435 -0
  43. package/core/registry/__init__.py +0 -0
  44. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  45. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  46. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  47. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  48. package/core/registry/data/port.txt +1 -0
  49. package/core/registry/data/port_14752.txt +1 -0
  50. package/core/registry/data/port_484.txt +1 -0
  51. package/core/registry/entry.py +73 -0
  52. package/core/registry/module.md +30 -0
  53. package/core/registry/server.py +256 -0
  54. package/core/registry/store.py +232 -0
  55. package/extensions/__init__.py +0 -0
  56. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  57. package/extensions/services/__init__.py +0 -0
  58. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  59. package/extensions/services/watchdog/__init__.py +0 -0
  60. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  61. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  62. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  63. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
  64. package/extensions/services/watchdog/entry.py +143 -0
  65. package/extensions/services/watchdog/module.md +25 -0
  66. package/extensions/services/watchdog/monitor.py +420 -0
  67. package/extensions/services/watchdog/server.py +167 -0
  68. package/main.py +17 -0
  69. package/package.json +27 -0
@@ -0,0 +1,965 @@
1
+ """
2
+ Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors processes.
3
+
4
+ Thread model:
5
+ - Main thread: asyncio event loop (process management + monitor loop)
6
+ - API thread: independent thread running uvicorn + FastAPI
7
+ - stdout threads: one daemon thread per child process
8
+ - (Windows) keyboard listener thread: polls for 'q' key
9
+ """
10
+
11
+ import asyncio
12
+ import json
13
+ import os
14
+ import secrets
15
+ import signal
16
+ import sys
17
+ import threading
18
+ import time
19
+ import uuid
20
+
21
+ import httpx
22
+ import uvicorn
23
+ import websockets
24
+ from fastapi import FastAPI, HTTPException
25
+ from fastapi.responses import JSONResponse
26
+
27
+ from .module_scanner import ModuleScanner, ModuleInfo, _parse_frontmatter
28
+ from .process_manager import ProcessManager
29
+
30
+ IS_WINDOWS = sys.platform == "win32"
31
+
32
+
33
+ class Launcher:
34
+ """Kite system entry point. Starts Registry, manages modules, exposes API."""
35
+
36
+ def __init__(self, kite_token: str):
37
+ self.kite_token = kite_token
38
+ self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
39
+ self.instance_id = str(os.getpid())
40
+ self.process_manager = ProcessManager(self.project_root, kite_token, self.instance_id)
41
+ self.module_scanner = ModuleScanner(
42
+ self.project_root,
43
+ discovery=self._load_discovery(),
44
+ )
45
+
46
+ self.registry_port: int = 0
47
+ self.api_port: int = 0
48
+ self.modules: dict[str, ModuleInfo] = {}
49
+ self._shutdown_event = asyncio.Event()
50
+ self._thread_shutdown = threading.Event()
51
+ self._api_server: uvicorn.Server | None = None
52
+ self._api_ready = threading.Event()
53
+ self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
54
+ self._module_tokens: dict[str, str] = {} # module_name -> per-module token
55
+
56
+ # Three-layer state model: desired_state per module
57
+ # Initialized from config_state: enabled→running, manual→stopped, disabled→stopped
58
+ self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
59
+
60
+ # Event Hub WebSocket client
61
+ self._event_hub_ws_url: str = ""
62
+ self._ws: object | None = None
63
+ self._ws_task: asyncio.Task | None = None
64
+ self._loop: asyncio.AbstractEventLoop | None = None
65
+
66
+ # Event waiters: {event_key: (asyncio.Event, data_dict)}
67
+ # event_key format: "event_type:module_id"
68
+ self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
69
+
70
+ self._lifecycle_log = os.path.join(
71
+ self.project_root, "core", "launcher", "data", "log", "lifecycle.jsonl",
72
+ )
73
+ self._app = self._create_api_app()
74
+
75
+ # ── Public entry ──
76
+
77
+ def run(self):
78
+ """Synchronous entry point. Sets up signals, runs the async main loop."""
79
+ print("[launcher] Kite starting...")
80
+ print(f"[launcher] Project root: {self.project_root}")
81
+
82
+ if IS_WINDOWS:
83
+ self._setup_windows_exit()
84
+ else:
85
+ self._setup_unix_signals()
86
+
87
+ try:
88
+ asyncio.run(self._async_main())
89
+ except KeyboardInterrupt:
90
+ pass
91
+ finally:
92
+ self._final_cleanup()
93
+
94
+ def _setup_unix_signals(self):
95
+ """Register SIGTERM/SIGINT handlers on Linux/macOS."""
96
+ def _handler(signum, frame):
97
+ print(f"\n[launcher] Received signal {signum}, shutting down...")
98
+ self._thread_shutdown.set()
99
+ signal.signal(signal.SIGTERM, _handler)
100
+ signal.signal(signal.SIGINT, _handler)
101
+
102
+ def _setup_windows_exit(self):
103
+ """Start a daemon thread that listens for 'q' or Ctrl+C on Windows."""
104
+ def _listen():
105
+ import msvcrt
106
+ while not self._thread_shutdown.is_set():
107
+ if msvcrt.kbhit():
108
+ ch = msvcrt.getch()
109
+ if ch in (b'q', b'Q', b'\x03'): # q or Ctrl+C
110
+ print("\n[launcher] Exit requested, shutting down...")
111
+ self._thread_shutdown.set()
112
+ return
113
+ time.sleep(0.1)
114
+ t = threading.Thread(target=_listen, daemon=True)
115
+ t.start()
116
+
117
+ # ── Async main ──
118
+
119
+ async def _async_main(self):
120
+ """Full startup sequence, then monitor loop."""
121
+ self._loop = asyncio.get_running_loop()
122
+
123
+ # Step 1: cleanup leftovers
124
+ self.process_manager.cleanup_leftovers()
125
+
126
+ # Step 2: start Registry
127
+ await self._start_registry()
128
+
129
+ # Step 3: start Launcher API in a separate thread
130
+ self._start_api_thread()
131
+
132
+ # Step 4: register Launcher to Registry
133
+ await self._register_self()
134
+
135
+ # Step 5: scan modules
136
+ self.modules = self.module_scanner.scan()
137
+ for name, info in self.modules.items():
138
+ self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
139
+ print(f"[launcher] Found {len(self.modules)} module(s): {', '.join(self.modules.keys()) or '(none)'}")
140
+
141
+ # Step 5.5: initialize desired_state from config_state
142
+ for name, info in self.modules.items():
143
+ if info.state == "enabled":
144
+ self._desired_states[name] = "running"
145
+ else: # manual, disabled
146
+ self._desired_states[name] = "stopped"
147
+
148
+ # Step 6: generate per-module tokens and register to Registry
149
+ await self._register_module_tokens()
150
+
151
+ # Step 7: start enabled modules
152
+ await self._start_enabled_modules()
153
+
154
+ # Step 8: persist records
155
+ self.process_manager.persist_records()
156
+
157
+ # Step 9: connect to Event Hub (best-effort, non-blocking)
158
+ await self._connect_event_hub()
159
+
160
+ # Step 10: start heartbeat to Registry
161
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
162
+
163
+ # Step 11: monitor loop
164
+ print("[launcher] Entering monitor loop (press Ctrl+C or 'q' to exit)")
165
+ await self._monitor_loop()
166
+
167
+ # Graceful shutdown all modules before event loop closes
168
+ await self._graceful_shutdown_all()
169
+
170
+ # ── Event Hub connection ──
171
+
172
+ async def _connect_event_hub(self):
173
+ """Discover Event Hub WS endpoint (with retry) and start background client."""
174
+ url = f"http://127.0.0.1:{self.registry_port}"
175
+ headers = {"Authorization": f"Bearer {self.kite_token}"}
176
+
177
+ # Event Hub needs time to start and register itself to Registry
178
+ print("[launcher] Waiting for Event Hub to register...")
179
+ deadline = time.time() + 15
180
+ while time.time() < deadline:
181
+ try:
182
+ async with httpx.AsyncClient() as client:
183
+ resp = await client.get(
184
+ f"{url}/get/event_hub.metadata.ws_endpoint",
185
+ headers=headers, timeout=3,
186
+ )
187
+ if resp.status_code == 200:
188
+ self._event_hub_ws_url = resp.json()
189
+ if self._event_hub_ws_url:
190
+ break
191
+ except Exception:
192
+ pass
193
+ await asyncio.sleep(1)
194
+
195
+ if not self._event_hub_ws_url:
196
+ print("[launcher] WARNING: Could not discover Event Hub WS, events disabled")
197
+ return
198
+
199
+ print(f"[launcher] Event Hub discovered: {self._event_hub_ws_url}")
200
+ self._ws_task = asyncio.create_task(self._ws_loop())
201
+
202
+ async def _ws_loop(self):
203
+ """Connect to Event Hub, reconnect on failure."""
204
+ while not self._thread_shutdown.is_set():
205
+ try:
206
+ await self._ws_connect()
207
+ except asyncio.CancelledError:
208
+ return
209
+ except Exception as e:
210
+ print(f"[launcher] Event Hub connection error: {e}")
211
+ self._ws = None
212
+ await asyncio.sleep(5)
213
+
214
+ async def _ws_connect(self):
215
+ """Single WebSocket session: connect, subscribe to all events, display them."""
216
+ ws_url = f"{self._event_hub_ws_url}?token={self.kite_token}"
217
+ async with websockets.connect(ws_url) as ws:
218
+ self._ws = ws
219
+ print("[launcher] Connected to Event Hub")
220
+
221
+ # Subscribe to all events
222
+ await ws.send(json.dumps({
223
+ "type": "subscribe",
224
+ "events": [">"],
225
+ }))
226
+
227
+ # Receive loop
228
+ async for raw in ws:
229
+ try:
230
+ msg = json.loads(raw)
231
+ except (json.JSONDecodeError, TypeError):
232
+ continue
233
+ msg_type = msg.get("type", "")
234
+ if msg_type == "event":
235
+ source = msg.get("source", "unknown")
236
+ event = msg.get("event", "")
237
+ data = msg.get("data", {})
238
+ # Trigger event waiters
239
+ module_id = data.get("module_id", "")
240
+ waiter_key = f"{event}:{module_id}"
241
+ waiter = self._event_waiters.get(waiter_key)
242
+ if waiter:
243
+ waiter[1].update(data)
244
+ waiter[0].set()
245
+ ts = msg.get("timestamp", "")
246
+ latency_str = ""
247
+ if ts:
248
+ try:
249
+ from datetime import datetime, timezone
250
+ sent = datetime.fromisoformat(ts)
251
+ delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
252
+ latency_str = f" ({delay_ms:.1f}ms)"
253
+ local_ts = sent.astimezone().strftime("%H:%M:%S")
254
+ except Exception:
255
+ local_ts = ts[11:19] if len(ts) >= 19 else ts
256
+ print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
257
+ else:
258
+ print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
259
+ elif msg_type == "error":
260
+ print(f"[launcher] Event Hub error: {msg.get('message')}")
261
+
262
+ async def _publish_event(self, event_type: str, data: dict):
263
+ """Publish an event to Event Hub via WebSocket."""
264
+ if not self._ws:
265
+ return
266
+ from datetime import datetime, timezone
267
+ msg = {
268
+ "type": "event",
269
+ "event_id": str(uuid.uuid4()),
270
+ "event": event_type,
271
+ "source": "launcher",
272
+ "timestamp": datetime.now(timezone.utc).isoformat(),
273
+ "data": data,
274
+ }
275
+ try:
276
+ await self._ws.send(json.dumps(msg))
277
+ except Exception as e:
278
+ print(f"[launcher] Failed to publish event: {e}")
279
+
280
+ def _publish_event_threadsafe(self, event_type: str, data: dict):
281
+ """Publish event from non-async context (API thread). Fire-and-forget."""
282
+ if not self._ws or not self._loop:
283
+ return
284
+ asyncio.run_coroutine_threadsafe(
285
+ self._publish_event(event_type, data), self._loop,
286
+ )
287
+
288
+ async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
289
+ """Wait for a specific event from a module. Returns data dict or None on timeout."""
290
+ key = f"{event_type}:{module_id}"
291
+ evt = asyncio.Event()
292
+ data = {}
293
+ self._event_waiters[key] = (evt, data)
294
+ try:
295
+ await asyncio.wait_for(evt.wait(), timeout=timeout)
296
+ return data
297
+ except asyncio.TimeoutError:
298
+ return None
299
+ finally:
300
+ self._event_waiters.pop(key, None)
301
+
302
+ async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
303
+ """Graceful shutdown: send event → wait ack → wait ready → kill."""
304
+ self._log_lifecycle("stopping", name, reason=reason)
305
+ # Step 1: send module.shutdown event
306
+ await self._publish_event("module.shutdown", {
307
+ "module_id": name, "reason": reason, "timeout": timeout,
308
+ })
309
+
310
+ # Step 2: wait for ack (3s)
311
+ ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
312
+ if not ack:
313
+ # No ack — fallback to direct terminate
314
+ self.process_manager.stop_module(name, timeout=5)
315
+ await self._publish_event("module.stopped", {"module_id": name})
316
+ return
317
+
318
+ # Step 3: wait for ready
319
+ estimated = min(ack.get("estimated_cleanup", timeout), timeout)
320
+ ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
321
+ if ready:
322
+ # Module is ready to die — kill immediately
323
+ self.process_manager.stop_module(name, timeout=1)
324
+ else:
325
+ # Timeout — force stop
326
+ self.process_manager.stop_module(name, timeout=3)
327
+
328
+ self._log_lifecycle("stopped", name, reason=reason)
329
+ await self._publish_event("module.stopped", {"module_id": name})
330
+
331
+ async def _graceful_shutdown_all(self):
332
+ """Broadcast module.shutdown to all running modules, then force-kill survivors."""
333
+ running = [n for n in self.modules if self.process_manager.is_running(n)]
334
+ if not running:
335
+ return
336
+ print(f"[launcher] Graceful shutdown: {', '.join(running)}")
337
+ # Broadcast shutdown event
338
+ for name in running:
339
+ self._log_lifecycle("stopping", name, reason="system_shutdown")
340
+ await self._publish_event("module.shutdown", {
341
+ "module_id": name, "reason": "system_shutdown", "timeout": 10,
342
+ })
343
+ # Wait up to 10s total, then force-kill
344
+ deadline = time.time() + 10
345
+ while time.time() < deadline:
346
+ still_running = [n for n in running if self.process_manager.is_running(n)]
347
+ if not still_running:
348
+ break
349
+ await asyncio.sleep(0.5)
350
+ self.process_manager.stop_all(timeout=3)
351
+ for name in running:
352
+ self._log_lifecycle("stopped", name, reason="system_shutdown")
353
+
354
+ # ── Heartbeat to Registry ──
355
+
356
+ async def _heartbeat_loop(self):
357
+ """Send heartbeat to Registry every 30 seconds."""
358
+ while not self._thread_shutdown.is_set():
359
+ await asyncio.sleep(30)
360
+ try:
361
+ async with httpx.AsyncClient() as client:
362
+ await client.post(
363
+ f"http://127.0.0.1:{self.registry_port}/modules",
364
+ json={"action": "heartbeat", "module_id": "launcher"},
365
+ headers={"Authorization": f"Bearer {self.kite_token}"},
366
+ timeout=5,
367
+ )
368
+ except Exception:
369
+ pass
370
+
371
+ # ── Registry startup ──
372
+
373
+ async def _start_registry(self):
374
+ """Start Registry as a subprocess, wait for it to write port.txt and /health to respond."""
375
+ registry_dir = os.path.join(self.project_root, "core", "registry")
376
+ if not os.path.isdir(registry_dir):
377
+ raise RuntimeError(f"Registry module not found at {registry_dir}")
378
+
379
+ # Clean our instance's port file before starting
380
+ port_file = os.path.join(registry_dir, "data", f"port_{self.instance_id}.txt")
381
+ if os.path.isfile(port_file):
382
+ os.remove(port_file)
383
+
384
+ registry_info = ModuleInfo(
385
+ name="registry",
386
+ display_name="Registry",
387
+ type="infrastructure",
388
+ state="enabled",
389
+ runtime="python",
390
+ entry="entry.py",
391
+ module_dir=registry_dir,
392
+ )
393
+
394
+ # Pass launcher_token + bind config via stdin
395
+ boot_info = {"token": self.kite_token, "registry_port": 0, "bind": "127.0.0.1", "instance_id": self.instance_id}
396
+ ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
397
+ if not ok:
398
+ raise RuntimeError("Failed to start Registry")
399
+
400
+ # Wait for Registry to write port.txt
401
+ print("[launcher] Waiting for Registry to report its port...")
402
+ deadline = time.time() + 10
403
+ while time.time() < deadline:
404
+ if os.path.isfile(port_file):
405
+ try:
406
+ with open(port_file, "r") as f:
407
+ self.registry_port = int(f.read().strip())
408
+ break
409
+ except (ValueError, OSError):
410
+ pass
411
+ await asyncio.sleep(0.2)
412
+ else:
413
+ raise RuntimeError("Registry failed to write port.txt within 10s")
414
+
415
+ # Poll /health until ready
416
+ url = f"http://127.0.0.1:{self.registry_port}/health"
417
+ print(f"[launcher] Registry on port {self.registry_port}, waiting for health check...")
418
+
419
+ deadline = time.time() + 10
420
+ async with httpx.AsyncClient() as client:
421
+ while time.time() < deadline:
422
+ try:
423
+ resp = await client.get(url, timeout=1)
424
+ if resp.status_code == 200:
425
+ print("[launcher] Registry is ready")
426
+ return
427
+ except Exception:
428
+ pass
429
+ await asyncio.sleep(0.2)
430
+
431
+ raise RuntimeError("Registry failed to become ready within 10s")
432
+
433
+ async def _register_self(self):
434
+ """Register Launcher itself to Registry using new API."""
435
+ url = f"http://127.0.0.1:{self.registry_port}/modules"
436
+ headers = {"Authorization": f"Bearer {self.kite_token}"}
437
+ payload = {
438
+ "action": "register",
439
+ "module_id": "launcher",
440
+ "module_type": "infrastructure",
441
+ "name": "Launcher",
442
+ "api_endpoint": f"http://127.0.0.1:{self.api_port}",
443
+ "health_endpoint": "/launcher/modules",
444
+ "events_publish": {
445
+ "module.started": {},
446
+ "module.stopped": {},
447
+ "module.state_changed": {},
448
+ },
449
+ "events_subscribe": [">"],
450
+ }
451
+ try:
452
+ async with httpx.AsyncClient() as client:
453
+ resp = await client.post(url, json=payload, headers=headers, timeout=5)
454
+ if resp.status_code == 200:
455
+ print("[launcher] Registered self to Registry")
456
+ else:
457
+ print(f"[launcher] WARNING: Registry registration returned {resp.status_code}")
458
+ except Exception as e:
459
+ print(f"[launcher] WARNING: failed to register to Registry: {e}")
460
+
461
+ # ── Module startup ──
462
+
463
+ def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
464
+ """Topological sort by depends_on. Raises RuntimeError on cycle."""
465
+ name_map = {m.name: m for m in modules}
466
+ visited = set()
467
+ in_stack = set()
468
+ order = []
469
+
470
+ def visit(name):
471
+ if name in in_stack:
472
+ raise RuntimeError(f"Circular dependency detected involving '{name}'")
473
+ if name in visited:
474
+ return
475
+ in_stack.add(name)
476
+ info = name_map.get(name)
477
+ if info:
478
+ for dep in info.depends_on:
479
+ visit(dep)
480
+ in_stack.remove(name)
481
+ visited.add(name)
482
+ if info:
483
+ order.append(info)
484
+
485
+ for m in modules:
486
+ visit(m.name)
487
+ return order
488
+
489
+ async def _start_one_module(self, info: ModuleInfo):
490
+ """Start a single module: publish starting event, start process, wait for ready."""
491
+ self._log_lifecycle("starting", info.name)
492
+ await self._publish_event("module.starting", {"module_id": info.name})
493
+
494
+ token = self._module_tokens.get(info.name, "")
495
+ boot_info = {
496
+ "token": token,
497
+ "registry_port": self.registry_port,
498
+ "preferred_port": info.preferred_port,
499
+ "advertise_ip": "127.0.0.1",
500
+ }
501
+ ok = self.process_manager.start_module(info, boot_info=boot_info)
502
+ if not ok:
503
+ self._log_lifecycle("start_failed", info.name)
504
+ return
505
+
506
+ # Wait for module.ready (configurable timeout, degrade on timeout)
507
+ timeout = info.launch.timeout
508
+ ready = await self._wait_event("module.ready", info.name, timeout=timeout)
509
+ if ready:
510
+ print(f"[launcher] Module '{info.name}' is ready")
511
+ else:
512
+ print(f"[launcher] WARNING: '{info.name}' did not send module.ready within {timeout}s")
513
+
514
+ rec = self.process_manager.get_record(info.name)
515
+ self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
516
+ await self._publish_event("module.started", {"module_id": info.name})
517
+
518
+ async def _start_enabled_modules(self):
519
+ """Start modules in dependency order, auto-starting manual deps if needed."""
520
+ to_start = [m for m in self.modules.values()
521
+ if self._desired_states.get(m.name) == "running"]
522
+ if not to_start:
523
+ print("[launcher] No modules to start")
524
+ return
525
+
526
+ # Auto-start manual modules if depended upon
527
+ needed = set(m.name for m in to_start)
528
+ for m in to_start:
529
+ for dep in m.depends_on:
530
+ if dep not in needed:
531
+ dep_info = self.modules.get(dep)
532
+ if dep_info and dep_info.state != "disabled":
533
+ needed.add(dep)
534
+ to_start.append(dep_info)
535
+ self._desired_states[dep] = "running"
536
+ print(f"[launcher] Auto-starting '{dep}' (dependency)")
537
+ elif dep_info and dep_info.state == "disabled":
538
+ print(f"[launcher] ERROR: '{m.name}' depends on disabled module '{dep}'")
539
+
540
+ try:
541
+ sorted_modules = self._topo_sort(to_start)
542
+ except RuntimeError as e:
543
+ print(f"[launcher] ERROR: {e}")
544
+ return
545
+
546
+ print(f"[launcher] Starting {len(sorted_modules)} module(s)...")
547
+ for info in sorted_modules:
548
+ await self._start_one_module(info)
549
+
550
+ async def _register_module_tokens(self):
551
+ """Generate per-module tokens and register the mapping to Registry."""
552
+ for name in self.modules:
553
+ self._module_tokens[name] = secrets.token_hex(32)
554
+
555
+ if not self._module_tokens:
556
+ return
557
+
558
+ url = f"http://127.0.0.1:{self.registry_port}/tokens"
559
+ headers = {"Authorization": f"Bearer {self.kite_token}"}
560
+ try:
561
+ async with httpx.AsyncClient() as client:
562
+ resp = await client.post(url, json=self._module_tokens, headers=headers, timeout=5)
563
+ if resp.status_code == 200:
564
+ print(f"[launcher] Registered {len(self._module_tokens)} module token(s)")
565
+ else:
566
+ print(f"[launcher] WARNING: token registration returned {resp.status_code}")
567
+ except Exception as e:
568
+ print(f"[launcher] WARNING: failed to register module tokens: {e}")
569
+
570
+ # ── API thread ──
571
+
572
+ def _start_api_thread(self):
573
+ """Start the Launcher API server in a separate thread with OS-assigned port."""
574
+ self.api_port = self._get_free_port()
575
+ config = uvicorn.Config(
576
+ self._app,
577
+ host="127.0.0.1",
578
+ port=self.api_port,
579
+ log_level="warning",
580
+ )
581
+ self._api_server = uvicorn.Server(config)
582
+
583
+ def _run():
584
+ self._api_server.run()
585
+
586
+ t = threading.Thread(target=_run, daemon=True)
587
+ t.start()
588
+
589
+ # Wait for API server to actually be ready before proceeding
590
+ deadline = time.time() + 5
591
+ while time.time() < deadline:
592
+ if self._api_server.started:
593
+ break
594
+ time.sleep(0.05)
595
+ else:
596
+ print("[launcher] WARNING: API server may not be fully ready")
597
+
598
+ print(f"[launcher] API server started on port {self.api_port}")
599
+
600
+ # ── Monitor loop ──
601
+
602
+ async def _monitor_loop(self):
603
+ """Check child processes every second. Handle crashes."""
604
+ MAX_FAIL = 3
605
+ MAX_FAILED_MODULES = 3
606
+
607
+ while not self._thread_shutdown.is_set():
608
+ exited = self.process_manager.check_exited()
609
+
610
+ for name, rc in exited:
611
+ print(f"[launcher] Module '{name}' exited with code {rc}")
612
+ self._log_lifecycle("exited", name, exit_code=rc)
613
+ await self._publish_event("module.stopped", {
614
+ "module_id": name, "exit_code": rc,
615
+ })
616
+ info = self.modules.get(name)
617
+
618
+ # Core module crash → full restart
619
+ if info and info.is_core(self.project_root):
620
+ print(f"[launcher] CRITICAL: core module '{name}' crashed, restarting all...")
621
+ self._log_lifecycle("core_crash", name, exit_code=rc)
622
+ await self._full_restart()
623
+ return
624
+
625
+ # Non-core: attempt restart if desired_state is "running"
626
+ self._fail_counts[name] = self._fail_counts.get(name, 0) + 1
627
+ count = self._fail_counts[name]
628
+
629
+ if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
630
+ print(f"[launcher] Restarting '{name}' (attempt {count}/{MAX_FAIL})...")
631
+ await self._start_one_module(info)
632
+ elif count >= MAX_FAIL:
633
+ self._desired_states[name] = "stopped"
634
+ self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
635
+ print(f"[launcher] Module '{name}' failed {MAX_FAIL} times, giving up")
636
+
637
+ # Too many failed modules → exit
638
+ failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
639
+ if failed_count >= MAX_FAILED_MODULES:
640
+ print(f"[launcher] {failed_count} modules permanently failed, Launcher exiting")
641
+ return
642
+
643
+ if exited:
644
+ self.process_manager.persist_records()
645
+
646
+ await asyncio.sleep(1)
647
+
648
+ async def _full_restart(self):
649
+ """Stop all modules, then re-run the startup sequence."""
650
+ print("[launcher] Full restart: stopping all modules...")
651
+
652
+ # Disconnect Event Hub
653
+ if self._ws_task:
654
+ self._ws_task.cancel()
655
+ self._ws_task = None
656
+ if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
657
+ self._heartbeat_task.cancel()
658
+ self._heartbeat_task = None
659
+ self._ws = None
660
+
661
+ await self._graceful_shutdown_all()
662
+ self._fail_counts.clear()
663
+
664
+ self._module_tokens.clear()
665
+
666
+ print("[launcher] Full restart: re-running startup sequence...")
667
+ try:
668
+ await self._start_registry()
669
+ await self._register_self()
670
+ self.modules = self.module_scanner.scan()
671
+ for n, info in self.modules.items():
672
+ self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
673
+ await self._register_module_tokens()
674
+ await self._start_enabled_modules()
675
+ self.process_manager.persist_records()
676
+ await self._connect_event_hub()
677
+ print("[launcher] Full restart complete, resuming monitor loop")
678
+ await self._monitor_loop()
679
+ except Exception as e:
680
+ print(f"[launcher] Full restart failed: {e}")
681
+
682
+ # ── Shutdown ──
683
+
684
+ def _final_cleanup(self):
685
+ """Called on exit — stop all processes, stop API, clear records."""
686
+ print("[launcher] Shutting down...")
687
+
688
+ if self._ws_task:
689
+ self._ws_task.cancel()
690
+ if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
691
+ self._heartbeat_task.cancel()
692
+
693
+ self.process_manager.stop_all(timeout=10)
694
+
695
+ if self._api_server:
696
+ self._api_server.should_exit = True
697
+
698
+ # Clear instance runtime files
699
+ self.process_manager._write_records_file([])
700
+ try:
701
+ os.remove(self.process_manager.records_path)
702
+ except OSError:
703
+ pass
704
+ port_file = os.path.join(self.project_root, "core", "registry", "data", f"port_{self.instance_id}.txt")
705
+ try:
706
+ os.remove(port_file)
707
+ except OSError:
708
+ pass
709
+ print("[launcher] Goodbye.")
710
+
711
+ if IS_WINDOWS:
712
+ os._exit(0)
713
+
714
+ # ── Utilities ──
715
+
716
+ def _load_discovery(self) -> dict | None:
717
+ """Read discovery config from launcher's own module.md."""
718
+ md_path = os.path.join(self.project_root, "core", "launcher", "module.md")
719
+ try:
720
+ with open(md_path, "r", encoding="utf-8") as f:
721
+ fm = _parse_frontmatter(f.read())
722
+ discovery = fm.get("discovery")
723
+ if isinstance(discovery, dict) and discovery:
724
+ print(f"[launcher] Discovery sources: {', '.join(discovery.keys())}")
725
+ return discovery
726
+ except Exception as e:
727
+ print(f"[launcher] WARNING: failed to read discovery config: {e}")
728
+ return None
729
+
730
+ def _log_lifecycle(self, event: str, module: str, **extra):
731
+ """Append one JSONL line to core/launcher/data/lifecycle.jsonl."""
732
+ from datetime import datetime, timezone
733
+ record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
734
+ record.update(extra)
735
+ try:
736
+ os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
737
+ with open(self._lifecycle_log, "a", encoding="utf-8") as f:
738
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
739
+ except Exception:
740
+ pass
741
+
742
+ @staticmethod
743
+ def _get_free_port() -> int:
744
+ """Get a free port assigned by the OS (bind to port 0)."""
745
+ import socket
746
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
747
+ s.bind(("127.0.0.1", 0))
748
+ return s.getsockname()[1]
749
+
750
+ # ── API app ──
751
+
752
+ def _create_api_app(self) -> FastAPI:
753
+ """Create the FastAPI app with Launcher management routes."""
754
+ app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
755
+ launcher = self # closure reference
756
+
757
+ @app.get("/launcher/modules")
758
+ async def list_modules():
759
+ """List all modules and their current status (three-layer state model)."""
760
+ result = []
761
+ for name, info in launcher.modules.items():
762
+ running = launcher.process_manager.is_running(name)
763
+ rec = launcher.process_manager.get_record(name)
764
+ result.append({
765
+ "name": name,
766
+ "display_name": info.display_name,
767
+ "type": info.type,
768
+ "config_state": info.state,
769
+ "desired_state": launcher._desired_states.get(name, "stopped"),
770
+ "actual_state": f"running({rec.pid})" if running and rec else "stopped",
771
+ "pid": rec.pid if running and rec else None,
772
+ "monitor": info.monitor,
773
+ })
774
+ return result
775
+
776
+ @app.post("/launcher/modules/{name}/start")
777
+ async def start_module(name: str):
778
+ """Start a module by name. Generates token and passes boot_info via stdin."""
779
+ info = launcher.modules.get(name)
780
+ if not info:
781
+ raise HTTPException(404, f"Module '{name}' not found")
782
+ if info.state == "disabled":
783
+ raise HTTPException(403, f"Module '{name}' is disabled")
784
+
785
+ # Generate token if not already present
786
+ if name not in launcher._module_tokens:
787
+ launcher._module_tokens[name] = secrets.token_hex(32)
788
+ # Register the new token to Registry
789
+ try:
790
+ async with httpx.AsyncClient() as client:
791
+ await client.post(
792
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
793
+ json={name: launcher._module_tokens[name]},
794
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
795
+ timeout=5,
796
+ )
797
+ except Exception as e:
798
+ print(f"[launcher] WARNING: failed to register token for {name}: {e}")
799
+
800
+ token = launcher._module_tokens[name]
801
+ boot_info = {
802
+ "token": token,
803
+ "registry_port": launcher.registry_port,
804
+ "preferred_port": info.preferred_port,
805
+ }
806
+ ok = launcher.process_manager.start_module(info, boot_info=boot_info)
807
+ if ok:
808
+ launcher._desired_states[name] = "running"
809
+ launcher._fail_counts.pop(name, None)
810
+ launcher.process_manager.persist_records()
811
+ rec = launcher.process_manager.get_record(name)
812
+ launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
813
+ launcher._publish_event_threadsafe("module.started", {"module_id": name})
814
+ return {"status": "started", "name": name}
815
+ launcher._log_lifecycle("start_failed", name, via="api")
816
+ raise HTTPException(500, f"Failed to start '{name}'")
817
+
818
+ @app.post("/launcher/modules/{name}/stop")
819
+ async def stop_module(name: str, body: dict = None):
820
+ """Stop a module with graceful shutdown. Accepts optional reason."""
821
+ info = launcher.modules.get(name)
822
+ if not info:
823
+ raise HTTPException(404, f"Module '{name}' not found")
824
+ reason = (body or {}).get("reason", "stop_requested")
825
+ launcher._desired_states[name] = "stopped"
826
+ await launcher._graceful_stop(name, reason)
827
+ launcher.process_manager.persist_records()
828
+ return {"status": "stopped", "name": name}
829
+
830
+ @app.post("/launcher/modules/{name}/restart")
831
+ async def restart_module(name: str, body: dict = None):
832
+ """Restart a module (stop + start)."""
833
+ info = launcher.modules.get(name)
834
+ if not info:
835
+ raise HTTPException(404, f"Module '{name}' not found")
836
+ if info.state == "disabled":
837
+ raise HTTPException(403, f"Module '{name}' is disabled")
838
+ reason = (body or {}).get("reason", "restart")
839
+ await launcher._graceful_stop(name, reason)
840
+ # Re-generate token
841
+ launcher._module_tokens[name] = secrets.token_hex(32)
842
+ try:
843
+ async with httpx.AsyncClient() as client:
844
+ await client.post(
845
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
846
+ json={name: launcher._module_tokens[name]},
847
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
848
+ timeout=5,
849
+ )
850
+ except Exception:
851
+ pass
852
+ token = launcher._module_tokens[name]
853
+ boot_info = {
854
+ "token": token,
855
+ "registry_port": launcher.registry_port,
856
+ "preferred_port": info.preferred_port,
857
+ }
858
+ ok = launcher.process_manager.start_module(info, boot_info=boot_info)
859
+ if ok:
860
+ launcher._desired_states[name] = "running"
861
+ launcher._fail_counts.pop(name, None)
862
+ launcher.process_manager.persist_records()
863
+ rec = launcher.process_manager.get_record(name)
864
+ launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
865
+ launcher._publish_event_threadsafe("module.started", {"module_id": name})
866
+ return {"status": "restarted", "name": name}
867
+ launcher._log_lifecycle("start_failed", name, via="restart_api")
868
+ raise HTTPException(500, f"Failed to restart '{name}'")
869
+
870
+ @app.post("/launcher/rescan")
871
+ async def rescan_modules():
872
+ """Rescan module directories for new/removed modules."""
873
+ old_names = set(launcher.modules.keys())
874
+ launcher.modules = launcher.module_scanner.scan()
875
+ new_names = set(launcher.modules.keys())
876
+ added = list(new_names - old_names)
877
+ removed = list(old_names - new_names)
878
+ for name in added:
879
+ info = launcher.modules[name]
880
+ launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
881
+ # Initialize desired_state for new modules
882
+ for name in added:
883
+ info = launcher.modules[name]
884
+ launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
885
+ # Register tokens for new modules
886
+ if added:
887
+ new_tokens = {}
888
+ for name in added:
889
+ launcher._module_tokens[name] = secrets.token_hex(32)
890
+ new_tokens[name] = launcher._module_tokens[name]
891
+ try:
892
+ async with httpx.AsyncClient() as client:
893
+ await client.post(
894
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
895
+ json=new_tokens,
896
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
897
+ timeout=5,
898
+ )
899
+ except Exception:
900
+ pass
901
+ return {"added": added, "removed": removed, "total": len(launcher.modules)}
902
+
903
+ @app.put("/launcher/modules/{name}/state")
904
+ async def update_state(name: str, body: dict):
905
+ """Update module state (enabled/manual/disabled). Writes to module.md."""
906
+ info = launcher.modules.get(name)
907
+ if not info:
908
+ raise HTTPException(404, f"Module '{name}' not found")
909
+
910
+ new_state = body.get("state", "")
911
+ if new_state not in ("enabled", "manual", "disabled"):
912
+ raise HTTPException(400, "state must be enabled, manual, or disabled")
913
+
914
+ # Core modules cannot be disabled
915
+ if info.is_core(launcher.project_root) and new_state == "disabled":
916
+ raise HTTPException(403, "Core modules cannot be disabled")
917
+
918
+ old_state = info.state
919
+ info.state = new_state
920
+
921
+ # Update desired_state to match new config_state
922
+ if new_state == "enabled":
923
+ launcher._desired_states[name] = "running"
924
+ else:
925
+ launcher._desired_states[name] = "stopped"
926
+
927
+ _update_module_md_state(info.module_dir, new_state)
928
+ launcher._publish_event_threadsafe("module.state_changed", {
929
+ "module_id": name,
930
+ "old_state": old_state,
931
+ "new_state": new_state,
932
+ })
933
+ return {
934
+ "name": name,
935
+ "old_state": old_state,
936
+ "new_state": new_state,
937
+ }
938
+
939
+ return app
940
+
941
+
942
+ def _update_module_md_state(module_dir: str, new_state: str):
943
+ """Update the state field in a module's module.md frontmatter."""
944
+ import re
945
+ md_path = os.path.join(module_dir, "module.md")
946
+ if not os.path.isfile(md_path):
947
+ return
948
+
949
+ try:
950
+ with open(md_path, "r", encoding="utf-8") as f:
951
+ content = f.read()
952
+
953
+ # Replace state: xxx in frontmatter
954
+ updated = re.sub(
955
+ r'^(state:\s*)(\S+)',
956
+ rf'\g<1>{new_state}',
957
+ content,
958
+ count=1,
959
+ flags=re.MULTILINE,
960
+ )
961
+
962
+ with open(md_path, "w", encoding="utf-8") as f:
963
+ f.write(updated)
964
+ except Exception as e:
965
+ print(f"[launcher] WARNING: failed to update module.md state: {e}")