@agentunion/kite 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/__init__.py +1 -0
  2. package/__main__.py +15 -0
  3. package/cli.js +70 -0
  4. package/core/__init__.py +0 -0
  5. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  6. package/core/event_hub/BENCHMARK.md +94 -0
  7. package/core/event_hub/__init__.py +0 -0
  8. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  9. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  10. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  11. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  12. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  13. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  14. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  15. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  16. package/core/event_hub/bench.py +459 -0
  17. package/core/event_hub/bench_extreme.py +308 -0
  18. package/core/event_hub/bench_perf.py +350 -0
  19. package/core/event_hub/bench_results/.gitkeep +0 -0
  20. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +51 -0
  21. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +51 -0
  22. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +51 -0
  23. package/core/event_hub/dedup.py +31 -0
  24. package/core/event_hub/entry.py +113 -0
  25. package/core/event_hub/hub.py +263 -0
  26. package/core/event_hub/module.md +21 -0
  27. package/core/event_hub/router.py +21 -0
  28. package/core/event_hub/server.py +138 -0
  29. package/core/event_hub_bench/entry.py +371 -0
  30. package/core/event_hub_bench/module.md +25 -0
  31. package/core/launcher/__init__.py +0 -0
  32. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  33. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  34. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  35. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  36. package/core/launcher/data/log/lifecycle.jsonl +1045 -0
  37. package/core/launcher/data/processes_14752.json +32 -0
  38. package/core/launcher/data/token.txt +1 -0
  39. package/core/launcher/entry.py +965 -0
  40. package/core/launcher/module.md +37 -0
  41. package/core/launcher/module_scanner.py +253 -0
  42. package/core/launcher/process_manager.py +435 -0
  43. package/core/registry/__init__.py +0 -0
  44. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  45. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  46. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  47. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  48. package/core/registry/data/port.txt +1 -0
  49. package/core/registry/data/port_14752.txt +1 -0
  50. package/core/registry/data/port_484.txt +1 -0
  51. package/core/registry/entry.py +73 -0
  52. package/core/registry/module.md +30 -0
  53. package/core/registry/server.py +256 -0
  54. package/core/registry/store.py +232 -0
  55. package/extensions/__init__.py +0 -0
  56. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  57. package/extensions/services/__init__.py +0 -0
  58. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  59. package/extensions/services/watchdog/__init__.py +0 -0
  60. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  61. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  62. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  63. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
  64. package/extensions/services/watchdog/entry.py +143 -0
  65. package/extensions/services/watchdog/module.md +25 -0
  66. package/extensions/services/watchdog/monitor.py +420 -0
  67. package/extensions/services/watchdog/server.py +167 -0
  68. package/main.py +17 -0
  69. package/package.json +27 -0
@@ -0,0 +1,420 @@
1
+ """
2
+ Watchdog monitor — periodic health checks + resource monitoring.
3
+ Launcher handles process-level crashes; Watchdog handles app-level failures
4
+ (process alive but /health unhealthy) and resource anomalies (CPU/memory).
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import time
10
+ from datetime import datetime, timezone
11
+
12
+ import httpx
13
+
14
+
15
+ # Module health states
16
+ HEALTHY = "healthy"
17
+ UNHEALTHY = "unhealthy"
18
+ UNKNOWN = "unknown"
19
+
20
+ # Resource states
21
+ NORMAL = "normal"
22
+ WARNING = "warning"
23
+ CRITICAL = "critical"
24
+
25
+
26
+ class ModuleStatus:
27
+ """Track health and resource state for a single module."""
28
+
29
+ def __init__(self, module_id: str, api_endpoint: str, health_endpoint: str, pid: int = None):
30
+ self.module_id = module_id
31
+ self.api_endpoint = api_endpoint
32
+ self.health_endpoint = health_endpoint
33
+ self.pid = pid
34
+ # Health check state
35
+ self.state: str = UNKNOWN
36
+ self.fail_count: int = 0
37
+ self.last_check: float = 0
38
+ self.last_healthy: float = 0
39
+ self.last_error: str = ""
40
+ self.restarted_count: int = 0
41
+ # Resource monitoring state
42
+ self.resource_state: str = NORMAL
43
+ self.resource_warning_count: int = 0
44
+ self.resource_critical_count: int = 0
45
+ self.critical_since: float = 0 # when critical started
46
+ self.memory_samples: list[float] = [] # last 5 memory_rss samples
47
+ self.recovery_since: float = 0 # when recovery observation started
48
+ self.last_metrics: dict = {}
49
+
50
+
51
+ class HealthMonitor:
52
+ """Periodically check module health and resources, track failures, trigger restarts."""
53
+
54
+ # Thresholds
55
+ HEALTH_TIMEOUT = 5 # HTTP timeout per health check
56
+ FAIL_THRESHOLD = 3 # consecutive failures before restart
57
+ MAX_RESTARTS = 3 # max restarts before giving up on a module
58
+ ALERT_AFTER_RESTARTS = 2 # publish alert after this many restarts
59
+
60
+ # Resource thresholds
61
+ MEMORY_WARNING = 70 # memory_percent > 70% → warning
62
+ MEMORY_CRITICAL = 85 # memory_percent > 85% → critical
63
+ CPU_WARNING = 80 # cpu_percent > 80% → warning
64
+ CPU_CRITICAL = 95 # cpu_percent > 95% → critical
65
+ CRITICAL_TIMEOUT = 30 # seconds in critical before restart
66
+ RECOVERY_OBSERVE = 60 # seconds to observe after recovery
67
+
68
+ # Check intervals per resource state
69
+ INTERVALS = {NORMAL: 15, WARNING: 5, CRITICAL: 2}
70
+
71
+ def __init__(self, own_token: str, registry_url: str, launcher_url: str,
72
+ publish_event=None):
73
+ self.own_token = own_token
74
+ self.registry_url = registry_url
75
+ self.launcher_url = launcher_url
76
+ self.publish_event = publish_event # async callable(event_dict)
77
+ self.modules: dict[str, ModuleStatus] = {}
78
+ self._running = False
79
+ self._psutil = None # lazy import
80
+
81
+ # ── Module discovery ──
82
+
83
+ async def discover_modules(self):
84
+ """Fetch monitored modules from Launcher API + Registry health endpoints."""
85
+ # Step 1: Get module list with monitor/pid from Launcher API
86
+ monitored = {} # name -> {pid, running}
87
+ try:
88
+ async with httpx.AsyncClient() as client:
89
+ resp = await client.get(
90
+ f"{self.launcher_url}/launcher/modules", timeout=5,
91
+ )
92
+ if resp.status_code == 200:
93
+ for m in resp.json():
94
+ name = m.get("name", "")
95
+ if name == "watchdog" or not m.get("monitor", True):
96
+ continue
97
+ if m.get("actual_state", "").startswith("running"):
98
+ monitored[name] = m.get("pid")
99
+ except Exception as e:
100
+ print(f"[watchdog] Launcher API failed: {e}")
101
+ return
102
+
103
+ # Step 2: Get health endpoints from Registry
104
+ health_map = {} # name -> {api_endpoint, health_endpoint}
105
+ headers = {"Authorization": f"Bearer {self.own_token}"}
106
+ try:
107
+ async with httpx.AsyncClient() as client:
108
+ resp = await client.get(
109
+ f"{self.registry_url}/lookup",
110
+ params={"field": "health_endpoint"},
111
+ headers=headers, timeout=5,
112
+ )
113
+ if resp.status_code == 200:
114
+ for entry in resp.json():
115
+ mid = entry.get("module", "")
116
+ if mid in monitored:
117
+ health_map[mid] = {
118
+ "api_endpoint": entry.get("api_endpoint", ""),
119
+ "health_endpoint": entry.get("value", "/health"),
120
+ }
121
+ except Exception:
122
+ pass
123
+
124
+ # Step 3: Sync module list
125
+ seen = set()
126
+ for mid, pid in monitored.items():
127
+ seen.add(mid)
128
+ h = health_map.get(mid, {})
129
+ if mid not in self.modules:
130
+ self.modules[mid] = ModuleStatus(
131
+ module_id=mid,
132
+ api_endpoint=h.get("api_endpoint", ""),
133
+ health_endpoint=h.get("health_endpoint", "/health"),
134
+ pid=pid,
135
+ )
136
+ else:
137
+ self.modules[mid].pid = pid
138
+
139
+ for mid in list(self.modules):
140
+ if mid not in seen:
141
+ del self.modules[mid]
142
+
143
+ # ── Health check ──
144
+
145
+ async def _check_one(self, status: ModuleStatus):
146
+ """Check a single module's /health endpoint."""
147
+ url = f"{status.api_endpoint}{status.health_endpoint}"
148
+ status.last_check = time.time()
149
+
150
+ try:
151
+ async with httpx.AsyncClient() as client:
152
+ resp = await client.get(url, timeout=self.HEALTH_TIMEOUT)
153
+ if resp.status_code == 200:
154
+ body = resp.json()
155
+ if body.get("status") == "healthy":
156
+ await self._mark_healthy(status)
157
+ return
158
+ status.last_error = f"unhealthy response: {body.get('status')}"
159
+ else:
160
+ status.last_error = f"HTTP {resp.status_code}"
161
+ except Exception as e:
162
+ status.last_error = str(e)
163
+
164
+ await self._mark_unhealthy(status)
165
+
166
+ async def _mark_healthy(self, status: ModuleStatus):
167
+ """Module responded healthy — reset failure tracking."""
168
+ was_unhealthy = status.state == UNHEALTHY
169
+ status.state = HEALTHY
170
+ status.fail_count = 0
171
+ status.last_healthy = time.time()
172
+ status.last_error = ""
173
+
174
+ if was_unhealthy:
175
+ print(f"[watchdog] {status.module_id} recovered")
176
+ await self._publish("watchdog.module.recovered", {
177
+ "module_id": status.module_id,
178
+ })
179
+
180
+ async def _mark_unhealthy(self, status: ModuleStatus):
181
+ """Module failed health check — increment counter, maybe restart."""
182
+ status.state = UNHEALTHY
183
+ status.fail_count += 1
184
+ print(f"[watchdog] {status.module_id} unhealthy ({status.fail_count}/{self.FAIL_THRESHOLD}): {status.last_error}")
185
+
186
+ await self._publish("watchdog.module.unhealthy", {
187
+ "module_id": status.module_id,
188
+ "fail_count": status.fail_count,
189
+ "error": status.last_error,
190
+ })
191
+
192
+ # Restart if threshold reached
193
+ if (status.fail_count >= self.FAIL_THRESHOLD
194
+ and status.restarted_count < self.MAX_RESTARTS):
195
+ await self._restart_module(status)
196
+
197
+ # ── Restart via Launcher API ──
198
+
199
+ async def _restart_module(self, status: ModuleStatus):
200
+ """Restart a module via Launcher API."""
201
+ mid = status.module_id
202
+ print(f"[watchdog] Restarting {mid} (attempt {status.restarted_count + 1}/{self.MAX_RESTARTS})")
203
+ try:
204
+ async with httpx.AsyncClient() as client:
205
+ resp = await client.post(
206
+ f"{self.launcher_url}/launcher/modules/{mid}/restart",
207
+ json={"reason": "resource_critical" if status.resource_state == CRITICAL else "restart"},
208
+ timeout=15,
209
+ )
210
+ if resp.status_code == 200:
211
+ status.restarted_count += 1
212
+ status.fail_count = 0
213
+ print(f"[watchdog] {mid} restart requested")
214
+ if status.restarted_count >= self.ALERT_AFTER_RESTARTS:
215
+ await self._publish("watchdog.alert", {
216
+ "module_id": mid,
217
+ "restarted_count": status.restarted_count,
218
+ "message": f"{mid} has been restarted {status.restarted_count} times",
219
+ })
220
+ else:
221
+ print(f"[watchdog] {mid} restart failed: HTTP {resp.status_code}")
222
+ except Exception as e:
223
+ print(f"[watchdog] {mid} restart error: {e}")
224
+
225
+ # ── Resource monitoring ──
226
+
227
+ def _collect_metrics(self, status: ModuleStatus) -> dict | None:
228
+ """Collect CPU/memory metrics for a module via psutil."""
229
+ if not status.pid:
230
+ return None
231
+ if not self._psutil:
232
+ try:
233
+ import psutil
234
+ self._psutil = psutil
235
+ except ImportError:
236
+ return None
237
+ try:
238
+ proc = self._psutil.Process(status.pid)
239
+ return {
240
+ "memory_rss": round(proc.memory_info().rss / 1024 / 1024, 1),
241
+ "memory_percent": round(proc.memory_percent(), 1),
242
+ "cpu_percent": round(proc.cpu_percent(interval=0.1), 1),
243
+ "thread_count": proc.num_threads(),
244
+ }
245
+ except Exception:
246
+ return None
247
+
248
+ async def _check_resources(self, status: ModuleStatus):
249
+ """Collect metrics and evaluate resource state."""
250
+ metrics = self._collect_metrics(status)
251
+ if not metrics:
252
+ return
253
+ status.last_metrics = metrics
254
+
255
+ # Track memory samples for leak detection
256
+ status.memory_samples.append(metrics["memory_rss"])
257
+ if len(status.memory_samples) > 5:
258
+ status.memory_samples.pop(0)
259
+
260
+ # Determine raw level from thresholds
261
+ mem_pct = metrics["memory_percent"]
262
+ cpu_pct = metrics["cpu_percent"]
263
+ if mem_pct > self.MEMORY_CRITICAL or cpu_pct > self.CPU_CRITICAL:
264
+ raw_level = CRITICAL
265
+ elif mem_pct > self.MEMORY_WARNING or cpu_pct > self.CPU_WARNING:
266
+ raw_level = WARNING
267
+ else:
268
+ raw_level = NORMAL
269
+
270
+ # Memory leak detection: 3+ consecutive rises, total >20%
271
+ if len(status.memory_samples) >= 3:
272
+ recent = status.memory_samples[-3:]
273
+ if all(recent[i] < recent[i+1] for i in range(len(recent)-1)):
274
+ if recent[-1] > recent[0] * 1.2:
275
+ raw_level = CRITICAL
276
+
277
+ await self._update_resource_state(status, raw_level, metrics)
278
+
279
+ async def _update_resource_state(self, status: ModuleStatus, level: str, metrics: dict):
280
+ """Update resource state with consecutive-count logic."""
281
+ now = time.time()
282
+ old_state = status.resource_state
283
+
284
+ if level == CRITICAL:
285
+ status.resource_critical_count += 1
286
+ status.resource_warning_count = 0
287
+ status.recovery_since = 0
288
+ if status.resource_critical_count >= 3 and old_state != CRITICAL:
289
+ status.resource_state = CRITICAL
290
+ status.critical_since = now
291
+ await self._publish("watchdog.module.resource_critical", {
292
+ "module_id": status.module_id, "metrics": metrics,
293
+ })
294
+ elif old_state == CRITICAL:
295
+ # Already critical — check timeout
296
+ if now - status.critical_since >= self.CRITICAL_TIMEOUT:
297
+ await self._resource_restart(status)
298
+ elif level == WARNING:
299
+ status.resource_warning_count += 1
300
+ status.resource_critical_count = 0
301
+ status.recovery_since = 0
302
+ if status.resource_warning_count >= 2 and old_state == NORMAL:
303
+ status.resource_state = WARNING
304
+ await self._publish("watchdog.module.resource_warning", {
305
+ "module_id": status.module_id, "metrics": metrics,
306
+ })
307
+ else: # NORMAL
308
+ status.resource_warning_count = 0
309
+ status.resource_critical_count = 0
310
+ if old_state != NORMAL:
311
+ if not status.recovery_since:
312
+ status.recovery_since = now
313
+ elif now - status.recovery_since >= self.RECOVERY_OBSERVE:
314
+ status.resource_state = NORMAL
315
+ status.recovery_since = 0
316
+ await self._publish("watchdog.module.resource_recovered", {
317
+ "module_id": status.module_id,
318
+ })
319
+
320
+ async def _resource_restart(self, status: ModuleStatus):
321
+ """Restart module due to resource critical timeout."""
322
+ if status.restarted_count >= self.MAX_RESTARTS:
323
+ await self._publish("watchdog.alert", {
324
+ "module_id": status.module_id,
325
+ "message": f"{status.module_id} exceeded max restarts (resource_critical)",
326
+ })
327
+ return
328
+ print(f"[watchdog] Resource critical timeout for {status.module_id}, restarting...")
329
+ status.resource_state = NORMAL
330
+ status.resource_critical_count = 0
331
+ status.critical_since = 0
332
+ await self._restart_module(status)
333
+
334
+ # ── Event publishing ──
335
+
336
+ async def _publish(self, event_type: str, data: dict):
337
+ """Publish event via callback if available."""
338
+ if not self.publish_event:
339
+ return
340
+ try:
341
+ await self.publish_event({
342
+ "event": event_type,
343
+ "data": data,
344
+ })
345
+ except Exception:
346
+ pass
347
+
348
+ # ── Incoming event handler ──
349
+
350
+ async def handle_event(self, msg: dict):
351
+ """Handle events from Event Hub (module.started / module.stopped)."""
352
+ event_type = msg.get("event", "")
353
+ data = msg.get("data", {})
354
+ module_id = data.get("module_id", "")
355
+
356
+ if not module_id or module_id == "watchdog":
357
+ return
358
+
359
+ if event_type == "module.started":
360
+ print(f"[watchdog] Received module.started: {module_id}")
361
+ # Trigger immediate discovery to pick up the new module
362
+ await self.discover_modules()
363
+
364
+ elif event_type == "module.stopped":
365
+ print(f"[watchdog] Received module.stopped: {module_id}")
366
+ # Remove from tracking — it's gone
367
+ self.modules.pop(module_id, None)
368
+
369
+ # ── Main loop ──
370
+
371
+ async def run(self):
372
+ """Main monitoring loop with dynamic intervals per resource state."""
373
+ self._running = True
374
+ print("[watchdog] Monitor started")
375
+ last_discover = 0
376
+
377
+ while self._running:
378
+ now = time.time()
379
+ # Rediscover every 30s
380
+ if now - last_discover >= 30:
381
+ await self.discover_modules()
382
+ last_discover = now
383
+
384
+ if self.modules:
385
+ tasks = []
386
+ for s in self.modules.values():
387
+ tasks.append(self._check_one(s))
388
+ tasks.append(self._check_resources(s))
389
+ await asyncio.gather(*tasks, return_exceptions=True)
390
+
391
+ # Sleep = shortest interval needed by any module
392
+ interval = self._min_interval()
393
+ await asyncio.sleep(interval)
394
+
395
+ def _min_interval(self) -> float:
396
+ """Return the shortest check interval needed across all modules."""
397
+ if not self.modules:
398
+ return self.INTERVALS[NORMAL]
399
+ return min(self.INTERVALS.get(s.resource_state, 15) for s in self.modules.values())
400
+
401
+ def stop(self):
402
+ self._running = False
403
+
404
+ # ── Stats ──
405
+
406
+ def get_status(self) -> dict:
407
+ """Return current health and resource status of all monitored modules."""
408
+ return {
409
+ mid: {
410
+ "state": s.state,
411
+ "fail_count": s.fail_count,
412
+ "restarted_count": s.restarted_count,
413
+ "last_check": s.last_check,
414
+ "last_healthy": s.last_healthy,
415
+ "last_error": s.last_error,
416
+ "resource_state": s.resource_state,
417
+ "metrics": s.last_metrics,
418
+ }
419
+ for mid, s in self.modules.items()
420
+ }
@@ -0,0 +1,167 @@
1
+ """
2
+ Watchdog HTTP server.
3
+ Exposes /health and /status endpoints. Runs the monitor loop on startup.
4
+ Connects to Event Hub via WebSocket for event publishing and subscription.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import time
10
+ import uuid
11
+
12
+ import httpx
13
+ import websockets
14
+ from fastapi import FastAPI
15
+
16
+ from .monitor import HealthMonitor
17
+
18
+
19
+ class WatchdogServer:
20
+
21
+ def __init__(self, monitor: HealthMonitor, token: str = "",
22
+ event_hub_ws: str = ""):
23
+ self.monitor = monitor
24
+ self.token = token
25
+ self.event_hub_ws = event_hub_ws
26
+ self._monitor_task: asyncio.Task | None = None
27
+ self._ws_task: asyncio.Task | None = None
28
+ self._heartbeat_task: asyncio.Task | None = None
29
+ self._ws: object | None = None
30
+ self._start_time = time.time()
31
+ self.app = self._create_app()
32
+
33
+ # Wire up publish callback on monitor
34
+ self.monitor.publish_event = self._publish_event
35
+
36
+ def _create_app(self) -> FastAPI:
37
+ app = FastAPI(title="Kite Watchdog", docs_url=None, redoc_url=None)
38
+ server = self
39
+
40
+ @app.on_event("startup")
41
+ async def _startup():
42
+ server._monitor_task = asyncio.create_task(server.monitor.run())
43
+ server._heartbeat_task = asyncio.create_task(server._heartbeat_loop())
44
+ if server.event_hub_ws:
45
+ server._ws_task = asyncio.create_task(server._ws_loop())
46
+ server._test_task = asyncio.create_task(server._test_event_loop())
47
+
48
+ @app.on_event("shutdown")
49
+ async def _shutdown():
50
+ server.monitor.stop()
51
+ if server._monitor_task:
52
+ server._monitor_task.cancel()
53
+ if server._heartbeat_task:
54
+ server._heartbeat_task.cancel()
55
+ if server._ws_task:
56
+ server._ws_task.cancel()
57
+ if hasattr(server, '_test_task') and server._test_task:
58
+ server._test_task.cancel()
59
+ if server._ws:
60
+ await server._ws.close()
61
+
62
+ @app.get("/health")
63
+ async def health():
64
+ return {
65
+ "status": "healthy",
66
+ "details": {
67
+ "monitored_modules": len(server.monitor.modules),
68
+ "event_hub_connected": server._ws is not None,
69
+ "uptime_seconds": round(time.time() - server._start_time),
70
+ },
71
+ }
72
+
73
+ @app.get("/status")
74
+ async def status():
75
+ return server.monitor.get_status()
76
+
77
+ return app
78
+
79
+ # ── Event Hub WebSocket client ──
80
+
81
+ async def _ws_loop(self):
82
+ """Connect to Event Hub, subscribe, and listen. Reconnect on failure."""
83
+ while True:
84
+ try:
85
+ await self._ws_connect()
86
+ except asyncio.CancelledError:
87
+ return
88
+ except Exception as e:
89
+ print(f"[watchdog] Event Hub connection error: {e}")
90
+ self._ws = None
91
+ await asyncio.sleep(5) # reconnect delay
92
+
93
+ async def _ws_connect(self):
94
+ """Single WebSocket session: connect, subscribe, receive loop."""
95
+ url = f"{self.event_hub_ws}?token={self.token}"
96
+ async with websockets.connect(url) as ws:
97
+ self._ws = ws
98
+ print("[watchdog] Connected to Event Hub")
99
+
100
+ # Subscribe to module lifecycle events
101
+ await ws.send(json.dumps({
102
+ "type": "subscribe",
103
+ "events": ["module.started", "module.stopped"],
104
+ }))
105
+
106
+ # Receive loop
107
+ async for raw in ws:
108
+ try:
109
+ msg = json.loads(raw)
110
+ except (json.JSONDecodeError, TypeError):
111
+ continue
112
+
113
+ msg_type = msg.get("type", "")
114
+ if msg_type == "event":
115
+ await self.monitor.handle_event(msg)
116
+ elif msg_type == "ack":
117
+ pass # publish confirmed
118
+ elif msg_type == "error":
119
+ print(f"[watchdog] Event Hub error: {msg.get('message')}")
120
+
121
+ async def _publish_event(self, event: dict):
122
+ """Publish an event to Event Hub via WebSocket."""
123
+ if not self._ws:
124
+ return
125
+ from datetime import datetime, timezone
126
+ msg = {
127
+ "type": "event",
128
+ "event_id": str(uuid.uuid4()),
129
+ "event": event.get("event", ""),
130
+ "source": "watchdog",
131
+ "timestamp": datetime.now(timezone.utc).isoformat(),
132
+ "data": event.get("data", {}),
133
+ }
134
+ try:
135
+ await self._ws.send(json.dumps(msg))
136
+ except Exception as e:
137
+ print(f"[watchdog] Failed to publish event: {e}")
138
+
139
+ # ── Heartbeat to Registry ──
140
+
141
+ async def _heartbeat_loop(self):
142
+ """Send heartbeat to Registry every 30 seconds."""
143
+ while True:
144
+ await asyncio.sleep(30)
145
+ try:
146
+ async with httpx.AsyncClient() as client:
147
+ await client.post(
148
+ f"{self.monitor.registry_url}/modules",
149
+ json={"action": "heartbeat", "module_id": "watchdog"},
150
+ headers={"Authorization": f"Bearer {self.monitor.own_token}"},
151
+ timeout=5,
152
+ )
153
+ except Exception:
154
+ pass
155
+
156
+ async def _test_event_loop(self):
157
+ """Publish a test event every 5 seconds."""
158
+ from datetime import datetime, timezone
159
+ while True:
160
+ await asyncio.sleep(5)
161
+ await self._publish_event({
162
+ "event": "watchdog.test",
163
+ "data": {
164
+ "message": "heartbeat from watchdog",
165
+ "timestamp": datetime.now(timezone.utc).isoformat(),
166
+ },
167
+ })
package/main.py ADDED
@@ -0,0 +1,17 @@
1
+ import secrets
2
+ import sys
3
+ import os
4
+
5
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
6
+
7
+ from core.launcher.entry import Launcher
8
+
9
+
10
+ def main():
11
+ token = secrets.token_hex(32)
12
+ print(f"[main] KITE_TOKEN generated ({len(token)} chars)")
13
+ Launcher(kite_token=token).run()
14
+
15
+
16
+ if __name__ == "__main__":
17
+ main()
package/package.json ADDED
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "@agentunion/kite",
3
+ "version": "1.0.0",
4
+ "description": "Kite framework launcher — start Kite from anywhere",
5
+ "bin": {
6
+ "kite": "./cli.js"
7
+ },
8
+ "files": [
9
+ "cli.js",
10
+ "main.py",
11
+ "__main__.py",
12
+ "__init__.py",
13
+ "core/**",
14
+ "extensions/**"
15
+ ],
16
+ "engines": {
17
+ "node": ">=16"
18
+ },
19
+ "os": ["win32", "linux", "darwin"],
20
+ "keywords": ["kite", "framework", "launcher"],
21
+ "license": "MIT",
22
+ "author": "agentcp",
23
+ "repository": {
24
+ "type": "git",
25
+ "url": "https://github.com/agentcp/kite.git"
26
+ }
27
+ }