@agentunion/kite 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__init__.py +1 -0
- package/__main__.py +15 -0
- package/cli.js +70 -0
- package/core/__init__.py +0 -0
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/BENCHMARK.md +94 -0
- package/core/event_hub/__init__.py +0 -0
- package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
- package/core/event_hub/bench.py +459 -0
- package/core/event_hub/bench_extreme.py +308 -0
- package/core/event_hub/bench_perf.py +350 -0
- package/core/event_hub/bench_results/.gitkeep +0 -0
- package/core/event_hub/bench_results/2026-02-28_13-26-48.json +51 -0
- package/core/event_hub/bench_results/2026-02-28_13-44-45.json +51 -0
- package/core/event_hub/bench_results/2026-02-28_13-45-39.json +51 -0
- package/core/event_hub/dedup.py +31 -0
- package/core/event_hub/entry.py +113 -0
- package/core/event_hub/hub.py +263 -0
- package/core/event_hub/module.md +21 -0
- package/core/event_hub/router.py +21 -0
- package/core/event_hub/server.py +138 -0
- package/core/event_hub_bench/entry.py +371 -0
- package/core/event_hub_bench/module.md +25 -0
- package/core/launcher/__init__.py +0 -0
- package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
- package/core/launcher/data/log/lifecycle.jsonl +1045 -0
- package/core/launcher/data/processes_14752.json +32 -0
- package/core/launcher/data/token.txt +1 -0
- package/core/launcher/entry.py +965 -0
- package/core/launcher/module.md +37 -0
- package/core/launcher/module_scanner.py +253 -0
- package/core/launcher/process_manager.py +435 -0
- package/core/registry/__init__.py +0 -0
- package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
- package/core/registry/data/port.txt +1 -0
- package/core/registry/data/port_14752.txt +1 -0
- package/core/registry/data/port_484.txt +1 -0
- package/core/registry/entry.py +73 -0
- package/core/registry/module.md +30 -0
- package/core/registry/server.py +256 -0
- package/core/registry/store.py +232 -0
- package/extensions/__init__.py +0 -0
- package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/__init__.py +0 -0
- package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__init__.py +0 -0
- package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/entry.py +143 -0
- package/extensions/services/watchdog/module.md +25 -0
- package/extensions/services/watchdog/monitor.py +420 -0
- package/extensions/services/watchdog/server.py +167 -0
- package/main.py +17 -0
- package/package.json +27 -0
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Watchdog monitor — periodic health checks + resource monitoring.
|
|
3
|
+
Launcher handles process-level crashes; Watchdog handles app-level failures
|
|
4
|
+
(process alive but /health unhealthy) and resource anomalies (CPU/memory).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Module health states
|
|
16
|
+
HEALTHY = "healthy"
|
|
17
|
+
UNHEALTHY = "unhealthy"
|
|
18
|
+
UNKNOWN = "unknown"
|
|
19
|
+
|
|
20
|
+
# Resource states
|
|
21
|
+
NORMAL = "normal"
|
|
22
|
+
WARNING = "warning"
|
|
23
|
+
CRITICAL = "critical"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ModuleStatus:
|
|
27
|
+
"""Track health and resource state for a single module."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, module_id: str, api_endpoint: str, health_endpoint: str, pid: int = None):
|
|
30
|
+
self.module_id = module_id
|
|
31
|
+
self.api_endpoint = api_endpoint
|
|
32
|
+
self.health_endpoint = health_endpoint
|
|
33
|
+
self.pid = pid
|
|
34
|
+
# Health check state
|
|
35
|
+
self.state: str = UNKNOWN
|
|
36
|
+
self.fail_count: int = 0
|
|
37
|
+
self.last_check: float = 0
|
|
38
|
+
self.last_healthy: float = 0
|
|
39
|
+
self.last_error: str = ""
|
|
40
|
+
self.restarted_count: int = 0
|
|
41
|
+
# Resource monitoring state
|
|
42
|
+
self.resource_state: str = NORMAL
|
|
43
|
+
self.resource_warning_count: int = 0
|
|
44
|
+
self.resource_critical_count: int = 0
|
|
45
|
+
self.critical_since: float = 0 # when critical started
|
|
46
|
+
self.memory_samples: list[float] = [] # last 5 memory_rss samples
|
|
47
|
+
self.recovery_since: float = 0 # when recovery observation started
|
|
48
|
+
self.last_metrics: dict = {}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class HealthMonitor:
|
|
52
|
+
"""Periodically check module health and resources, track failures, trigger restarts."""
|
|
53
|
+
|
|
54
|
+
# Thresholds
|
|
55
|
+
HEALTH_TIMEOUT = 5 # HTTP timeout per health check
|
|
56
|
+
FAIL_THRESHOLD = 3 # consecutive failures before restart
|
|
57
|
+
MAX_RESTARTS = 3 # max restarts before giving up on a module
|
|
58
|
+
ALERT_AFTER_RESTARTS = 2 # publish alert after this many restarts
|
|
59
|
+
|
|
60
|
+
# Resource thresholds
|
|
61
|
+
MEMORY_WARNING = 70 # memory_percent > 70% → warning
|
|
62
|
+
MEMORY_CRITICAL = 85 # memory_percent > 85% → critical
|
|
63
|
+
CPU_WARNING = 80 # cpu_percent > 80% → warning
|
|
64
|
+
CPU_CRITICAL = 95 # cpu_percent > 95% → critical
|
|
65
|
+
CRITICAL_TIMEOUT = 30 # seconds in critical before restart
|
|
66
|
+
RECOVERY_OBSERVE = 60 # seconds to observe after recovery
|
|
67
|
+
|
|
68
|
+
# Check intervals per resource state
|
|
69
|
+
INTERVALS = {NORMAL: 15, WARNING: 5, CRITICAL: 2}
|
|
70
|
+
|
|
71
|
+
def __init__(self, own_token: str, registry_url: str, launcher_url: str,
|
|
72
|
+
publish_event=None):
|
|
73
|
+
self.own_token = own_token
|
|
74
|
+
self.registry_url = registry_url
|
|
75
|
+
self.launcher_url = launcher_url
|
|
76
|
+
self.publish_event = publish_event # async callable(event_dict)
|
|
77
|
+
self.modules: dict[str, ModuleStatus] = {}
|
|
78
|
+
self._running = False
|
|
79
|
+
self._psutil = None # lazy import
|
|
80
|
+
|
|
81
|
+
# ── Module discovery ──
|
|
82
|
+
|
|
83
|
+
async def discover_modules(self):
|
|
84
|
+
"""Fetch monitored modules from Launcher API + Registry health endpoints."""
|
|
85
|
+
# Step 1: Get module list with monitor/pid from Launcher API
|
|
86
|
+
monitored = {} # name -> {pid, running}
|
|
87
|
+
try:
|
|
88
|
+
async with httpx.AsyncClient() as client:
|
|
89
|
+
resp = await client.get(
|
|
90
|
+
f"{self.launcher_url}/launcher/modules", timeout=5,
|
|
91
|
+
)
|
|
92
|
+
if resp.status_code == 200:
|
|
93
|
+
for m in resp.json():
|
|
94
|
+
name = m.get("name", "")
|
|
95
|
+
if name == "watchdog" or not m.get("monitor", True):
|
|
96
|
+
continue
|
|
97
|
+
if m.get("actual_state", "").startswith("running"):
|
|
98
|
+
monitored[name] = m.get("pid")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
print(f"[watchdog] Launcher API failed: {e}")
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
# Step 2: Get health endpoints from Registry
|
|
104
|
+
health_map = {} # name -> {api_endpoint, health_endpoint}
|
|
105
|
+
headers = {"Authorization": f"Bearer {self.own_token}"}
|
|
106
|
+
try:
|
|
107
|
+
async with httpx.AsyncClient() as client:
|
|
108
|
+
resp = await client.get(
|
|
109
|
+
f"{self.registry_url}/lookup",
|
|
110
|
+
params={"field": "health_endpoint"},
|
|
111
|
+
headers=headers, timeout=5,
|
|
112
|
+
)
|
|
113
|
+
if resp.status_code == 200:
|
|
114
|
+
for entry in resp.json():
|
|
115
|
+
mid = entry.get("module", "")
|
|
116
|
+
if mid in monitored:
|
|
117
|
+
health_map[mid] = {
|
|
118
|
+
"api_endpoint": entry.get("api_endpoint", ""),
|
|
119
|
+
"health_endpoint": entry.get("value", "/health"),
|
|
120
|
+
}
|
|
121
|
+
except Exception:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
# Step 3: Sync module list
|
|
125
|
+
seen = set()
|
|
126
|
+
for mid, pid in monitored.items():
|
|
127
|
+
seen.add(mid)
|
|
128
|
+
h = health_map.get(mid, {})
|
|
129
|
+
if mid not in self.modules:
|
|
130
|
+
self.modules[mid] = ModuleStatus(
|
|
131
|
+
module_id=mid,
|
|
132
|
+
api_endpoint=h.get("api_endpoint", ""),
|
|
133
|
+
health_endpoint=h.get("health_endpoint", "/health"),
|
|
134
|
+
pid=pid,
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
self.modules[mid].pid = pid
|
|
138
|
+
|
|
139
|
+
for mid in list(self.modules):
|
|
140
|
+
if mid not in seen:
|
|
141
|
+
del self.modules[mid]
|
|
142
|
+
|
|
143
|
+
# ── Health check ──
|
|
144
|
+
|
|
145
|
+
async def _check_one(self, status: ModuleStatus):
|
|
146
|
+
"""Check a single module's /health endpoint."""
|
|
147
|
+
url = f"{status.api_endpoint}{status.health_endpoint}"
|
|
148
|
+
status.last_check = time.time()
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
async with httpx.AsyncClient() as client:
|
|
152
|
+
resp = await client.get(url, timeout=self.HEALTH_TIMEOUT)
|
|
153
|
+
if resp.status_code == 200:
|
|
154
|
+
body = resp.json()
|
|
155
|
+
if body.get("status") == "healthy":
|
|
156
|
+
await self._mark_healthy(status)
|
|
157
|
+
return
|
|
158
|
+
status.last_error = f"unhealthy response: {body.get('status')}"
|
|
159
|
+
else:
|
|
160
|
+
status.last_error = f"HTTP {resp.status_code}"
|
|
161
|
+
except Exception as e:
|
|
162
|
+
status.last_error = str(e)
|
|
163
|
+
|
|
164
|
+
await self._mark_unhealthy(status)
|
|
165
|
+
|
|
166
|
+
async def _mark_healthy(self, status: ModuleStatus):
|
|
167
|
+
"""Module responded healthy — reset failure tracking."""
|
|
168
|
+
was_unhealthy = status.state == UNHEALTHY
|
|
169
|
+
status.state = HEALTHY
|
|
170
|
+
status.fail_count = 0
|
|
171
|
+
status.last_healthy = time.time()
|
|
172
|
+
status.last_error = ""
|
|
173
|
+
|
|
174
|
+
if was_unhealthy:
|
|
175
|
+
print(f"[watchdog] {status.module_id} recovered")
|
|
176
|
+
await self._publish("watchdog.module.recovered", {
|
|
177
|
+
"module_id": status.module_id,
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
async def _mark_unhealthy(self, status: ModuleStatus):
|
|
181
|
+
"""Module failed health check — increment counter, maybe restart."""
|
|
182
|
+
status.state = UNHEALTHY
|
|
183
|
+
status.fail_count += 1
|
|
184
|
+
print(f"[watchdog] {status.module_id} unhealthy ({status.fail_count}/{self.FAIL_THRESHOLD}): {status.last_error}")
|
|
185
|
+
|
|
186
|
+
await self._publish("watchdog.module.unhealthy", {
|
|
187
|
+
"module_id": status.module_id,
|
|
188
|
+
"fail_count": status.fail_count,
|
|
189
|
+
"error": status.last_error,
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
# Restart if threshold reached
|
|
193
|
+
if (status.fail_count >= self.FAIL_THRESHOLD
|
|
194
|
+
and status.restarted_count < self.MAX_RESTARTS):
|
|
195
|
+
await self._restart_module(status)
|
|
196
|
+
|
|
197
|
+
# ── Restart via Launcher API ──
|
|
198
|
+
|
|
199
|
+
async def _restart_module(self, status: ModuleStatus):
|
|
200
|
+
"""Restart a module via Launcher API."""
|
|
201
|
+
mid = status.module_id
|
|
202
|
+
print(f"[watchdog] Restarting {mid} (attempt {status.restarted_count + 1}/{self.MAX_RESTARTS})")
|
|
203
|
+
try:
|
|
204
|
+
async with httpx.AsyncClient() as client:
|
|
205
|
+
resp = await client.post(
|
|
206
|
+
f"{self.launcher_url}/launcher/modules/{mid}/restart",
|
|
207
|
+
json={"reason": "resource_critical" if status.resource_state == CRITICAL else "restart"},
|
|
208
|
+
timeout=15,
|
|
209
|
+
)
|
|
210
|
+
if resp.status_code == 200:
|
|
211
|
+
status.restarted_count += 1
|
|
212
|
+
status.fail_count = 0
|
|
213
|
+
print(f"[watchdog] {mid} restart requested")
|
|
214
|
+
if status.restarted_count >= self.ALERT_AFTER_RESTARTS:
|
|
215
|
+
await self._publish("watchdog.alert", {
|
|
216
|
+
"module_id": mid,
|
|
217
|
+
"restarted_count": status.restarted_count,
|
|
218
|
+
"message": f"{mid} has been restarted {status.restarted_count} times",
|
|
219
|
+
})
|
|
220
|
+
else:
|
|
221
|
+
print(f"[watchdog] {mid} restart failed: HTTP {resp.status_code}")
|
|
222
|
+
except Exception as e:
|
|
223
|
+
print(f"[watchdog] {mid} restart error: {e}")
|
|
224
|
+
|
|
225
|
+
# ── Resource monitoring ──
|
|
226
|
+
|
|
227
|
+
def _collect_metrics(self, status: ModuleStatus) -> dict | None:
|
|
228
|
+
"""Collect CPU/memory metrics for a module via psutil."""
|
|
229
|
+
if not status.pid:
|
|
230
|
+
return None
|
|
231
|
+
if not self._psutil:
|
|
232
|
+
try:
|
|
233
|
+
import psutil
|
|
234
|
+
self._psutil = psutil
|
|
235
|
+
except ImportError:
|
|
236
|
+
return None
|
|
237
|
+
try:
|
|
238
|
+
proc = self._psutil.Process(status.pid)
|
|
239
|
+
return {
|
|
240
|
+
"memory_rss": round(proc.memory_info().rss / 1024 / 1024, 1),
|
|
241
|
+
"memory_percent": round(proc.memory_percent(), 1),
|
|
242
|
+
"cpu_percent": round(proc.cpu_percent(interval=0.1), 1),
|
|
243
|
+
"thread_count": proc.num_threads(),
|
|
244
|
+
}
|
|
245
|
+
except Exception:
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
async def _check_resources(self, status: ModuleStatus):
|
|
249
|
+
"""Collect metrics and evaluate resource state."""
|
|
250
|
+
metrics = self._collect_metrics(status)
|
|
251
|
+
if not metrics:
|
|
252
|
+
return
|
|
253
|
+
status.last_metrics = metrics
|
|
254
|
+
|
|
255
|
+
# Track memory samples for leak detection
|
|
256
|
+
status.memory_samples.append(metrics["memory_rss"])
|
|
257
|
+
if len(status.memory_samples) > 5:
|
|
258
|
+
status.memory_samples.pop(0)
|
|
259
|
+
|
|
260
|
+
# Determine raw level from thresholds
|
|
261
|
+
mem_pct = metrics["memory_percent"]
|
|
262
|
+
cpu_pct = metrics["cpu_percent"]
|
|
263
|
+
if mem_pct > self.MEMORY_CRITICAL or cpu_pct > self.CPU_CRITICAL:
|
|
264
|
+
raw_level = CRITICAL
|
|
265
|
+
elif mem_pct > self.MEMORY_WARNING or cpu_pct > self.CPU_WARNING:
|
|
266
|
+
raw_level = WARNING
|
|
267
|
+
else:
|
|
268
|
+
raw_level = NORMAL
|
|
269
|
+
|
|
270
|
+
# Memory leak detection: 3+ consecutive rises, total >20%
|
|
271
|
+
if len(status.memory_samples) >= 3:
|
|
272
|
+
recent = status.memory_samples[-3:]
|
|
273
|
+
if all(recent[i] < recent[i+1] for i in range(len(recent)-1)):
|
|
274
|
+
if recent[-1] > recent[0] * 1.2:
|
|
275
|
+
raw_level = CRITICAL
|
|
276
|
+
|
|
277
|
+
await self._update_resource_state(status, raw_level, metrics)
|
|
278
|
+
|
|
279
|
+
async def _update_resource_state(self, status: ModuleStatus, level: str, metrics: dict):
|
|
280
|
+
"""Update resource state with consecutive-count logic."""
|
|
281
|
+
now = time.time()
|
|
282
|
+
old_state = status.resource_state
|
|
283
|
+
|
|
284
|
+
if level == CRITICAL:
|
|
285
|
+
status.resource_critical_count += 1
|
|
286
|
+
status.resource_warning_count = 0
|
|
287
|
+
status.recovery_since = 0
|
|
288
|
+
if status.resource_critical_count >= 3 and old_state != CRITICAL:
|
|
289
|
+
status.resource_state = CRITICAL
|
|
290
|
+
status.critical_since = now
|
|
291
|
+
await self._publish("watchdog.module.resource_critical", {
|
|
292
|
+
"module_id": status.module_id, "metrics": metrics,
|
|
293
|
+
})
|
|
294
|
+
elif old_state == CRITICAL:
|
|
295
|
+
# Already critical — check timeout
|
|
296
|
+
if now - status.critical_since >= self.CRITICAL_TIMEOUT:
|
|
297
|
+
await self._resource_restart(status)
|
|
298
|
+
elif level == WARNING:
|
|
299
|
+
status.resource_warning_count += 1
|
|
300
|
+
status.resource_critical_count = 0
|
|
301
|
+
status.recovery_since = 0
|
|
302
|
+
if status.resource_warning_count >= 2 and old_state == NORMAL:
|
|
303
|
+
status.resource_state = WARNING
|
|
304
|
+
await self._publish("watchdog.module.resource_warning", {
|
|
305
|
+
"module_id": status.module_id, "metrics": metrics,
|
|
306
|
+
})
|
|
307
|
+
else: # NORMAL
|
|
308
|
+
status.resource_warning_count = 0
|
|
309
|
+
status.resource_critical_count = 0
|
|
310
|
+
if old_state != NORMAL:
|
|
311
|
+
if not status.recovery_since:
|
|
312
|
+
status.recovery_since = now
|
|
313
|
+
elif now - status.recovery_since >= self.RECOVERY_OBSERVE:
|
|
314
|
+
status.resource_state = NORMAL
|
|
315
|
+
status.recovery_since = 0
|
|
316
|
+
await self._publish("watchdog.module.resource_recovered", {
|
|
317
|
+
"module_id": status.module_id,
|
|
318
|
+
})
|
|
319
|
+
|
|
320
|
+
async def _resource_restart(self, status: ModuleStatus):
|
|
321
|
+
"""Restart module due to resource critical timeout."""
|
|
322
|
+
if status.restarted_count >= self.MAX_RESTARTS:
|
|
323
|
+
await self._publish("watchdog.alert", {
|
|
324
|
+
"module_id": status.module_id,
|
|
325
|
+
"message": f"{status.module_id} exceeded max restarts (resource_critical)",
|
|
326
|
+
})
|
|
327
|
+
return
|
|
328
|
+
print(f"[watchdog] Resource critical timeout for {status.module_id}, restarting...")
|
|
329
|
+
status.resource_state = NORMAL
|
|
330
|
+
status.resource_critical_count = 0
|
|
331
|
+
status.critical_since = 0
|
|
332
|
+
await self._restart_module(status)
|
|
333
|
+
|
|
334
|
+
# ── Event publishing ──
|
|
335
|
+
|
|
336
|
+
async def _publish(self, event_type: str, data: dict):
|
|
337
|
+
"""Publish event via callback if available."""
|
|
338
|
+
if not self.publish_event:
|
|
339
|
+
return
|
|
340
|
+
try:
|
|
341
|
+
await self.publish_event({
|
|
342
|
+
"event": event_type,
|
|
343
|
+
"data": data,
|
|
344
|
+
})
|
|
345
|
+
except Exception:
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
# ── Incoming event handler ──
|
|
349
|
+
|
|
350
|
+
async def handle_event(self, msg: dict):
|
|
351
|
+
"""Handle events from Event Hub (module.started / module.stopped)."""
|
|
352
|
+
event_type = msg.get("event", "")
|
|
353
|
+
data = msg.get("data", {})
|
|
354
|
+
module_id = data.get("module_id", "")
|
|
355
|
+
|
|
356
|
+
if not module_id or module_id == "watchdog":
|
|
357
|
+
return
|
|
358
|
+
|
|
359
|
+
if event_type == "module.started":
|
|
360
|
+
print(f"[watchdog] Received module.started: {module_id}")
|
|
361
|
+
# Trigger immediate discovery to pick up the new module
|
|
362
|
+
await self.discover_modules()
|
|
363
|
+
|
|
364
|
+
elif event_type == "module.stopped":
|
|
365
|
+
print(f"[watchdog] Received module.stopped: {module_id}")
|
|
366
|
+
# Remove from tracking — it's gone
|
|
367
|
+
self.modules.pop(module_id, None)
|
|
368
|
+
|
|
369
|
+
# ── Main loop ──
|
|
370
|
+
|
|
371
|
+
async def run(self):
|
|
372
|
+
"""Main monitoring loop with dynamic intervals per resource state."""
|
|
373
|
+
self._running = True
|
|
374
|
+
print("[watchdog] Monitor started")
|
|
375
|
+
last_discover = 0
|
|
376
|
+
|
|
377
|
+
while self._running:
|
|
378
|
+
now = time.time()
|
|
379
|
+
# Rediscover every 30s
|
|
380
|
+
if now - last_discover >= 30:
|
|
381
|
+
await self.discover_modules()
|
|
382
|
+
last_discover = now
|
|
383
|
+
|
|
384
|
+
if self.modules:
|
|
385
|
+
tasks = []
|
|
386
|
+
for s in self.modules.values():
|
|
387
|
+
tasks.append(self._check_one(s))
|
|
388
|
+
tasks.append(self._check_resources(s))
|
|
389
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
390
|
+
|
|
391
|
+
# Sleep = shortest interval needed by any module
|
|
392
|
+
interval = self._min_interval()
|
|
393
|
+
await asyncio.sleep(interval)
|
|
394
|
+
|
|
395
|
+
def _min_interval(self) -> float:
|
|
396
|
+
"""Return the shortest check interval needed across all modules."""
|
|
397
|
+
if not self.modules:
|
|
398
|
+
return self.INTERVALS[NORMAL]
|
|
399
|
+
return min(self.INTERVALS.get(s.resource_state, 15) for s in self.modules.values())
|
|
400
|
+
|
|
401
|
+
def stop(self):
|
|
402
|
+
self._running = False
|
|
403
|
+
|
|
404
|
+
# ── Stats ──
|
|
405
|
+
|
|
406
|
+
def get_status(self) -> dict:
|
|
407
|
+
"""Return current health and resource status of all monitored modules."""
|
|
408
|
+
return {
|
|
409
|
+
mid: {
|
|
410
|
+
"state": s.state,
|
|
411
|
+
"fail_count": s.fail_count,
|
|
412
|
+
"restarted_count": s.restarted_count,
|
|
413
|
+
"last_check": s.last_check,
|
|
414
|
+
"last_healthy": s.last_healthy,
|
|
415
|
+
"last_error": s.last_error,
|
|
416
|
+
"resource_state": s.resource_state,
|
|
417
|
+
"metrics": s.last_metrics,
|
|
418
|
+
}
|
|
419
|
+
for mid, s in self.modules.items()
|
|
420
|
+
}
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Watchdog HTTP server.
|
|
3
|
+
Exposes /health and /status endpoints. Runs the monitor loop on startup.
|
|
4
|
+
Connects to Event Hub via WebSocket for event publishing and subscription.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
import uuid
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
import websockets
|
|
14
|
+
from fastapi import FastAPI
|
|
15
|
+
|
|
16
|
+
from .monitor import HealthMonitor
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class WatchdogServer:
|
|
20
|
+
|
|
21
|
+
def __init__(self, monitor: HealthMonitor, token: str = "",
|
|
22
|
+
event_hub_ws: str = ""):
|
|
23
|
+
self.monitor = monitor
|
|
24
|
+
self.token = token
|
|
25
|
+
self.event_hub_ws = event_hub_ws
|
|
26
|
+
self._monitor_task: asyncio.Task | None = None
|
|
27
|
+
self._ws_task: asyncio.Task | None = None
|
|
28
|
+
self._heartbeat_task: asyncio.Task | None = None
|
|
29
|
+
self._ws: object | None = None
|
|
30
|
+
self._start_time = time.time()
|
|
31
|
+
self.app = self._create_app()
|
|
32
|
+
|
|
33
|
+
# Wire up publish callback on monitor
|
|
34
|
+
self.monitor.publish_event = self._publish_event
|
|
35
|
+
|
|
36
|
+
def _create_app(self) -> FastAPI:
|
|
37
|
+
app = FastAPI(title="Kite Watchdog", docs_url=None, redoc_url=None)
|
|
38
|
+
server = self
|
|
39
|
+
|
|
40
|
+
@app.on_event("startup")
|
|
41
|
+
async def _startup():
|
|
42
|
+
server._monitor_task = asyncio.create_task(server.monitor.run())
|
|
43
|
+
server._heartbeat_task = asyncio.create_task(server._heartbeat_loop())
|
|
44
|
+
if server.event_hub_ws:
|
|
45
|
+
server._ws_task = asyncio.create_task(server._ws_loop())
|
|
46
|
+
server._test_task = asyncio.create_task(server._test_event_loop())
|
|
47
|
+
|
|
48
|
+
@app.on_event("shutdown")
|
|
49
|
+
async def _shutdown():
|
|
50
|
+
server.monitor.stop()
|
|
51
|
+
if server._monitor_task:
|
|
52
|
+
server._monitor_task.cancel()
|
|
53
|
+
if server._heartbeat_task:
|
|
54
|
+
server._heartbeat_task.cancel()
|
|
55
|
+
if server._ws_task:
|
|
56
|
+
server._ws_task.cancel()
|
|
57
|
+
if hasattr(server, '_test_task') and server._test_task:
|
|
58
|
+
server._test_task.cancel()
|
|
59
|
+
if server._ws:
|
|
60
|
+
await server._ws.close()
|
|
61
|
+
|
|
62
|
+
@app.get("/health")
|
|
63
|
+
async def health():
|
|
64
|
+
return {
|
|
65
|
+
"status": "healthy",
|
|
66
|
+
"details": {
|
|
67
|
+
"monitored_modules": len(server.monitor.modules),
|
|
68
|
+
"event_hub_connected": server._ws is not None,
|
|
69
|
+
"uptime_seconds": round(time.time() - server._start_time),
|
|
70
|
+
},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
@app.get("/status")
|
|
74
|
+
async def status():
|
|
75
|
+
return server.monitor.get_status()
|
|
76
|
+
|
|
77
|
+
return app
|
|
78
|
+
|
|
79
|
+
# ── Event Hub WebSocket client ──
|
|
80
|
+
|
|
81
|
+
async def _ws_loop(self):
|
|
82
|
+
"""Connect to Event Hub, subscribe, and listen. Reconnect on failure."""
|
|
83
|
+
while True:
|
|
84
|
+
try:
|
|
85
|
+
await self._ws_connect()
|
|
86
|
+
except asyncio.CancelledError:
|
|
87
|
+
return
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(f"[watchdog] Event Hub connection error: {e}")
|
|
90
|
+
self._ws = None
|
|
91
|
+
await asyncio.sleep(5) # reconnect delay
|
|
92
|
+
|
|
93
|
+
async def _ws_connect(self):
|
|
94
|
+
"""Single WebSocket session: connect, subscribe, receive loop."""
|
|
95
|
+
url = f"{self.event_hub_ws}?token={self.token}"
|
|
96
|
+
async with websockets.connect(url) as ws:
|
|
97
|
+
self._ws = ws
|
|
98
|
+
print("[watchdog] Connected to Event Hub")
|
|
99
|
+
|
|
100
|
+
# Subscribe to module lifecycle events
|
|
101
|
+
await ws.send(json.dumps({
|
|
102
|
+
"type": "subscribe",
|
|
103
|
+
"events": ["module.started", "module.stopped"],
|
|
104
|
+
}))
|
|
105
|
+
|
|
106
|
+
# Receive loop
|
|
107
|
+
async for raw in ws:
|
|
108
|
+
try:
|
|
109
|
+
msg = json.loads(raw)
|
|
110
|
+
except (json.JSONDecodeError, TypeError):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
msg_type = msg.get("type", "")
|
|
114
|
+
if msg_type == "event":
|
|
115
|
+
await self.monitor.handle_event(msg)
|
|
116
|
+
elif msg_type == "ack":
|
|
117
|
+
pass # publish confirmed
|
|
118
|
+
elif msg_type == "error":
|
|
119
|
+
print(f"[watchdog] Event Hub error: {msg.get('message')}")
|
|
120
|
+
|
|
121
|
+
async def _publish_event(self, event: dict):
|
|
122
|
+
"""Publish an event to Event Hub via WebSocket."""
|
|
123
|
+
if not self._ws:
|
|
124
|
+
return
|
|
125
|
+
from datetime import datetime, timezone
|
|
126
|
+
msg = {
|
|
127
|
+
"type": "event",
|
|
128
|
+
"event_id": str(uuid.uuid4()),
|
|
129
|
+
"event": event.get("event", ""),
|
|
130
|
+
"source": "watchdog",
|
|
131
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
132
|
+
"data": event.get("data", {}),
|
|
133
|
+
}
|
|
134
|
+
try:
|
|
135
|
+
await self._ws.send(json.dumps(msg))
|
|
136
|
+
except Exception as e:
|
|
137
|
+
print(f"[watchdog] Failed to publish event: {e}")
|
|
138
|
+
|
|
139
|
+
# ── Heartbeat to Registry ──
|
|
140
|
+
|
|
141
|
+
async def _heartbeat_loop(self):
|
|
142
|
+
"""Send heartbeat to Registry every 30 seconds."""
|
|
143
|
+
while True:
|
|
144
|
+
await asyncio.sleep(30)
|
|
145
|
+
try:
|
|
146
|
+
async with httpx.AsyncClient() as client:
|
|
147
|
+
await client.post(
|
|
148
|
+
f"{self.monitor.registry_url}/modules",
|
|
149
|
+
json={"action": "heartbeat", "module_id": "watchdog"},
|
|
150
|
+
headers={"Authorization": f"Bearer {self.monitor.own_token}"},
|
|
151
|
+
timeout=5,
|
|
152
|
+
)
|
|
153
|
+
except Exception:
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
async def _test_event_loop(self):
|
|
157
|
+
"""Publish a test event every 5 seconds."""
|
|
158
|
+
from datetime import datetime, timezone
|
|
159
|
+
while True:
|
|
160
|
+
await asyncio.sleep(5)
|
|
161
|
+
await self._publish_event({
|
|
162
|
+
"event": "watchdog.test",
|
|
163
|
+
"data": {
|
|
164
|
+
"message": "heartbeat from watchdog",
|
|
165
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
166
|
+
},
|
|
167
|
+
})
|
package/main.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import secrets
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
6
|
+
|
|
7
|
+
from core.launcher.entry import Launcher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
token = secrets.token_hex(32)
|
|
12
|
+
print(f"[main] KITE_TOKEN generated ({len(token)} chars)")
|
|
13
|
+
Launcher(kite_token=token).run()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
if __name__ == "__main__":
|
|
17
|
+
main()
|
package/package.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@agentunion/kite",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Kite framework launcher — start Kite from anywhere",
|
|
5
|
+
"bin": {
|
|
6
|
+
"kite": "./cli.js"
|
|
7
|
+
},
|
|
8
|
+
"files": [
|
|
9
|
+
"cli.js",
|
|
10
|
+
"main.py",
|
|
11
|
+
"__main__.py",
|
|
12
|
+
"__init__.py",
|
|
13
|
+
"core/**",
|
|
14
|
+
"extensions/**"
|
|
15
|
+
],
|
|
16
|
+
"engines": {
|
|
17
|
+
"node": ">=16"
|
|
18
|
+
},
|
|
19
|
+
"os": ["win32", "linux", "darwin"],
|
|
20
|
+
"keywords": ["kite", "framework", "launcher"],
|
|
21
|
+
"license": "MIT",
|
|
22
|
+
"author": "agentcp",
|
|
23
|
+
"repository": {
|
|
24
|
+
"type": "git",
|
|
25
|
+
"url": "https://github.com/agentcp/kite.git"
|
|
26
|
+
}
|
|
27
|
+
}
|