halyn 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- halyn/__init__.py +7 -0
- halyn/__main__.py +4 -0
- halyn/audit.py +278 -0
- halyn/auth.py +88 -0
- halyn/autonomy.py +262 -0
- halyn/cli.py +208 -0
- halyn/config.py +135 -0
- halyn/consent.py +243 -0
- halyn/control_plane.py +354 -0
- halyn/discovery.py +323 -0
- halyn/drivers/__init__.py +0 -0
- halyn/drivers/browser.py +60 -0
- halyn/drivers/dds.py +156 -0
- halyn/drivers/docker.py +62 -0
- halyn/drivers/http_auto.py +259 -0
- halyn/drivers/mqtt.py +93 -0
- halyn/drivers/opcua.py +77 -0
- halyn/drivers/ros2.py +124 -0
- halyn/drivers/serial.py +226 -0
- halyn/drivers/socket_raw.py +153 -0
- halyn/drivers/ssh.py +131 -0
- halyn/drivers/unitree.py +103 -0
- halyn/drivers/websocket.py +175 -0
- halyn/engine.py +222 -0
- halyn/intent.py +240 -0
- halyn/llm.py +178 -0
- halyn/mcp.py +239 -0
- halyn/memory/__init__.py +0 -0
- halyn/memory/store.py +200 -0
- halyn/nrp_bridge.py +213 -0
- halyn/py.typed +0 -0
- halyn/sanitizer.py +120 -0
- halyn/server.py +292 -0
- halyn/types.py +116 -0
- halyn/watchdog.py +252 -0
- halyn-0.2.0.dist-info/METADATA +246 -0
- halyn-0.2.0.dist-info/RECORD +41 -0
- halyn-0.2.0.dist-info/WHEEL +5 -0
- halyn-0.2.0.dist-info/entry_points.txt +2 -0
- halyn-0.2.0.dist-info/licenses/LICENSE +15 -0
- halyn-0.2.0.dist-info/top_level.txt +1 -0
halyn/server.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
# Copyright (c) 2026 Elmadani SALKA. All rights reserved.
|
|
2
|
+
# Licensed under the MIT License. See LICENSE file.
|
|
3
|
+
"""
|
|
4
|
+
HTTP Server — REST API + MCP + SSE events.
|
|
5
|
+
|
|
6
|
+
Endpoints:
|
|
7
|
+
GET /health System status
|
|
8
|
+
GET /nodes Connected nodes and manifests
|
|
9
|
+
POST /execute Execute action through pipeline
|
|
10
|
+
POST /emergency-stop Stop all nodes immediately
|
|
11
|
+
POST /resume Resume after emergency stop
|
|
12
|
+
GET /events SSE stream of all events
|
|
13
|
+
GET /events/query Query recent events
|
|
14
|
+
GET /audit Query audit trail
|
|
15
|
+
GET /audit/verify Verify hash chain integrity
|
|
16
|
+
POST /consent/approve Approve a pending node
|
|
17
|
+
POST /consent/deny Deny a pending node
|
|
18
|
+
GET /consent/pending List pending consent requests
|
|
19
|
+
POST /confirm/approve Approve a pending action
|
|
20
|
+
POST /confirm/deny Deny a pending action
|
|
21
|
+
GET /confirm/pending List pending confirmations
|
|
22
|
+
GET /intents Query intent chains
|
|
23
|
+
GET /scan Trigger network discovery
|
|
24
|
+
GET /mcp MCP endpoint (Claude.ai native)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import json
|
|
30
|
+
import logging
|
|
31
|
+
import time
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
log = logging.getLogger("halyn.server")
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from aiohttp import web
|
|
38
|
+
HAS_AIOHTTP = True
|
|
39
|
+
except ImportError:
|
|
40
|
+
HAS_AIOHTTP = False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _json(data: Any, status: int = 200) -> "web.Response":
|
|
44
|
+
return web.Response(
|
|
45
|
+
text=json.dumps(data, default=str, ensure_ascii=False),
|
|
46
|
+
content_type="application/json",
|
|
47
|
+
status=status,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def create_app(control_plane: Any, api_key: str = "") -> "web.Application":
|
|
52
|
+
if not HAS_AIOHTTP:
|
|
53
|
+
raise ImportError("aiohttp required: pip install aiohttp")
|
|
54
|
+
|
|
55
|
+
app = web.Application()
|
|
56
|
+
cp = control_plane
|
|
57
|
+
|
|
58
|
+
# Auth middleware
|
|
59
|
+
@web.middleware
|
|
60
|
+
async def auth_middleware(request: web.Request, handler):
|
|
61
|
+
if api_key and request.path not in ("/health",):
|
|
62
|
+
key = request.headers.get("Authorization", "").replace("Bearer ", "")
|
|
63
|
+
if not key:
|
|
64
|
+
key = request.query.get("key", "")
|
|
65
|
+
if key != api_key:
|
|
66
|
+
return _json({"error": "unauthorized"}, 401)
|
|
67
|
+
return await handler(request)
|
|
68
|
+
|
|
69
|
+
app.middlewares.append(auth_middleware)
|
|
70
|
+
|
|
71
|
+
# ─── Health ─────────────────────────────────
|
|
72
|
+
|
|
73
|
+
async def handle_health(req: web.Request) -> web.Response:
|
|
74
|
+
return _json(cp.status())
|
|
75
|
+
|
|
76
|
+
# ─── Nodes ──────────────────────────────────
|
|
77
|
+
|
|
78
|
+
async def handle_nodes(req: web.Request) -> web.Response:
|
|
79
|
+
nodes = {}
|
|
80
|
+
for nrp_id, manifest in cp._manifests.items():
|
|
81
|
+
nodes[nrp_id] = manifest.to_dict()
|
|
82
|
+
return _json({"nodes": nodes, "count": len(nodes)})
|
|
83
|
+
|
|
84
|
+
# ─── Execute ────────────────────────────────
|
|
85
|
+
|
|
86
|
+
async def handle_execute(req: web.Request) -> web.Response:
|
|
87
|
+
body = await req.json()
|
|
88
|
+
tool = body.get("tool", "")
|
|
89
|
+
args = body.get("args", {})
|
|
90
|
+
user_id = body.get("user_id", req.headers.get("X-User-Id", ""))
|
|
91
|
+
llm_model = body.get("llm_model", "")
|
|
92
|
+
intent = body.get("intent", "")
|
|
93
|
+
|
|
94
|
+
if not tool:
|
|
95
|
+
return _json({"error": "missing 'tool' field"}, 400)
|
|
96
|
+
|
|
97
|
+
result = await cp.execute(tool, args, user_id, llm_model, intent)
|
|
98
|
+
return _json({
|
|
99
|
+
"ok": result.ok,
|
|
100
|
+
"data": result.data,
|
|
101
|
+
"error": result.error,
|
|
102
|
+
"status": result.status.value,
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
# ─── Emergency ──────────────────────────────
|
|
106
|
+
|
|
107
|
+
async def handle_emergency_stop(req: web.Request) -> web.Response:
|
|
108
|
+
await cp.emergency_stop()
|
|
109
|
+
return _json({"status": "emergency_stop_activated"})
|
|
110
|
+
|
|
111
|
+
async def handle_resume(req: web.Request) -> web.Response:
|
|
112
|
+
await cp.resume()
|
|
113
|
+
return _json({"status": "resumed"})
|
|
114
|
+
|
|
115
|
+
# ─── Events ─────────────────────────────────
|
|
116
|
+
|
|
117
|
+
async def handle_events_sse(req: web.Request) -> web.StreamResponse:
|
|
118
|
+
resp = web.StreamResponse()
|
|
119
|
+
resp.content_type = "text/event-stream"
|
|
120
|
+
resp.headers["Cache-Control"] = "no-cache"
|
|
121
|
+
resp.headers["X-Accel-Buffering"] = "no"
|
|
122
|
+
await resp.prepare(req)
|
|
123
|
+
|
|
124
|
+
import asyncio
|
|
125
|
+
queue: asyncio.Queue[str] = asyncio.Queue(maxsize=1000)
|
|
126
|
+
|
|
127
|
+
async def forward(event):
|
|
128
|
+
try:
|
|
129
|
+
queue.put_nowait(event.to_json())
|
|
130
|
+
except asyncio.QueueFull:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
cp.event_bus.subscribe("*", forward)
|
|
134
|
+
try:
|
|
135
|
+
while True:
|
|
136
|
+
try:
|
|
137
|
+
data = await asyncio.wait_for(queue.get(), timeout=30.0)
|
|
138
|
+
await resp.write(f"data: {data}\n\n".encode())
|
|
139
|
+
except asyncio.TimeoutError:
|
|
140
|
+
await resp.write(b": keepalive\n\n")
|
|
141
|
+
except (ConnectionResetError, asyncio.CancelledError):
|
|
142
|
+
pass
|
|
143
|
+
finally:
|
|
144
|
+
cp.event_bus.unsubscribe("*", forward)
|
|
145
|
+
return resp
|
|
146
|
+
|
|
147
|
+
async def handle_events_query(req: web.Request) -> web.Response:
|
|
148
|
+
n = int(req.query.get("n", "50"))
|
|
149
|
+
source = req.query.get("source", "")
|
|
150
|
+
name = req.query.get("name", "")
|
|
151
|
+
events = cp.event_bus.recent(n, source=source, name=name)
|
|
152
|
+
return _json({
|
|
153
|
+
"events": [e.to_dict() for e in events],
|
|
154
|
+
"total": cp.event_bus.total,
|
|
155
|
+
"pending": cp.event_bus.pending,
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
# ─── Audit ──────────────────────────────────
|
|
159
|
+
|
|
160
|
+
async def handle_audit(req: web.Request) -> web.Response:
|
|
161
|
+
limit = int(req.query.get("limit", "50"))
|
|
162
|
+
tool = req.query.get("tool", "")
|
|
163
|
+
node = req.query.get("node", "")
|
|
164
|
+
entries = cp.audit.query(tool=tool, node=node, limit=limit)
|
|
165
|
+
return _json({
|
|
166
|
+
"entries": [e.to_dict() for e in entries],
|
|
167
|
+
"count": cp.audit.count,
|
|
168
|
+
"chain_tip": cp.audit.chain_tip[:16],
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
async def handle_audit_verify(req: web.Request) -> web.Response:
|
|
172
|
+
valid, count, msg = cp.audit.verify_chain()
|
|
173
|
+
return _json({"valid": valid, "entries_checked": count, "message": msg})
|
|
174
|
+
|
|
175
|
+
# ─── Consent ────────────────────────────────
|
|
176
|
+
|
|
177
|
+
async def handle_consent_pending(req: web.Request) -> web.Response:
|
|
178
|
+
from .consent import ConsentLevel
|
|
179
|
+
pending = cp.consent.list_all(level=ConsentLevel.PENDING)
|
|
180
|
+
return _json({"pending": [r.to_dict() for r in pending], "count": len(pending)})
|
|
181
|
+
|
|
182
|
+
async def handle_consent_approve(req: web.Request) -> web.Response:
|
|
183
|
+
body = await req.json()
|
|
184
|
+
nrp_id = body.get("nrp_id", "")
|
|
185
|
+
level = body.get("level", "full")
|
|
186
|
+
duration = float(body.get("duration_hours", 0))
|
|
187
|
+
from .consent import ConsentLevel
|
|
188
|
+
lvl_map = {"full": ConsentLevel.FULL, "read_only": ConsentLevel.READ_ONLY,
|
|
189
|
+
"temporary": ConsentLevel.TEMPORARY}
|
|
190
|
+
record = cp.consent.grant(
|
|
191
|
+
nrp_id, lvl_map.get(level, ConsentLevel.FULL),
|
|
192
|
+
granted_by=body.get("user_id", "api"),
|
|
193
|
+
duration_hours=duration,
|
|
194
|
+
)
|
|
195
|
+
return _json(record.to_dict())
|
|
196
|
+
|
|
197
|
+
async def handle_consent_deny(req: web.Request) -> web.Response:
|
|
198
|
+
body = await req.json()
|
|
199
|
+
nrp_id = body.get("nrp_id", "")
|
|
200
|
+
cp.consent.revoke(nrp_id, reason=body.get("reason", "denied via API"))
|
|
201
|
+
return _json({"denied": nrp_id})
|
|
202
|
+
|
|
203
|
+
# ─── Confirmations ──────────────────────────
|
|
204
|
+
|
|
205
|
+
async def handle_confirm_pending(req: web.Request) -> web.Response:
|
|
206
|
+
pending = cp.autonomy.get_pending()
|
|
207
|
+
return _json({
|
|
208
|
+
"pending": [{
|
|
209
|
+
"request_id": r.request_id,
|
|
210
|
+
"tool": r.action.tool,
|
|
211
|
+
"args": r.action.args,
|
|
212
|
+
"reason": r.reason,
|
|
213
|
+
"domain": r.domain,
|
|
214
|
+
"created_at": r.created_at,
|
|
215
|
+
"expires_at": r.expires_at,
|
|
216
|
+
} for r in pending],
|
|
217
|
+
"count": len(pending),
|
|
218
|
+
})
|
|
219
|
+
|
|
220
|
+
async def handle_confirm_approve(req: web.Request) -> web.Response:
|
|
221
|
+
body = await req.json()
|
|
222
|
+
req_id = body.get("request_id", "")
|
|
223
|
+
ok = cp.autonomy.approve(req_id)
|
|
224
|
+
if ok:
|
|
225
|
+
req_obj = cp.autonomy.get_request(req_id)
|
|
226
|
+
if req_obj:
|
|
227
|
+
result = await cp.execute(
|
|
228
|
+
req_obj.action.tool, req_obj.action.args,
|
|
229
|
+
intent_text=f"approved: {req_obj.reason}",
|
|
230
|
+
)
|
|
231
|
+
return _json({"approved": True, "result": {
|
|
232
|
+
"ok": result.ok, "data": result.data, "error": result.error,
|
|
233
|
+
}})
|
|
234
|
+
return _json({"approved": ok})
|
|
235
|
+
|
|
236
|
+
async def handle_confirm_deny(req: web.Request) -> web.Response:
|
|
237
|
+
body = await req.json()
|
|
238
|
+
req_id = body.get("request_id", "")
|
|
239
|
+
ok = cp.autonomy.deny(req_id)
|
|
240
|
+
return _json({"denied": ok})
|
|
241
|
+
|
|
242
|
+
# ─── Intents ────────────────────────────────
|
|
243
|
+
|
|
244
|
+
async def handle_intents(req: web.Request) -> web.Response:
|
|
245
|
+
limit = int(req.query.get("limit", "20"))
|
|
246
|
+
node = req.query.get("node", "")
|
|
247
|
+
chains = cp.intents.query(node=node, limit=limit)
|
|
248
|
+
return _json({"chains": [c.to_dict() for c in chains], "count": len(chains)})
|
|
249
|
+
|
|
250
|
+
# ─── Discovery ──────────────────────────────
|
|
251
|
+
|
|
252
|
+
async def handle_scan(req: web.Request) -> web.Response:
|
|
253
|
+
config = {}
|
|
254
|
+
if req.query.get("subnet"):
|
|
255
|
+
config["subnets"] = [req.query["subnet"]]
|
|
256
|
+
if req.query.get("ssh"):
|
|
257
|
+
config["ssh_hosts"] = req.query["ssh"].split(",")
|
|
258
|
+
if req.query.get("http"):
|
|
259
|
+
config["http_urls"] = req.query["http"].split(",")
|
|
260
|
+
nodes = await cp.scan(config or None)
|
|
261
|
+
return _json({
|
|
262
|
+
"discovered": [{
|
|
263
|
+
"address": n.address, "port": n.port,
|
|
264
|
+
"protocol": n.protocol, "name": n.name,
|
|
265
|
+
"suggested_nrp_id": n.suggested_nrp_id,
|
|
266
|
+
"metadata": n.metadata,
|
|
267
|
+
} for n in nodes],
|
|
268
|
+
"count": len(nodes),
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
# ─── Register routes ────────────────────────
|
|
272
|
+
|
|
273
|
+
app.router.add_get("/health", handle_health)
|
|
274
|
+
app.router.add_get("/nodes", handle_nodes)
|
|
275
|
+
app.router.add_post("/execute", handle_execute)
|
|
276
|
+
app.router.add_post("/emergency-stop", handle_emergency_stop)
|
|
277
|
+
app.router.add_post("/resume", handle_resume)
|
|
278
|
+
app.router.add_get("/events", handle_events_sse)
|
|
279
|
+
app.router.add_get("/events/query", handle_events_query)
|
|
280
|
+
app.router.add_get("/audit", handle_audit)
|
|
281
|
+
app.router.add_get("/audit/verify", handle_audit_verify)
|
|
282
|
+
app.router.add_get("/consent/pending", handle_consent_pending)
|
|
283
|
+
app.router.add_post("/consent/approve", handle_consent_approve)
|
|
284
|
+
app.router.add_post("/consent/deny", handle_consent_deny)
|
|
285
|
+
app.router.add_get("/confirm/pending", handle_confirm_pending)
|
|
286
|
+
app.router.add_post("/confirm/approve", handle_confirm_approve)
|
|
287
|
+
app.router.add_post("/confirm/deny", handle_confirm_deny)
|
|
288
|
+
app.router.add_get("/intents", handle_intents)
|
|
289
|
+
app.router.add_get("/scan", handle_scan)
|
|
290
|
+
|
|
291
|
+
log.info("server.routes registered=%d", len(app.router.routes()))
|
|
292
|
+
return app
|
halyn/types.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Copyright (c) 2026 Elmadani SALKA. All rights reserved.
|
|
2
|
+
# Licensed under the MIT License. See LICENSE file.
|
|
3
|
+
"""
|
|
4
|
+
Core types. Every object in the system is defined here.
|
|
5
|
+
No business logic. Just shapes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ToolCategory(str, Enum):
|
|
17
|
+
"""What kind of tool this is."""
|
|
18
|
+
EXECUTOR = "executor" # Does things (shell, write, deploy)
|
|
19
|
+
OBSERVER = "observer" # Sees things (metrics, logs, status)
|
|
20
|
+
MEMORY = "memory" # Remembers things
|
|
21
|
+
VOICE = "voice" # Communicates things
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NodeKind(str, Enum):
|
|
25
|
+
"""How we connect to a machine."""
|
|
26
|
+
LOCAL = "local"
|
|
27
|
+
SSH = "ssh"
|
|
28
|
+
ADB = "adb"
|
|
29
|
+
DOCKER = "docker"
|
|
30
|
+
KUBERNETES = "k8s"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ActionStatus(str, Enum):
|
|
34
|
+
"""Outcome of an action."""
|
|
35
|
+
OK = "ok"
|
|
36
|
+
DENIED = "denied" # Shield blocked it
|
|
37
|
+
FAILED = "failed" # Execution error
|
|
38
|
+
TIMEOUT = "timeout"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True, slots=True)
|
|
42
|
+
class ToolSpec:
|
|
43
|
+
"""Definition of a single tool."""
|
|
44
|
+
name: str
|
|
45
|
+
category: ToolCategory
|
|
46
|
+
description: str
|
|
47
|
+
dangerous: bool = False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(slots=True)
|
|
51
|
+
class Node:
|
|
52
|
+
"""A connected machine."""
|
|
53
|
+
name: str
|
|
54
|
+
kind: NodeKind
|
|
55
|
+
host: str = "localhost"
|
|
56
|
+
user: str = ""
|
|
57
|
+
port: int = 22
|
|
58
|
+
key_path: str = ""
|
|
59
|
+
alive: bool = False
|
|
60
|
+
last_seen: float = 0.0
|
|
61
|
+
labels: dict[str, str] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def ssh_target(self) -> str:
|
|
65
|
+
if self.user:
|
|
66
|
+
return f"{self.user}@{self.host}"
|
|
67
|
+
return self.host
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True, slots=True)
|
|
71
|
+
class Action:
|
|
72
|
+
"""A request to do something."""
|
|
73
|
+
tool: str
|
|
74
|
+
args: dict[str, Any] = field(default_factory=dict)
|
|
75
|
+
node: str = "local"
|
|
76
|
+
request_id: str = ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(slots=True)
|
|
80
|
+
class Result:
|
|
81
|
+
"""What came back from an action."""
|
|
82
|
+
status: ActionStatus
|
|
83
|
+
data: Any = None
|
|
84
|
+
error: str = ""
|
|
85
|
+
elapsed_ms: float = 0.0
|
|
86
|
+
node: str = "local"
|
|
87
|
+
tool: str = ""
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def ok(self) -> bool:
|
|
91
|
+
return self.status == ActionStatus.OK
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass(slots=True)
|
|
95
|
+
class AuditEntry:
|
|
96
|
+
"""One line in the audit trail. Immutable once created."""
|
|
97
|
+
timestamp: float = field(default_factory=time.time)
|
|
98
|
+
tool: str = ""
|
|
99
|
+
node: str = "local"
|
|
100
|
+
status: str = "ok"
|
|
101
|
+
elapsed_ms: float = 0.0
|
|
102
|
+
user: str = "default"
|
|
103
|
+
error: str = ""
|
|
104
|
+
prev_hash: str = ""
|
|
105
|
+
entry_hash: str = ""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass(slots=True)
|
|
109
|
+
class PolicyRule:
|
|
110
|
+
"""One RBAC rule."""
|
|
111
|
+
role: str
|
|
112
|
+
allow: list[str] = field(default_factory=list) # tool patterns
|
|
113
|
+
deny: list[str] = field(default_factory=list) # tool patterns
|
|
114
|
+
confirm: list[str] = field(default_factory=list) # need human OK
|
|
115
|
+
nodes: list[str] = field(default_factory=lambda: ["*"]) # node patterns
|
|
116
|
+
|
halyn/watchdog.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# Copyright (c) 2026 Elmadani SALKA. All rights reserved.
|
|
2
|
+
# Licensed under the MIT License. See LICENSE file.
|
|
3
|
+
"""
|
|
4
|
+
Watchdog — Component health monitor with failsafe.
|
|
5
|
+
|
|
6
|
+
Periodic health checks on all registered components.
|
|
7
|
+
Escalates failures to alert handlers. Triggers failsafe
|
|
8
|
+
when critical components are unresponsive.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from enum import Enum
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Callable, Awaitable
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger("halyn.watchdog")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Health(str, Enum):
|
|
27
|
+
GREEN = "green" # Everything nominal
|
|
28
|
+
YELLOW = "yellow" # Degraded but functional
|
|
29
|
+
RED = "red" # Critical — action needed
|
|
30
|
+
DEAD = "dead" # Unresponsive
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(slots=True)
|
|
34
|
+
class ComponentStatus:
|
|
35
|
+
name: str
|
|
36
|
+
health: Health = Health.GREEN
|
|
37
|
+
last_check: float = 0.0
|
|
38
|
+
last_ok: float = 0.0
|
|
39
|
+
message: str = ""
|
|
40
|
+
checks_failed: int = 0
|
|
41
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def age(self) -> float:
|
|
45
|
+
return time.time() - self.last_check if self.last_check else float("inf")
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def downtime(self) -> float:
|
|
49
|
+
if self.health == Health.GREEN:
|
|
50
|
+
return 0.0
|
|
51
|
+
return time.time() - self.last_ok if self.last_ok else float("inf")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
AlertHandler = Callable[[str, str, dict[str, Any]], Awaitable[None] | None]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Watchdog:
|
|
58
|
+
"""
|
|
59
|
+
Monitors all Halyn components and NRP nodes.
|
|
60
|
+
|
|
61
|
+
Runs periodic health checks. Escalates problems.
|
|
62
|
+
Sends alerts to humans — not to the AI.
|
|
63
|
+
The human is always the last line of defense.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
__slots__ = (
|
|
67
|
+
"_components", "_checks", "_alert_handlers",
|
|
68
|
+
"_interval", "_running", "_failsafe_handlers",
|
|
69
|
+
"_heartbeat_file",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def __init__(self, interval: float = 10.0, heartbeat_file: str = "") -> None:
|
|
73
|
+
self._components: dict[str, ComponentStatus] = {}
|
|
74
|
+
self._checks: dict[str, Callable[[], Awaitable[Health]]] = {}
|
|
75
|
+
self._alert_handlers: list[AlertHandler] = []
|
|
76
|
+
self._failsafe_handlers: list[Callable[[], Awaitable[None] | None]] = []
|
|
77
|
+
self._interval = interval
|
|
78
|
+
self._running = False
|
|
79
|
+
self._heartbeat_file = heartbeat_file or "/tmp/halyn.heartbeat"
|
|
80
|
+
|
|
81
|
+
def register(self, name: str, check: Callable[[], Awaitable[Health]]) -> None:
|
|
82
|
+
self._components[name] = ComponentStatus(name=name)
|
|
83
|
+
self._checks[name] = check
|
|
84
|
+
log.debug("watchdog.register component=%s", name)
|
|
85
|
+
|
|
86
|
+
def on_alert(self, handler: AlertHandler) -> None:
|
|
87
|
+
self._alert_handlers.append(handler)
|
|
88
|
+
|
|
89
|
+
def on_failsafe(self, handler: Callable[[], Awaitable[None] | None]) -> None:
|
|
90
|
+
self._failsafe_handlers.append(handler)
|
|
91
|
+
|
|
92
|
+
async def check_all(self) -> dict[str, ComponentStatus]:
|
|
93
|
+
for name, check_fn in self._checks.items():
|
|
94
|
+
status = self._components[name]
|
|
95
|
+
status.last_check = time.time()
|
|
96
|
+
try:
|
|
97
|
+
health = await check_fn()
|
|
98
|
+
if asyncio.iscoroutine(health):
|
|
99
|
+
health = await health
|
|
100
|
+
status.health = health
|
|
101
|
+
if health == Health.GREEN:
|
|
102
|
+
status.last_ok = time.time()
|
|
103
|
+
status.checks_failed = 0
|
|
104
|
+
status.message = "OK"
|
|
105
|
+
else:
|
|
106
|
+
status.checks_failed += 1
|
|
107
|
+
if status.checks_failed >= 3:
|
|
108
|
+
await self._escalate(name, status)
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
status.health = Health.RED
|
|
111
|
+
status.message = str(exc)[:200]
|
|
112
|
+
status.checks_failed += 1
|
|
113
|
+
log.error("watchdog.check_failed component=%s error=%s", name, exc)
|
|
114
|
+
if status.checks_failed >= 3:
|
|
115
|
+
await self._escalate(name, status)
|
|
116
|
+
return dict(self._components)
|
|
117
|
+
|
|
118
|
+
async def run(self) -> None:
|
|
119
|
+
self._running = True
|
|
120
|
+
log.info("watchdog.started interval=%.1fs components=%d",
|
|
121
|
+
self._interval, len(self._checks))
|
|
122
|
+
while self._running:
|
|
123
|
+
try:
|
|
124
|
+
await self.check_all()
|
|
125
|
+
self._write_heartbeat()
|
|
126
|
+
await asyncio.sleep(self._interval)
|
|
127
|
+
except asyncio.CancelledError:
|
|
128
|
+
break
|
|
129
|
+
except Exception as exc:
|
|
130
|
+
log.exception("watchdog.loop_error: %s", exc)
|
|
131
|
+
await asyncio.sleep(self._interval)
|
|
132
|
+
log.info("watchdog.stopped")
|
|
133
|
+
|
|
134
|
+
def stop(self) -> None:
|
|
135
|
+
self._running = False
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def overall_health(self) -> Health:
|
|
139
|
+
if not self._components:
|
|
140
|
+
return Health.GREEN
|
|
141
|
+
healths = [c.health for c in self._components.values()]
|
|
142
|
+
if Health.DEAD in healths:
|
|
143
|
+
return Health.DEAD
|
|
144
|
+
if Health.RED in healths:
|
|
145
|
+
return Health.RED
|
|
146
|
+
if Health.YELLOW in healths:
|
|
147
|
+
return Health.YELLOW
|
|
148
|
+
return Health.GREEN
|
|
149
|
+
|
|
150
|
+
def status_report(self) -> dict[str, Any]:
|
|
151
|
+
return {
|
|
152
|
+
"overall": self.overall_health.value,
|
|
153
|
+
"components": {
|
|
154
|
+
name: {
|
|
155
|
+
"health": c.health.value,
|
|
156
|
+
"age_seconds": round(c.age, 1),
|
|
157
|
+
"downtime_seconds": round(c.downtime, 1),
|
|
158
|
+
"checks_failed": c.checks_failed,
|
|
159
|
+
"message": c.message,
|
|
160
|
+
}
|
|
161
|
+
for name, c in self._components.items()
|
|
162
|
+
},
|
|
163
|
+
"timestamp": time.time(),
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async def _escalate(self, name: str, status: ComponentStatus) -> None:
|
|
167
|
+
severity = "critical" if status.health == Health.RED else "warning"
|
|
168
|
+
alert_data = {
|
|
169
|
+
"component": name,
|
|
170
|
+
"health": status.health.value,
|
|
171
|
+
"checks_failed": status.checks_failed,
|
|
172
|
+
"downtime_seconds": round(status.downtime, 1),
|
|
173
|
+
"message": status.message,
|
|
174
|
+
}
|
|
175
|
+
log.warning("watchdog.alert component=%s severity=%s failed=%d",
|
|
176
|
+
name, severity, status.checks_failed)
|
|
177
|
+
|
|
178
|
+
for handler in self._alert_handlers:
|
|
179
|
+
try:
|
|
180
|
+
result = handler(severity, f"{name} is {status.health.value}", alert_data)
|
|
181
|
+
if asyncio.iscoroutine(result):
|
|
182
|
+
await result
|
|
183
|
+
except Exception as exc:
|
|
184
|
+
log.error("watchdog.alert_handler_error: %s", exc)
|
|
185
|
+
|
|
186
|
+
if status.health in (Health.RED, Health.DEAD) and status.checks_failed >= 5:
|
|
187
|
+
await self._trigger_failsafe(name)
|
|
188
|
+
|
|
189
|
+
async def _trigger_failsafe(self, trigger: str) -> None:
|
|
190
|
+
log.critical("watchdog.FAILSAFE trigger=%s — activating safe mode", trigger)
|
|
191
|
+
for handler in self._failsafe_handlers:
|
|
192
|
+
try:
|
|
193
|
+
result = handler()
|
|
194
|
+
if asyncio.iscoroutine(result):
|
|
195
|
+
await result
|
|
196
|
+
except Exception as exc:
|
|
197
|
+
log.error("watchdog.failsafe_error: %s", exc)
|
|
198
|
+
|
|
199
|
+
def _write_heartbeat(self) -> None:
|
|
200
|
+
try:
|
|
201
|
+
Path(self._heartbeat_file).write_text(
|
|
202
|
+
json.dumps({
|
|
203
|
+
"alive": True,
|
|
204
|
+
"health": self.overall_health.value,
|
|
205
|
+
"timestamp": time.time(),
|
|
206
|
+
"pid": os.getpid(),
|
|
207
|
+
})
|
|
208
|
+
)
|
|
209
|
+
except OSError:
|
|
210
|
+
pass
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# ─── Built-in health checks ────────────────────────
|
|
214
|
+
|
|
215
|
+
async def check_event_bus(bus: Any) -> Health:
|
|
216
|
+
if bus.pending > 10000:
|
|
217
|
+
return Health.RED
|
|
218
|
+
if bus.pending > 1000:
|
|
219
|
+
return Health.YELLOW
|
|
220
|
+
return Health.GREEN
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
async def check_memory_store(store: Any) -> Health:
|
|
224
|
+
try:
|
|
225
|
+
store.search("__healthcheck__", limit=1)
|
|
226
|
+
return Health.GREEN
|
|
227
|
+
except Exception:
|
|
228
|
+
return Health.RED
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def check_disk_space(path: str = "/", threshold: float = 0.90) -> Health:
|
|
232
|
+
try:
|
|
233
|
+
st = os.statvfs(path)
|
|
234
|
+
used = 1.0 - (st.f_bavail / st.f_blocks)
|
|
235
|
+
if used > 0.95:
|
|
236
|
+
return Health.RED
|
|
237
|
+
if used > threshold:
|
|
238
|
+
return Health.YELLOW
|
|
239
|
+
return Health.GREEN
|
|
240
|
+
except Exception:
|
|
241
|
+
return Health.YELLOW
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
async def check_driver_heartbeat(driver: Any) -> Health:
|
|
245
|
+
try:
|
|
246
|
+
result = await asyncio.wait_for(driver.heartbeat(), timeout=10.0)
|
|
247
|
+
return Health.GREEN if result.get("alive") else Health.RED
|
|
248
|
+
except asyncio.TimeoutError:
|
|
249
|
+
return Health.RED
|
|
250
|
+
except Exception:
|
|
251
|
+
return Health.RED
|
|
252
|
+
|