@agentunion/kite 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/assistant/entry.py +30 -81
- package/extensions/agents/assistant/module.md +1 -1
- package/extensions/agents/assistant/server.py +83 -122
- package/extensions/channels/acp_channel/entry.py +30 -81
- package/extensions/channels/acp_channel/module.md +1 -1
- package/extensions/channels/acp_channel/server.py +83 -122
- package/extensions/event_hub_bench/entry.py +81 -121
- package/extensions/services/backup/entry.py +213 -85
- package/extensions/services/model_service/entry.py +213 -85
- package/extensions/services/watchdog/entry.py +513 -460
- package/extensions/services/watchdog/monitor.py +55 -69
- package/extensions/services/web/entry.py +11 -108
- package/extensions/services/web/server.py +120 -77
- package/{core/registry → kernel}/entry.py +65 -37
- package/{core/event_hub/hub.py → kernel/event_hub.py} +61 -81
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +13 -4
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/{core/launcher → launcher}/entry.py +693 -767
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/main.py +11 -350
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/__init__.py +0 -0
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -436
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -269
- package/core/kite_log.py +0 -241
- package/core/launcher/__init__.py +0 -0
- package/core/registry/__init__.py +0 -0
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -339
- package/extensions/services/backup/server.py +0 -244
- package/extensions/services/model_service/server.py +0 -236
- package/extensions/services/watchdog/server.py +0 -229
- /package/{core → kernel}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
- /package/{core/launcher → launcher}/process_manager.py +0 -0
package/core/registry/server.py
DELETED
|
@@ -1,339 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Registry HTTP server.
|
|
3
|
-
7 endpoints: /modules, /lookup, /get/{path}, /tokens, /verify, /query, /health.
|
|
4
|
-
All endpoints except /health require Bearer token auth.
|
|
5
|
-
|
|
6
|
-
Delayed ready mechanism (mechanism 7):
|
|
7
|
-
Registry does NOT send module.ready immediately after HTTP starts.
|
|
8
|
-
When Event Hub registers (POST /modules with metadata.ws_endpoint),
|
|
9
|
-
Registry connects to Event Hub WS and then sends module.ready.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
import asyncio
|
|
13
|
-
import json
|
|
14
|
-
import uuid
|
|
15
|
-
from typing import Any
|
|
16
|
-
|
|
17
|
-
import websockets
|
|
18
|
-
from fastapi import FastAPI, Request, HTTPException
|
|
19
|
-
from fastapi.responses import JSONResponse
|
|
20
|
-
|
|
21
|
-
from .store import RegistryStore
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class RegistryServer:
|
|
25
|
-
"""FastAPI-based Registry HTTP server."""
|
|
26
|
-
|
|
27
|
-
def __init__(self, store: RegistryStore, launcher_token: str = "", advertise_ip: str = "127.0.0.1"):
|
|
28
|
-
self.store = store
|
|
29
|
-
self.launcher_token = launcher_token
|
|
30
|
-
self.advertise_ip = advertise_ip
|
|
31
|
-
self.port: int = 0 # set by entry.py before uvicorn.run
|
|
32
|
-
self.app = self._create_app()
|
|
33
|
-
self._ttl_task: asyncio.Task | None = None
|
|
34
|
-
# Event Hub WebSocket
|
|
35
|
-
self._event_hub_ws_url: str = ""
|
|
36
|
-
self._ws: object | None = None
|
|
37
|
-
self._ws_task: asyncio.Task | None = None
|
|
38
|
-
self._event_hub_connected = False
|
|
39
|
-
self._ready_sent = False
|
|
40
|
-
self._uvicorn_server = None # set by entry.py for graceful shutdown
|
|
41
|
-
self._shutting_down = False
|
|
42
|
-
|
|
43
|
-
def _extract_token(self, request: Request) -> str:
|
|
44
|
-
"""Extract Bearer token from Authorization header."""
|
|
45
|
-
auth = request.headers.get("Authorization", "")
|
|
46
|
-
if auth.startswith("Bearer "):
|
|
47
|
-
return auth[7:].strip()
|
|
48
|
-
return ""
|
|
49
|
-
|
|
50
|
-
def _require_auth(self, request: Request) -> str:
|
|
51
|
-
"""Verify token, return module_id. Raise 401 on failure."""
|
|
52
|
-
token = self._extract_token(request)
|
|
53
|
-
module_id = self.store.verify_token(token)
|
|
54
|
-
if module_id is None:
|
|
55
|
-
raise HTTPException(status_code=401, detail="Invalid or missing token")
|
|
56
|
-
return module_id
|
|
57
|
-
|
|
58
|
-
def _require_launcher(self, request: Request):
|
|
59
|
-
"""Verify the caller is Launcher. Raise 403 if not."""
|
|
60
|
-
token = self._extract_token(request)
|
|
61
|
-
if not self.store.is_launcher(token):
|
|
62
|
-
raise HTTPException(status_code=403, detail="Only Launcher may call this endpoint")
|
|
63
|
-
|
|
64
|
-
# ── Event Hub connection + delayed ready ──
|
|
65
|
-
|
|
66
|
-
async def _try_connect_event_hub(self):
|
|
67
|
-
"""Event Hub just registered — connect to it and send module.ready."""
|
|
68
|
-
if self._event_hub_connected:
|
|
69
|
-
return
|
|
70
|
-
eh = self.store.modules.get("event_hub")
|
|
71
|
-
if not eh:
|
|
72
|
-
return
|
|
73
|
-
ws_url = (eh.get("metadata") or {}).get("ws_endpoint", "")
|
|
74
|
-
if not ws_url:
|
|
75
|
-
return
|
|
76
|
-
self._event_hub_ws_url = ws_url
|
|
77
|
-
if not self._ws_task:
|
|
78
|
-
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
79
|
-
|
|
80
|
-
async def _ws_loop(self):
|
|
81
|
-
"""Connect to Event Hub, reconnect on failure."""
|
|
82
|
-
retry_delay = 0.5 # start with 0.5s
|
|
83
|
-
max_delay = 30 # cap at 30s
|
|
84
|
-
while not self._shutting_down:
|
|
85
|
-
try:
|
|
86
|
-
await self._ws_connect()
|
|
87
|
-
retry_delay = 0.5 # reset on successful connection
|
|
88
|
-
except asyncio.CancelledError:
|
|
89
|
-
return
|
|
90
|
-
except Exception as e:
|
|
91
|
-
print(f"[registry] Event Hub connection error: {e}, retrying in {retry_delay:.1f}s")
|
|
92
|
-
self._ws = None
|
|
93
|
-
self._event_hub_connected = False
|
|
94
|
-
if self._shutting_down:
|
|
95
|
-
return
|
|
96
|
-
await asyncio.sleep(retry_delay)
|
|
97
|
-
retry_delay = min(retry_delay * 2, max_delay) # exponential backoff
|
|
98
|
-
|
|
99
|
-
async def _ws_connect(self):
|
|
100
|
-
"""Single WebSocket session. On first connect, send module.ready."""
|
|
101
|
-
# Use registry's own per-module token to avoid conflicting with Launcher's connection
|
|
102
|
-
token = self.store.token_map.get("registry", "") or self.launcher_token
|
|
103
|
-
ws_url = f"{self._event_hub_ws_url}?token={token}&id=registry"
|
|
104
|
-
async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
105
|
-
self._ws = ws
|
|
106
|
-
self._event_hub_connected = True
|
|
107
|
-
print("[registry] Connected to Event Hub")
|
|
108
|
-
|
|
109
|
-
# Subscribe to shutdown events
|
|
110
|
-
await ws.send(json.dumps({"type": "subscribe", "events": ["module.shutdown"]}))
|
|
111
|
-
|
|
112
|
-
# Send module.ready on first successful connection (delayed ready mechanism)
|
|
113
|
-
if not self._ready_sent:
|
|
114
|
-
self._ready_sent = True
|
|
115
|
-
# Self-register so watchdog (and others) can discover registry
|
|
116
|
-
# via /lookup. Without this, health checks fail (empty api_endpoint).
|
|
117
|
-
self.store.register_module({
|
|
118
|
-
"module_id": "registry",
|
|
119
|
-
"module_type": "infrastructure",
|
|
120
|
-
"api_endpoint": f"http://{self.advertise_ip}:{self.port}",
|
|
121
|
-
"health_endpoint": "/health",
|
|
122
|
-
})
|
|
123
|
-
await self._send_module_ready()
|
|
124
|
-
|
|
125
|
-
async for raw in ws:
|
|
126
|
-
try:
|
|
127
|
-
msg = json.loads(raw)
|
|
128
|
-
except (json.JSONDecodeError, TypeError):
|
|
129
|
-
continue
|
|
130
|
-
try:
|
|
131
|
-
msg_type = msg.get("type", "")
|
|
132
|
-
if msg_type == "event":
|
|
133
|
-
event = msg.get("event", "")
|
|
134
|
-
data = msg.get("data") if isinstance(msg.get("data"), dict) else {}
|
|
135
|
-
if event == "module.shutdown" and data.get("module_id") == "registry":
|
|
136
|
-
await self._handle_shutdown(data)
|
|
137
|
-
return
|
|
138
|
-
elif msg_type == "error":
|
|
139
|
-
print(f"[registry] Event Hub error: {msg.get('message')}")
|
|
140
|
-
except Exception as e:
|
|
141
|
-
print(f"[registry] 事件处理异常(已忽略): {e}")
|
|
142
|
-
|
|
143
|
-
async def _send_module_ready(self):
|
|
144
|
-
"""Send module.ready event to Event Hub. Launcher is listening for this."""
|
|
145
|
-
from datetime import datetime, timezone
|
|
146
|
-
msg = {
|
|
147
|
-
"type": "event",
|
|
148
|
-
"event_id": str(uuid.uuid4()),
|
|
149
|
-
"event": "module.ready",
|
|
150
|
-
"source": "registry",
|
|
151
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
152
|
-
"data": {
|
|
153
|
-
"module_id": "registry",
|
|
154
|
-
"api_endpoint": f"http://{self.advertise_ip}:{self.port}",
|
|
155
|
-
"graceful_shutdown": True,
|
|
156
|
-
},
|
|
157
|
-
}
|
|
158
|
-
try:
|
|
159
|
-
await self._ws.send(json.dumps(msg))
|
|
160
|
-
print("[registry] Sent module.ready")
|
|
161
|
-
except Exception as e:
|
|
162
|
-
print(f"[registry] Failed to send module.ready: {e}")
|
|
163
|
-
|
|
164
|
-
async def _publish_event(self, event_type: str, data: dict):
|
|
165
|
-
"""Publish event to Event Hub. Best-effort, no-op if not connected."""
|
|
166
|
-
if not self._ws:
|
|
167
|
-
return
|
|
168
|
-
from datetime import datetime, timezone
|
|
169
|
-
msg = {
|
|
170
|
-
"type": "event",
|
|
171
|
-
"event_id": str(uuid.uuid4()),
|
|
172
|
-
"event": event_type,
|
|
173
|
-
"source": "registry",
|
|
174
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
175
|
-
"data": data,
|
|
176
|
-
}
|
|
177
|
-
try:
|
|
178
|
-
await self._ws.send(json.dumps(msg))
|
|
179
|
-
except Exception:
|
|
180
|
-
pass
|
|
181
|
-
|
|
182
|
-
async def _handle_shutdown(self, data: dict):
|
|
183
|
-
"""Handle module.shutdown event — ack, cleanup, ready, exit."""
|
|
184
|
-
print("[registry] Received shutdown request")
|
|
185
|
-
self._shutting_down = True
|
|
186
|
-
# Step 1: Send ack
|
|
187
|
-
await self._publish_event("module.shutdown.ack", {
|
|
188
|
-
"module_id": "registry",
|
|
189
|
-
"estimated_cleanup": 2,
|
|
190
|
-
})
|
|
191
|
-
# Step 2: Cleanup
|
|
192
|
-
if self._ttl_task:
|
|
193
|
-
self._ttl_task.cancel()
|
|
194
|
-
# Step 3: Send ready (before closing WS!)
|
|
195
|
-
await self._publish_event("module.shutdown.ready", {
|
|
196
|
-
"module_id": "registry",
|
|
197
|
-
})
|
|
198
|
-
print("[registry] Shutdown ready, exiting")
|
|
199
|
-
# Step 4: Trigger uvicorn exit (WS will close when uvicorn shuts down)
|
|
200
|
-
if self._uvicorn_server:
|
|
201
|
-
self._uvicorn_server.should_exit = True
|
|
202
|
-
|
|
203
|
-
# ── App factory ──
|
|
204
|
-
|
|
205
|
-
def _create_app(self) -> FastAPI:
|
|
206
|
-
app = FastAPI(title="Kite Registry", docs_url=None, redoc_url=None)
|
|
207
|
-
server = self
|
|
208
|
-
|
|
209
|
-
@app.on_event("startup")
|
|
210
|
-
async def _startup():
|
|
211
|
-
server._ttl_task = asyncio.create_task(server._ttl_loop())
|
|
212
|
-
|
|
213
|
-
@app.on_event("shutdown")
|
|
214
|
-
async def _shutdown():
|
|
215
|
-
if server._ttl_task:
|
|
216
|
-
server._ttl_task.cancel()
|
|
217
|
-
if server._ws_task:
|
|
218
|
-
server._ws_task.cancel()
|
|
219
|
-
if server._ws:
|
|
220
|
-
await server._ws.close()
|
|
221
|
-
|
|
222
|
-
# ── 1. POST /modules ──
|
|
223
|
-
|
|
224
|
-
@app.post("/modules")
|
|
225
|
-
async def modules(request: Request):
|
|
226
|
-
caller = server._require_auth(request)
|
|
227
|
-
body = await request.json()
|
|
228
|
-
action = body.get("action", "")
|
|
229
|
-
|
|
230
|
-
if action == "register":
|
|
231
|
-
if "module_id" not in body:
|
|
232
|
-
raise HTTPException(400, "module_id required")
|
|
233
|
-
# Only Launcher or the module itself may register
|
|
234
|
-
if caller != "launcher" and caller != body["module_id"]:
|
|
235
|
-
raise HTTPException(403, f"Module '{caller}' cannot register as '{body['module_id']}'")
|
|
236
|
-
result = server.store.register_module(body)
|
|
237
|
-
if result.get("ok"):
|
|
238
|
-
mid = body["module_id"]
|
|
239
|
-
await server._publish_event("module.registered", {"module_id": mid})
|
|
240
|
-
# If Event Hub just registered, connect and send module.ready
|
|
241
|
-
ws_endpoint = (body.get("metadata") or {}).get("ws_endpoint")
|
|
242
|
-
if ws_endpoint and mid == "event_hub":
|
|
243
|
-
await server._try_connect_event_hub()
|
|
244
|
-
return result
|
|
245
|
-
|
|
246
|
-
elif action == "deregister":
|
|
247
|
-
mid = body.get("module_id")
|
|
248
|
-
if not mid:
|
|
249
|
-
raise HTTPException(400, "module_id required")
|
|
250
|
-
if caller != "launcher" and caller != mid:
|
|
251
|
-
raise HTTPException(403, f"Module '{caller}' cannot deregister '{mid}'")
|
|
252
|
-
result = server.store.deregister_module(mid)
|
|
253
|
-
if result.get("ok"):
|
|
254
|
-
await server._publish_event("module.unregistered", {"module_id": mid})
|
|
255
|
-
return result
|
|
256
|
-
|
|
257
|
-
elif action == "heartbeat":
|
|
258
|
-
mid = body.get("module_id")
|
|
259
|
-
if not mid:
|
|
260
|
-
raise HTTPException(400, "module_id required")
|
|
261
|
-
if caller != "launcher" and caller != mid:
|
|
262
|
-
raise HTTPException(403, f"Module '{caller}' cannot heartbeat for '{mid}'")
|
|
263
|
-
result = server.store.heartbeat(mid)
|
|
264
|
-
if result.get("ok"):
|
|
265
|
-
await server._publish_event("module.heartbeat", {"module_id": mid})
|
|
266
|
-
return result
|
|
267
|
-
|
|
268
|
-
else:
|
|
269
|
-
raise HTTPException(400, f"Unknown action: {action}")
|
|
270
|
-
|
|
271
|
-
# ── 2. GET /lookup ──
|
|
272
|
-
|
|
273
|
-
@app.get("/lookup")
|
|
274
|
-
async def lookup(request: Request, field: str = None, module: str = None, value: str = None):
|
|
275
|
-
server._require_auth(request)
|
|
276
|
-
return server.store.lookup(field=field, module=module, value=value)
|
|
277
|
-
|
|
278
|
-
# ── 3. GET /get/{path} ──
|
|
279
|
-
|
|
280
|
-
@app.get("/get/{path:path}")
|
|
281
|
-
async def get_by_path(request: Request, path: str):
|
|
282
|
-
server._require_auth(request)
|
|
283
|
-
val, found = server.store.get_by_path(path)
|
|
284
|
-
if not found:
|
|
285
|
-
raise HTTPException(404, f"Path not found: {path}")
|
|
286
|
-
return val
|
|
287
|
-
|
|
288
|
-
# ── 4. POST /tokens ──
|
|
289
|
-
|
|
290
|
-
@app.post("/tokens")
|
|
291
|
-
async def register_tokens(request: Request):
|
|
292
|
-
server._require_launcher(request)
|
|
293
|
-
body = await request.json()
|
|
294
|
-
server.store.register_tokens(body)
|
|
295
|
-
return {"ok": True}
|
|
296
|
-
|
|
297
|
-
# ── 5. POST /verify ──
|
|
298
|
-
|
|
299
|
-
@app.post("/verify")
|
|
300
|
-
async def verify_token(request: Request):
|
|
301
|
-
server._require_auth(request)
|
|
302
|
-
body = await request.json()
|
|
303
|
-
target_token = body.get("token", "")
|
|
304
|
-
module_id = server.store.verify_token(target_token)
|
|
305
|
-
if module_id:
|
|
306
|
-
return {"ok": True, "module_id": module_id}
|
|
307
|
-
return {"ok": False}
|
|
308
|
-
|
|
309
|
-
# ── 6. POST /query (stub) ──
|
|
310
|
-
|
|
311
|
-
@app.post("/query")
|
|
312
|
-
async def query(request: Request):
|
|
313
|
-
server._require_auth(request)
|
|
314
|
-
body = await request.json()
|
|
315
|
-
question = body.get("question", "")
|
|
316
|
-
return {"ok": False, "error": "LLM query not implemented yet", "question": question}
|
|
317
|
-
|
|
318
|
-
# ── 7. GET /health ──
|
|
319
|
-
|
|
320
|
-
@app.get("/health")
|
|
321
|
-
async def health():
|
|
322
|
-
return {
|
|
323
|
-
"status": "healthy",
|
|
324
|
-
"module_count": len(server.store.modules),
|
|
325
|
-
"online_count": sum(
|
|
326
|
-
1 for m in server.store.modules.values()
|
|
327
|
-
if m.get("status") == "online"
|
|
328
|
-
),
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
return app
|
|
332
|
-
|
|
333
|
-
async def _ttl_loop(self):
|
|
334
|
-
"""Periodically check heartbeat TTL and publish offline events."""
|
|
335
|
-
while True:
|
|
336
|
-
await asyncio.sleep(10)
|
|
337
|
-
expired = self.store.check_ttl()
|
|
338
|
-
for mid in expired:
|
|
339
|
-
await self._publish_event("module.offline", {"module_id": mid})
|
|
@@ -1,244 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Backup HTTP server.
|
|
3
|
-
Exposes /health and /status endpoints.
|
|
4
|
-
Connects to Event Hub via WebSocket for event publishing and subscription.
|
|
5
|
-
Sends periodic heartbeat to Registry and test events to Event Hub.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import json
|
|
10
|
-
import time
|
|
11
|
-
import uuid
|
|
12
|
-
from datetime import datetime, timezone
|
|
13
|
-
|
|
14
|
-
import httpx
|
|
15
|
-
import websockets
|
|
16
|
-
from fastapi import FastAPI
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class BackupServer:
|
|
20
|
-
|
|
21
|
-
def __init__(self, token: str = "", registry_url: str = "",
|
|
22
|
-
event_hub_ws: str = "", boot_t0: float = 0):
|
|
23
|
-
self.token = token
|
|
24
|
-
self.registry_url = registry_url
|
|
25
|
-
self.event_hub_ws = event_hub_ws
|
|
26
|
-
self.boot_t0 = boot_t0
|
|
27
|
-
self._ws_task: asyncio.Task | None = None
|
|
28
|
-
self._heartbeat_task: asyncio.Task | None = None
|
|
29
|
-
self._test_task: asyncio.Task | None = None
|
|
30
|
-
self._ws: object | None = None
|
|
31
|
-
self._ready_sent = False
|
|
32
|
-
self._shutting_down = False
|
|
33
|
-
self._uvicorn_server = None # set by entry.py for graceful shutdown
|
|
34
|
-
self._start_time = time.time()
|
|
35
|
-
self.app = self._create_app()
|
|
36
|
-
|
|
37
|
-
def _create_app(self) -> FastAPI:
|
|
38
|
-
app = FastAPI(title="Kite Backup", docs_url=None, redoc_url=None)
|
|
39
|
-
server = self
|
|
40
|
-
|
|
41
|
-
@app.on_event("startup")
|
|
42
|
-
async def _startup():
|
|
43
|
-
elapsed = time.monotonic() - server.boot_t0 if server.boot_t0 else 0
|
|
44
|
-
print(f"[backup] FastAPI startup event triggered ({elapsed:.1f}s)")
|
|
45
|
-
server._heartbeat_task = asyncio.create_task(server._heartbeat_loop())
|
|
46
|
-
if server.event_hub_ws:
|
|
47
|
-
print(f"[backup] Creating Event Hub connection task...")
|
|
48
|
-
server._ws_task = asyncio.create_task(server._ws_loop())
|
|
49
|
-
server._test_task = asyncio.create_task(server._test_event_loop())
|
|
50
|
-
print(f"[backup] All background tasks created")
|
|
51
|
-
|
|
52
|
-
@app.on_event("shutdown")
|
|
53
|
-
async def _shutdown():
|
|
54
|
-
if server._heartbeat_task:
|
|
55
|
-
server._heartbeat_task.cancel()
|
|
56
|
-
if server._ws_task:
|
|
57
|
-
server._ws_task.cancel()
|
|
58
|
-
if server._test_task:
|
|
59
|
-
server._test_task.cancel()
|
|
60
|
-
if server._ws:
|
|
61
|
-
await server._ws.close()
|
|
62
|
-
print("[backup] Shutdown complete")
|
|
63
|
-
|
|
64
|
-
@app.get("/health")
|
|
65
|
-
async def health():
|
|
66
|
-
return {
|
|
67
|
-
"status": "healthy",
|
|
68
|
-
"details": {
|
|
69
|
-
"event_hub_connected": server._ws is not None,
|
|
70
|
-
"uptime_seconds": round(time.time() - server._start_time),
|
|
71
|
-
},
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
@app.get("/status")
|
|
75
|
-
async def status():
|
|
76
|
-
return {
|
|
77
|
-
"module": "backup",
|
|
78
|
-
"status": "running",
|
|
79
|
-
"event_hub_connected": server._ws is not None,
|
|
80
|
-
"uptime_seconds": round(time.time() - server._start_time),
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
return app
|
|
84
|
-
|
|
85
|
-
# ── Event Hub WebSocket client ──
|
|
86
|
-
|
|
87
|
-
async def _ws_loop(self):
|
|
88
|
-
"""Connect to Event Hub, subscribe, and listen. Reconnect on failure."""
|
|
89
|
-
elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
|
|
90
|
-
print(f"[backup] _ws_loop started ({elapsed:.1f}s)")
|
|
91
|
-
retry_delay = 0.5 # start with 0.5s
|
|
92
|
-
max_delay = 30 # cap at 30s
|
|
93
|
-
while not self._shutting_down:
|
|
94
|
-
try:
|
|
95
|
-
elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
|
|
96
|
-
print(f"[backup] Attempting Event Hub connection... ({elapsed:.1f}s)")
|
|
97
|
-
await self._ws_connect()
|
|
98
|
-
retry_delay = 0.5 # reset on successful connection
|
|
99
|
-
except asyncio.CancelledError:
|
|
100
|
-
return
|
|
101
|
-
except Exception as e:
|
|
102
|
-
print(f"[backup] Event Hub connection error: {e}, retrying in {retry_delay:.1f}s")
|
|
103
|
-
self._ws = None
|
|
104
|
-
if self._shutting_down:
|
|
105
|
-
return
|
|
106
|
-
await asyncio.sleep(retry_delay)
|
|
107
|
-
retry_delay = min(retry_delay * 2, max_delay) # exponential backoff
|
|
108
|
-
|
|
109
|
-
async def _ws_connect(self):
|
|
110
|
-
"""Single WebSocket session: connect, subscribe, receive loop."""
|
|
111
|
-
url = f"{self.event_hub_ws}?token={self.token}&id=backup"
|
|
112
|
-
print(f"[backup] WS connecting to {self.event_hub_ws}")
|
|
113
|
-
async with websockets.connect(url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
114
|
-
self._ws = ws
|
|
115
|
-
elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
|
|
116
|
-
elapsed_str = f" ({elapsed:.1f}s)" if elapsed else ""
|
|
117
|
-
print(f"[backup] Connected to Event Hub{elapsed_str}")
|
|
118
|
-
|
|
119
|
-
# Subscribe to module lifecycle events + shutdown
|
|
120
|
-
await ws.send(json.dumps({
|
|
121
|
-
"type": "subscribe",
|
|
122
|
-
"events": ["module.started", "module.stopped", "module.shutdown"],
|
|
123
|
-
}))
|
|
124
|
-
|
|
125
|
-
# Send module.ready (once) so Launcher knows we're up
|
|
126
|
-
if not self._ready_sent:
|
|
127
|
-
ready_msg = {
|
|
128
|
-
"type": "event",
|
|
129
|
-
"event_id": str(uuid.uuid4()),
|
|
130
|
-
"event": "module.ready",
|
|
131
|
-
"source": "backup",
|
|
132
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
133
|
-
"data": {
|
|
134
|
-
"module_id": "backup",
|
|
135
|
-
"graceful_shutdown": True,
|
|
136
|
-
},
|
|
137
|
-
}
|
|
138
|
-
await ws.send(json.dumps(ready_msg))
|
|
139
|
-
self._ready_sent = True
|
|
140
|
-
elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
|
|
141
|
-
elapsed_str = f" ({elapsed:.1f}s)" if elapsed else ""
|
|
142
|
-
print(f"[backup] module.ready sent{elapsed_str}")
|
|
143
|
-
|
|
144
|
-
# Receive loop
|
|
145
|
-
async for raw in ws:
|
|
146
|
-
try:
|
|
147
|
-
msg = json.loads(raw)
|
|
148
|
-
except (json.JSONDecodeError, TypeError):
|
|
149
|
-
continue
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
msg_type = msg.get("type", "")
|
|
153
|
-
if msg_type == "event":
|
|
154
|
-
event_name = msg.get("event", "")
|
|
155
|
-
if event_name == "module.shutdown":
|
|
156
|
-
target = (msg.get("data") if isinstance(msg.get("data"), dict) else {}).get("module_id", "")
|
|
157
|
-
if target == "backup":
|
|
158
|
-
await self._handle_shutdown(ws)
|
|
159
|
-
return
|
|
160
|
-
elif msg_type == "ack":
|
|
161
|
-
pass # publish confirmed
|
|
162
|
-
elif msg_type == "error":
|
|
163
|
-
print(f"[backup] Event Hub error: {msg.get('message')}")
|
|
164
|
-
except Exception as e:
|
|
165
|
-
print(f"[backup] 事件处理异常(已忽略): {e}")
|
|
166
|
-
|
|
167
|
-
async def _handle_shutdown(self, ws):
|
|
168
|
-
"""Handle module.shutdown: ack → cleanup → ready → exit."""
|
|
169
|
-
print("[backup] Received module.shutdown")
|
|
170
|
-
self._shutting_down = True
|
|
171
|
-
|
|
172
|
-
# Step 1: Send ack
|
|
173
|
-
await self._publish_event({
|
|
174
|
-
"event": "module.shutdown.ack",
|
|
175
|
-
"data": {"module_id": "backup", "estimated_cleanup": 2},
|
|
176
|
-
})
|
|
177
|
-
print("[backup] shutdown ack sent")
|
|
178
|
-
|
|
179
|
-
# Step 2: Cleanup (cancel background tasks)
|
|
180
|
-
if self._heartbeat_task:
|
|
181
|
-
self._heartbeat_task.cancel()
|
|
182
|
-
if self._test_task:
|
|
183
|
-
self._test_task.cancel()
|
|
184
|
-
|
|
185
|
-
# Step 3: Send ready (before closing WS!)
|
|
186
|
-
await self._publish_event({
|
|
187
|
-
"event": "module.shutdown.ready",
|
|
188
|
-
"data": {"module_id": "backup"},
|
|
189
|
-
})
|
|
190
|
-
print("[backup] Shutdown complete")
|
|
191
|
-
|
|
192
|
-
# Step 4: Trigger uvicorn exit (WS will close when uvicorn shuts down)
|
|
193
|
-
if self._uvicorn_server:
|
|
194
|
-
self._uvicorn_server.should_exit = True
|
|
195
|
-
|
|
196
|
-
async def _publish_event(self, event: dict):
|
|
197
|
-
"""Publish an event to Event Hub via WebSocket."""
|
|
198
|
-
if not self._ws:
|
|
199
|
-
return
|
|
200
|
-
msg = {
|
|
201
|
-
"type": "event",
|
|
202
|
-
"event_id": str(uuid.uuid4()),
|
|
203
|
-
"event": event.get("event", ""),
|
|
204
|
-
"source": "backup",
|
|
205
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
206
|
-
"data": event.get("data", {}),
|
|
207
|
-
}
|
|
208
|
-
try:
|
|
209
|
-
await self._ws.send(json.dumps(msg))
|
|
210
|
-
except Exception as e:
|
|
211
|
-
print(f"[backup] Failed to publish event: {e}")
|
|
212
|
-
|
|
213
|
-
# ── Heartbeat to Registry ──
|
|
214
|
-
|
|
215
|
-
async def _heartbeat_loop(self):
|
|
216
|
-
"""Send heartbeat to Registry every 30 seconds."""
|
|
217
|
-
while True:
|
|
218
|
-
await asyncio.sleep(30)
|
|
219
|
-
try:
|
|
220
|
-
async with httpx.AsyncClient() as client:
|
|
221
|
-
await client.post(
|
|
222
|
-
f"{self.registry_url}/modules",
|
|
223
|
-
json={"action": "heartbeat", "module_id": "backup"},
|
|
224
|
-
headers={"Authorization": f"Bearer {self.token}"},
|
|
225
|
-
timeout=5,
|
|
226
|
-
)
|
|
227
|
-
print("[backup] heartbeat sent")
|
|
228
|
-
except Exception:
|
|
229
|
-
pass
|
|
230
|
-
|
|
231
|
-
# ── Test event loop ──
|
|
232
|
-
|
|
233
|
-
async def _test_event_loop(self):
|
|
234
|
-
"""Publish a test event every 10 seconds."""
|
|
235
|
-
while True:
|
|
236
|
-
await asyncio.sleep(10)
|
|
237
|
-
await self._publish_event({
|
|
238
|
-
"event": "backup.test",
|
|
239
|
-
"data": {
|
|
240
|
-
"message": "test event from backup",
|
|
241
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
242
|
-
},
|
|
243
|
-
})
|
|
244
|
-
print("[backup] test event published")
|