@agentunion/kite 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/__init__.py +1 -0
  2. package/__main__.py +15 -0
  3. package/cli.js +70 -0
  4. package/core/__init__.py +0 -0
  5. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  6. package/core/event_hub/BENCHMARK.md +94 -0
  7. package/core/event_hub/__init__.py +0 -0
  8. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  9. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  10. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  11. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  12. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  13. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  14. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  15. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  16. package/core/event_hub/bench.py +459 -0
  17. package/core/event_hub/bench_extreme.py +308 -0
  18. package/core/event_hub/bench_perf.py +350 -0
  19. package/core/event_hub/bench_results/.gitkeep +0 -0
  20. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +51 -0
  21. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +51 -0
  22. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +51 -0
  23. package/core/event_hub/dedup.py +31 -0
  24. package/core/event_hub/entry.py +113 -0
  25. package/core/event_hub/hub.py +263 -0
  26. package/core/event_hub/module.md +21 -0
  27. package/core/event_hub/router.py +21 -0
  28. package/core/event_hub/server.py +138 -0
  29. package/core/event_hub_bench/entry.py +371 -0
  30. package/core/event_hub_bench/module.md +25 -0
  31. package/core/launcher/__init__.py +0 -0
  32. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  33. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  34. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  35. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  36. package/core/launcher/data/log/lifecycle.jsonl +1045 -0
  37. package/core/launcher/data/processes_14752.json +32 -0
  38. package/core/launcher/data/token.txt +1 -0
  39. package/core/launcher/entry.py +965 -0
  40. package/core/launcher/module.md +37 -0
  41. package/core/launcher/module_scanner.py +253 -0
  42. package/core/launcher/process_manager.py +435 -0
  43. package/core/registry/__init__.py +0 -0
  44. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  45. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  46. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  47. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  48. package/core/registry/data/port.txt +1 -0
  49. package/core/registry/data/port_14752.txt +1 -0
  50. package/core/registry/data/port_484.txt +1 -0
  51. package/core/registry/entry.py +73 -0
  52. package/core/registry/module.md +30 -0
  53. package/core/registry/server.py +256 -0
  54. package/core/registry/store.py +232 -0
  55. package/extensions/__init__.py +0 -0
  56. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  57. package/extensions/services/__init__.py +0 -0
  58. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  59. package/extensions/services/watchdog/__init__.py +0 -0
  60. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  61. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  62. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  63. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
  64. package/extensions/services/watchdog/entry.py +143 -0
  65. package/extensions/services/watchdog/module.md +25 -0
  66. package/extensions/services/watchdog/monitor.py +420 -0
  67. package/extensions/services/watchdog/server.py +167 -0
  68. package/main.py +17 -0
  69. package/package.json +27 -0
@@ -0,0 +1,256 @@
1
+ """
2
+ Registry HTTP server.
3
+ 7 endpoints: /modules, /lookup, /get/{path}, /tokens, /verify, /query, /health.
4
+ All endpoints except /health require Bearer token auth.
5
+ Connects to Event Hub to publish module lifecycle events.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import uuid
11
+ from typing import Any
12
+
13
+ import websockets
14
+ from fastapi import FastAPI, Request, HTTPException
15
+ from fastapi.responses import JSONResponse
16
+
17
+ from .store import RegistryStore
18
+
19
+
20
+ class RegistryServer:
21
+ """FastAPI-based Registry HTTP server."""
22
+
23
+ def __init__(self, store: RegistryStore, launcher_token: str = ""):
24
+ self.store = store
25
+ self.launcher_token = launcher_token
26
+ self.app = self._create_app()
27
+ self._ttl_task: asyncio.Task | None = None
28
+ # Event Hub WebSocket
29
+ self._event_hub_ws_url: str = ""
30
+ self._ws: object | None = None
31
+ self._ws_task: asyncio.Task | None = None
32
+
33
+ def _extract_token(self, request: Request) -> str:
34
+ """Extract Bearer token from Authorization header."""
35
+ auth = request.headers.get("Authorization", "")
36
+ if auth.startswith("Bearer "):
37
+ return auth[7:].strip()
38
+ return ""
39
+
40
+ def _require_auth(self, request: Request) -> str:
41
+ """Verify token, return module_id. Raise 401 on failure."""
42
+ token = self._extract_token(request)
43
+ module_id = self.store.verify_token(token)
44
+ if module_id is None:
45
+ raise HTTPException(status_code=401, detail="Invalid or missing token")
46
+ return module_id
47
+
48
+ def _require_launcher(self, request: Request):
49
+ """Verify the caller is Launcher. Raise 403 if not."""
50
+ token = self._extract_token(request)
51
+ if not self.store.is_launcher(token):
52
+ raise HTTPException(status_code=403, detail="Only Launcher may call this endpoint")
53
+
54
+ # ── Event Hub connection ──
55
+
56
+ async def _try_connect_event_hub(self):
57
+ """Check if Event Hub is registered and connect if not already connected."""
58
+ if self._ws:
59
+ return
60
+ # Look up Event Hub ws_endpoint from our own store
61
+ eh = self.store.modules.get("event_hub")
62
+ if not eh:
63
+ return
64
+ ws_url = (eh.get("metadata") or {}).get("ws_endpoint", "")
65
+ if not ws_url:
66
+ return
67
+ self._event_hub_ws_url = ws_url
68
+ if not self._ws_task:
69
+ self._ws_task = asyncio.create_task(self._ws_loop())
70
+
71
+ async def _ws_loop(self):
72
+ """Connect to Event Hub, reconnect on failure."""
73
+ while True:
74
+ try:
75
+ await self._ws_connect()
76
+ except asyncio.CancelledError:
77
+ return
78
+ except Exception as e:
79
+ print(f"[registry] Event Hub connection error: {e}")
80
+ self._ws = None
81
+ await asyncio.sleep(5)
82
+
83
+ async def _ws_connect(self):
84
+ """Single WebSocket session."""
85
+ # Use registry's own per-module token (registered by Launcher via /tokens)
86
+ # to avoid conflicting with Launcher's connection (same launcher_token → same module_id)
87
+ token = self.store.token_map.get("registry", "") or self.launcher_token
88
+ ws_url = f"{self._event_hub_ws_url}?token={token}"
89
+ async with websockets.connect(ws_url) as ws:
90
+ self._ws = ws
91
+ print("[registry] Connected to Event Hub")
92
+ async for raw in ws:
93
+ try:
94
+ msg = json.loads(raw)
95
+ except (json.JSONDecodeError, TypeError):
96
+ continue
97
+ msg_type = msg.get("type", "")
98
+ if msg_type == "error":
99
+ print(f"[registry] Event Hub error: {msg.get('message')}")
100
+
101
+ async def _publish_event(self, event_type: str, data: dict):
102
+ """Publish event to Event Hub. Best-effort, no-op if not connected."""
103
+ if not self._ws:
104
+ return
105
+ from datetime import datetime, timezone
106
+ msg = {
107
+ "type": "event",
108
+ "event_id": str(uuid.uuid4()),
109
+ "event": event_type,
110
+ "source": "registry",
111
+ "timestamp": datetime.now(timezone.utc).isoformat(),
112
+ "data": data,
113
+ }
114
+ try:
115
+ await self._ws.send(json.dumps(msg))
116
+ except Exception:
117
+ pass
118
+
119
+ # ── App factory ──
120
+
121
+ def _create_app(self) -> FastAPI:
122
+ app = FastAPI(title="Kite Registry", docs_url=None, redoc_url=None)
123
+ server = self
124
+
125
+ @app.on_event("startup")
126
+ async def _startup():
127
+ server._ttl_task = asyncio.create_task(server._ttl_loop())
128
+
129
+ @app.on_event("shutdown")
130
+ async def _shutdown():
131
+ if server._ttl_task:
132
+ server._ttl_task.cancel()
133
+ if server._ws_task:
134
+ server._ws_task.cancel()
135
+ if server._ws:
136
+ await server._ws.close()
137
+
138
+ # ── 1. POST /modules ──
139
+
140
+ @app.post("/modules")
141
+ async def modules(request: Request):
142
+ caller = server._require_auth(request)
143
+ body = await request.json()
144
+ action = body.get("action", "")
145
+
146
+ if action == "register":
147
+ if "module_id" not in body:
148
+ raise HTTPException(400, "module_id required")
149
+ # Only Launcher or the module itself may register
150
+ if caller != "launcher" and caller != body["module_id"]:
151
+ raise HTTPException(403, f"Module '{caller}' cannot register as '{body['module_id']}'")
152
+ result = server.store.register_module(body)
153
+ if result.get("ok"):
154
+ mid = body["module_id"]
155
+ await server._publish_event("module.registered", {"module_id": mid})
156
+ # If Event Hub just registered, try connecting
157
+ if mid == "event_hub":
158
+ await server._try_connect_event_hub()
159
+ return result
160
+
161
+ elif action == "deregister":
162
+ mid = body.get("module_id")
163
+ if not mid:
164
+ raise HTTPException(400, "module_id required")
165
+ if caller != "launcher" and caller != mid:
166
+ raise HTTPException(403, f"Module '{caller}' cannot deregister '{mid}'")
167
+ result = server.store.deregister_module(mid)
168
+ if result.get("ok"):
169
+ await server._publish_event("module.unregistered", {"module_id": mid})
170
+ return result
171
+
172
+ elif action == "heartbeat":
173
+ mid = body.get("module_id")
174
+ if not mid:
175
+ raise HTTPException(400, "module_id required")
176
+ if caller != "launcher" and caller != mid:
177
+ raise HTTPException(403, f"Module '{caller}' cannot heartbeat for '{mid}'")
178
+ result = server.store.heartbeat(mid)
179
+ if result.get("ok"):
180
+ await server._publish_event("module.heartbeat", {"module_id": mid})
181
+ return result
182
+
183
+ else:
184
+ raise HTTPException(400, f"Unknown action: {action}")
185
+
186
+ # ── 2. GET /lookup ──
187
+
188
+ @app.get("/lookup")
189
+ async def lookup(request: Request, field: str = None, module: str = None, value: str = None):
190
+ server._require_auth(request)
191
+ return server.store.lookup(field=field, module=module, value=value)
192
+
193
+ # ── 3. GET /get/{path} ──
194
+
195
+ @app.get("/get/{path:path}")
196
+ async def get_by_path(request: Request, path: str):
197
+ server._require_auth(request)
198
+ val, found = server.store.get_by_path(path)
199
+ if not found:
200
+ raise HTTPException(404, f"Path not found: {path}")
201
+ return val
202
+
203
+ # ── 4. POST /tokens ──
204
+
205
+ @app.post("/tokens")
206
+ async def register_tokens(request: Request):
207
+ server._require_launcher(request)
208
+ body = await request.json()
209
+ server.store.register_tokens(body)
210
+ return {"ok": True}
211
+
212
+ # ── 5. POST /verify ──
213
+
214
+ @app.post("/verify")
215
+ async def verify_token(request: Request):
216
+ server._require_auth(request)
217
+ body = await request.json()
218
+ target_token = body.get("token", "")
219
+ module_id = server.store.verify_token(target_token)
220
+ if module_id:
221
+ return {"ok": True, "module_id": module_id}
222
+ return {"ok": False}
223
+
224
+ # ── 6. POST /query (stub) ──
225
+ # TODO: implement LLM semantic query per design §5.1
226
+ # accept {"question": "..."}, search registry with LLM, return matched modules/tools
227
+
228
+ @app.post("/query")
229
+ async def query(request: Request):
230
+ server._require_auth(request)
231
+ body = await request.json()
232
+ question = body.get("question", "")
233
+ return {"ok": False, "error": "LLM query not implemented yet", "question": question}
234
+
235
+ # ── 7. GET /health ──
236
+
237
+ @app.get("/health")
238
+ async def health():
239
+ return {
240
+ "status": "healthy",
241
+ "module_count": len(server.store.modules),
242
+ "online_count": sum(
243
+ 1 for m in server.store.modules.values()
244
+ if m.get("status") == "online"
245
+ ),
246
+ }
247
+
248
+ return app
249
+
250
+ async def _ttl_loop(self):
251
+ """Periodically check heartbeat TTL and publish offline events."""
252
+ while True:
253
+ await asyncio.sleep(10)
254
+ expired = self.store.check_ttl()
255
+ for mid in expired:
256
+ await self._publish_event("module.offline", {"module_id": mid})
@@ -0,0 +1,232 @@
1
+ """
2
+ Registry in-memory store.
3
+ Manages module records, token verification, heartbeat TTL, lookup and get-by-path.
4
+ No persistence — Registry crash triggers Kite full restart, all modules re-register.
5
+ """
6
+
7
+ import fnmatch
8
+ import time
9
+ from typing import Any
10
+
11
+
12
+ class RegistryStore:
13
+
14
+ def __init__(self, launcher_token: str):
15
+ self.launcher_token = launcher_token
16
+ self.token_map: dict[str, str] = {} # module_id -> token
17
+ self.modules: dict[str, dict] = {} # module_id -> registration payload
18
+ self.heartbeats: dict[str, float] = {} # module_id -> last heartbeat timestamp
19
+ self.ttl = 60 # seconds before marking offline
20
+ self.heartbeat_interval = 30
21
+
22
+ # ── Token verification ──
23
+
24
+ def verify_token(self, token: str) -> str | None:
25
+ """Return module_id if token is valid, None otherwise."""
26
+ if token == self.launcher_token:
27
+ return "launcher"
28
+ for mid, tok in self.token_map.items():
29
+ if token == tok:
30
+ return mid
31
+ return None
32
+
33
+ def is_launcher(self, token: str) -> bool:
34
+ return token == self.launcher_token
35
+
36
+ def register_tokens(self, mapping: dict[str, str]):
37
+ """Register module_id -> token mapping. Only Launcher may call this."""
38
+ self.token_map.update(mapping)
39
+
40
+ # ── Module lifecycle ──
41
+
42
+ _REQUIRED_FIELDS = ("module_id", "module_type", "api_endpoint")
43
+
44
+ def register_module(self, data: dict) -> dict:
45
+ """Register or update a module. Idempotent — same module_id overwrites."""
46
+ # Validate required fields
47
+ missing = [f for f in self._REQUIRED_FIELDS if not data.get(f)]
48
+ if missing:
49
+ return {"ok": False, "error": f"Missing required fields: {', '.join(missing)}"}
50
+
51
+ mid = data["module_id"]
52
+
53
+ # Warn on tool name conflicts
54
+ new_tools = data.get("tools", {})
55
+ if isinstance(new_tools, dict):
56
+ for tool_name in new_tools:
57
+ for other_mid, other_data in self.modules.items():
58
+ if other_mid == mid:
59
+ continue
60
+ if tool_name in other_data.get("tools", {}):
61
+ print(f"[registry] WARNING: tool '{tool_name}' registered by both '{other_mid}' and '{mid}'")
62
+
63
+ # Strip action field — it's a request verb, not part of the registration payload
64
+ record = {k: v for k, v in data.items() if k != "action"}
65
+ record["status"] = "online"
66
+ record["registered_at"] = time.time()
67
+ self.modules[mid] = record
68
+ self.heartbeats[mid] = time.time()
69
+ return {"ok": True, "ttl": self.ttl, "heartbeat_interval": self.heartbeat_interval}
70
+
71
+ def deregister_module(self, module_id: str) -> dict:
72
+ """Remove a module record immediately."""
73
+ self.modules.pop(module_id, None)
74
+ self.heartbeats.pop(module_id, None)
75
+ return {"ok": True}
76
+
77
+ def heartbeat(self, module_id: str) -> dict:
78
+ """Renew heartbeat for a module."""
79
+ if module_id not in self.modules:
80
+ return {"ok": False, "error": "module not registered"}
81
+ self.heartbeats[module_id] = time.time()
82
+ self.modules[module_id]["status"] = "online"
83
+ return {"ok": True}
84
+
85
+ def check_ttl(self) -> list[str]:
86
+ """Mark modules as offline if heartbeat expired. Returns list of newly-offline module_ids."""
87
+ now = time.time()
88
+ expired = []
89
+ for mid, last in list(self.heartbeats.items()):
90
+ if mid in self.modules and now - last > self.ttl:
91
+ if self.modules[mid].get("status") != "offline":
92
+ self.modules[mid]["status"] = "offline"
93
+ expired.append(mid)
94
+ return expired
95
+
96
+ # ── Get by dot-path ──
97
+
98
+ def get_by_path(self, path: str) -> Any | None:
99
+ """
100
+ Resolve a dot-path like 'kernel.tools.read_file.endpoint'.
101
+ First segment is module_id, rest navigates into the registration dict.
102
+ Handles dotted keys (e.g. 'session.ended') by greedy matching.
103
+ Returns (value, True) on hit, (None, False) on miss.
104
+ """
105
+ parts = path.split(".")
106
+ if not parts:
107
+ return None, False
108
+
109
+ # First segment: module_id
110
+ module_id = parts[0]
111
+ data = self.modules.get(module_id)
112
+ if data is None:
113
+ return None, False
114
+ if len(parts) == 1:
115
+ return data, True
116
+
117
+ return self._resolve_path(data, parts[1:])
118
+
119
+ @staticmethod
120
+ def _resolve_path(obj: Any, parts: list[str]) -> tuple[Any, bool]:
121
+ """
122
+ Walk into obj using parts. Greedy: try joining multiple parts
123
+ as a single dotted key before falling back to single-segment traversal.
124
+ """
125
+ if not parts:
126
+ return obj, True
127
+ if not isinstance(obj, dict):
128
+ return None, False
129
+
130
+ # Try longest key first (greedy match for dotted keys like 'session.ended')
131
+ for i in range(len(parts), 0, -1):
132
+ candidate = ".".join(parts[:i])
133
+ if candidate in obj:
134
+ return RegistryStore._resolve_path(obj[candidate], parts[i:])
135
+
136
+ return None, False
137
+
138
+ # ── Lookup (glob search) ──
139
+
140
+ def lookup(self, field: str = None, module: str = None, value: str = None) -> list[dict]:
141
+ """
142
+ Search across all online modules. All three params support glob patterns.
143
+ Returns list of {field, module, api_endpoint, value}.
144
+ """
145
+ results = []
146
+ for mid, data in self.modules.items():
147
+ if data.get("status") != "online":
148
+ continue
149
+ if module and not fnmatch.fnmatch(mid, module):
150
+ continue
151
+
152
+ api_ep = data.get("api_endpoint", "")
153
+
154
+ if field:
155
+ matches = self._match_fields(data, field)
156
+ for fpath, fval in matches:
157
+ if value and not self._value_matches(fval, value):
158
+ continue
159
+ results.append({
160
+ "field": fpath,
161
+ "module": mid,
162
+ "api_endpoint": api_ep,
163
+ "value": fval,
164
+ })
165
+ elif value:
166
+ for k, v in data.items():
167
+ if self._value_matches(v, value):
168
+ results.append({
169
+ "field": k,
170
+ "module": mid,
171
+ "api_endpoint": api_ep,
172
+ "value": v,
173
+ })
174
+ else:
175
+ results.append({
176
+ "field": "module_id",
177
+ "module": mid,
178
+ "api_endpoint": api_ep,
179
+ "value": mid,
180
+ })
181
+
182
+ return results
183
+
184
+ # ── Lookup helpers ──
185
+
186
+ @staticmethod
187
+ def _match_fields(data: dict, pattern: str) -> list[tuple[str, Any]]:
188
+ """
189
+ Walk a module's registration dict and find all field paths matching a glob pattern.
190
+ Pattern like 'tools.*' matches top-level keys under 'tools'.
191
+ Pattern like 'tools.*file*' matches tool names containing 'file'.
192
+ Pattern like 'events_publish.session.ended' matches dotted keys via greedy join.
193
+ Returns list of (field_path, value).
194
+ """
195
+ parts = pattern.split(".")
196
+ return RegistryStore._walk_fields(data, parts, "")
197
+
198
+ @staticmethod
199
+ def _walk_fields(obj: Any, parts: list[str], prefix: str) -> list[tuple[str, Any]]:
200
+ """Recursively walk dict, matching glob pattern segments against keys."""
201
+ if not parts:
202
+ return [(prefix, obj)] if prefix else []
203
+ if not isinstance(obj, dict):
204
+ return []
205
+
206
+ results = []
207
+ # Try greedy: join multiple pattern segments to match dotted keys
208
+ for i in range(len(parts), 0, -1):
209
+ candidate_pattern = ".".join(parts[:i])
210
+ remaining = parts[i:]
211
+
212
+ for key in obj:
213
+ if fnmatch.fnmatch(key, candidate_pattern):
214
+ new_prefix = f"{prefix}.{key}" if prefix else key
215
+ if remaining:
216
+ results.extend(
217
+ RegistryStore._walk_fields(obj[key], remaining, new_prefix)
218
+ )
219
+ else:
220
+ results.append((new_prefix, obj[key]))
221
+
222
+ if results:
223
+ break # greedy: longest match wins
224
+
225
+ return results
226
+
227
+ @staticmethod
228
+ def _value_matches(val: Any, pattern: str) -> bool:
229
+ """Check if a value matches a glob pattern. Dicts/lists never match."""
230
+ if isinstance(val, (dict, list)):
231
+ return False
232
+ return fnmatch.fnmatch(str(val), pattern)
File without changes
File without changes
File without changes
@@ -0,0 +1,143 @@
1
+ """
2
+ Watchdog entry point.
3
+ Reads boot_info from stdin, registers to Registry, starts health monitor.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import socket
9
+ import sys
10
+
11
+ import httpx
12
+ import uvicorn
13
+
14
+ # Ensure project root is on sys.path
15
+ _this_dir = os.path.dirname(os.path.abspath(__file__))
16
+ _project_root = os.path.dirname(os.path.dirname(os.path.dirname(_this_dir)))
17
+ if _project_root not in sys.path:
18
+ sys.path.insert(0, _project_root)
19
+
20
+ from extensions.services.watchdog.monitor import HealthMonitor
21
+ from extensions.services.watchdog.server import WatchdogServer
22
+
23
+
24
+ def _get_free_port() -> int:
25
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
26
+ s.bind(("127.0.0.1", 0))
27
+ return s.getsockname()[1]
28
+
29
+
30
+ def _register_to_registry(token: str, registry_url: str, port: int):
31
+ payload = {
32
+ "action": "register",
33
+ "module_id": "watchdog",
34
+ "module_type": "service",
35
+ "name": "Watchdog",
36
+ "api_endpoint": f"http://127.0.0.1:{port}",
37
+ "health_endpoint": "/health",
38
+ "events_publish": {
39
+ "watchdog.module.unhealthy": {"description": "Module failed health check"},
40
+ "watchdog.module.recovered": {"description": "Module recovered from unhealthy"},
41
+ "watchdog.alert": {"description": "Module restarted too many times"},
42
+ },
43
+ "events_subscribe": [
44
+ "module.started",
45
+ "module.stopped",
46
+ ],
47
+ }
48
+ headers = {"Authorization": f"Bearer {token}"}
49
+ resp = httpx.post(
50
+ f"{registry_url}/modules",
51
+ json=payload, headers=headers, timeout=5,
52
+ )
53
+ if resp.status_code == 200:
54
+ print("[watchdog] Registered to Registry")
55
+ else:
56
+ print(f"[watchdog] WARNING: Registry returned {resp.status_code}")
57
+
58
+
59
+ def _get_launcher_url(token: str, registry_url: str) -> str:
60
+ """Discover Launcher API endpoint from Registry."""
61
+ headers = {"Authorization": f"Bearer {token}"}
62
+ try:
63
+ resp = httpx.get(
64
+ f"{registry_url}/get/launcher.api_endpoint",
65
+ headers=headers, timeout=5,
66
+ )
67
+ if resp.status_code == 200:
68
+ return resp.json()
69
+ except Exception:
70
+ pass
71
+ return ""
72
+
73
+
74
+ def _get_event_hub_ws(token: str, registry_url: str) -> str:
75
+ """Discover Event Hub WebSocket endpoint from Registry, with retry."""
76
+ import time
77
+ headers = {"Authorization": f"Bearer {token}"}
78
+ deadline = time.time() + 10
79
+ while time.time() < deadline:
80
+ try:
81
+ resp = httpx.get(
82
+ f"{registry_url}/get/event_hub.metadata.ws_endpoint",
83
+ headers=headers, timeout=3,
84
+ )
85
+ if resp.status_code == 200:
86
+ val = resp.json()
87
+ if val:
88
+ return val
89
+ except Exception:
90
+ pass
91
+ time.sleep(1)
92
+ return ""
93
+
94
+
95
+ def main():
96
+ # Read boot_info from stdin
97
+ token = ""
98
+ registry_port = 0
99
+ try:
100
+ line = sys.stdin.readline().strip()
101
+ if line:
102
+ boot_info = json.loads(line)
103
+ token = boot_info.get("token", "")
104
+ registry_port = boot_info.get("registry_port", 0)
105
+ except Exception:
106
+ pass
107
+
108
+ if not token or not registry_port:
109
+ print("[watchdog] ERROR: Missing token or registry_port in boot_info")
110
+ sys.exit(1)
111
+
112
+ print(f"[watchdog] Token received ({len(token)} chars), registry port: {registry_port}")
113
+
114
+ registry_url = f"http://127.0.0.1:{registry_port}"
115
+ port = _get_free_port()
116
+
117
+ # Register to Registry
118
+ _register_to_registry(token, registry_url, port)
119
+
120
+ # Discover Launcher URL
121
+ launcher_url = _get_launcher_url(token, registry_url)
122
+ if not launcher_url:
123
+ print("[watchdog] WARNING: Could not discover Launcher URL, restart disabled")
124
+
125
+ # Discover Event Hub WebSocket URL
126
+ event_hub_ws = _get_event_hub_ws(token, registry_url)
127
+ if not event_hub_ws:
128
+ print("[watchdog] WARNING: Could not discover Event Hub WS, events disabled")
129
+
130
+ # Create monitor and server
131
+ monitor = HealthMonitor(
132
+ own_token=token,
133
+ registry_url=registry_url,
134
+ launcher_url=launcher_url,
135
+ )
136
+ server = WatchdogServer(monitor, token=token, event_hub_ws=event_hub_ws)
137
+
138
+ print(f"[watchdog] Starting on port {port}")
139
+ uvicorn.run(server.app, host="127.0.0.1", port=port, log_level="warning")
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main()
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: watchdog
3
+ display_name: Watchdog
4
+ version: "1.0"
5
+ type: service
6
+ state: enabled
7
+ runtime: python
8
+ entry: entry.py
9
+ events:
10
+ - watchdog.module.unhealthy
11
+ - watchdog.module.recovered
12
+ - watchdog.alert
13
+ subscriptions:
14
+ - module.started
15
+ - module.stopped
16
+ ---
17
+
18
+ # Watchdog(保活模块)
19
+
20
+ 应用层健康监控扩展模块。Launcher 处理进程级故障,Watchdog 处理应用级故障(进程活着但不健康)。
21
+
22
+ - 心跳检测 — 定期检查各模块的 `/health` 端点
23
+ - 资源监控 — 检测模块健康状态中的异常指标
24
+ - 非 core 模块重启 — 发现异常时通过 Launcher API 重启模块
25
+ - 告警通知 — 模块持续异常时,通过 Event Hub 发布告警事件