@agentunion/kite 1.0.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +329 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +197 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +329 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +197 -0
- package/extensions/event_hub_bench/entry.py +624 -379
- package/extensions/event_hub_bench/module.md +2 -1
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +508 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +508 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/watchdog/entry.py +468 -102
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +170 -69
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +390 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +375 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/kernel/__init__.py +0 -0
- package/kernel/entry.py +407 -0
- package/{core/event_hub/hub.py → kernel/event_hub.py} +62 -74
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +23 -8
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/launcher/entry.py +1778 -0
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/launcher/process_manager.py +880 -0
- package/main.py +11 -210
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -157
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -206
- package/core/launcher/entry.py +0 -1158
- package/core/launcher/process_manager.py +0 -470
- package/core/registry/entry.py +0 -110
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -289
- package/extensions/services/watchdog/server.py +0 -167
- /package/{core → extensions/services/web/vendor/bluetooth}/__init__.py +0 -0
- /package/{core/event_hub → extensions/services/web/vendor/conversation}/__init__.py +0 -0
- /package/{core/launcher → extensions/services/web/vendor/task}/__init__.py +0 -0
- /package/{core/registry → extensions/services/web/vendor/tools}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
|
@@ -1,100 +1,302 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Watchdog entry point.
|
|
3
|
-
|
|
3
|
+
Connects to Kernel via WebSocket JSON-RPC 2.0, registers, subscribes to events,
|
|
4
|
+
runs health monitor loop, handles incoming RPC requests.
|
|
4
5
|
"""
|
|
5
6
|
|
|
7
|
+
import asyncio
|
|
8
|
+
import builtins
|
|
6
9
|
import json
|
|
7
10
|
import os
|
|
8
|
-
import
|
|
11
|
+
import re
|
|
9
12
|
import sys
|
|
13
|
+
import threading
|
|
14
|
+
import time
|
|
15
|
+
import traceback
|
|
16
|
+
import uuid
|
|
17
|
+
from datetime import datetime, timezone
|
|
10
18
|
|
|
11
|
-
import
|
|
12
|
-
import uvicorn
|
|
19
|
+
import websockets
|
|
13
20
|
|
|
14
|
-
|
|
21
|
+
|
|
22
|
+
# ── Module configuration ──
|
|
23
|
+
MODULE_NAME = "watchdog"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _fmt_elapsed(t0: float) -> str:
|
|
27
|
+
"""Format elapsed time since t0: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
28
|
+
d = time.monotonic() - t0
|
|
29
|
+
if d < 1:
|
|
30
|
+
return f"{d * 1000:.0f}ms"
|
|
31
|
+
if d < 10:
|
|
32
|
+
return f"{d:.1f}s"
|
|
33
|
+
return f"{d:.0f}s"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ── Safe stdout/stderr: ignore BrokenPipeError after Launcher closes stdio ──
|
|
37
|
+
|
|
38
|
+
class _SafeWriter:
|
|
39
|
+
"""Wraps a stream to silently swallow BrokenPipeError on write/flush."""
|
|
40
|
+
def __init__(self, stream):
|
|
41
|
+
self._stream = stream
|
|
42
|
+
|
|
43
|
+
def write(self, s):
|
|
44
|
+
try:
|
|
45
|
+
self._stream.write(s)
|
|
46
|
+
except (BrokenPipeError, OSError):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def flush(self):
|
|
50
|
+
try:
|
|
51
|
+
self._stream.flush()
|
|
52
|
+
except (BrokenPipeError, OSError):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
def __getattr__(self, name):
|
|
56
|
+
return getattr(self._stream, name)
|
|
57
|
+
|
|
58
|
+
sys.stdout = _SafeWriter(sys.stdout)
|
|
59
|
+
sys.stderr = _SafeWriter(sys.stderr)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ── Timestamped print + log file writer ──
|
|
63
|
+
|
|
64
|
+
_builtin_print = builtins.print
|
|
65
|
+
_start_ts = time.monotonic()
|
|
66
|
+
_last_ts = time.monotonic()
|
|
67
|
+
_ANSI_RE = re.compile(r"\033\[[0-9;]*m")
|
|
68
|
+
_log_lock = threading.Lock()
|
|
69
|
+
_log_latest_path = None
|
|
70
|
+
_log_daily_path = None
|
|
71
|
+
_log_daily_date = ""
|
|
72
|
+
_log_dir = None
|
|
73
|
+
_crash_log_path = None
|
|
74
|
+
|
|
75
|
+
def _strip_ansi(s: str) -> str:
|
|
76
|
+
return _ANSI_RE.sub("", s)
|
|
77
|
+
|
|
78
|
+
def _resolve_daily_log_path():
|
|
79
|
+
"""Resolve daily log path based on current date."""
|
|
80
|
+
global _log_daily_path, _log_daily_date
|
|
81
|
+
if not _log_dir:
|
|
82
|
+
return
|
|
83
|
+
today = datetime.now().strftime("%Y-%m-%d")
|
|
84
|
+
if today == _log_daily_date and _log_daily_path:
|
|
85
|
+
return
|
|
86
|
+
month_dir = os.path.join(_log_dir, today[:7])
|
|
87
|
+
os.makedirs(month_dir, exist_ok=True)
|
|
88
|
+
_log_daily_path = os.path.join(month_dir, f"{today}.log")
|
|
89
|
+
_log_daily_date = today
|
|
90
|
+
|
|
91
|
+
def _write_log(plain_line: str):
|
|
92
|
+
"""Write a plain-text line to both latest.log and daily log."""
|
|
93
|
+
with _log_lock:
|
|
94
|
+
if _log_latest_path:
|
|
95
|
+
try:
|
|
96
|
+
with open(_log_latest_path, "a", encoding="utf-8") as f:
|
|
97
|
+
f.write(plain_line)
|
|
98
|
+
except Exception:
|
|
99
|
+
pass
|
|
100
|
+
_resolve_daily_log_path()
|
|
101
|
+
if _log_daily_path:
|
|
102
|
+
try:
|
|
103
|
+
with open(_log_daily_path, "a", encoding="utf-8") as f:
|
|
104
|
+
f.write(plain_line)
|
|
105
|
+
except Exception:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _write_crash(exc_type, exc_value, exc_tb, thread_name=None, severity="critical", handled=False):
|
|
110
|
+
"""Write crash record to crashes.jsonl + daily crash archive."""
|
|
111
|
+
record = {
|
|
112
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
113
|
+
"module": MODULE_NAME,
|
|
114
|
+
"thread": thread_name or threading.current_thread().name,
|
|
115
|
+
"exception_type": exc_type.__name__ if exc_type else "Unknown",
|
|
116
|
+
"exception_message": str(exc_value),
|
|
117
|
+
"traceback": "".join(traceback.format_exception(exc_type, exc_value, exc_tb)),
|
|
118
|
+
"severity": severity,
|
|
119
|
+
"handled": handled,
|
|
120
|
+
"process_id": os.getpid(),
|
|
121
|
+
"platform": sys.platform,
|
|
122
|
+
"runtime_version": f"Python {sys.version.split()[0]}",
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if exc_tb:
|
|
126
|
+
tb_entries = traceback.extract_tb(exc_tb)
|
|
127
|
+
if tb_entries:
|
|
128
|
+
last = tb_entries[-1]
|
|
129
|
+
record["context"] = {
|
|
130
|
+
"function": last.name,
|
|
131
|
+
"file": os.path.basename(last.filename),
|
|
132
|
+
"line": last.lineno,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
line = json.dumps(record, ensure_ascii=False) + "\n"
|
|
136
|
+
|
|
137
|
+
if _crash_log_path:
|
|
138
|
+
try:
|
|
139
|
+
with open(_crash_log_path, "a", encoding="utf-8") as f:
|
|
140
|
+
f.write(line)
|
|
141
|
+
except Exception:
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
if _log_dir:
|
|
145
|
+
try:
|
|
146
|
+
today = datetime.now().strftime("%Y-%m-%d")
|
|
147
|
+
archive_dir = os.path.join(_log_dir, "crashes", today[:7])
|
|
148
|
+
os.makedirs(archive_dir, exist_ok=True)
|
|
149
|
+
archive_path = os.path.join(archive_dir, f"{today}.jsonl")
|
|
150
|
+
with open(archive_path, "a", encoding="utf-8") as f:
|
|
151
|
+
f.write(line)
|
|
152
|
+
except Exception:
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _print_crash_summary(exc_type, exc_tb, thread_name=None):
|
|
157
|
+
"""Print crash summary to console (red highlight)."""
|
|
158
|
+
RED = "\033[91m"
|
|
159
|
+
RESET = "\033[0m"
|
|
160
|
+
|
|
161
|
+
if exc_tb:
|
|
162
|
+
tb_entries = traceback.extract_tb(exc_tb)
|
|
163
|
+
if tb_entries:
|
|
164
|
+
last = tb_entries[-1]
|
|
165
|
+
location = f"{os.path.basename(last.filename)}:{last.lineno}"
|
|
166
|
+
else:
|
|
167
|
+
location = "unknown"
|
|
168
|
+
else:
|
|
169
|
+
location = "unknown"
|
|
170
|
+
|
|
171
|
+
prefix = f"[{MODULE_NAME}]"
|
|
172
|
+
if thread_name:
|
|
173
|
+
_builtin_print(f"{prefix} {RED}线程 {thread_name} 崩溃: "
|
|
174
|
+
f"{exc_type.__name__} in {location}{RESET}")
|
|
175
|
+
else:
|
|
176
|
+
_builtin_print(f"{prefix} {RED}崩溃: {exc_type.__name__} in {location}{RESET}")
|
|
177
|
+
if _crash_log_path:
|
|
178
|
+
_builtin_print(f"{prefix} 崩溃日志: {_crash_log_path}")
|
|
179
|
+
|
|
180
|
+
def _setup_exception_hooks():
|
|
181
|
+
"""Set up global exception hooks."""
|
|
182
|
+
_orig_excepthook = sys.excepthook
|
|
183
|
+
|
|
184
|
+
def _excepthook(exc_type, exc_value, exc_tb):
|
|
185
|
+
_write_crash(exc_type, exc_value, exc_tb, severity="critical", handled=False)
|
|
186
|
+
_print_crash_summary(exc_type, exc_tb)
|
|
187
|
+
_orig_excepthook(exc_type, exc_value, exc_tb)
|
|
188
|
+
|
|
189
|
+
sys.excepthook = _excepthook
|
|
190
|
+
|
|
191
|
+
if hasattr(threading, "excepthook"):
|
|
192
|
+
def _thread_excepthook(args):
|
|
193
|
+
_write_crash(args.exc_type, args.exc_value, args.exc_traceback,
|
|
194
|
+
thread_name=args.thread.name if args.thread else "unknown",
|
|
195
|
+
severity="error", handled=False)
|
|
196
|
+
_print_crash_summary(args.exc_type, args.exc_traceback,
|
|
197
|
+
thread_name=args.thread.name if args.thread else None)
|
|
198
|
+
|
|
199
|
+
threading.excepthook = _thread_excepthook
|
|
200
|
+
|
|
201
|
+
def _tprint(*args, **kwargs):
|
|
202
|
+
"""Timestamped print that adds [timestamp] HH:MM:SS.mmm +delta prefix."""
|
|
203
|
+
global _last_ts
|
|
204
|
+
now = time.monotonic()
|
|
205
|
+
elapsed = now - _start_ts
|
|
206
|
+
delta = now - _last_ts
|
|
207
|
+
_last_ts = now
|
|
208
|
+
|
|
209
|
+
if elapsed < 1:
|
|
210
|
+
elapsed_str = f"{elapsed * 1000:.0f}ms"
|
|
211
|
+
elif elapsed < 100:
|
|
212
|
+
elapsed_str = f"{elapsed:.1f}s"
|
|
213
|
+
else:
|
|
214
|
+
elapsed_str = f"{elapsed:.0f}s"
|
|
215
|
+
|
|
216
|
+
if delta < 0.001:
|
|
217
|
+
delta_str = ""
|
|
218
|
+
elif delta < 1:
|
|
219
|
+
delta_str = f"+{delta * 1000:.0f}ms"
|
|
220
|
+
elif delta < 100:
|
|
221
|
+
delta_str = f"+{delta:.1f}s"
|
|
222
|
+
else:
|
|
223
|
+
delta_str = f"+{delta:.0f}s"
|
|
224
|
+
|
|
225
|
+
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
226
|
+
|
|
227
|
+
_builtin_print(*args, **kwargs)
|
|
228
|
+
|
|
229
|
+
if _log_latest_path or _log_daily_path:
|
|
230
|
+
sep = kwargs.get("sep", " ")
|
|
231
|
+
end = kwargs.get("end", "\n")
|
|
232
|
+
text = sep.join(str(a) for a in args)
|
|
233
|
+
prefix = f"[{elapsed_str:>6}] {ts} {delta_str:>8} "
|
|
234
|
+
_write_log(prefix + _strip_ansi(text) + end)
|
|
235
|
+
|
|
236
|
+
builtins.print = _tprint
|
|
237
|
+
|
|
238
|
+
# Ensure project root is on sys.path
|
|
15
239
|
_project_root = os.environ.get("KITE_PROJECT") or os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
16
240
|
if _project_root not in sys.path:
|
|
17
241
|
sys.path.insert(0, _project_root)
|
|
18
242
|
|
|
19
243
|
from extensions.services.watchdog.monitor import HealthMonitor
|
|
20
|
-
from extensions.services.watchdog.server import WatchdogServer
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _get_free_port() -> int:
|
|
24
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
25
|
-
s.bind(("127.0.0.1", 0))
|
|
26
|
-
return s.getsockname()[1]
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _register_to_registry(token: str, registry_url: str, port: int):
|
|
30
|
-
payload = {
|
|
31
|
-
"action": "register",
|
|
32
|
-
"module_id": "watchdog",
|
|
33
|
-
"module_type": "service",
|
|
34
|
-
"name": "Watchdog",
|
|
35
|
-
"api_endpoint": f"http://127.0.0.1:{port}",
|
|
36
|
-
"health_endpoint": "/health",
|
|
37
|
-
"events_publish": {
|
|
38
|
-
"watchdog.module.unhealthy": {"description": "Module failed health check"},
|
|
39
|
-
"watchdog.module.recovered": {"description": "Module recovered from unhealthy"},
|
|
40
|
-
"watchdog.alert": {"description": "Module restarted too many times"},
|
|
41
|
-
},
|
|
42
|
-
"events_subscribe": [
|
|
43
|
-
"module.started",
|
|
44
|
-
"module.stopped",
|
|
45
|
-
],
|
|
46
|
-
}
|
|
47
|
-
headers = {"Authorization": f"Bearer {token}"}
|
|
48
|
-
resp = httpx.post(
|
|
49
|
-
f"{registry_url}/modules",
|
|
50
|
-
json=payload, headers=headers, timeout=5,
|
|
51
|
-
)
|
|
52
|
-
if resp.status_code == 200:
|
|
53
|
-
print("[watchdog] Registered to Registry")
|
|
54
|
-
else:
|
|
55
|
-
print(f"[watchdog] WARNING: Registry returned {resp.status_code}")
|
|
56
|
-
|
|
57
244
|
|
|
58
|
-
def _get_launcher_url(token: str, registry_url: str) -> str:
|
|
59
|
-
"""Discover Launcher API endpoint from Registry."""
|
|
60
|
-
headers = {"Authorization": f"Bearer {token}"}
|
|
61
|
-
try:
|
|
62
|
-
resp = httpx.get(
|
|
63
|
-
f"{registry_url}/get/launcher.api_endpoint",
|
|
64
|
-
headers=headers, timeout=5,
|
|
65
|
-
)
|
|
66
|
-
if resp.status_code == 200:
|
|
67
|
-
return resp.json()
|
|
68
|
-
except Exception:
|
|
69
|
-
pass
|
|
70
|
-
return ""
|
|
71
245
|
|
|
246
|
+
def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict | None:
|
|
247
|
+
"""Read a single kite message of expected type from stdin with timeout."""
|
|
248
|
+
result = [None]
|
|
72
249
|
|
|
73
|
-
def
|
|
74
|
-
"""Discover Event Hub WebSocket endpoint from Registry, with retry."""
|
|
75
|
-
import time
|
|
76
|
-
headers = {"Authorization": f"Bearer {token}"}
|
|
77
|
-
deadline = time.time() + 10
|
|
78
|
-
while time.time() < deadline:
|
|
250
|
+
def _read():
|
|
79
251
|
try:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
val = resp.json()
|
|
86
|
-
if val:
|
|
87
|
-
return val
|
|
252
|
+
line = sys.stdin.readline().strip()
|
|
253
|
+
if line:
|
|
254
|
+
msg = json.loads(line)
|
|
255
|
+
if isinstance(msg, dict) and msg.get("kite") == expected_type:
|
|
256
|
+
result[0] = msg
|
|
88
257
|
except Exception:
|
|
89
258
|
pass
|
|
90
|
-
|
|
91
|
-
|
|
259
|
+
|
|
260
|
+
t = threading.Thread(target=_read, daemon=True)
|
|
261
|
+
t.start()
|
|
262
|
+
t.join(timeout=timeout)
|
|
263
|
+
return result[0]
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# Global WS reference for publish_event callback
|
|
267
|
+
_ws_global = None
|
|
92
268
|
|
|
93
269
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
270
|
+
|
|
271
|
+
async def main():
|
|
272
|
+
global _ws_global
|
|
273
|
+
# Initialize log file paths
|
|
274
|
+
global _log_dir, _log_latest_path, _crash_log_path
|
|
275
|
+
module_data = os.environ.get("KITE_MODULE_DATA")
|
|
276
|
+
if module_data:
|
|
277
|
+
_log_dir = os.path.join(module_data, "log")
|
|
278
|
+
os.makedirs(_log_dir, exist_ok=True)
|
|
279
|
+
suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
280
|
+
|
|
281
|
+
_log_latest_path = os.path.join(_log_dir, f"latest{suffix}.log")
|
|
282
|
+
try:
|
|
283
|
+
with open(_log_latest_path, "w", encoding="utf-8") as f:
|
|
284
|
+
pass
|
|
285
|
+
except Exception:
|
|
286
|
+
_log_latest_path = None
|
|
287
|
+
|
|
288
|
+
_crash_log_path = os.path.join(_log_dir, f"crashes{suffix}.jsonl")
|
|
289
|
+
try:
|
|
290
|
+
with open(_crash_log_path, "w", encoding="utf-8") as f:
|
|
291
|
+
pass
|
|
292
|
+
except Exception:
|
|
293
|
+
_crash_log_path = None
|
|
294
|
+
|
|
295
|
+
_resolve_daily_log_path()
|
|
296
|
+
|
|
297
|
+
_setup_exception_hooks()
|
|
298
|
+
|
|
299
|
+
_t0 = time.monotonic()
|
|
98
300
|
|
|
99
301
|
# Read boot_info from stdin (only token)
|
|
100
302
|
token = ""
|
|
@@ -106,42 +308,206 @@ def main():
|
|
|
106
308
|
except Exception:
|
|
107
309
|
pass
|
|
108
310
|
|
|
109
|
-
# Read
|
|
110
|
-
|
|
311
|
+
# Read kernel_port: env first (fast path), stdin fallback (parallel start)
|
|
312
|
+
kernel_port = int(os.environ.get("KITE_KERNEL_PORT", "0"))
|
|
313
|
+
if not kernel_port:
|
|
314
|
+
msg = _read_stdin_kite_message("kernel_port", timeout=10)
|
|
315
|
+
if msg:
|
|
316
|
+
kernel_port = int(msg.get("kernel_port", 0))
|
|
111
317
|
|
|
112
|
-
if not token or not
|
|
113
|
-
print("[watchdog] ERROR: Missing token or
|
|
318
|
+
if not token or not kernel_port:
|
|
319
|
+
print("[watchdog] ERROR: Missing token or kernel_port")
|
|
114
320
|
sys.exit(1)
|
|
115
321
|
|
|
116
|
-
print(f"[watchdog] Token received ({len(token)} chars),
|
|
322
|
+
print(f"[watchdog] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
|
|
117
323
|
|
|
118
|
-
|
|
119
|
-
|
|
324
|
+
# Connect to Kernel WebSocket
|
|
325
|
+
ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
|
|
326
|
+
print(f"[watchdog] Connecting to Kernel: {ws_url}")
|
|
120
327
|
|
|
121
|
-
|
|
122
|
-
|
|
328
|
+
try:
|
|
329
|
+
async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
330
|
+
_ws_global = ws
|
|
331
|
+
print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
332
|
+
|
|
333
|
+
# Subscribe to events
|
|
334
|
+
await _rpc_call(ws, "event.subscribe", {
|
|
335
|
+
"events": [
|
|
336
|
+
"system.ready",
|
|
337
|
+
"module.started",
|
|
338
|
+
"module.stopped",
|
|
339
|
+
"module.exiting",
|
|
340
|
+
"module.ready",
|
|
341
|
+
"module.shutdown",
|
|
342
|
+
],
|
|
343
|
+
})
|
|
344
|
+
print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
|
|
345
|
+
|
|
346
|
+
# Register to Kernel Registry via RPC
|
|
347
|
+
await _rpc_call(ws, "registry.register", {
|
|
348
|
+
"module_id": "watchdog",
|
|
349
|
+
"module_type": "service",
|
|
350
|
+
"events_publish": {
|
|
351
|
+
"watchdog.module.unhealthy": {},
|
|
352
|
+
"watchdog.module.recovered": {},
|
|
353
|
+
"watchdog.alert": {},
|
|
354
|
+
},
|
|
355
|
+
"events_subscribe": [
|
|
356
|
+
"system.ready",
|
|
357
|
+
"module.started",
|
|
358
|
+
"module.stopped",
|
|
359
|
+
"module.exiting",
|
|
360
|
+
"module.ready",
|
|
361
|
+
"module.shutdown",
|
|
362
|
+
],
|
|
363
|
+
})
|
|
364
|
+
print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
|
|
123
365
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
366
|
+
# Create monitor with RPC callback
|
|
367
|
+
monitor = HealthMonitor(
|
|
368
|
+
own_token=token,
|
|
369
|
+
kernel_port=kernel_port,
|
|
370
|
+
)
|
|
371
|
+
monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
|
|
372
|
+
monitor.rpc_call = lambda method, params: _rpc_call(ws, method, params)
|
|
373
|
+
|
|
374
|
+
# Publish module.ready
|
|
375
|
+
await _rpc_call(ws, "event.publish", {
|
|
376
|
+
"event_id": str(uuid.uuid4()),
|
|
377
|
+
"event": "module.ready",
|
|
378
|
+
"data": {
|
|
379
|
+
"module_id": "watchdog",
|
|
380
|
+
"graceful_shutdown": True,
|
|
381
|
+
},
|
|
382
|
+
})
|
|
383
|
+
print(f"[watchdog] module.ready published ({_fmt_elapsed(_t0)})")
|
|
128
384
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
if not event_hub_ws:
|
|
132
|
-
print("[watchdog] WARNING: Could not discover Event Hub WS, events disabled")
|
|
385
|
+
# Start monitor loop in background
|
|
386
|
+
monitor_task = asyncio.create_task(monitor.run())
|
|
133
387
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
server = WatchdogServer(monitor, token=token, event_hub_ws=event_hub_ws)
|
|
388
|
+
# Message loop: handle incoming RPC + events
|
|
389
|
+
async for raw in ws:
|
|
390
|
+
try:
|
|
391
|
+
msg = json.loads(raw)
|
|
392
|
+
except (json.JSONDecodeError, TypeError):
|
|
393
|
+
continue
|
|
141
394
|
|
|
142
|
-
|
|
143
|
-
|
|
395
|
+
try:
|
|
396
|
+
has_method = "method" in msg
|
|
397
|
+
has_id = "id" in msg
|
|
398
|
+
|
|
399
|
+
if has_method and not has_id:
|
|
400
|
+
# Event Notification
|
|
401
|
+
await _handle_event_notification(msg, monitor)
|
|
402
|
+
elif has_method and has_id:
|
|
403
|
+
# Incoming RPC request
|
|
404
|
+
await _handle_rpc_request(ws, msg, monitor)
|
|
405
|
+
# Ignore RPC responses (we don't await them in this simple impl)
|
|
406
|
+
except Exception as e:
|
|
407
|
+
print(f"[watchdog] 消息处理异常(已忽略): {e}")
|
|
408
|
+
|
|
409
|
+
except Exception as e:
|
|
410
|
+
_write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
|
|
411
|
+
_print_crash_summary(type(e), e.__traceback__)
|
|
412
|
+
sys.exit(1)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
async def _rpc_call(ws, method: str, params: dict = None):
|
|
417
|
+
"""Send a JSON-RPC 2.0 request (fire-and-forget, no response awaited)."""
|
|
418
|
+
msg = {"jsonrpc": "2.0", "id": str(uuid.uuid4()), "method": method}
|
|
419
|
+
if params:
|
|
420
|
+
msg["params"] = params
|
|
421
|
+
await ws.send(json.dumps(msg))
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
async def _publish_event(ws, event: dict):
|
|
425
|
+
"""Publish an event via RPC event.publish."""
|
|
426
|
+
await _rpc_call(ws, "event.publish", {
|
|
427
|
+
"event_id": str(uuid.uuid4()),
|
|
428
|
+
"event": event.get("event", ""),
|
|
429
|
+
"data": event.get("data", {}),
|
|
430
|
+
})
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
|
|
434
|
+
"""Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
|
|
435
|
+
params = msg.get("params", {})
|
|
436
|
+
event_type = params.get("event", "")
|
|
437
|
+
data = params.get("data", {})
|
|
438
|
+
|
|
439
|
+
# Special handling for module.shutdown targeting watchdog
|
|
440
|
+
if event_type == "module.shutdown" and data.get("module_id") == "watchdog":
|
|
441
|
+
await _handle_shutdown(monitor)
|
|
442
|
+
return
|
|
443
|
+
|
|
444
|
+
# Forward to monitor
|
|
445
|
+
await monitor.handle_event(msg)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
async def _handle_rpc_request(ws, msg: dict, monitor: HealthMonitor):
|
|
449
|
+
"""Handle an incoming RPC request (watchdog.* methods)."""
|
|
450
|
+
rpc_id = msg.get("id", "")
|
|
451
|
+
method = msg.get("method", "")
|
|
452
|
+
params = msg.get("params", {})
|
|
453
|
+
|
|
454
|
+
handlers = {
|
|
455
|
+
"health": lambda p: _rpc_health(monitor),
|
|
456
|
+
"status": lambda p: _rpc_status(monitor),
|
|
457
|
+
}
|
|
458
|
+
handler = handlers.get(method)
|
|
459
|
+
if handler:
|
|
460
|
+
try:
|
|
461
|
+
result = await handler(params)
|
|
462
|
+
await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
|
|
463
|
+
except Exception as e:
|
|
464
|
+
await ws.send(json.dumps({
|
|
465
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
466
|
+
"error": {"code": -32603, "message": str(e)},
|
|
467
|
+
}))
|
|
468
|
+
else:
|
|
469
|
+
await ws.send(json.dumps({
|
|
470
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
471
|
+
"error": {"code": -32601, "message": f"Method not found: {method}"},
|
|
472
|
+
}))
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
async def _rpc_health(monitor: HealthMonitor) -> dict:
|
|
476
|
+
"""RPC handler for watchdog.health."""
|
|
477
|
+
return {
|
|
478
|
+
"status": "healthy",
|
|
479
|
+
"details": {
|
|
480
|
+
"monitored_modules": len(monitor.modules),
|
|
481
|
+
"uptime_seconds": round(time.time() - _start_ts),
|
|
482
|
+
},
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
async def _rpc_status(monitor: HealthMonitor) -> dict:
|
|
487
|
+
"""RPC handler for watchdog.status."""
|
|
488
|
+
return monitor.get_status()
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
async def _handle_shutdown(monitor: HealthMonitor):
|
|
492
|
+
"""Handle module.shutdown event — ack, cleanup, ready, exit."""
|
|
493
|
+
print("[watchdog] Received shutdown request")
|
|
494
|
+
# Step 1: Send ack
|
|
495
|
+
await _publish_event(_ws_global, {
|
|
496
|
+
"event": "module.shutdown.ack",
|
|
497
|
+
"data": {"module_id": "watchdog", "estimated_cleanup": 2},
|
|
498
|
+
})
|
|
499
|
+
# Step 2: Cleanup
|
|
500
|
+
monitor.stop()
|
|
501
|
+
# Step 3: Send ready
|
|
502
|
+
await _publish_event(_ws_global, {
|
|
503
|
+
"event": "module.shutdown.ready",
|
|
504
|
+
"data": {"module_id": "watchdog"},
|
|
505
|
+
})
|
|
506
|
+
print("[watchdog] Shutdown ready, exiting")
|
|
507
|
+
# Step 4: Exit
|
|
508
|
+
sys.exit(0)
|
|
144
509
|
|
|
145
510
|
|
|
146
511
|
if __name__ == "__main__":
|
|
147
|
-
main()
|
|
512
|
+
asyncio.run(main())
|
|
513
|
+
|