@agentunion/kite 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +208 -0
  2. package/README.md +48 -0
  3. package/cli.js +1 -1
  4. package/extensions/agents/assistant/entry.py +30 -81
  5. package/extensions/agents/assistant/module.md +1 -1
  6. package/extensions/agents/assistant/server.py +83 -122
  7. package/extensions/channels/acp_channel/entry.py +30 -81
  8. package/extensions/channels/acp_channel/module.md +1 -1
  9. package/extensions/channels/acp_channel/server.py +83 -122
  10. package/extensions/event_hub_bench/entry.py +81 -121
  11. package/extensions/services/backup/entry.py +213 -85
  12. package/extensions/services/model_service/entry.py +213 -85
  13. package/extensions/services/watchdog/entry.py +513 -460
  14. package/extensions/services/watchdog/monitor.py +55 -69
  15. package/extensions/services/web/entry.py +11 -108
  16. package/extensions/services/web/server.py +120 -77
  17. package/{core/registry → kernel}/entry.py +65 -37
  18. package/{core/event_hub/hub.py → kernel/event_hub.py} +61 -81
  19. package/kernel/module.md +33 -0
  20. package/{core/registry/store.py → kernel/registry_store.py} +13 -4
  21. package/kernel/rpc_router.py +388 -0
  22. package/kernel/server.py +267 -0
  23. package/launcher/__init__.py +10 -0
  24. package/launcher/__main__.py +6 -0
  25. package/launcher/count_lines.py +258 -0
  26. package/{core/launcher → launcher}/entry.py +693 -767
  27. package/launcher/logging_setup.py +289 -0
  28. package/{core/launcher → launcher}/module_scanner.py +11 -6
  29. package/main.py +11 -350
  30. package/package.json +6 -9
  31. package/__init__.py +0 -1
  32. package/__main__.py +0 -15
  33. package/core/event_hub/BENCHMARK.md +0 -94
  34. package/core/event_hub/__init__.py +0 -0
  35. package/core/event_hub/bench.py +0 -459
  36. package/core/event_hub/bench_extreme.py +0 -308
  37. package/core/event_hub/bench_perf.py +0 -350
  38. package/core/event_hub/entry.py +0 -436
  39. package/core/event_hub/module.md +0 -20
  40. package/core/event_hub/server.py +0 -269
  41. package/core/kite_log.py +0 -241
  42. package/core/launcher/__init__.py +0 -0
  43. package/core/registry/__init__.py +0 -0
  44. package/core/registry/module.md +0 -30
  45. package/core/registry/server.py +0 -339
  46. package/extensions/services/backup/server.py +0 -244
  47. package/extensions/services/model_service/server.py +0 -236
  48. package/extensions/services/watchdog/server.py +0 -229
  49. /package/{core → kernel}/__init__.py +0 -0
  50. /package/{core/event_hub → kernel}/dedup.py +0 -0
  51. /package/{core/event_hub → kernel}/router.py +0 -0
  52. /package/{core/launcher → launcher}/module.md +0 -0
  53. /package/{core/launcher → launcher}/process_manager.py +0 -0
@@ -1,460 +1,513 @@
1
- """
2
- Watchdog entry point.
3
- Reads boot_info from stdin, registers to Registry, starts health monitor.
4
- Registry port: env KITE_REGISTRY_PORT (fast path) or stdin kite message (parallel start).
5
- """
6
-
7
- import builtins
8
- import json
9
- import os
10
- import re
11
- import socket
12
- import sys
13
- import threading
14
- import time
15
- import traceback
16
- from datetime import datetime, timezone
17
-
18
- import httpx
19
- import uvicorn
20
-
21
-
22
-
23
- # ── Module configuration ──
24
- MODULE_NAME = "watchdog"
25
-
26
-
27
- def _fmt_elapsed(t0: float) -> str:
28
- """Format elapsed time since t0: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
29
- d = time.monotonic() - t0
30
- if d < 1:
31
- return f"{d * 1000:.0f}ms"
32
- if d < 10:
33
- return f"{d:.1f}s"
34
- return f"{d:.0f}s"
35
-
36
-
37
- # ── Safe stdout/stderr: ignore BrokenPipeError after Launcher closes stdio ──
38
-
39
- class _SafeWriter:
40
- """Wraps a stream to silently swallow BrokenPipeError on write/flush."""
41
- def __init__(self, stream):
42
- self._stream = stream
43
-
44
- def write(self, s):
45
- try:
46
- self._stream.write(s)
47
- except (BrokenPipeError, OSError):
48
- pass
49
-
50
- def flush(self):
51
- try:
52
- self._stream.flush()
53
- except (BrokenPipeError, OSError):
54
- pass
55
-
56
- def __getattr__(self, name):
57
- return getattr(self._stream, name)
58
-
59
- sys.stdout = _SafeWriter(sys.stdout)
60
- sys.stderr = _SafeWriter(sys.stderr)
61
-
62
-
63
- # ── Timestamped print + log file writer ──
64
- # Independent implementation per module (no shared code dependency)
65
-
66
- _builtin_print = builtins.print
67
- _start_ts = time.monotonic()
68
- _last_ts = time.monotonic()
69
- _ANSI_RE = re.compile(r"\033\[[0-9;]*m")
70
- _log_lock = threading.Lock()
71
- _log_latest_path = None
72
- _log_daily_path = None
73
- _log_daily_date = ""
74
- _log_dir = None
75
- _crash_log_path = None
76
-
77
- def _strip_ansi(s: str) -> str:
78
- return _ANSI_RE.sub("", s)
79
-
80
- def _resolve_daily_log_path():
81
- """Resolve daily log path based on current date."""
82
- global _log_daily_path, _log_daily_date
83
- if not _log_dir:
84
- return
85
- today = datetime.now().strftime("%Y-%m-%d")
86
- if today == _log_daily_date and _log_daily_path:
87
- return
88
- month_dir = os.path.join(_log_dir, today[:7])
89
- os.makedirs(month_dir, exist_ok=True)
90
- _log_daily_path = os.path.join(month_dir, f"{today}.log")
91
- _log_daily_date = today
92
-
93
- def _write_log(plain_line: str):
94
- """Write a plain-text line to both latest.log and daily log."""
95
- with _log_lock:
96
- if _log_latest_path:
97
- try:
98
- with open(_log_latest_path, "a", encoding="utf-8") as f:
99
- f.write(plain_line)
100
- except Exception:
101
- pass
102
- _resolve_daily_log_path()
103
- if _log_daily_path:
104
- try:
105
- with open(_log_daily_path, "a", encoding="utf-8") as f:
106
- f.write(plain_line)
107
- except Exception:
108
- pass
109
-
110
- def _write_crash(exc_type, exc_value, exc_tb, thread_name=None, severity="critical", handled=False):
111
- """Write crash record to crashes.jsonl + daily crash archive."""
112
- record = {
113
- "timestamp": datetime.now(timezone.utc).isoformat(),
114
- "module": MODULE_NAME,
115
- "thread": thread_name or threading.current_thread().name,
116
- "exception_type": exc_type.__name__ if exc_type else "Unknown",
117
- "exception_message": str(exc_value),
118
- "traceback": "".join(traceback.format_exception(exc_type, exc_value, exc_tb)),
119
- "severity": severity,
120
- "handled": handled,
121
- "process_id": os.getpid(),
122
- "platform": sys.platform,
123
- "runtime_version": f"Python {sys.version.split()[0]}",
124
- }
125
-
126
- if exc_tb:
127
- tb_entries = traceback.extract_tb(exc_tb)
128
- if tb_entries:
129
- last = tb_entries[-1]
130
- record["context"] = {
131
- "function": last.name,
132
- "file": os.path.basename(last.filename),
133
- "line": last.lineno,
134
- }
135
-
136
- line = json.dumps(record, ensure_ascii=False) + "\n"
137
-
138
- if _crash_log_path:
139
- try:
140
- with open(_crash_log_path, "a", encoding="utf-8") as f:
141
- f.write(line)
142
- except Exception:
143
- pass
144
-
145
- if _log_dir:
146
- try:
147
- today = datetime.now().strftime("%Y-%m-%d")
148
- archive_dir = os.path.join(_log_dir, "crashes", today[:7])
149
- os.makedirs(archive_dir, exist_ok=True)
150
- archive_path = os.path.join(archive_dir, f"{today}.jsonl")
151
- with open(archive_path, "a", encoding="utf-8") as f:
152
- f.write(line)
153
- except Exception:
154
- pass
155
-
156
- def _print_crash_summary(exc_type, exc_tb, thread_name=None):
157
- """Print crash summary to console (red highlight)."""
158
- RED = "\033[91m"
159
- RESET = "\033[0m"
160
-
161
- if exc_tb:
162
- tb_entries = traceback.extract_tb(exc_tb)
163
- if tb_entries:
164
- last = tb_entries[-1]
165
- location = f"{os.path.basename(last.filename)}:{last.lineno}"
166
- else:
167
- location = "unknown"
168
- else:
169
- location = "unknown"
170
-
171
- prefix = f"[{MODULE_NAME}]"
172
- if thread_name:
173
- _builtin_print(f"{prefix} {RED}线程 {thread_name} 崩溃: "
174
- f"{exc_type.__name__} in {location}{RESET}")
175
- else:
176
- _builtin_print(f"{prefix} {RED}崩溃: {exc_type.__name__} in {location}{RESET}")
177
- if _crash_log_path:
178
- _builtin_print(f"{prefix} 崩溃日志: {_crash_log_path}")
179
-
180
- def _setup_exception_hooks():
181
- """Set up global exception hooks."""
182
- _orig_excepthook = sys.excepthook
183
-
184
- def _excepthook(exc_type, exc_value, exc_tb):
185
- _write_crash(exc_type, exc_value, exc_tb, severity="critical", handled=False)
186
- _print_crash_summary(exc_type, exc_tb)
187
- _orig_excepthook(exc_type, exc_value, exc_tb)
188
-
189
- sys.excepthook = _excepthook
190
-
191
- if hasattr(threading, "excepthook"):
192
- def _thread_excepthook(args):
193
- _write_crash(args.exc_type, args.exc_value, args.exc_traceback,
194
- thread_name=args.thread.name if args.thread else "unknown",
195
- severity="error", handled=False)
196
- _print_crash_summary(args.exc_type, args.exc_traceback,
197
- thread_name=args.thread.name if args.thread else None)
198
-
199
- threading.excepthook = _thread_excepthook
200
-
201
- def _tprint(*args, **kwargs):
202
- """Timestamped print that adds [timestamp] HH:MM:SS.mmm +delta prefix."""
203
- global _last_ts
204
- now = time.monotonic()
205
- elapsed = now - _start_ts
206
- delta = now - _last_ts
207
- _last_ts = now
208
-
209
- if elapsed < 1:
210
- elapsed_str = f"{elapsed * 1000:.0f}ms"
211
- elif elapsed < 100:
212
- elapsed_str = f"{elapsed:.1f}s"
213
- else:
214
- elapsed_str = f"{elapsed:.0f}s"
215
-
216
- if delta < 0.001:
217
- delta_str = ""
218
- elif delta < 1:
219
- delta_str = f"+{delta * 1000:.0f}ms"
220
- elif delta < 100:
221
- delta_str = f"+{delta:.1f}s"
222
- else:
223
- delta_str = f"+{delta:.0f}s"
224
-
225
- ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
226
-
227
- _builtin_print(*args, **kwargs)
228
-
229
- if _log_latest_path or _log_daily_path:
230
- sep = kwargs.get("sep", " ")
231
- end = kwargs.get("end", "\n")
232
- text = sep.join(str(a) for a in args)
233
- prefix = f"[{elapsed_str:>6}] {ts} {delta_str:>8} "
234
- _write_log(prefix + _strip_ansi(text) + end)
235
-
236
- builtins.print = _tprint
237
-
238
- # Ensure project root is on sys.path (set by main.py or cli.js)
239
- _project_root = os.environ.get("KITE_PROJECT") or os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
240
- if _project_root not in sys.path:
241
- sys.path.insert(0, _project_root)
242
-
243
- from extensions.services.watchdog.monitor import HealthMonitor
244
- from extensions.services.watchdog.server import WatchdogServer
245
-
246
-
247
- def _get_free_port() -> int:
248
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
249
- s.bind(("127.0.0.1", 0))
250
- return s.getsockname()[1]
251
-
252
-
253
- def _register_to_registry(client: httpx.Client, token: str, registry_url: str, port: int, _t0: float):
254
- payload = {
255
- "action": "register",
256
- "module_id": "watchdog",
257
- "module_type": "service",
258
- "name": "Watchdog",
259
- "api_endpoint": f"http://127.0.0.1:{port}",
260
- "health_endpoint": "/health",
261
- "events_publish": {
262
- "watchdog.module.unhealthy": {"description": "Module failed health check"},
263
- "watchdog.module.recovered": {"description": "Module recovered from unhealthy"},
264
- "watchdog.alert": {"description": "Module restarted too many times"},
265
- },
266
- "events_subscribe": [
267
- "module.started",
268
- "module.stopped",
269
- "module.exiting",
270
- "module.ready",
271
- "module.shutdown",
272
- ],
273
- }
274
- headers = {"Authorization": f"Bearer {token}"}
275
- # Watchdog starts in parallel with token registration — retry on 401
276
- deadline = time.monotonic() + 10
277
- while True:
278
- try:
279
- resp = client.post(
280
- f"{registry_url}/modules",
281
- json=payload, headers=headers,
282
- )
283
- if resp.status_code == 200:
284
- print(f"[watchdog] Registered to Registry ({_fmt_elapsed(_t0)})")
285
- return
286
- if resp.status_code == 401 and time.monotonic() < deadline:
287
- time.sleep(0.3)
288
- continue
289
- print(f"[watchdog] WARNING: Registry returned {resp.status_code}")
290
- return
291
- except Exception as e:
292
- if time.monotonic() < deadline:
293
- time.sleep(0.3)
294
- continue
295
- print(f"[watchdog] WARNING: Registry registration failed: {e}")
296
- return
297
-
298
-
299
- def _get_launcher_url(client: httpx.Client, token: str, registry_url: str) -> str:
300
- """Discover Launcher API endpoint from Registry, with retry."""
301
- import time
302
- headers = {"Authorization": f"Bearer {token}"}
303
- deadline = time.time() + 5 # 5s timeout (Launcher registers quickly)
304
- while time.time() < deadline:
305
- try:
306
- resp = client.get(
307
- f"{registry_url}/get/launcher.api_endpoint",
308
- headers=headers,
309
- )
310
- if resp.status_code == 200:
311
- val = resp.json()
312
- if val:
313
- return val
314
- except Exception:
315
- pass
316
- time.sleep(0.1) # Retry every 100ms
317
- return ""
318
-
319
-
320
- def _get_event_hub_ws(client: httpx.Client, token: str, registry_url: str) -> str:
321
- """Discover Event Hub WebSocket endpoint from Registry, with retry."""
322
- import time
323
- headers = {"Authorization": f"Bearer {token}"}
324
- deadline = time.time() + 10
325
- while time.time() < deadline:
326
- try:
327
- resp = client.get(
328
- f"{registry_url}/get/event_hub.metadata.ws_endpoint",
329
- headers=headers,
330
- )
331
- if resp.status_code == 200:
332
- val = resp.json()
333
- if val:
334
- return val
335
- except Exception:
336
- pass
337
- time.sleep(0.2)
338
- return ""
339
-
340
-
341
- def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict | None:
342
- """Read a single kite message of expected type from stdin with timeout."""
343
- result = [None]
344
-
345
- def _read():
346
- try:
347
- line = sys.stdin.readline().strip()
348
- if line:
349
- msg = json.loads(line)
350
- if isinstance(msg, dict) and msg.get("kite") == expected_type:
351
- result[0] = msg
352
- except Exception:
353
- pass
354
-
355
- t = threading.Thread(target=_read, daemon=True)
356
- t.start()
357
- t.join(timeout=timeout)
358
- return result[0]
359
-
360
-
361
- def main():
362
- # Initialize log file paths
363
- global _log_dir, _log_latest_path, _crash_log_path
364
- module_data = os.environ.get("KITE_MODULE_DATA")
365
- if module_data:
366
- _log_dir = os.path.join(module_data, "log")
367
- os.makedirs(_log_dir, exist_ok=True)
368
- suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
369
-
370
- _log_latest_path = os.path.join(_log_dir, f"latest{suffix}.log")
371
- try:
372
- with open(_log_latest_path, "w", encoding="utf-8") as f:
373
- pass
374
- except Exception:
375
- _log_latest_path = None
376
-
377
- _crash_log_path = os.path.join(_log_dir, f"crashes{suffix}.jsonl")
378
- try:
379
- with open(_crash_log_path, "w", encoding="utf-8") as f:
380
- pass
381
- except Exception:
382
- _crash_log_path = None
383
-
384
- _resolve_daily_log_path()
385
-
386
- _setup_exception_hooks()
387
-
388
- _t0 = time.monotonic()
389
-
390
- # Kite environment
391
- kite_instance = os.environ.get("KITE_INSTANCE", "")
392
- is_debug = os.environ.get("KITE_DEBUG") == "1"
393
-
394
- # Read boot_info from stdin (only token)
395
- token = ""
396
- try:
397
- line = sys.stdin.readline().strip()
398
- if line:
399
- boot_info = json.loads(line)
400
- token = boot_info.get("token", "")
401
- except Exception:
402
- pass
403
-
404
- # Read registry_port: env first (fast path), stdin fallback (parallel start)
405
- registry_port = int(os.environ.get("KITE_REGISTRY_PORT", "0"))
406
- if not registry_port:
407
- msg = _read_stdin_kite_message("registry_port", timeout=10)
408
- if msg:
409
- registry_port = int(msg.get("registry_port", 0))
410
-
411
- if not token or not registry_port:
412
- print("[watchdog] ERROR: Missing token or registry_port")
413
- sys.exit(1)
414
-
415
- print(f"[watchdog] Token received ({len(token)} chars), registry port: {registry_port} ({_fmt_elapsed(_t0)})")
416
-
417
- registry_url = f"http://127.0.0.1:{registry_port}"
418
- port = _get_free_port()
419
-
420
- client = httpx.Client(timeout=5)
421
-
422
- # Register to Registry
423
- _register_to_registry(client, token, registry_url, port, _t0)
424
-
425
- # Discover Launcher URL
426
- launcher_url = _get_launcher_url(client, token, registry_url)
427
- if not launcher_url:
428
- print("[watchdog] WARNING: Could not discover Launcher URL, restart disabled")
429
-
430
- # Discover Event Hub WebSocket URL
431
- event_hub_ws = _get_event_hub_ws(client, token, registry_url)
432
- if not event_hub_ws:
433
- print("[watchdog] WARNING: Could not discover Event Hub WS, events disabled")
434
- else:
435
- print(f"[watchdog] Discovered Event Hub: {event_hub_ws}")
436
-
437
- client.close()
438
-
439
- # Create monitor and server
440
- monitor = HealthMonitor(
441
- own_token=token,
442
- registry_url=registry_url,
443
- launcher_url=launcher_url,
444
- )
445
- server = WatchdogServer(monitor, token=token, event_hub_ws=event_hub_ws)
446
-
447
- print(f"[watchdog] Starting on port {port} ({_fmt_elapsed(_t0)})")
448
- try:
449
- config = uvicorn.Config(server.app, host="127.0.0.1", port=port, log_level="warning")
450
- uvi_server = uvicorn.Server(config)
451
- server._uvicorn_server = uvi_server
452
- uvi_server.run()
453
- except Exception as e:
454
- _write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
455
- _print_crash_summary(type(e), e.__traceback__)
456
- sys.exit(1)
457
-
458
-
459
- if __name__ == "__main__":
460
- main()
1
+ """
2
+ Watchdog entry point.
3
+ Connects to Kernel via WebSocket JSON-RPC 2.0, registers, subscribes to events,
4
+ runs health monitor loop, handles incoming RPC requests.
5
+ """
6
+
7
+ import asyncio
8
+ import builtins
9
+ import json
10
+ import os
11
+ import re
12
+ import sys
13
+ import threading
14
+ import time
15
+ import traceback
16
+ import uuid
17
+ from datetime import datetime, timezone
18
+
19
+ import websockets
20
+
21
+
22
+ # ── Module configuration ──
23
+ MODULE_NAME = "watchdog"
24
+
25
+
26
+ def _fmt_elapsed(t0: float) -> str:
27
+ """Format elapsed time since t0: <1s 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
28
+ d = time.monotonic() - t0
29
+ if d < 1:
30
+ return f"{d * 1000:.0f}ms"
31
+ if d < 10:
32
+ return f"{d:.1f}s"
33
+ return f"{d:.0f}s"
34
+
35
+
36
+ # ── Safe stdout/stderr: ignore BrokenPipeError after Launcher closes stdio ──
37
+
38
+ class _SafeWriter:
39
+ """Wraps a stream to silently swallow BrokenPipeError on write/flush."""
40
+ def __init__(self, stream):
41
+ self._stream = stream
42
+
43
+ def write(self, s):
44
+ try:
45
+ self._stream.write(s)
46
+ except (BrokenPipeError, OSError):
47
+ pass
48
+
49
+ def flush(self):
50
+ try:
51
+ self._stream.flush()
52
+ except (BrokenPipeError, OSError):
53
+ pass
54
+
55
+ def __getattr__(self, name):
56
+ return getattr(self._stream, name)
57
+
58
+ sys.stdout = _SafeWriter(sys.stdout)
59
+ sys.stderr = _SafeWriter(sys.stderr)
60
+
61
+
62
+ # ── Timestamped print + log file writer ──
63
+
64
+ _builtin_print = builtins.print
65
+ _start_ts = time.monotonic()
66
+ _last_ts = time.monotonic()
67
+ _ANSI_RE = re.compile(r"\033\[[0-9;]*m")
68
+ _log_lock = threading.Lock()
69
+ _log_latest_path = None
70
+ _log_daily_path = None
71
+ _log_daily_date = ""
72
+ _log_dir = None
73
+ _crash_log_path = None
74
+
75
+ def _strip_ansi(s: str) -> str:
76
+ return _ANSI_RE.sub("", s)
77
+
78
+ def _resolve_daily_log_path():
79
+ """Resolve daily log path based on current date."""
80
+ global _log_daily_path, _log_daily_date
81
+ if not _log_dir:
82
+ return
83
+ today = datetime.now().strftime("%Y-%m-%d")
84
+ if today == _log_daily_date and _log_daily_path:
85
+ return
86
+ month_dir = os.path.join(_log_dir, today[:7])
87
+ os.makedirs(month_dir, exist_ok=True)
88
+ _log_daily_path = os.path.join(month_dir, f"{today}.log")
89
+ _log_daily_date = today
90
+
91
+ def _write_log(plain_line: str):
92
+ """Write a plain-text line to both latest.log and daily log."""
93
+ with _log_lock:
94
+ if _log_latest_path:
95
+ try:
96
+ with open(_log_latest_path, "a", encoding="utf-8") as f:
97
+ f.write(plain_line)
98
+ except Exception:
99
+ pass
100
+ _resolve_daily_log_path()
101
+ if _log_daily_path:
102
+ try:
103
+ with open(_log_daily_path, "a", encoding="utf-8") as f:
104
+ f.write(plain_line)
105
+ except Exception:
106
+ pass
107
+
108
+
109
+ def _write_crash(exc_type, exc_value, exc_tb, thread_name=None, severity="critical", handled=False):
110
+ """Write crash record to crashes.jsonl + daily crash archive."""
111
+ record = {
112
+ "timestamp": datetime.now(timezone.utc).isoformat(),
113
+ "module": MODULE_NAME,
114
+ "thread": thread_name or threading.current_thread().name,
115
+ "exception_type": exc_type.__name__ if exc_type else "Unknown",
116
+ "exception_message": str(exc_value),
117
+ "traceback": "".join(traceback.format_exception(exc_type, exc_value, exc_tb)),
118
+ "severity": severity,
119
+ "handled": handled,
120
+ "process_id": os.getpid(),
121
+ "platform": sys.platform,
122
+ "runtime_version": f"Python {sys.version.split()[0]}",
123
+ }
124
+
125
+ if exc_tb:
126
+ tb_entries = traceback.extract_tb(exc_tb)
127
+ if tb_entries:
128
+ last = tb_entries[-1]
129
+ record["context"] = {
130
+ "function": last.name,
131
+ "file": os.path.basename(last.filename),
132
+ "line": last.lineno,
133
+ }
134
+
135
+ line = json.dumps(record, ensure_ascii=False) + "\n"
136
+
137
+ if _crash_log_path:
138
+ try:
139
+ with open(_crash_log_path, "a", encoding="utf-8") as f:
140
+ f.write(line)
141
+ except Exception:
142
+ pass
143
+
144
+ if _log_dir:
145
+ try:
146
+ today = datetime.now().strftime("%Y-%m-%d")
147
+ archive_dir = os.path.join(_log_dir, "crashes", today[:7])
148
+ os.makedirs(archive_dir, exist_ok=True)
149
+ archive_path = os.path.join(archive_dir, f"{today}.jsonl")
150
+ with open(archive_path, "a", encoding="utf-8") as f:
151
+ f.write(line)
152
+ except Exception:
153
+ pass
154
+
155
+
156
+ def _print_crash_summary(exc_type, exc_tb, thread_name=None):
157
+ """Print crash summary to console (red highlight)."""
158
+ RED = "\033[91m"
159
+ RESET = "\033[0m"
160
+
161
+ if exc_tb:
162
+ tb_entries = traceback.extract_tb(exc_tb)
163
+ if tb_entries:
164
+ last = tb_entries[-1]
165
+ location = f"{os.path.basename(last.filename)}:{last.lineno}"
166
+ else:
167
+ location = "unknown"
168
+ else:
169
+ location = "unknown"
170
+
171
+ prefix = f"[{MODULE_NAME}]"
172
+ if thread_name:
173
+ _builtin_print(f"{prefix} {RED}线程 {thread_name} 崩溃: "
174
+ f"{exc_type.__name__} in {location}{RESET}")
175
+ else:
176
+ _builtin_print(f"{prefix} {RED}崩溃: {exc_type.__name__} in {location}{RESET}")
177
+ if _crash_log_path:
178
+ _builtin_print(f"{prefix} 崩溃日志: {_crash_log_path}")
179
+
180
+ def _setup_exception_hooks():
181
+ """Set up global exception hooks."""
182
+ _orig_excepthook = sys.excepthook
183
+
184
+ def _excepthook(exc_type, exc_value, exc_tb):
185
+ _write_crash(exc_type, exc_value, exc_tb, severity="critical", handled=False)
186
+ _print_crash_summary(exc_type, exc_tb)
187
+ _orig_excepthook(exc_type, exc_value, exc_tb)
188
+
189
+ sys.excepthook = _excepthook
190
+
191
+ if hasattr(threading, "excepthook"):
192
+ def _thread_excepthook(args):
193
+ _write_crash(args.exc_type, args.exc_value, args.exc_traceback,
194
+ thread_name=args.thread.name if args.thread else "unknown",
195
+ severity="error", handled=False)
196
+ _print_crash_summary(args.exc_type, args.exc_traceback,
197
+ thread_name=args.thread.name if args.thread else None)
198
+
199
+ threading.excepthook = _thread_excepthook
200
+
201
+ def _tprint(*args, **kwargs):
202
+ """Timestamped print that adds [timestamp] HH:MM:SS.mmm +delta prefix."""
203
+ global _last_ts
204
+ now = time.monotonic()
205
+ elapsed = now - _start_ts
206
+ delta = now - _last_ts
207
+ _last_ts = now
208
+
209
+ if elapsed < 1:
210
+ elapsed_str = f"{elapsed * 1000:.0f}ms"
211
+ elif elapsed < 100:
212
+ elapsed_str = f"{elapsed:.1f}s"
213
+ else:
214
+ elapsed_str = f"{elapsed:.0f}s"
215
+
216
+ if delta < 0.001:
217
+ delta_str = ""
218
+ elif delta < 1:
219
+ delta_str = f"+{delta * 1000:.0f}ms"
220
+ elif delta < 100:
221
+ delta_str = f"+{delta:.1f}s"
222
+ else:
223
+ delta_str = f"+{delta:.0f}s"
224
+
225
+ ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
226
+
227
+ _builtin_print(*args, **kwargs)
228
+
229
+ if _log_latest_path or _log_daily_path:
230
+ sep = kwargs.get("sep", " ")
231
+ end = kwargs.get("end", "\n")
232
+ text = sep.join(str(a) for a in args)
233
+ prefix = f"[{elapsed_str:>6}] {ts} {delta_str:>8} "
234
+ _write_log(prefix + _strip_ansi(text) + end)
235
+
236
+ builtins.print = _tprint
237
+
238
+ # Ensure project root is on sys.path
239
+ _project_root = os.environ.get("KITE_PROJECT") or os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
240
+ if _project_root not in sys.path:
241
+ sys.path.insert(0, _project_root)
242
+
243
+ from extensions.services.watchdog.monitor import HealthMonitor
244
+
245
+
246
+ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict | None:
247
+ """Read a single kite message of expected type from stdin with timeout."""
248
+ result = [None]
249
+
250
+ def _read():
251
+ try:
252
+ line = sys.stdin.readline().strip()
253
+ if line:
254
+ msg = json.loads(line)
255
+ if isinstance(msg, dict) and msg.get("kite") == expected_type:
256
+ result[0] = msg
257
+ except Exception:
258
+ pass
259
+
260
+ t = threading.Thread(target=_read, daemon=True)
261
+ t.start()
262
+ t.join(timeout=timeout)
263
+ return result[0]
264
+
265
+
266
+ # Global WS reference for publish_event callback
267
+ _ws_global = None
268
+
269
+
270
+
271
+ async def main():
272
+ global _ws_global
273
+ # Initialize log file paths
274
+ global _log_dir, _log_latest_path, _crash_log_path
275
+ module_data = os.environ.get("KITE_MODULE_DATA")
276
+ if module_data:
277
+ _log_dir = os.path.join(module_data, "log")
278
+ os.makedirs(_log_dir, exist_ok=True)
279
+ suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
280
+
281
+ _log_latest_path = os.path.join(_log_dir, f"latest{suffix}.log")
282
+ try:
283
+ with open(_log_latest_path, "w", encoding="utf-8") as f:
284
+ pass
285
+ except Exception:
286
+ _log_latest_path = None
287
+
288
+ _crash_log_path = os.path.join(_log_dir, f"crashes{suffix}.jsonl")
289
+ try:
290
+ with open(_crash_log_path, "w", encoding="utf-8") as f:
291
+ pass
292
+ except Exception:
293
+ _crash_log_path = None
294
+
295
+ _resolve_daily_log_path()
296
+
297
+ _setup_exception_hooks()
298
+
299
+ _t0 = time.monotonic()
300
+
301
+ # Read boot_info from stdin (only token)
302
+ token = ""
303
+ try:
304
+ line = sys.stdin.readline().strip()
305
+ if line:
306
+ boot_info = json.loads(line)
307
+ token = boot_info.get("token", "")
308
+ except Exception:
309
+ pass
310
+
311
+ # Read kernel_port: env first (fast path), stdin fallback (parallel start)
312
+ kernel_port = int(os.environ.get("KITE_KERNEL_PORT", "0"))
313
+ if not kernel_port:
314
+ msg = _read_stdin_kite_message("kernel_port", timeout=10)
315
+ if msg:
316
+ kernel_port = int(msg.get("kernel_port", 0))
317
+
318
+ if not token or not kernel_port:
319
+ print("[watchdog] ERROR: Missing token or kernel_port")
320
+ sys.exit(1)
321
+
322
+ print(f"[watchdog] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
323
+
324
+ # Connect to Kernel WebSocket
325
+ ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
326
+ print(f"[watchdog] Connecting to Kernel: {ws_url}")
327
+
328
+ try:
329
+ async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
330
+ _ws_global = ws
331
+ print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
332
+
333
+ # Subscribe to events
334
+ await _rpc_call(ws, "event.subscribe", {
335
+ "events": [
336
+ "system.ready",
337
+ "module.started",
338
+ "module.stopped",
339
+ "module.exiting",
340
+ "module.ready",
341
+ "module.shutdown",
342
+ ],
343
+ })
344
+ print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
345
+
346
+ # Register to Kernel Registry via RPC
347
+ await _rpc_call(ws, "registry.register", {
348
+ "module_id": "watchdog",
349
+ "module_type": "service",
350
+ "events_publish": {
351
+ "watchdog.module.unhealthy": {},
352
+ "watchdog.module.recovered": {},
353
+ "watchdog.alert": {},
354
+ },
355
+ "events_subscribe": [
356
+ "system.ready",
357
+ "module.started",
358
+ "module.stopped",
359
+ "module.exiting",
360
+ "module.ready",
361
+ "module.shutdown",
362
+ ],
363
+ })
364
+ print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
365
+
366
+ # Create monitor with RPC callback
367
+ monitor = HealthMonitor(
368
+ own_token=token,
369
+ kernel_port=kernel_port,
370
+ )
371
+ monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
372
+ monitor.rpc_call = lambda method, params: _rpc_call(ws, method, params)
373
+
374
+ # Publish module.ready
375
+ await _rpc_call(ws, "event.publish", {
376
+ "event_id": str(uuid.uuid4()),
377
+ "event": "module.ready",
378
+ "data": {
379
+ "module_id": "watchdog",
380
+ "graceful_shutdown": True,
381
+ },
382
+ })
383
+ print(f"[watchdog] module.ready published ({_fmt_elapsed(_t0)})")
384
+
385
+ # Start monitor loop in background
386
+ monitor_task = asyncio.create_task(monitor.run())
387
+
388
+ # Message loop: handle incoming RPC + events
389
+ async for raw in ws:
390
+ try:
391
+ msg = json.loads(raw)
392
+ except (json.JSONDecodeError, TypeError):
393
+ continue
394
+
395
+ try:
396
+ has_method = "method" in msg
397
+ has_id = "id" in msg
398
+
399
+ if has_method and not has_id:
400
+ # Event Notification
401
+ await _handle_event_notification(msg, monitor)
402
+ elif has_method and has_id:
403
+ # Incoming RPC request
404
+ await _handle_rpc_request(ws, msg, monitor)
405
+ # Ignore RPC responses (we don't await them in this simple impl)
406
+ except Exception as e:
407
+ print(f"[watchdog] 消息处理异常(已忽略): {e}")
408
+
409
+ except Exception as e:
410
+ _write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
411
+ _print_crash_summary(type(e), e.__traceback__)
412
+ sys.exit(1)
413
+
414
+
415
+
416
+ async def _rpc_call(ws, method: str, params: dict = None):
417
+ """Send a JSON-RPC 2.0 request (fire-and-forget, no response awaited)."""
418
+ msg = {"jsonrpc": "2.0", "id": str(uuid.uuid4()), "method": method}
419
+ if params:
420
+ msg["params"] = params
421
+ await ws.send(json.dumps(msg))
422
+
423
+
424
+ async def _publish_event(ws, event: dict):
425
+ """Publish an event via RPC event.publish."""
426
+ await _rpc_call(ws, "event.publish", {
427
+ "event_id": str(uuid.uuid4()),
428
+ "event": event.get("event", ""),
429
+ "data": event.get("data", {}),
430
+ })
431
+
432
+
433
+ async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
434
+ """Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
435
+ params = msg.get("params", {})
436
+ event_type = params.get("event", "")
437
+ data = params.get("data", {})
438
+
439
+ # Special handling for module.shutdown targeting watchdog
440
+ if event_type == "module.shutdown" and data.get("module_id") == "watchdog":
441
+ await _handle_shutdown(monitor)
442
+ return
443
+
444
+ # Forward to monitor
445
+ await monitor.handle_event(msg)
446
+
447
+
448
+ async def _handle_rpc_request(ws, msg: dict, monitor: HealthMonitor):
449
+ """Handle an incoming RPC request (watchdog.* methods)."""
450
+ rpc_id = msg.get("id", "")
451
+ method = msg.get("method", "")
452
+ params = msg.get("params", {})
453
+
454
+ handlers = {
455
+ "health": lambda p: _rpc_health(monitor),
456
+ "status": lambda p: _rpc_status(monitor),
457
+ }
458
+ handler = handlers.get(method)
459
+ if handler:
460
+ try:
461
+ result = await handler(params)
462
+ await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
463
+ except Exception as e:
464
+ await ws.send(json.dumps({
465
+ "jsonrpc": "2.0", "id": rpc_id,
466
+ "error": {"code": -32603, "message": str(e)},
467
+ }))
468
+ else:
469
+ await ws.send(json.dumps({
470
+ "jsonrpc": "2.0", "id": rpc_id,
471
+ "error": {"code": -32601, "message": f"Method not found: {method}"},
472
+ }))
473
+
474
+
475
+ async def _rpc_health(monitor: HealthMonitor) -> dict:
476
+ """RPC handler for watchdog.health."""
477
+ return {
478
+ "status": "healthy",
479
+ "details": {
480
+ "monitored_modules": len(monitor.modules),
481
+ "uptime_seconds": round(time.time() - _start_ts),
482
+ },
483
+ }
484
+
485
+
486
+ async def _rpc_status(monitor: HealthMonitor) -> dict:
487
+ """RPC handler for watchdog.status."""
488
+ return monitor.get_status()
489
+
490
+
491
+ async def _handle_shutdown(monitor: HealthMonitor):
492
+ """Handle module.shutdown event — ack, cleanup, ready, exit."""
493
+ print("[watchdog] Received shutdown request")
494
+ # Step 1: Send ack
495
+ await _publish_event(_ws_global, {
496
+ "event": "module.shutdown.ack",
497
+ "data": {"module_id": "watchdog", "estimated_cleanup": 2},
498
+ })
499
+ # Step 2: Cleanup
500
+ monitor.stop()
501
+ # Step 3: Send ready
502
+ await _publish_event(_ws_global, {
503
+ "event": "module.shutdown.ready",
504
+ "data": {"module_id": "watchdog"},
505
+ })
506
+ print("[watchdog] Shutdown ready, exiting")
507
+ # Step 4: Exit
508
+ sys.exit(0)
509
+
510
+
511
+ if __name__ == "__main__":
512
+ asyncio.run(main())
513
+