@agentunion/kite 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/assistant/entry.py +30 -81
- package/extensions/agents/assistant/module.md +1 -1
- package/extensions/agents/assistant/server.py +83 -122
- package/extensions/channels/acp_channel/entry.py +30 -81
- package/extensions/channels/acp_channel/module.md +1 -1
- package/extensions/channels/acp_channel/server.py +83 -122
- package/extensions/event_hub_bench/entry.py +81 -121
- package/extensions/services/backup/entry.py +213 -85
- package/extensions/services/model_service/entry.py +213 -85
- package/extensions/services/watchdog/entry.py +513 -460
- package/extensions/services/watchdog/monitor.py +55 -69
- package/extensions/services/web/entry.py +11 -108
- package/extensions/services/web/server.py +120 -77
- package/{core/registry → kernel}/entry.py +65 -37
- package/{core/event_hub/hub.py → kernel/event_hub.py} +61 -81
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +13 -4
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/{core/launcher → launcher}/entry.py +693 -767
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/main.py +11 -350
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/__init__.py +0 -0
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -436
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -269
- package/core/kite_log.py +0 -241
- package/core/launcher/__init__.py +0 -0
- package/core/registry/__init__.py +0 -0
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -339
- package/extensions/services/backup/server.py +0 -244
- package/extensions/services/model_service/server.py +0 -236
- package/extensions/services/watchdog/server.py +0 -229
- /package/{core → kernel}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
- /package/{core/launcher → launcher}/process_manager.py +0 -0
|
@@ -1,460 +1,513 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Watchdog entry point.
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import
|
|
8
|
-
import
|
|
9
|
-
import
|
|
10
|
-
import
|
|
11
|
-
import
|
|
12
|
-
import sys
|
|
13
|
-
import threading
|
|
14
|
-
import time
|
|
15
|
-
import traceback
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
import
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
d
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
sys.
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
"
|
|
117
|
-
"
|
|
118
|
-
"
|
|
119
|
-
"
|
|
120
|
-
"
|
|
121
|
-
"
|
|
122
|
-
"
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
tb_entries
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
os.
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def _print_crash_summary(exc_type, exc_tb, thread_name=None):
|
|
157
|
-
"""Print crash summary to console (red highlight)."""
|
|
158
|
-
RED = "\033[91m"
|
|
159
|
-
RESET = "\033[0m"
|
|
160
|
-
|
|
161
|
-
if exc_tb:
|
|
162
|
-
tb_entries = traceback.extract_tb(exc_tb)
|
|
163
|
-
if tb_entries:
|
|
164
|
-
last = tb_entries[-1]
|
|
165
|
-
location = f"{os.path.basename(last.filename)}:{last.lineno}"
|
|
166
|
-
else:
|
|
167
|
-
location = "unknown"
|
|
168
|
-
else:
|
|
169
|
-
location = "unknown"
|
|
170
|
-
|
|
171
|
-
prefix = f"[{MODULE_NAME}]"
|
|
172
|
-
if thread_name:
|
|
173
|
-
_builtin_print(f"{prefix} {RED}线程 {thread_name} 崩溃: "
|
|
174
|
-
f"{exc_type.__name__} in {location}{RESET}")
|
|
175
|
-
else:
|
|
176
|
-
_builtin_print(f"{prefix} {RED}崩溃: {exc_type.__name__} in {location}{RESET}")
|
|
177
|
-
if _crash_log_path:
|
|
178
|
-
_builtin_print(f"{prefix} 崩溃日志: {_crash_log_path}")
|
|
179
|
-
|
|
180
|
-
def _setup_exception_hooks():
|
|
181
|
-
"""Set up global exception hooks."""
|
|
182
|
-
_orig_excepthook = sys.excepthook
|
|
183
|
-
|
|
184
|
-
def _excepthook(exc_type, exc_value, exc_tb):
|
|
185
|
-
_write_crash(exc_type, exc_value, exc_tb, severity="critical", handled=False)
|
|
186
|
-
_print_crash_summary(exc_type, exc_tb)
|
|
187
|
-
_orig_excepthook(exc_type, exc_value, exc_tb)
|
|
188
|
-
|
|
189
|
-
sys.excepthook = _excepthook
|
|
190
|
-
|
|
191
|
-
if hasattr(threading, "excepthook"):
|
|
192
|
-
def _thread_excepthook(args):
|
|
193
|
-
_write_crash(args.exc_type, args.exc_value, args.exc_traceback,
|
|
194
|
-
thread_name=args.thread.name if args.thread else "unknown",
|
|
195
|
-
severity="error", handled=False)
|
|
196
|
-
_print_crash_summary(args.exc_type, args.exc_traceback,
|
|
197
|
-
thread_name=args.thread.name if args.thread else None)
|
|
198
|
-
|
|
199
|
-
threading.excepthook = _thread_excepthook
|
|
200
|
-
|
|
201
|
-
def _tprint(*args, **kwargs):
|
|
202
|
-
"""Timestamped print that adds [timestamp] HH:MM:SS.mmm +delta prefix."""
|
|
203
|
-
global _last_ts
|
|
204
|
-
now = time.monotonic()
|
|
205
|
-
elapsed = now - _start_ts
|
|
206
|
-
delta = now - _last_ts
|
|
207
|
-
_last_ts = now
|
|
208
|
-
|
|
209
|
-
if elapsed < 1:
|
|
210
|
-
elapsed_str = f"{elapsed * 1000:.0f}ms"
|
|
211
|
-
elif elapsed < 100:
|
|
212
|
-
elapsed_str = f"{elapsed:.1f}s"
|
|
213
|
-
else:
|
|
214
|
-
elapsed_str = f"{elapsed:.0f}s"
|
|
215
|
-
|
|
216
|
-
if delta < 0.001:
|
|
217
|
-
delta_str = ""
|
|
218
|
-
elif delta < 1:
|
|
219
|
-
delta_str = f"+{delta * 1000:.0f}ms"
|
|
220
|
-
elif delta < 100:
|
|
221
|
-
delta_str = f"+{delta:.1f}s"
|
|
222
|
-
else:
|
|
223
|
-
delta_str = f"+{delta:.0f}s"
|
|
224
|
-
|
|
225
|
-
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
226
|
-
|
|
227
|
-
_builtin_print(*args, **kwargs)
|
|
228
|
-
|
|
229
|
-
if _log_latest_path or _log_daily_path:
|
|
230
|
-
sep = kwargs.get("sep", " ")
|
|
231
|
-
end = kwargs.get("end", "\n")
|
|
232
|
-
text = sep.join(str(a) for a in args)
|
|
233
|
-
prefix = f"[{elapsed_str:>6}] {ts} {delta_str:>8} "
|
|
234
|
-
_write_log(prefix + _strip_ansi(text) + end)
|
|
235
|
-
|
|
236
|
-
builtins.print = _tprint
|
|
237
|
-
|
|
238
|
-
# Ensure project root is on sys.path
|
|
239
|
-
_project_root = os.environ.get("KITE_PROJECT") or os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
240
|
-
if _project_root not in sys.path:
|
|
241
|
-
sys.path.insert(0, _project_root)
|
|
242
|
-
|
|
243
|
-
from extensions.services.watchdog.monitor import HealthMonitor
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
#
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
if
|
|
460
|
-
|
|
1
|
+
"""
|
|
2
|
+
Watchdog entry point.
|
|
3
|
+
Connects to Kernel via WebSocket JSON-RPC 2.0, registers, subscribes to events,
|
|
4
|
+
runs health monitor loop, handles incoming RPC requests.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import builtins
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import sys
|
|
13
|
+
import threading
|
|
14
|
+
import time
|
|
15
|
+
import traceback
|
|
16
|
+
import uuid
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
|
|
19
|
+
import websockets
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ── Module configuration ──
|
|
23
|
+
MODULE_NAME = "watchdog"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _fmt_elapsed(t0: float) -> str:
|
|
27
|
+
"""Format elapsed time since t0: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
28
|
+
d = time.monotonic() - t0
|
|
29
|
+
if d < 1:
|
|
30
|
+
return f"{d * 1000:.0f}ms"
|
|
31
|
+
if d < 10:
|
|
32
|
+
return f"{d:.1f}s"
|
|
33
|
+
return f"{d:.0f}s"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ── Safe stdout/stderr: ignore BrokenPipeError after Launcher closes stdio ──
|
|
37
|
+
|
|
38
|
+
class _SafeWriter:
|
|
39
|
+
"""Wraps a stream to silently swallow BrokenPipeError on write/flush."""
|
|
40
|
+
def __init__(self, stream):
|
|
41
|
+
self._stream = stream
|
|
42
|
+
|
|
43
|
+
def write(self, s):
|
|
44
|
+
try:
|
|
45
|
+
self._stream.write(s)
|
|
46
|
+
except (BrokenPipeError, OSError):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def flush(self):
|
|
50
|
+
try:
|
|
51
|
+
self._stream.flush()
|
|
52
|
+
except (BrokenPipeError, OSError):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
def __getattr__(self, name):
|
|
56
|
+
return getattr(self._stream, name)
|
|
57
|
+
|
|
58
|
+
sys.stdout = _SafeWriter(sys.stdout)
|
|
59
|
+
sys.stderr = _SafeWriter(sys.stderr)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ── Timestamped print + log file writer ──
|
|
63
|
+
|
|
64
|
+
_builtin_print = builtins.print
|
|
65
|
+
_start_ts = time.monotonic()
|
|
66
|
+
_last_ts = time.monotonic()
|
|
67
|
+
_ANSI_RE = re.compile(r"\033\[[0-9;]*m")
|
|
68
|
+
_log_lock = threading.Lock()
|
|
69
|
+
_log_latest_path = None
|
|
70
|
+
_log_daily_path = None
|
|
71
|
+
_log_daily_date = ""
|
|
72
|
+
_log_dir = None
|
|
73
|
+
_crash_log_path = None
|
|
74
|
+
|
|
75
|
+
def _strip_ansi(s: str) -> str:
|
|
76
|
+
return _ANSI_RE.sub("", s)
|
|
77
|
+
|
|
78
|
+
def _resolve_daily_log_path():
|
|
79
|
+
"""Resolve daily log path based on current date."""
|
|
80
|
+
global _log_daily_path, _log_daily_date
|
|
81
|
+
if not _log_dir:
|
|
82
|
+
return
|
|
83
|
+
today = datetime.now().strftime("%Y-%m-%d")
|
|
84
|
+
if today == _log_daily_date and _log_daily_path:
|
|
85
|
+
return
|
|
86
|
+
month_dir = os.path.join(_log_dir, today[:7])
|
|
87
|
+
os.makedirs(month_dir, exist_ok=True)
|
|
88
|
+
_log_daily_path = os.path.join(month_dir, f"{today}.log")
|
|
89
|
+
_log_daily_date = today
|
|
90
|
+
|
|
91
|
+
def _write_log(plain_line: str):
|
|
92
|
+
"""Write a plain-text line to both latest.log and daily log."""
|
|
93
|
+
with _log_lock:
|
|
94
|
+
if _log_latest_path:
|
|
95
|
+
try:
|
|
96
|
+
with open(_log_latest_path, "a", encoding="utf-8") as f:
|
|
97
|
+
f.write(plain_line)
|
|
98
|
+
except Exception:
|
|
99
|
+
pass
|
|
100
|
+
_resolve_daily_log_path()
|
|
101
|
+
if _log_daily_path:
|
|
102
|
+
try:
|
|
103
|
+
with open(_log_daily_path, "a", encoding="utf-8") as f:
|
|
104
|
+
f.write(plain_line)
|
|
105
|
+
except Exception:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _write_crash(exc_type, exc_value, exc_tb, thread_name=None, severity="critical", handled=False):
|
|
110
|
+
"""Write crash record to crashes.jsonl + daily crash archive."""
|
|
111
|
+
record = {
|
|
112
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
113
|
+
"module": MODULE_NAME,
|
|
114
|
+
"thread": thread_name or threading.current_thread().name,
|
|
115
|
+
"exception_type": exc_type.__name__ if exc_type else "Unknown",
|
|
116
|
+
"exception_message": str(exc_value),
|
|
117
|
+
"traceback": "".join(traceback.format_exception(exc_type, exc_value, exc_tb)),
|
|
118
|
+
"severity": severity,
|
|
119
|
+
"handled": handled,
|
|
120
|
+
"process_id": os.getpid(),
|
|
121
|
+
"platform": sys.platform,
|
|
122
|
+
"runtime_version": f"Python {sys.version.split()[0]}",
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if exc_tb:
|
|
126
|
+
tb_entries = traceback.extract_tb(exc_tb)
|
|
127
|
+
if tb_entries:
|
|
128
|
+
last = tb_entries[-1]
|
|
129
|
+
record["context"] = {
|
|
130
|
+
"function": last.name,
|
|
131
|
+
"file": os.path.basename(last.filename),
|
|
132
|
+
"line": last.lineno,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
line = json.dumps(record, ensure_ascii=False) + "\n"
|
|
136
|
+
|
|
137
|
+
if _crash_log_path:
|
|
138
|
+
try:
|
|
139
|
+
with open(_crash_log_path, "a", encoding="utf-8") as f:
|
|
140
|
+
f.write(line)
|
|
141
|
+
except Exception:
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
if _log_dir:
|
|
145
|
+
try:
|
|
146
|
+
today = datetime.now().strftime("%Y-%m-%d")
|
|
147
|
+
archive_dir = os.path.join(_log_dir, "crashes", today[:7])
|
|
148
|
+
os.makedirs(archive_dir, exist_ok=True)
|
|
149
|
+
archive_path = os.path.join(archive_dir, f"{today}.jsonl")
|
|
150
|
+
with open(archive_path, "a", encoding="utf-8") as f:
|
|
151
|
+
f.write(line)
|
|
152
|
+
except Exception:
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _print_crash_summary(exc_type, exc_tb, thread_name=None):
|
|
157
|
+
"""Print crash summary to console (red highlight)."""
|
|
158
|
+
RED = "\033[91m"
|
|
159
|
+
RESET = "\033[0m"
|
|
160
|
+
|
|
161
|
+
if exc_tb:
|
|
162
|
+
tb_entries = traceback.extract_tb(exc_tb)
|
|
163
|
+
if tb_entries:
|
|
164
|
+
last = tb_entries[-1]
|
|
165
|
+
location = f"{os.path.basename(last.filename)}:{last.lineno}"
|
|
166
|
+
else:
|
|
167
|
+
location = "unknown"
|
|
168
|
+
else:
|
|
169
|
+
location = "unknown"
|
|
170
|
+
|
|
171
|
+
prefix = f"[{MODULE_NAME}]"
|
|
172
|
+
if thread_name:
|
|
173
|
+
_builtin_print(f"{prefix} {RED}线程 {thread_name} 崩溃: "
|
|
174
|
+
f"{exc_type.__name__} in {location}{RESET}")
|
|
175
|
+
else:
|
|
176
|
+
_builtin_print(f"{prefix} {RED}崩溃: {exc_type.__name__} in {location}{RESET}")
|
|
177
|
+
if _crash_log_path:
|
|
178
|
+
_builtin_print(f"{prefix} 崩溃日志: {_crash_log_path}")
|
|
179
|
+
|
|
180
|
+
def _setup_exception_hooks():
|
|
181
|
+
"""Set up global exception hooks."""
|
|
182
|
+
_orig_excepthook = sys.excepthook
|
|
183
|
+
|
|
184
|
+
def _excepthook(exc_type, exc_value, exc_tb):
|
|
185
|
+
_write_crash(exc_type, exc_value, exc_tb, severity="critical", handled=False)
|
|
186
|
+
_print_crash_summary(exc_type, exc_tb)
|
|
187
|
+
_orig_excepthook(exc_type, exc_value, exc_tb)
|
|
188
|
+
|
|
189
|
+
sys.excepthook = _excepthook
|
|
190
|
+
|
|
191
|
+
if hasattr(threading, "excepthook"):
|
|
192
|
+
def _thread_excepthook(args):
|
|
193
|
+
_write_crash(args.exc_type, args.exc_value, args.exc_traceback,
|
|
194
|
+
thread_name=args.thread.name if args.thread else "unknown",
|
|
195
|
+
severity="error", handled=False)
|
|
196
|
+
_print_crash_summary(args.exc_type, args.exc_traceback,
|
|
197
|
+
thread_name=args.thread.name if args.thread else None)
|
|
198
|
+
|
|
199
|
+
threading.excepthook = _thread_excepthook
|
|
200
|
+
|
|
201
|
+
def _tprint(*args, **kwargs):
|
|
202
|
+
"""Timestamped print that adds [timestamp] HH:MM:SS.mmm +delta prefix."""
|
|
203
|
+
global _last_ts
|
|
204
|
+
now = time.monotonic()
|
|
205
|
+
elapsed = now - _start_ts
|
|
206
|
+
delta = now - _last_ts
|
|
207
|
+
_last_ts = now
|
|
208
|
+
|
|
209
|
+
if elapsed < 1:
|
|
210
|
+
elapsed_str = f"{elapsed * 1000:.0f}ms"
|
|
211
|
+
elif elapsed < 100:
|
|
212
|
+
elapsed_str = f"{elapsed:.1f}s"
|
|
213
|
+
else:
|
|
214
|
+
elapsed_str = f"{elapsed:.0f}s"
|
|
215
|
+
|
|
216
|
+
if delta < 0.001:
|
|
217
|
+
delta_str = ""
|
|
218
|
+
elif delta < 1:
|
|
219
|
+
delta_str = f"+{delta * 1000:.0f}ms"
|
|
220
|
+
elif delta < 100:
|
|
221
|
+
delta_str = f"+{delta:.1f}s"
|
|
222
|
+
else:
|
|
223
|
+
delta_str = f"+{delta:.0f}s"
|
|
224
|
+
|
|
225
|
+
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
226
|
+
|
|
227
|
+
_builtin_print(*args, **kwargs)
|
|
228
|
+
|
|
229
|
+
if _log_latest_path or _log_daily_path:
|
|
230
|
+
sep = kwargs.get("sep", " ")
|
|
231
|
+
end = kwargs.get("end", "\n")
|
|
232
|
+
text = sep.join(str(a) for a in args)
|
|
233
|
+
prefix = f"[{elapsed_str:>6}] {ts} {delta_str:>8} "
|
|
234
|
+
_write_log(prefix + _strip_ansi(text) + end)
|
|
235
|
+
|
|
236
|
+
builtins.print = _tprint
|
|
237
|
+
|
|
238
|
+
# Ensure project root is on sys.path
|
|
239
|
+
_project_root = os.environ.get("KITE_PROJECT") or os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
240
|
+
if _project_root not in sys.path:
|
|
241
|
+
sys.path.insert(0, _project_root)
|
|
242
|
+
|
|
243
|
+
from extensions.services.watchdog.monitor import HealthMonitor
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict | None:
|
|
247
|
+
"""Read a single kite message of expected type from stdin with timeout."""
|
|
248
|
+
result = [None]
|
|
249
|
+
|
|
250
|
+
def _read():
|
|
251
|
+
try:
|
|
252
|
+
line = sys.stdin.readline().strip()
|
|
253
|
+
if line:
|
|
254
|
+
msg = json.loads(line)
|
|
255
|
+
if isinstance(msg, dict) and msg.get("kite") == expected_type:
|
|
256
|
+
result[0] = msg
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
t = threading.Thread(target=_read, daemon=True)
|
|
261
|
+
t.start()
|
|
262
|
+
t.join(timeout=timeout)
|
|
263
|
+
return result[0]
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# Global WS reference for publish_event callback
|
|
267
|
+
_ws_global = None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
async def main():
|
|
272
|
+
global _ws_global
|
|
273
|
+
# Initialize log file paths
|
|
274
|
+
global _log_dir, _log_latest_path, _crash_log_path
|
|
275
|
+
module_data = os.environ.get("KITE_MODULE_DATA")
|
|
276
|
+
if module_data:
|
|
277
|
+
_log_dir = os.path.join(module_data, "log")
|
|
278
|
+
os.makedirs(_log_dir, exist_ok=True)
|
|
279
|
+
suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
280
|
+
|
|
281
|
+
_log_latest_path = os.path.join(_log_dir, f"latest{suffix}.log")
|
|
282
|
+
try:
|
|
283
|
+
with open(_log_latest_path, "w", encoding="utf-8") as f:
|
|
284
|
+
pass
|
|
285
|
+
except Exception:
|
|
286
|
+
_log_latest_path = None
|
|
287
|
+
|
|
288
|
+
_crash_log_path = os.path.join(_log_dir, f"crashes{suffix}.jsonl")
|
|
289
|
+
try:
|
|
290
|
+
with open(_crash_log_path, "w", encoding="utf-8") as f:
|
|
291
|
+
pass
|
|
292
|
+
except Exception:
|
|
293
|
+
_crash_log_path = None
|
|
294
|
+
|
|
295
|
+
_resolve_daily_log_path()
|
|
296
|
+
|
|
297
|
+
_setup_exception_hooks()
|
|
298
|
+
|
|
299
|
+
_t0 = time.monotonic()
|
|
300
|
+
|
|
301
|
+
# Read boot_info from stdin (only token)
|
|
302
|
+
token = ""
|
|
303
|
+
try:
|
|
304
|
+
line = sys.stdin.readline().strip()
|
|
305
|
+
if line:
|
|
306
|
+
boot_info = json.loads(line)
|
|
307
|
+
token = boot_info.get("token", "")
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
# Read kernel_port: env first (fast path), stdin fallback (parallel start)
|
|
312
|
+
kernel_port = int(os.environ.get("KITE_KERNEL_PORT", "0"))
|
|
313
|
+
if not kernel_port:
|
|
314
|
+
msg = _read_stdin_kite_message("kernel_port", timeout=10)
|
|
315
|
+
if msg:
|
|
316
|
+
kernel_port = int(msg.get("kernel_port", 0))
|
|
317
|
+
|
|
318
|
+
if not token or not kernel_port:
|
|
319
|
+
print("[watchdog] ERROR: Missing token or kernel_port")
|
|
320
|
+
sys.exit(1)
|
|
321
|
+
|
|
322
|
+
print(f"[watchdog] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
|
|
323
|
+
|
|
324
|
+
# Connect to Kernel WebSocket
|
|
325
|
+
ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
|
|
326
|
+
print(f"[watchdog] Connecting to Kernel: {ws_url}")
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
330
|
+
_ws_global = ws
|
|
331
|
+
print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
332
|
+
|
|
333
|
+
# Subscribe to events
|
|
334
|
+
await _rpc_call(ws, "event.subscribe", {
|
|
335
|
+
"events": [
|
|
336
|
+
"system.ready",
|
|
337
|
+
"module.started",
|
|
338
|
+
"module.stopped",
|
|
339
|
+
"module.exiting",
|
|
340
|
+
"module.ready",
|
|
341
|
+
"module.shutdown",
|
|
342
|
+
],
|
|
343
|
+
})
|
|
344
|
+
print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
|
|
345
|
+
|
|
346
|
+
# Register to Kernel Registry via RPC
|
|
347
|
+
await _rpc_call(ws, "registry.register", {
|
|
348
|
+
"module_id": "watchdog",
|
|
349
|
+
"module_type": "service",
|
|
350
|
+
"events_publish": {
|
|
351
|
+
"watchdog.module.unhealthy": {},
|
|
352
|
+
"watchdog.module.recovered": {},
|
|
353
|
+
"watchdog.alert": {},
|
|
354
|
+
},
|
|
355
|
+
"events_subscribe": [
|
|
356
|
+
"system.ready",
|
|
357
|
+
"module.started",
|
|
358
|
+
"module.stopped",
|
|
359
|
+
"module.exiting",
|
|
360
|
+
"module.ready",
|
|
361
|
+
"module.shutdown",
|
|
362
|
+
],
|
|
363
|
+
})
|
|
364
|
+
print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
|
|
365
|
+
|
|
366
|
+
# Create monitor with RPC callback
|
|
367
|
+
monitor = HealthMonitor(
|
|
368
|
+
own_token=token,
|
|
369
|
+
kernel_port=kernel_port,
|
|
370
|
+
)
|
|
371
|
+
monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
|
|
372
|
+
monitor.rpc_call = lambda method, params: _rpc_call(ws, method, params)
|
|
373
|
+
|
|
374
|
+
# Publish module.ready
|
|
375
|
+
await _rpc_call(ws, "event.publish", {
|
|
376
|
+
"event_id": str(uuid.uuid4()),
|
|
377
|
+
"event": "module.ready",
|
|
378
|
+
"data": {
|
|
379
|
+
"module_id": "watchdog",
|
|
380
|
+
"graceful_shutdown": True,
|
|
381
|
+
},
|
|
382
|
+
})
|
|
383
|
+
print(f"[watchdog] module.ready published ({_fmt_elapsed(_t0)})")
|
|
384
|
+
|
|
385
|
+
# Start monitor loop in background
|
|
386
|
+
monitor_task = asyncio.create_task(monitor.run())
|
|
387
|
+
|
|
388
|
+
# Message loop: handle incoming RPC + events
|
|
389
|
+
async for raw in ws:
|
|
390
|
+
try:
|
|
391
|
+
msg = json.loads(raw)
|
|
392
|
+
except (json.JSONDecodeError, TypeError):
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
has_method = "method" in msg
|
|
397
|
+
has_id = "id" in msg
|
|
398
|
+
|
|
399
|
+
if has_method and not has_id:
|
|
400
|
+
# Event Notification
|
|
401
|
+
await _handle_event_notification(msg, monitor)
|
|
402
|
+
elif has_method and has_id:
|
|
403
|
+
# Incoming RPC request
|
|
404
|
+
await _handle_rpc_request(ws, msg, monitor)
|
|
405
|
+
# Ignore RPC responses (we don't await them in this simple impl)
|
|
406
|
+
except Exception as e:
|
|
407
|
+
print(f"[watchdog] 消息处理异常(已忽略): {e}")
|
|
408
|
+
|
|
409
|
+
except Exception as e:
|
|
410
|
+
_write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
|
|
411
|
+
_print_crash_summary(type(e), e.__traceback__)
|
|
412
|
+
sys.exit(1)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
async def _rpc_call(ws, method: str, params: dict = None):
|
|
417
|
+
"""Send a JSON-RPC 2.0 request (fire-and-forget, no response awaited)."""
|
|
418
|
+
msg = {"jsonrpc": "2.0", "id": str(uuid.uuid4()), "method": method}
|
|
419
|
+
if params:
|
|
420
|
+
msg["params"] = params
|
|
421
|
+
await ws.send(json.dumps(msg))
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
async def _publish_event(ws, event: dict):
|
|
425
|
+
"""Publish an event via RPC event.publish."""
|
|
426
|
+
await _rpc_call(ws, "event.publish", {
|
|
427
|
+
"event_id": str(uuid.uuid4()),
|
|
428
|
+
"event": event.get("event", ""),
|
|
429
|
+
"data": event.get("data", {}),
|
|
430
|
+
})
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
|
|
434
|
+
"""Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
|
|
435
|
+
params = msg.get("params", {})
|
|
436
|
+
event_type = params.get("event", "")
|
|
437
|
+
data = params.get("data", {})
|
|
438
|
+
|
|
439
|
+
# Special handling for module.shutdown targeting watchdog
|
|
440
|
+
if event_type == "module.shutdown" and data.get("module_id") == "watchdog":
|
|
441
|
+
await _handle_shutdown(monitor)
|
|
442
|
+
return
|
|
443
|
+
|
|
444
|
+
# Forward to monitor
|
|
445
|
+
await monitor.handle_event(msg)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
async def _handle_rpc_request(ws, msg: dict, monitor: HealthMonitor):
|
|
449
|
+
"""Handle an incoming RPC request (watchdog.* methods)."""
|
|
450
|
+
rpc_id = msg.get("id", "")
|
|
451
|
+
method = msg.get("method", "")
|
|
452
|
+
params = msg.get("params", {})
|
|
453
|
+
|
|
454
|
+
handlers = {
|
|
455
|
+
"health": lambda p: _rpc_health(monitor),
|
|
456
|
+
"status": lambda p: _rpc_status(monitor),
|
|
457
|
+
}
|
|
458
|
+
handler = handlers.get(method)
|
|
459
|
+
if handler:
|
|
460
|
+
try:
|
|
461
|
+
result = await handler(params)
|
|
462
|
+
await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
|
|
463
|
+
except Exception as e:
|
|
464
|
+
await ws.send(json.dumps({
|
|
465
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
466
|
+
"error": {"code": -32603, "message": str(e)},
|
|
467
|
+
}))
|
|
468
|
+
else:
|
|
469
|
+
await ws.send(json.dumps({
|
|
470
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
471
|
+
"error": {"code": -32601, "message": f"Method not found: {method}"},
|
|
472
|
+
}))
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
async def _rpc_health(monitor: HealthMonitor) -> dict:
|
|
476
|
+
"""RPC handler for watchdog.health."""
|
|
477
|
+
return {
|
|
478
|
+
"status": "healthy",
|
|
479
|
+
"details": {
|
|
480
|
+
"monitored_modules": len(monitor.modules),
|
|
481
|
+
"uptime_seconds": round(time.time() - _start_ts),
|
|
482
|
+
},
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
async def _rpc_status(monitor: HealthMonitor) -> dict:
|
|
487
|
+
"""RPC handler for watchdog.status."""
|
|
488
|
+
return monitor.get_status()
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
async def _handle_shutdown(monitor: HealthMonitor):
|
|
492
|
+
"""Handle module.shutdown event — ack, cleanup, ready, exit."""
|
|
493
|
+
print("[watchdog] Received shutdown request")
|
|
494
|
+
# Step 1: Send ack
|
|
495
|
+
await _publish_event(_ws_global, {
|
|
496
|
+
"event": "module.shutdown.ack",
|
|
497
|
+
"data": {"module_id": "watchdog", "estimated_cleanup": 2},
|
|
498
|
+
})
|
|
499
|
+
# Step 2: Cleanup
|
|
500
|
+
monitor.stop()
|
|
501
|
+
# Step 3: Send ready
|
|
502
|
+
await _publish_event(_ws_global, {
|
|
503
|
+
"event": "module.shutdown.ready",
|
|
504
|
+
"data": {"module_id": "watchdog"},
|
|
505
|
+
})
|
|
506
|
+
print("[watchdog] Shutdown ready, exiting")
|
|
507
|
+
# Step 4: Exit
|
|
508
|
+
sys.exit(0)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
if __name__ == "__main__":
|
|
512
|
+
asyncio.run(main())
|
|
513
|
+
|