@agentunion/kite 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +287 -1
- package/cli.js +76 -0
- package/extensions/agents/assistant/entry.py +111 -1
- package/extensions/agents/assistant/server.py +263 -197
- package/extensions/channels/acp_channel/entry.py +111 -1
- package/extensions/channels/acp_channel/module.md +23 -22
- package/extensions/channels/acp_channel/server.py +263 -197
- package/extensions/event_hub_bench/entry.py +107 -1
- package/extensions/services/backup/entry.py +408 -72
- package/extensions/services/backup/module.md +24 -22
- package/extensions/services/model_service/entry.py +255 -71
- package/extensions/services/model_service/module.md +21 -22
- package/extensions/services/watchdog/entry.py +344 -90
- package/extensions/services/watchdog/monitor.py +237 -21
- package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
- package/extensions/services/web/config_example.py +35 -0
- package/extensions/services/web/config_loader.py +110 -0
- package/extensions/services/web/entry.py +114 -26
- package/extensions/services/web/module.md +35 -24
- package/extensions/services/web/pairing.py +250 -0
- package/extensions/services/web/pairing_codes.jsonl +16 -0
- package/extensions/services/web/relay.py +643 -0
- package/extensions/services/web/relay_config.json5 +67 -0
- package/extensions/services/web/routes/routes_management_ws.py +127 -0
- package/extensions/services/web/routes/routes_rpc.py +89 -0
- package/extensions/services/web/routes/routes_test.py +61 -0
- package/extensions/services/web/server.py +445 -99
- package/extensions/services/web/static/css/style.css +138 -2
- package/extensions/services/web/static/index.html +295 -2
- package/extensions/services/web/static/js/app.js +1579 -5
- package/extensions/services/web/static/js/kernel-client-example.js +161 -0
- package/extensions/services/web/static/js/kernel-client.js +383 -0
- package/extensions/services/web/static/js/registry-tests.js +558 -0
- package/extensions/services/web/static/js/token-manager.js +175 -0
- package/extensions/services/web/static/pairing.html +248 -0
- package/extensions/services/web/static/test_registry.html +262 -0
- package/extensions/services/web/web_config.json5 +29 -0
- package/kernel/entry.py +120 -32
- package/kernel/event_hub.py +159 -16
- package/kernel/module.md +36 -33
- package/kernel/registry_store.py +70 -20
- package/kernel/rpc_router.py +134 -57
- package/kernel/server.py +292 -15
- package/kite_cli/__init__.py +3 -0
- package/kite_cli/__main__.py +5 -0
- package/kite_cli/commands/__init__.py +1 -0
- package/kite_cli/commands/clean.py +101 -0
- package/kite_cli/commands/doctor.py +35 -0
- package/kite_cli/commands/history.py +111 -0
- package/kite_cli/commands/info.py +96 -0
- package/kite_cli/commands/install.py +313 -0
- package/kite_cli/commands/list.py +143 -0
- package/kite_cli/commands/log.py +81 -0
- package/kite_cli/commands/rollback.py +88 -0
- package/kite_cli/commands/search.py +73 -0
- package/kite_cli/commands/uninstall.py +85 -0
- package/kite_cli/commands/update.py +118 -0
- package/kite_cli/core/__init__.py +1 -0
- package/kite_cli/core/checker.py +142 -0
- package/kite_cli/core/dependency.py +229 -0
- package/kite_cli/core/downloader.py +209 -0
- package/kite_cli/core/install_info.py +40 -0
- package/kite_cli/core/tool_installer.py +397 -0
- package/kite_cli/core/validator.py +78 -0
- package/kite_cli/main.py +289 -0
- package/kite_cli/utils/__init__.py +1 -0
- package/kite_cli/utils/i18n.py +252 -0
- package/kite_cli/utils/interactive.py +63 -0
- package/kite_cli/utils/operation_log.py +77 -0
- package/kite_cli/utils/paths.py +34 -0
- package/kite_cli/utils/version.py +308 -0
- package/launcher/count_lines.py +34 -0
- package/launcher/entry.py +905 -166
- package/launcher/logging_setup.py +104 -0
- package/launcher/module.md +37 -37
- package/launcher/process_manager.py +12 -1
- package/package.json +2 -1
- package/scripts/plan_manager.py +315 -0
|
@@ -20,7 +20,113 @@ import websockets
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
# ── Module configuration ──
|
|
23
|
-
|
|
23
|
+
|
|
24
|
+
def _load_module_config() -> dict:
|
|
25
|
+
"""Load module configuration from module.md frontmatter.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Dict with keys: name, preferred_port, advertise_ip
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
SystemExit: If module.md is invalid or name is non-compliant
|
|
32
|
+
"""
|
|
33
|
+
_this_dir = os.path.dirname(os.path.abspath(__file__))
|
|
34
|
+
module_md = os.path.join(_this_dir, "module.md")
|
|
35
|
+
|
|
36
|
+
# Calculate relative path for error messages
|
|
37
|
+
project_root = os.environ.get("KITE_PROJECT", "")
|
|
38
|
+
if project_root and _this_dir.startswith(project_root):
|
|
39
|
+
rel_path = os.path.relpath(_this_dir, project_root)
|
|
40
|
+
else:
|
|
41
|
+
rel_path = _this_dir
|
|
42
|
+
|
|
43
|
+
# Default values (will be overridden if valid config exists)
|
|
44
|
+
result = {
|
|
45
|
+
"name": "",
|
|
46
|
+
"preferred_port": 0,
|
|
47
|
+
"advertise_ip": "0.0.0.0"
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Check if module.md exists
|
|
51
|
+
if not os.path.exists(module_md):
|
|
52
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
53
|
+
print(f" Path: {rel_path}/module.md")
|
|
54
|
+
print(f" Reason: File not found")
|
|
55
|
+
sys.exit(1)
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
with open(module_md, encoding="utf-8") as f:
|
|
59
|
+
text = f.read()
|
|
60
|
+
|
|
61
|
+
# Extract YAML frontmatter (between --- markers)
|
|
62
|
+
import re
|
|
63
|
+
m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
|
|
64
|
+
if not m:
|
|
65
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
66
|
+
print(f" Path: {rel_path}/module.md")
|
|
67
|
+
print(f" Reason: Missing YAML frontmatter")
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
# Parse YAML frontmatter
|
|
71
|
+
try:
|
|
72
|
+
import yaml
|
|
73
|
+
fm = yaml.safe_load(m.group(1)) or {}
|
|
74
|
+
except ImportError:
|
|
75
|
+
print(f"[{rel_path}] ERROR: PyYAML not installed, cannot parse module.md")
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
79
|
+
print(f" Path: {rel_path}/module.md")
|
|
80
|
+
print(f" Reason: YAML parse error: {e}")
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
# Validate 'name' field (required)
|
|
84
|
+
if "name" not in fm:
|
|
85
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
86
|
+
print(f" Path: {rel_path}/module.md")
|
|
87
|
+
print(f" Reason: Missing 'name' field")
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
raw_name = str(fm["name"]).strip()
|
|
91
|
+
|
|
92
|
+
if not raw_name:
|
|
93
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
94
|
+
print(f" Path: {rel_path}/module.md")
|
|
95
|
+
print(f" Reason: Empty module name")
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
# Validate name characters
|
|
99
|
+
sanitized = re.sub(r'[^a-zA-Z0-9_\-]', '', raw_name)
|
|
100
|
+
|
|
101
|
+
if sanitized != raw_name:
|
|
102
|
+
invalid_chars = ''.join(sorted(set(c for c in raw_name if c not in sanitized)))
|
|
103
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
104
|
+
print(f" Path: {rel_path}/module.md")
|
|
105
|
+
print(f" Reason: Invalid characters in name '{raw_name}': {repr(invalid_chars)}")
|
|
106
|
+
sys.exit(1)
|
|
107
|
+
|
|
108
|
+
result["name"] = sanitized
|
|
109
|
+
|
|
110
|
+
# Extract optional fields
|
|
111
|
+
if "preferred_port" in fm:
|
|
112
|
+
try:
|
|
113
|
+
result["preferred_port"] = int(fm["preferred_port"])
|
|
114
|
+
except (ValueError, TypeError):
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
if "advertise_ip" in fm:
|
|
118
|
+
result["advertise_ip"] = str(fm["advertise_ip"])
|
|
119
|
+
|
|
120
|
+
except SystemExit:
|
|
121
|
+
raise # Re-raise exit to prevent catching by outer except
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"[{rel_path}] ERROR: Failed to read module.md: {e}")
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
_module_config = _load_module_config()
|
|
129
|
+
MODULE_NAME = _module_config["name"]
|
|
24
130
|
|
|
25
131
|
|
|
26
132
|
def _fmt_elapsed(t0: float) -> str:
|
|
@@ -265,11 +371,26 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
|
|
|
265
371
|
|
|
266
372
|
# Global WS reference for publish_event callback
|
|
267
373
|
_ws_global = None
|
|
374
|
+
_shutting_down = False
|
|
375
|
+
_exit_code = 0 # Exit code for main() to use
|
|
376
|
+
_monitor = None
|
|
377
|
+
_monitor_task = None
|
|
378
|
+
|
|
379
|
+
# RPC request-response infrastructure
|
|
380
|
+
_rpc_waiters: dict[str, asyncio.Event] = {} # rpc_id -> Event
|
|
381
|
+
_rpc_results: dict[str, dict] = {} # rpc_id -> response dict
|
|
268
382
|
|
|
269
383
|
|
|
384
|
+
def _is_auth_failure(e: Exception) -> bool:
|
|
385
|
+
"""Check if a WebSocket exception indicates authentication failure."""
|
|
386
|
+
if hasattr(e, 'rcvd') and e.rcvd is not None:
|
|
387
|
+
code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
|
|
388
|
+
return code in (4001, 4003)
|
|
389
|
+
return False
|
|
390
|
+
|
|
270
391
|
|
|
271
392
|
async def main():
|
|
272
|
-
global _ws_global
|
|
393
|
+
global _ws_global, _shutting_down, _monitor
|
|
273
394
|
# Initialize log file paths
|
|
274
395
|
global _log_dir, _log_latest_path, _crash_log_path
|
|
275
396
|
module_data = os.environ.get("KITE_MODULE_DATA")
|
|
@@ -321,57 +442,128 @@ async def main():
|
|
|
321
442
|
|
|
322
443
|
print(f"[watchdog] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
|
|
323
444
|
|
|
324
|
-
#
|
|
325
|
-
|
|
326
|
-
|
|
445
|
+
# Create monitor (once, persists across reconnects)
|
|
446
|
+
_monitor = HealthMonitor(
|
|
447
|
+
own_token=token,
|
|
448
|
+
kernel_port=kernel_port,
|
|
449
|
+
)
|
|
327
450
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
_ws_global = ws
|
|
331
|
-
print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
332
|
-
|
|
333
|
-
# Subscribe to events
|
|
334
|
-
await _rpc_call(ws, "event.subscribe", {
|
|
335
|
-
"events": [
|
|
336
|
-
"system.ready",
|
|
337
|
-
"module.started",
|
|
338
|
-
"module.stopped",
|
|
339
|
-
"module.exiting",
|
|
340
|
-
"module.ready",
|
|
341
|
-
"module.shutdown",
|
|
342
|
-
],
|
|
343
|
-
})
|
|
344
|
-
print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
|
|
345
|
-
|
|
346
|
-
# Register to Kernel Registry via RPC
|
|
347
|
-
await _rpc_call(ws, "registry.register", {
|
|
348
|
-
"module_id": "watchdog",
|
|
349
|
-
"module_type": "service",
|
|
350
|
-
"events_publish": {
|
|
351
|
-
"watchdog.module.unhealthy": {},
|
|
352
|
-
"watchdog.module.recovered": {},
|
|
353
|
-
"watchdog.alert": {},
|
|
354
|
-
},
|
|
355
|
-
"events_subscribe": [
|
|
356
|
-
"system.ready",
|
|
357
|
-
"module.started",
|
|
358
|
-
"module.stopped",
|
|
359
|
-
"module.exiting",
|
|
360
|
-
"module.ready",
|
|
361
|
-
"module.shutdown",
|
|
362
|
-
],
|
|
363
|
-
})
|
|
364
|
-
print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
|
|
451
|
+
# Start reconnect loop
|
|
452
|
+
await _ws_loop(token, kernel_port, _t0)
|
|
365
453
|
|
|
366
|
-
# Create monitor with RPC callback
|
|
367
|
-
monitor = HealthMonitor(
|
|
368
|
-
own_token=token,
|
|
369
|
-
kernel_port=kernel_port,
|
|
370
|
-
)
|
|
371
|
-
monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
|
|
372
|
-
monitor.rpc_call = lambda method, params: _rpc_call(ws, method, params)
|
|
373
454
|
|
|
374
|
-
|
|
455
|
+
async def _ws_loop(token: str, kernel_port: int, _t0: float):
|
|
456
|
+
"""Connect to Kernel with exponential backoff reconnection."""
|
|
457
|
+
global _shutting_down, _exit_code
|
|
458
|
+
retry_delay = 0.3
|
|
459
|
+
max_delay = 5.0
|
|
460
|
+
max_retries = 10
|
|
461
|
+
attempt = 0
|
|
462
|
+
while not _shutting_down:
|
|
463
|
+
try:
|
|
464
|
+
await _ws_connect(token, kernel_port, _t0)
|
|
465
|
+
retry_delay = 0.3
|
|
466
|
+
attempt = 0
|
|
467
|
+
except asyncio.CancelledError:
|
|
468
|
+
return
|
|
469
|
+
except Exception as e:
|
|
470
|
+
attempt += 1
|
|
471
|
+
if _is_auth_failure(e):
|
|
472
|
+
print(f"[watchdog] Kernel 认证失败,退出")
|
|
473
|
+
_exit_code = 1
|
|
474
|
+
_shutting_down = True
|
|
475
|
+
return
|
|
476
|
+
if attempt >= max_retries:
|
|
477
|
+
print(f"[watchdog] 重连失败 {max_retries} 次,退出")
|
|
478
|
+
_exit_code = 1
|
|
479
|
+
_shutting_down = True
|
|
480
|
+
return
|
|
481
|
+
_write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
|
|
482
|
+
print(f"[watchdog] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
|
|
483
|
+
if attempt == 5:
|
|
484
|
+
print(f"\033[33m[watchdog] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {kernel_port})")
|
|
485
|
+
if kernel_port < 1024:
|
|
486
|
+
print(f"[watchdog] ⚠ 端口 {kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
|
|
487
|
+
print(f"[watchdog] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
|
|
488
|
+
_ws_global_clear()
|
|
489
|
+
if _shutting_down:
|
|
490
|
+
return
|
|
491
|
+
await asyncio.sleep(retry_delay)
|
|
492
|
+
retry_delay = min(retry_delay * 2, max_delay)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def _ws_global_clear():
|
|
496
|
+
global _ws_global
|
|
497
|
+
_ws_global = None
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
501
|
+
"""Single WebSocket session: connect → subscribe → register → ready → receive loop."""
|
|
502
|
+
global _ws_global, _monitor, _monitor_task
|
|
503
|
+
|
|
504
|
+
ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
|
|
505
|
+
print(f"[watchdog] Connecting to Kernel: {ws_url}")
|
|
506
|
+
|
|
507
|
+
async with websockets.connect(ws_url, open_timeout=5, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
|
|
508
|
+
_ws_global = ws
|
|
509
|
+
print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
510
|
+
|
|
511
|
+
# Subscribe to events
|
|
512
|
+
await _rpc_call(ws, "event.subscribe", {
|
|
513
|
+
"events": [
|
|
514
|
+
"system.ready",
|
|
515
|
+
"module.started",
|
|
516
|
+
"module.stopped",
|
|
517
|
+
"module.exiting",
|
|
518
|
+
"module.ready",
|
|
519
|
+
"module.shutdown",
|
|
520
|
+
"module.offline",
|
|
521
|
+
],
|
|
522
|
+
})
|
|
523
|
+
print(f"[watchdog] Subscribed to events ({_fmt_elapsed(_t0)})")
|
|
524
|
+
|
|
525
|
+
# Register to Kernel Registry via RPC
|
|
526
|
+
await _rpc_call(ws, "registry.register", {
|
|
527
|
+
"module_id": "watchdog",
|
|
528
|
+
"module_type": "service",
|
|
529
|
+
"tools": {
|
|
530
|
+
"rpc": {
|
|
531
|
+
"module": {
|
|
532
|
+
"health": {"method": "health", "description": "健康检查"},
|
|
533
|
+
"status": {"method": "status", "description": "状态查询"}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
},
|
|
537
|
+
"events_publish": {
|
|
538
|
+
"watchdog": {
|
|
539
|
+
"module": {
|
|
540
|
+
"unhealthy": {"description": "模块不健康"},
|
|
541
|
+
"recovered": {"description": "模块恢复"},
|
|
542
|
+
"resource_critical": {"description": "资源严重不足"},
|
|
543
|
+
"resource_warning": {"description": "资源警告"},
|
|
544
|
+
"resource_recovered": {"description": "资源恢复正常"}
|
|
545
|
+
},
|
|
546
|
+
"alert": {"description": "监控告警"}
|
|
547
|
+
}
|
|
548
|
+
},
|
|
549
|
+
"events_subscribe": [
|
|
550
|
+
"system.ready",
|
|
551
|
+
"module.started",
|
|
552
|
+
"module.stopped",
|
|
553
|
+
"module.exiting",
|
|
554
|
+
"module.ready",
|
|
555
|
+
"module.shutdown",
|
|
556
|
+
"module.offline",
|
|
557
|
+
],
|
|
558
|
+
})
|
|
559
|
+
print(f"[watchdog] Registered to Kernel ({_fmt_elapsed(_t0)})")
|
|
560
|
+
|
|
561
|
+
# Set up monitor callbacks (reconnect-safe)
|
|
562
|
+
_monitor.publish_event = lambda event: asyncio.create_task(_publish_event(ws, event))
|
|
563
|
+
_monitor.rpc_call = lambda method, params: _rpc_call_with_response(ws, method, params)
|
|
564
|
+
|
|
565
|
+
# Publish module.ready (every reconnect)
|
|
566
|
+
if not _shutting_down:
|
|
375
567
|
await _rpc_call(ws, "event.publish", {
|
|
376
568
|
"event_id": str(uuid.uuid4()),
|
|
377
569
|
"event": "module.ready",
|
|
@@ -382,34 +574,43 @@ async def main():
|
|
|
382
574
|
})
|
|
383
575
|
print(f"[watchdog] module.ready published ({_fmt_elapsed(_t0)})")
|
|
384
576
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
# Message loop: handle incoming RPC + events
|
|
389
|
-
async for raw in ws:
|
|
390
|
-
try:
|
|
391
|
-
msg = json.loads(raw)
|
|
392
|
-
except (json.JSONDecodeError, TypeError):
|
|
393
|
-
continue
|
|
394
|
-
|
|
395
|
-
try:
|
|
396
|
-
has_method = "method" in msg
|
|
397
|
-
has_id = "id" in msg
|
|
398
|
-
|
|
399
|
-
if has_method and not has_id:
|
|
400
|
-
# Event Notification
|
|
401
|
-
await _handle_event_notification(msg, monitor)
|
|
402
|
-
elif has_method and has_id:
|
|
403
|
-
# Incoming RPC request
|
|
404
|
-
await _handle_rpc_request(ws, msg, monitor)
|
|
405
|
-
# Ignore RPC responses (we don't await them in this simple impl)
|
|
406
|
-
except Exception as e:
|
|
407
|
-
print(f"[watchdog] 消息处理异常(已忽略): {e}")
|
|
577
|
+
# Start monitor loop if not already running
|
|
578
|
+
if _monitor_task is None or _monitor_task.done():
|
|
579
|
+
_monitor_task = asyncio.create_task(_monitor.run())
|
|
408
580
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
581
|
+
# Start heartbeat loop
|
|
582
|
+
heartbeat_task = asyncio.create_task(_heartbeat_loop(ws))
|
|
583
|
+
|
|
584
|
+
# Message loop: handle incoming RPC + events
|
|
585
|
+
# CRITICAL: RPC 死锁防范
|
|
586
|
+
# - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
|
|
587
|
+
# - 原因:如果 handler 内部调用 rpc_call_with_response() 发出站请求,出站响应需要本接收循环来分发
|
|
588
|
+
# - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
|
|
589
|
+
# - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
|
|
590
|
+
async for raw in ws:
|
|
591
|
+
try:
|
|
592
|
+
msg = json.loads(raw)
|
|
593
|
+
except (json.JSONDecodeError, TypeError):
|
|
594
|
+
continue
|
|
595
|
+
|
|
596
|
+
try:
|
|
597
|
+
has_method = "method" in msg
|
|
598
|
+
has_id = "id" in msg
|
|
599
|
+
|
|
600
|
+
if has_method and not has_id:
|
|
601
|
+
# Event Notification
|
|
602
|
+
await _handle_event_notification(msg, _monitor)
|
|
603
|
+
elif has_method and has_id:
|
|
604
|
+
# Incoming RPC request — run in background to prevent deadlock
|
|
605
|
+
asyncio.create_task(_handle_rpc_request(ws, msg, _monitor))
|
|
606
|
+
elif has_id and not has_method:
|
|
607
|
+
# RPC response — route to waiter
|
|
608
|
+
msg_id = msg["id"]
|
|
609
|
+
if msg_id in _rpc_waiters:
|
|
610
|
+
_rpc_results[msg_id] = msg
|
|
611
|
+
_rpc_waiters[msg_id].set()
|
|
612
|
+
except Exception as e:
|
|
613
|
+
print(f"[watchdog] 消息处理异常(已忽略): {e}")
|
|
413
614
|
|
|
414
615
|
|
|
415
616
|
|
|
@@ -421,6 +622,40 @@ async def _rpc_call(ws, method: str, params: dict = None):
|
|
|
421
622
|
await ws.send(json.dumps(msg))
|
|
422
623
|
|
|
423
624
|
|
|
625
|
+
async def _heartbeat_loop(ws):
|
|
626
|
+
"""Send registry.heartbeat every 30 seconds to prevent TTL expiration."""
|
|
627
|
+
while True:
|
|
628
|
+
try:
|
|
629
|
+
await asyncio.sleep(30)
|
|
630
|
+
if not _shutting_down:
|
|
631
|
+
await _rpc_call(ws, "registry.heartbeat", {"module_id": "watchdog"})
|
|
632
|
+
except Exception as e:
|
|
633
|
+
print(f"[watchdog] Heartbeat error: {e}")
|
|
634
|
+
break
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
async def _rpc_call_with_response(ws, method: str, params: dict = None, timeout: float = 5) -> dict:
|
|
638
|
+
"""Send a JSON-RPC 2.0 request and await the response."""
|
|
639
|
+
rpc_id = str(uuid.uuid4())
|
|
640
|
+
msg = {"jsonrpc": "2.0", "id": rpc_id, "method": method}
|
|
641
|
+
if params:
|
|
642
|
+
msg["params"] = params
|
|
643
|
+
|
|
644
|
+
evt = asyncio.Event()
|
|
645
|
+
_rpc_waiters[rpc_id] = evt
|
|
646
|
+
|
|
647
|
+
await ws.send(json.dumps(msg))
|
|
648
|
+
|
|
649
|
+
try:
|
|
650
|
+
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
651
|
+
return _rpc_results.pop(rpc_id, {})
|
|
652
|
+
except asyncio.TimeoutError:
|
|
653
|
+
return {"error": {"code": -32000, "message": f"RPC timeout: {method}"}}
|
|
654
|
+
finally:
|
|
655
|
+
_rpc_waiters.pop(rpc_id, None)
|
|
656
|
+
_rpc_results.pop(rpc_id, None)
|
|
657
|
+
|
|
658
|
+
|
|
424
659
|
async def _publish_event(ws, event: dict):
|
|
425
660
|
"""Publish an event via RPC event.publish."""
|
|
426
661
|
await _rpc_call(ws, "event.publish", {
|
|
@@ -436,13 +671,17 @@ async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
|
|
|
436
671
|
event_type = params.get("event", "")
|
|
437
672
|
data = params.get("data", {})
|
|
438
673
|
|
|
439
|
-
#
|
|
440
|
-
if event_type == "module.shutdown"
|
|
441
|
-
|
|
442
|
-
|
|
674
|
+
# Debug: log all shutdown events
|
|
675
|
+
if event_type == "module.shutdown":
|
|
676
|
+
target = data.get("module_id", "")
|
|
677
|
+
reason = data.get("reason", "")
|
|
678
|
+
# Handle both targeted shutdown (module_id == "watchdog") and broadcast shutdown (no module_id or launcher_lost)
|
|
679
|
+
if target == "watchdog" or not target or reason == "launcher_lost":
|
|
680
|
+
await _handle_shutdown(monitor)
|
|
681
|
+
return
|
|
443
682
|
|
|
444
|
-
# Forward to monitor
|
|
445
|
-
await monitor.handle_event(
|
|
683
|
+
# Forward to monitor (extract params from JSON-RPC notification)
|
|
684
|
+
await monitor.handle_event(params)
|
|
446
685
|
|
|
447
686
|
|
|
448
687
|
async def _handle_rpc_request(ws, msg: dict, monitor: HealthMonitor):
|
|
@@ -489,23 +728,38 @@ async def _rpc_status(monitor: HealthMonitor) -> dict:
|
|
|
489
728
|
|
|
490
729
|
|
|
491
730
|
async def _handle_shutdown(monitor: HealthMonitor):
|
|
492
|
-
"""Handle module.shutdown event — ack
|
|
731
|
+
"""Handle module.shutdown event — ack → exiting → cleanup → ready → exit."""
|
|
732
|
+
global _shutting_down
|
|
493
733
|
print("[watchdog] Received shutdown request")
|
|
494
|
-
|
|
734
|
+
_shutting_down = True
|
|
735
|
+
# Step 1: Send ack (立即确认收到)
|
|
495
736
|
await _publish_event(_ws_global, {
|
|
496
737
|
"event": "module.shutdown.ack",
|
|
497
|
-
"data": {"module_id": "watchdog"
|
|
738
|
+
"data": {"module_id": "watchdog"},
|
|
739
|
+
})
|
|
740
|
+
# Step 2: Send module.exiting (开始清理)
|
|
741
|
+
await _publish_event(_ws_global, {
|
|
742
|
+
"event": "module.exiting",
|
|
743
|
+
"data": {
|
|
744
|
+
"module_id": "watchdog",
|
|
745
|
+
"type": "passive",
|
|
746
|
+
"reason": "shutdown_requested",
|
|
747
|
+
"restart": "auto",
|
|
748
|
+
"action": "none",
|
|
749
|
+
"timeout": 2.0,
|
|
750
|
+
"restart_delay": 0.0,
|
|
751
|
+
},
|
|
498
752
|
})
|
|
499
|
-
# Step
|
|
753
|
+
# Step 3: Cleanup
|
|
500
754
|
monitor.stop()
|
|
501
|
-
# Step
|
|
755
|
+
# Step 4: Send ready (清理完成)
|
|
502
756
|
await _publish_event(_ws_global, {
|
|
503
757
|
"event": "module.shutdown.ready",
|
|
504
758
|
"data": {"module_id": "watchdog"},
|
|
505
759
|
})
|
|
506
760
|
print("[watchdog] Shutdown ready, exiting")
|
|
507
|
-
# Step
|
|
508
|
-
sys.exit(
|
|
761
|
+
# Step 5: Exit
|
|
762
|
+
sys.exit(_exit_code)
|
|
509
763
|
|
|
510
764
|
|
|
511
765
|
if __name__ == "__main__":
|