@agentunion/kite 1.3.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/CHANGELOG.md +200 -0
  2. package/cli.js +76 -0
  3. package/extensions/agents/assistant/entry.py +111 -1
  4. package/extensions/agents/assistant/server.py +263 -215
  5. package/extensions/channels/acp_channel/entry.py +111 -1
  6. package/extensions/channels/acp_channel/module.md +23 -22
  7. package/extensions/channels/acp_channel/server.py +263 -215
  8. package/extensions/event_hub_bench/entry.py +107 -1
  9. package/extensions/services/backup/entry.py +299 -21
  10. package/extensions/services/backup/module.md +24 -22
  11. package/extensions/services/model_service/entry.py +145 -19
  12. package/extensions/services/model_service/module.md +21 -22
  13. package/extensions/services/watchdog/entry.py +188 -25
  14. package/extensions/services/watchdog/monitor.py +144 -34
  15. package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
  16. package/extensions/services/web/config_example.py +35 -0
  17. package/extensions/services/web/config_loader.py +110 -0
  18. package/extensions/services/web/entry.py +114 -26
  19. package/extensions/services/web/module.md +35 -24
  20. package/extensions/services/web/pairing.py +250 -0
  21. package/extensions/services/web/pairing_codes.jsonl +16 -0
  22. package/extensions/services/web/relay.py +643 -0
  23. package/extensions/services/web/relay_config.json5 +67 -0
  24. package/extensions/services/web/routes/routes_management_ws.py +127 -0
  25. package/extensions/services/web/routes/routes_rpc.py +89 -0
  26. package/extensions/services/web/routes/routes_test.py +61 -0
  27. package/extensions/services/web/routes/schemas.py +0 -22
  28. package/extensions/services/web/server.py +421 -98
  29. package/extensions/services/web/static/css/style.css +67 -28
  30. package/extensions/services/web/static/index.html +234 -44
  31. package/extensions/services/web/static/js/app.js +1335 -48
  32. package/extensions/services/web/static/js/kernel-client-example.js +161 -0
  33. package/extensions/services/web/static/js/kernel-client.js +383 -0
  34. package/extensions/services/web/static/js/registry-tests.js +558 -0
  35. package/extensions/services/web/static/js/token-manager.js +175 -0
  36. package/extensions/services/web/static/pairing.html +248 -0
  37. package/extensions/services/web/static/test_registry.html +262 -0
  38. package/extensions/services/web/web_config.json5 +29 -0
  39. package/kernel/entry.py +120 -32
  40. package/kernel/event_hub.py +141 -16
  41. package/kernel/module.md +36 -33
  42. package/kernel/registry_store.py +48 -15
  43. package/kernel/rpc_router.py +120 -53
  44. package/kernel/server.py +219 -12
  45. package/kite_cli/__init__.py +3 -0
  46. package/kite_cli/__main__.py +5 -0
  47. package/kite_cli/commands/__init__.py +1 -0
  48. package/kite_cli/commands/clean.py +101 -0
  49. package/kite_cli/commands/doctor.py +35 -0
  50. package/kite_cli/commands/history.py +111 -0
  51. package/kite_cli/commands/info.py +96 -0
  52. package/kite_cli/commands/install.py +313 -0
  53. package/kite_cli/commands/list.py +143 -0
  54. package/kite_cli/commands/log.py +81 -0
  55. package/kite_cli/commands/rollback.py +88 -0
  56. package/kite_cli/commands/search.py +73 -0
  57. package/kite_cli/commands/uninstall.py +85 -0
  58. package/kite_cli/commands/update.py +118 -0
  59. package/kite_cli/core/__init__.py +1 -0
  60. package/kite_cli/core/checker.py +142 -0
  61. package/kite_cli/core/dependency.py +229 -0
  62. package/kite_cli/core/downloader.py +209 -0
  63. package/kite_cli/core/install_info.py +40 -0
  64. package/kite_cli/core/tool_installer.py +397 -0
  65. package/kite_cli/core/validator.py +78 -0
  66. package/kite_cli/main.py +289 -0
  67. package/kite_cli/utils/__init__.py +1 -0
  68. package/kite_cli/utils/i18n.py +252 -0
  69. package/kite_cli/utils/interactive.py +63 -0
  70. package/kite_cli/utils/operation_log.py +77 -0
  71. package/kite_cli/utils/paths.py +34 -0
  72. package/kite_cli/utils/version.py +308 -0
  73. package/launcher/entry.py +819 -158
  74. package/launcher/logging_setup.py +104 -0
  75. package/launcher/module.md +37 -37
  76. package/package.json +2 -1
  77. package/scripts/plan_manager.py +315 -0
  78. package/extensions/services/web/routes/routes_modules.py +0 -249
@@ -23,7 +23,113 @@ import websockets
23
23
 
24
24
 
25
25
  # ── Module configuration ──
26
- MODULE_NAME = "model_service"
26
+
27
+ def _load_module_config() -> dict:
28
+ """Load module configuration from module.md frontmatter.
29
+
30
+ Returns:
31
+ Dict with keys: name, preferred_port, advertise_ip
32
+
33
+ Raises:
34
+ SystemExit: If module.md is invalid or name is non-compliant
35
+ """
36
+ _this_dir = os.path.dirname(os.path.abspath(__file__))
37
+ module_md = os.path.join(_this_dir, "module.md")
38
+
39
+ # Calculate relative path for error messages
40
+ project_root = os.environ.get("KITE_PROJECT", "")
41
+ if project_root and _this_dir.startswith(project_root):
42
+ rel_path = os.path.relpath(_this_dir, project_root)
43
+ else:
44
+ rel_path = _this_dir
45
+
46
+ # Default values (will be overridden if valid config exists)
47
+ result = {
48
+ "name": "",
49
+ "preferred_port": 0,
50
+ "advertise_ip": "0.0.0.0"
51
+ }
52
+
53
+ # Check if module.md exists
54
+ if not os.path.exists(module_md):
55
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
56
+ print(f" Path: {rel_path}/module.md")
57
+ print(f" Reason: File not found")
58
+ sys.exit(1)
59
+
60
+ try:
61
+ with open(module_md, encoding="utf-8") as f:
62
+ text = f.read()
63
+
64
+ # Extract YAML frontmatter (between --- markers)
65
+ import re
66
+ m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
67
+ if not m:
68
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
69
+ print(f" Path: {rel_path}/module.md")
70
+ print(f" Reason: Missing YAML frontmatter")
71
+ sys.exit(1)
72
+
73
+ # Parse YAML frontmatter
74
+ try:
75
+ import yaml
76
+ fm = yaml.safe_load(m.group(1)) or {}
77
+ except ImportError:
78
+ print(f"[{rel_path}] ERROR: PyYAML not installed, cannot parse module.md")
79
+ sys.exit(1)
80
+ except Exception as e:
81
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
82
+ print(f" Path: {rel_path}/module.md")
83
+ print(f" Reason: YAML parse error: {e}")
84
+ sys.exit(1)
85
+
86
+ # Validate 'name' field (required)
87
+ if "name" not in fm:
88
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
89
+ print(f" Path: {rel_path}/module.md")
90
+ print(f" Reason: Missing 'name' field")
91
+ sys.exit(1)
92
+
93
+ raw_name = str(fm["name"]).strip()
94
+
95
+ if not raw_name:
96
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
97
+ print(f" Path: {rel_path}/module.md")
98
+ print(f" Reason: Empty module name")
99
+ sys.exit(1)
100
+
101
+ # Validate name characters
102
+ sanitized = re.sub(r'[^a-zA-Z0-9_\-]', '', raw_name)
103
+
104
+ if sanitized != raw_name:
105
+ invalid_chars = ''.join(sorted(set(c for c in raw_name if c not in sanitized)))
106
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
107
+ print(f" Path: {rel_path}/module.md")
108
+ print(f" Reason: Invalid characters in name '{raw_name}': {repr(invalid_chars)}")
109
+ sys.exit(1)
110
+
111
+ result["name"] = sanitized
112
+
113
+ # Extract optional fields
114
+ if "preferred_port" in fm:
115
+ try:
116
+ result["preferred_port"] = int(fm["preferred_port"])
117
+ except (ValueError, TypeError):
118
+ pass
119
+
120
+ if "advertise_ip" in fm:
121
+ result["advertise_ip"] = str(fm["advertise_ip"])
122
+
123
+ except SystemExit:
124
+ raise # Re-raise exit to prevent catching by outer except
125
+ except Exception as e:
126
+ print(f"[{rel_path}] ERROR: Failed to read module.md: {e}")
127
+ sys.exit(1)
128
+
129
+ return result
130
+
131
+ _module_config = _load_module_config()
132
+ MODULE_NAME = _module_config["name"]
27
133
 
28
134
 
29
135
  class _SafeWriter:
@@ -264,6 +370,7 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
264
370
  # Global WS reference for publish_event callback
265
371
  _ws_global = None
266
372
  _shutting_down = False
373
+ _exit_code = 0 # Exit code for main() to use
267
374
 
268
375
 
269
376
  def _is_auth_failure(e: Exception) -> bool:
@@ -333,7 +440,7 @@ async def main():
333
440
 
334
441
  async def _ws_loop(token: str, kernel_port: int, _t0: float):
335
442
  """Connect to Kernel with exponential backoff reconnection."""
336
- global _shutting_down
443
+ global _shutting_down, _exit_code
337
444
  retry_delay = 0.3
338
445
  max_delay = 5.0
339
446
  max_retries = 10
@@ -349,10 +456,14 @@ async def _ws_loop(token: str, kernel_port: int, _t0: float):
349
456
  attempt += 1
350
457
  if _is_auth_failure(e):
351
458
  print(f"[model_service] Kernel 认证失败,退出")
352
- sys.exit(1)
459
+ _exit_code = 1
460
+ _shutting_down = True
461
+ return
353
462
  if attempt >= max_retries:
354
463
  print(f"[model_service] 重连失败 {max_retries} 次,退出")
355
- sys.exit(1)
464
+ _exit_code = 1
465
+ _shutting_down = True
466
+ return
356
467
  _write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
357
468
  print(f"[model_service] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
358
469
  _ws_global_clear()
@@ -374,7 +485,7 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
374
485
  ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=model_service"
375
486
  print(f"[model_service] Connecting to Kernel: {ws_url}")
376
487
 
377
- async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
488
+ async with websockets.connect(ws_url, open_timeout=5, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
378
489
  _ws_global = ws
379
490
  print(f"[model_service] Connected to Kernel ({_fmt_elapsed(_t0)})")
380
491
 
@@ -393,7 +504,9 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
393
504
  "module_id": "model_service",
394
505
  "module_type": "service",
395
506
  "events_publish": {
396
- "model_service.test": {"description": "Test event from model_service module"},
507
+ "model_service": {
508
+ "test": {"description": "Test event from model_service module"},
509
+ }
397
510
  },
398
511
  "events_subscribe": [
399
512
  "module.started",
@@ -419,6 +532,11 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
419
532
  test_task = asyncio.create_task(_test_event_loop(ws))
420
533
 
421
534
  # Message loop: handle incoming RPC + events
535
+ # CRITICAL: RPC 死锁防范
536
+ # - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
537
+ # - 原因:如果 handler 内部调用 rpc_call() 发出站请求,出站响应需要本接收循环来分发
538
+ # - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
539
+ # - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
422
540
  async for raw in ws:
423
541
  try:
424
542
  msg = json.loads(raw)
@@ -433,8 +551,8 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
433
551
  # Event Notification
434
552
  await _handle_event_notification(msg)
435
553
  elif has_method and has_id:
436
- # Incoming RPC request
437
- await _handle_rpc_request(ws, msg)
554
+ # Incoming RPC request — run in background to prevent deadlock
555
+ asyncio.create_task(_handle_rpc_request(ws, msg))
438
556
  # Ignore RPC responses (we don't await them in this simple impl)
439
557
  except Exception as e:
440
558
  print(f"[model_service] 消息处理异常(已忽略): {e}")
@@ -523,29 +641,37 @@ async def _rpc_status() -> dict:
523
641
 
524
642
 
525
643
  async def _handle_shutdown():
526
- """Handle module.shutdown event — exitingack → cleanup → ready → exit."""
644
+ """Handle module.shutdown event — ackexiting → cleanup → ready → exit."""
527
645
  global _shutting_down
528
646
  print("[model_service] Received shutdown request")
529
647
  _shutting_down = True
530
- # Step 0: Send module.exiting
648
+ # Step 1: Send ack (立即确认收到)
531
649
  await _publish_event(_ws_global, {
532
- "event": "module.exiting",
533
- "data": {"module_id": "model_service", "action": "none"},
650
+ "event": "module.shutdown.ack",
651
+ "data": {"module_id": "model_service"},
534
652
  })
535
- # Step 1: Send ack
653
+ # Step 2: Send module.exiting (开始清理)
536
654
  await _publish_event(_ws_global, {
537
- "event": "module.shutdown.ack",
538
- "data": {"module_id": "model_service", "estimated_cleanup": 2},
655
+ "event": "module.exiting",
656
+ "data": {
657
+ "module_id": "model_service",
658
+ "type": "passive",
659
+ "reason": "shutdown_requested",
660
+ "restart": "auto",
661
+ "action": "none",
662
+ "timeout": 2.0,
663
+ "restart_delay": 0.0,
664
+ },
539
665
  })
540
- # Step 2: Cleanup (nothing to clean up for model_service)
541
- # Step 3: Send ready
666
+ # Step 3: Cleanup (nothing to clean up for model_service)
667
+ # Step 4: Send ready (清理完成)
542
668
  await _publish_event(_ws_global, {
543
669
  "event": "module.shutdown.ready",
544
670
  "data": {"module_id": "model_service"},
545
671
  })
546
672
  print("[model_service] Shutdown ready, exiting")
547
- # Step 4: Exit
548
- sys.exit(0)
673
+ # Step 5: Exit
674
+ sys.exit(_exit_code)
549
675
 
550
676
 
551
677
  async def _test_event_loop(ws):
@@ -1,22 +1,21 @@
1
- ---
2
- name: model_service
3
- display_name: Model Service
4
- version: "1.0"
5
- type: service
6
- state: enabled
7
- runtime: python
8
- entry: entry.py
9
- events:
10
- - model_service.test
11
- subscriptions:
12
- - module.started
13
- - module.stopped
14
- - module.shutdown
15
- ---
16
-
17
- # Model Service(大模型服务)
18
-
19
- 大模型服务模块,提供统一的 LLM 调用接口。
20
-
21
- - 模型调用封装多种大模型 API 的统一调用接口
22
- - 事件通知 — 通过 Event Hub 发布模型服务状态事件
1
+ ---
2
+ name: model_service
3
+ display_name: Model Service
4
+ version: '1.0'
5
+ type: service
6
+ state: manual
7
+ runtime: python
8
+ entry: entry.py
9
+ events:
10
+ - model_service.test
11
+ subscriptions:
12
+ - module.started
13
+ - module.stopped
14
+ - module.shutdown
15
+ ---
16
+ # Model Service(大模型服务)
17
+
18
+ 大模型服务模块,提供统一的 LLM 调用接口。
19
+
20
+ - 模型调用 — 封装多种大模型 API 的统一调用接口
21
+ - 事件通知通过 Event Hub 发布模型服务状态事件
@@ -20,7 +20,113 @@ import websockets
20
20
 
21
21
 
22
22
  # ── Module configuration ──
23
- MODULE_NAME = "watchdog"
23
+
24
+ def _load_module_config() -> dict:
25
+ """Load module configuration from module.md frontmatter.
26
+
27
+ Returns:
28
+ Dict with keys: name, preferred_port, advertise_ip
29
+
30
+ Raises:
31
+ SystemExit: If module.md is invalid or name is non-compliant
32
+ """
33
+ _this_dir = os.path.dirname(os.path.abspath(__file__))
34
+ module_md = os.path.join(_this_dir, "module.md")
35
+
36
+ # Calculate relative path for error messages
37
+ project_root = os.environ.get("KITE_PROJECT", "")
38
+ if project_root and _this_dir.startswith(project_root):
39
+ rel_path = os.path.relpath(_this_dir, project_root)
40
+ else:
41
+ rel_path = _this_dir
42
+
43
+ # Default values (will be overridden if valid config exists)
44
+ result = {
45
+ "name": "",
46
+ "preferred_port": 0,
47
+ "advertise_ip": "0.0.0.0"
48
+ }
49
+
50
+ # Check if module.md exists
51
+ if not os.path.exists(module_md):
52
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
53
+ print(f" Path: {rel_path}/module.md")
54
+ print(f" Reason: File not found")
55
+ sys.exit(1)
56
+
57
+ try:
58
+ with open(module_md, encoding="utf-8") as f:
59
+ text = f.read()
60
+
61
+ # Extract YAML frontmatter (between --- markers)
62
+ import re
63
+ m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
64
+ if not m:
65
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
66
+ print(f" Path: {rel_path}/module.md")
67
+ print(f" Reason: Missing YAML frontmatter")
68
+ sys.exit(1)
69
+
70
+ # Parse YAML frontmatter
71
+ try:
72
+ import yaml
73
+ fm = yaml.safe_load(m.group(1)) or {}
74
+ except ImportError:
75
+ print(f"[{rel_path}] ERROR: PyYAML not installed, cannot parse module.md")
76
+ sys.exit(1)
77
+ except Exception as e:
78
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
79
+ print(f" Path: {rel_path}/module.md")
80
+ print(f" Reason: YAML parse error: {e}")
81
+ sys.exit(1)
82
+
83
+ # Validate 'name' field (required)
84
+ if "name" not in fm:
85
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
86
+ print(f" Path: {rel_path}/module.md")
87
+ print(f" Reason: Missing 'name' field")
88
+ sys.exit(1)
89
+
90
+ raw_name = str(fm["name"]).strip()
91
+
92
+ if not raw_name:
93
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
94
+ print(f" Path: {rel_path}/module.md")
95
+ print(f" Reason: Empty module name")
96
+ sys.exit(1)
97
+
98
+ # Validate name characters
99
+ sanitized = re.sub(r'[^a-zA-Z0-9_\-]', '', raw_name)
100
+
101
+ if sanitized != raw_name:
102
+ invalid_chars = ''.join(sorted(set(c for c in raw_name if c not in sanitized)))
103
+ print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
104
+ print(f" Path: {rel_path}/module.md")
105
+ print(f" Reason: Invalid characters in name '{raw_name}': {repr(invalid_chars)}")
106
+ sys.exit(1)
107
+
108
+ result["name"] = sanitized
109
+
110
+ # Extract optional fields
111
+ if "preferred_port" in fm:
112
+ try:
113
+ result["preferred_port"] = int(fm["preferred_port"])
114
+ except (ValueError, TypeError):
115
+ pass
116
+
117
+ if "advertise_ip" in fm:
118
+ result["advertise_ip"] = str(fm["advertise_ip"])
119
+
120
+ except SystemExit:
121
+ raise # Re-raise exit to prevent catching by outer except
122
+ except Exception as e:
123
+ print(f"[{rel_path}] ERROR: Failed to read module.md: {e}")
124
+ sys.exit(1)
125
+
126
+ return result
127
+
128
+ _module_config = _load_module_config()
129
+ MODULE_NAME = _module_config["name"]
24
130
 
25
131
 
26
132
  def _fmt_elapsed(t0: float) -> str:
@@ -266,6 +372,7 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
266
372
  # Global WS reference for publish_event callback
267
373
  _ws_global = None
268
374
  _shutting_down = False
375
+ _exit_code = 0 # Exit code for main() to use
269
376
  _monitor = None
270
377
  _monitor_task = None
271
378
 
@@ -347,7 +454,7 @@ async def main():
347
454
 
348
455
  async def _ws_loop(token: str, kernel_port: int, _t0: float):
349
456
  """Connect to Kernel with exponential backoff reconnection."""
350
- global _shutting_down
457
+ global _shutting_down, _exit_code
351
458
  retry_delay = 0.3
352
459
  max_delay = 5.0
353
460
  max_retries = 10
@@ -363,12 +470,21 @@ async def _ws_loop(token: str, kernel_port: int, _t0: float):
363
470
  attempt += 1
364
471
  if _is_auth_failure(e):
365
472
  print(f"[watchdog] Kernel 认证失败,退出")
366
- sys.exit(1)
473
+ _exit_code = 1
474
+ _shutting_down = True
475
+ return
367
476
  if attempt >= max_retries:
368
477
  print(f"[watchdog] 重连失败 {max_retries} 次,退出")
369
- sys.exit(1)
478
+ _exit_code = 1
479
+ _shutting_down = True
480
+ return
370
481
  _write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
371
482
  print(f"[watchdog] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
483
+ if attempt == 5:
484
+ print(f"\033[33m[watchdog] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {kernel_port})")
485
+ if kernel_port < 1024:
486
+ print(f"[watchdog] ⚠ 端口 {kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
487
+ print(f"[watchdog] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
372
488
  _ws_global_clear()
373
489
  if _shutting_down:
374
490
  return
@@ -388,7 +504,7 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
388
504
  ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
389
505
  print(f"[watchdog] Connecting to Kernel: {ws_url}")
390
506
 
391
- async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
507
+ async with websockets.connect(ws_url, open_timeout=5, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
392
508
  _ws_global = ws
393
509
  print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
394
510
 
@@ -410,10 +526,25 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
410
526
  await _rpc_call(ws, "registry.register", {
411
527
  "module_id": "watchdog",
412
528
  "module_type": "service",
529
+ "tools": {
530
+ "rpc": {
531
+ "module": {
532
+ "health": {"method": "health", "description": "健康检查"},
533
+ "status": {"method": "status", "description": "状态查询"}
534
+ }
535
+ }
536
+ },
413
537
  "events_publish": {
414
- "watchdog.module.unhealthy": {},
415
- "watchdog.module.recovered": {},
416
- "watchdog.alert": {},
538
+ "watchdog": {
539
+ "module": {
540
+ "unhealthy": {"description": "模块不健康"},
541
+ "recovered": {"description": "模块恢复"},
542
+ "resource_critical": {"description": "资源严重不足"},
543
+ "resource_warning": {"description": "资源警告"},
544
+ "resource_recovered": {"description": "资源恢复正常"}
545
+ },
546
+ "alert": {"description": "监控告警"}
547
+ }
417
548
  },
418
549
  "events_subscribe": [
419
550
  "system.ready",
@@ -447,7 +578,15 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
447
578
  if _monitor_task is None or _monitor_task.done():
448
579
  _monitor_task = asyncio.create_task(_monitor.run())
449
580
 
581
+ # Start heartbeat loop
582
+ heartbeat_task = asyncio.create_task(_heartbeat_loop(ws))
583
+
450
584
  # Message loop: handle incoming RPC + events
585
+ # CRITICAL: RPC 死锁防范
586
+ # - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
587
+ # - 原因:如果 handler 内部调用 rpc_call_with_response() 发出站请求,出站响应需要本接收循环来分发
588
+ # - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
589
+ # - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
451
590
  async for raw in ws:
452
591
  try:
453
592
  msg = json.loads(raw)
@@ -462,8 +601,8 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
462
601
  # Event Notification
463
602
  await _handle_event_notification(msg, _monitor)
464
603
  elif has_method and has_id:
465
- # Incoming RPC request
466
- await _handle_rpc_request(ws, msg, _monitor)
604
+ # Incoming RPC request — run in background to prevent deadlock
605
+ asyncio.create_task(_handle_rpc_request(ws, msg, _monitor))
467
606
  elif has_id and not has_method:
468
607
  # RPC response — route to waiter
469
608
  msg_id = msg["id"]
@@ -483,6 +622,18 @@ async def _rpc_call(ws, method: str, params: dict = None):
483
622
  await ws.send(json.dumps(msg))
484
623
 
485
624
 
625
+ async def _heartbeat_loop(ws):
626
+ """Send registry.heartbeat every 30 seconds to prevent TTL expiration."""
627
+ while True:
628
+ try:
629
+ await asyncio.sleep(30)
630
+ if not _shutting_down:
631
+ await _rpc_call(ws, "registry.heartbeat", {"module_id": "watchdog"})
632
+ except Exception as e:
633
+ print(f"[watchdog] Heartbeat error: {e}")
634
+ break
635
+
636
+
486
637
  async def _rpc_call_with_response(ws, method: str, params: dict = None, timeout: float = 5) -> dict:
487
638
  """Send a JSON-RPC 2.0 request and await the response."""
488
639
  rpc_id = str(uuid.uuid4())
@@ -520,10 +671,14 @@ async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
520
671
  event_type = params.get("event", "")
521
672
  data = params.get("data", {})
522
673
 
523
- # Special handling for module.shutdown targeting watchdog
524
- if event_type == "module.shutdown" and data.get("module_id") == "watchdog":
525
- await _handle_shutdown(monitor)
526
- return
674
+ # Debug: log all shutdown events
675
+ if event_type == "module.shutdown":
676
+ target = data.get("module_id", "")
677
+ reason = data.get("reason", "")
678
+ # Handle both targeted shutdown (module_id == "watchdog") and broadcast shutdown (no module_id or launcher_lost)
679
+ if target == "watchdog" or not target or reason == "launcher_lost":
680
+ await _handle_shutdown(monitor)
681
+ return
527
682
 
528
683
  # Forward to monitor (extract params from JSON-RPC notification)
529
684
  await monitor.handle_event(params)
@@ -573,30 +728,38 @@ async def _rpc_status(monitor: HealthMonitor) -> dict:
573
728
 
574
729
 
575
730
  async def _handle_shutdown(monitor: HealthMonitor):
576
- """Handle module.shutdown event — exitingack → cleanup → ready → exit."""
731
+ """Handle module.shutdown event — ackexiting → cleanup → ready → exit."""
577
732
  global _shutting_down
578
733
  print("[watchdog] Received shutdown request")
579
734
  _shutting_down = True
580
- # Step 0: Send module.exiting
735
+ # Step 1: Send ack (立即确认收到)
581
736
  await _publish_event(_ws_global, {
582
- "event": "module.exiting",
583
- "data": {"module_id": "watchdog", "action": "none"},
737
+ "event": "module.shutdown.ack",
738
+ "data": {"module_id": "watchdog"},
584
739
  })
585
- # Step 1: Send ack
740
+ # Step 2: Send module.exiting (开始清理)
586
741
  await _publish_event(_ws_global, {
587
- "event": "module.shutdown.ack",
588
- "data": {"module_id": "watchdog", "estimated_cleanup": 2},
742
+ "event": "module.exiting",
743
+ "data": {
744
+ "module_id": "watchdog",
745
+ "type": "passive",
746
+ "reason": "shutdown_requested",
747
+ "restart": "auto",
748
+ "action": "none",
749
+ "timeout": 2.0,
750
+ "restart_delay": 0.0,
751
+ },
589
752
  })
590
- # Step 2: Cleanup
753
+ # Step 3: Cleanup
591
754
  monitor.stop()
592
- # Step 3: Send ready
755
+ # Step 4: Send ready (清理完成)
593
756
  await _publish_event(_ws_global, {
594
757
  "event": "module.shutdown.ready",
595
758
  "data": {"module_id": "watchdog"},
596
759
  })
597
760
  print("[watchdog] Shutdown ready, exiting")
598
- # Step 4: Exit
599
- sys.exit(0)
761
+ # Step 5: Exit
762
+ sys.exit(_exit_code)
600
763
 
601
764
 
602
765
  if __name__ == "__main__":