@agentunion/kite 1.3.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +200 -0
- package/cli.js +76 -0
- package/extensions/agents/assistant/entry.py +111 -1
- package/extensions/agents/assistant/server.py +263 -215
- package/extensions/channels/acp_channel/entry.py +111 -1
- package/extensions/channels/acp_channel/module.md +23 -22
- package/extensions/channels/acp_channel/server.py +263 -215
- package/extensions/event_hub_bench/entry.py +107 -1
- package/extensions/services/backup/entry.py +299 -21
- package/extensions/services/backup/module.md +24 -22
- package/extensions/services/model_service/entry.py +145 -19
- package/extensions/services/model_service/module.md +21 -22
- package/extensions/services/watchdog/entry.py +188 -25
- package/extensions/services/watchdog/monitor.py +144 -34
- package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
- package/extensions/services/web/config_example.py +35 -0
- package/extensions/services/web/config_loader.py +110 -0
- package/extensions/services/web/entry.py +114 -26
- package/extensions/services/web/module.md +35 -24
- package/extensions/services/web/pairing.py +250 -0
- package/extensions/services/web/pairing_codes.jsonl +16 -0
- package/extensions/services/web/relay.py +643 -0
- package/extensions/services/web/relay_config.json5 +67 -0
- package/extensions/services/web/routes/routes_management_ws.py +127 -0
- package/extensions/services/web/routes/routes_rpc.py +89 -0
- package/extensions/services/web/routes/routes_test.py +61 -0
- package/extensions/services/web/routes/schemas.py +0 -22
- package/extensions/services/web/server.py +421 -98
- package/extensions/services/web/static/css/style.css +67 -28
- package/extensions/services/web/static/index.html +234 -44
- package/extensions/services/web/static/js/app.js +1335 -48
- package/extensions/services/web/static/js/kernel-client-example.js +161 -0
- package/extensions/services/web/static/js/kernel-client.js +383 -0
- package/extensions/services/web/static/js/registry-tests.js +558 -0
- package/extensions/services/web/static/js/token-manager.js +175 -0
- package/extensions/services/web/static/pairing.html +248 -0
- package/extensions/services/web/static/test_registry.html +262 -0
- package/extensions/services/web/web_config.json5 +29 -0
- package/kernel/entry.py +120 -32
- package/kernel/event_hub.py +141 -16
- package/kernel/module.md +36 -33
- package/kernel/registry_store.py +48 -15
- package/kernel/rpc_router.py +120 -53
- package/kernel/server.py +219 -12
- package/kite_cli/__init__.py +3 -0
- package/kite_cli/__main__.py +5 -0
- package/kite_cli/commands/__init__.py +1 -0
- package/kite_cli/commands/clean.py +101 -0
- package/kite_cli/commands/doctor.py +35 -0
- package/kite_cli/commands/history.py +111 -0
- package/kite_cli/commands/info.py +96 -0
- package/kite_cli/commands/install.py +313 -0
- package/kite_cli/commands/list.py +143 -0
- package/kite_cli/commands/log.py +81 -0
- package/kite_cli/commands/rollback.py +88 -0
- package/kite_cli/commands/search.py +73 -0
- package/kite_cli/commands/uninstall.py +85 -0
- package/kite_cli/commands/update.py +118 -0
- package/kite_cli/core/__init__.py +1 -0
- package/kite_cli/core/checker.py +142 -0
- package/kite_cli/core/dependency.py +229 -0
- package/kite_cli/core/downloader.py +209 -0
- package/kite_cli/core/install_info.py +40 -0
- package/kite_cli/core/tool_installer.py +397 -0
- package/kite_cli/core/validator.py +78 -0
- package/kite_cli/main.py +289 -0
- package/kite_cli/utils/__init__.py +1 -0
- package/kite_cli/utils/i18n.py +252 -0
- package/kite_cli/utils/interactive.py +63 -0
- package/kite_cli/utils/operation_log.py +77 -0
- package/kite_cli/utils/paths.py +34 -0
- package/kite_cli/utils/version.py +308 -0
- package/launcher/entry.py +819 -158
- package/launcher/logging_setup.py +104 -0
- package/launcher/module.md +37 -37
- package/package.json +2 -1
- package/scripts/plan_manager.py +315 -0
- package/extensions/services/web/routes/routes_modules.py +0 -249
|
@@ -23,7 +23,113 @@ import websockets
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
# ── Module configuration ──
|
|
26
|
-
|
|
26
|
+
|
|
27
|
+
def _load_module_config() -> dict:
|
|
28
|
+
"""Load module configuration from module.md frontmatter.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dict with keys: name, preferred_port, advertise_ip
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
SystemExit: If module.md is invalid or name is non-compliant
|
|
35
|
+
"""
|
|
36
|
+
_this_dir = os.path.dirname(os.path.abspath(__file__))
|
|
37
|
+
module_md = os.path.join(_this_dir, "module.md")
|
|
38
|
+
|
|
39
|
+
# Calculate relative path for error messages
|
|
40
|
+
project_root = os.environ.get("KITE_PROJECT", "")
|
|
41
|
+
if project_root and _this_dir.startswith(project_root):
|
|
42
|
+
rel_path = os.path.relpath(_this_dir, project_root)
|
|
43
|
+
else:
|
|
44
|
+
rel_path = _this_dir
|
|
45
|
+
|
|
46
|
+
# Default values (will be overridden if valid config exists)
|
|
47
|
+
result = {
|
|
48
|
+
"name": "",
|
|
49
|
+
"preferred_port": 0,
|
|
50
|
+
"advertise_ip": "0.0.0.0"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Check if module.md exists
|
|
54
|
+
if not os.path.exists(module_md):
|
|
55
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
56
|
+
print(f" Path: {rel_path}/module.md")
|
|
57
|
+
print(f" Reason: File not found")
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
with open(module_md, encoding="utf-8") as f:
|
|
62
|
+
text = f.read()
|
|
63
|
+
|
|
64
|
+
# Extract YAML frontmatter (between --- markers)
|
|
65
|
+
import re
|
|
66
|
+
m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
|
|
67
|
+
if not m:
|
|
68
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
69
|
+
print(f" Path: {rel_path}/module.md")
|
|
70
|
+
print(f" Reason: Missing YAML frontmatter")
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
# Parse YAML frontmatter
|
|
74
|
+
try:
|
|
75
|
+
import yaml
|
|
76
|
+
fm = yaml.safe_load(m.group(1)) or {}
|
|
77
|
+
except ImportError:
|
|
78
|
+
print(f"[{rel_path}] ERROR: PyYAML not installed, cannot parse module.md")
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
82
|
+
print(f" Path: {rel_path}/module.md")
|
|
83
|
+
print(f" Reason: YAML parse error: {e}")
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
|
|
86
|
+
# Validate 'name' field (required)
|
|
87
|
+
if "name" not in fm:
|
|
88
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
89
|
+
print(f" Path: {rel_path}/module.md")
|
|
90
|
+
print(f" Reason: Missing 'name' field")
|
|
91
|
+
sys.exit(1)
|
|
92
|
+
|
|
93
|
+
raw_name = str(fm["name"]).strip()
|
|
94
|
+
|
|
95
|
+
if not raw_name:
|
|
96
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
97
|
+
print(f" Path: {rel_path}/module.md")
|
|
98
|
+
print(f" Reason: Empty module name")
|
|
99
|
+
sys.exit(1)
|
|
100
|
+
|
|
101
|
+
# Validate name characters
|
|
102
|
+
sanitized = re.sub(r'[^a-zA-Z0-9_\-]', '', raw_name)
|
|
103
|
+
|
|
104
|
+
if sanitized != raw_name:
|
|
105
|
+
invalid_chars = ''.join(sorted(set(c for c in raw_name if c not in sanitized)))
|
|
106
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
107
|
+
print(f" Path: {rel_path}/module.md")
|
|
108
|
+
print(f" Reason: Invalid characters in name '{raw_name}': {repr(invalid_chars)}")
|
|
109
|
+
sys.exit(1)
|
|
110
|
+
|
|
111
|
+
result["name"] = sanitized
|
|
112
|
+
|
|
113
|
+
# Extract optional fields
|
|
114
|
+
if "preferred_port" in fm:
|
|
115
|
+
try:
|
|
116
|
+
result["preferred_port"] = int(fm["preferred_port"])
|
|
117
|
+
except (ValueError, TypeError):
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
if "advertise_ip" in fm:
|
|
121
|
+
result["advertise_ip"] = str(fm["advertise_ip"])
|
|
122
|
+
|
|
123
|
+
except SystemExit:
|
|
124
|
+
raise # Re-raise exit to prevent catching by outer except
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(f"[{rel_path}] ERROR: Failed to read module.md: {e}")
|
|
127
|
+
sys.exit(1)
|
|
128
|
+
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
_module_config = _load_module_config()
|
|
132
|
+
MODULE_NAME = _module_config["name"]
|
|
27
133
|
|
|
28
134
|
|
|
29
135
|
class _SafeWriter:
|
|
@@ -264,6 +370,7 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
|
|
|
264
370
|
# Global WS reference for publish_event callback
|
|
265
371
|
_ws_global = None
|
|
266
372
|
_shutting_down = False
|
|
373
|
+
_exit_code = 0 # Exit code for main() to use
|
|
267
374
|
|
|
268
375
|
|
|
269
376
|
def _is_auth_failure(e: Exception) -> bool:
|
|
@@ -333,7 +440,7 @@ async def main():
|
|
|
333
440
|
|
|
334
441
|
async def _ws_loop(token: str, kernel_port: int, _t0: float):
|
|
335
442
|
"""Connect to Kernel with exponential backoff reconnection."""
|
|
336
|
-
global _shutting_down
|
|
443
|
+
global _shutting_down, _exit_code
|
|
337
444
|
retry_delay = 0.3
|
|
338
445
|
max_delay = 5.0
|
|
339
446
|
max_retries = 10
|
|
@@ -349,10 +456,14 @@ async def _ws_loop(token: str, kernel_port: int, _t0: float):
|
|
|
349
456
|
attempt += 1
|
|
350
457
|
if _is_auth_failure(e):
|
|
351
458
|
print(f"[model_service] Kernel 认证失败,退出")
|
|
352
|
-
|
|
459
|
+
_exit_code = 1
|
|
460
|
+
_shutting_down = True
|
|
461
|
+
return
|
|
353
462
|
if attempt >= max_retries:
|
|
354
463
|
print(f"[model_service] 重连失败 {max_retries} 次,退出")
|
|
355
|
-
|
|
464
|
+
_exit_code = 1
|
|
465
|
+
_shutting_down = True
|
|
466
|
+
return
|
|
356
467
|
_write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
|
|
357
468
|
print(f"[model_service] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
|
|
358
469
|
_ws_global_clear()
|
|
@@ -374,7 +485,7 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
374
485
|
ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=model_service"
|
|
375
486
|
print(f"[model_service] Connecting to Kernel: {ws_url}")
|
|
376
487
|
|
|
377
|
-
async with websockets.connect(ws_url, open_timeout=5, ping_interval=
|
|
488
|
+
async with websockets.connect(ws_url, open_timeout=5, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
|
|
378
489
|
_ws_global = ws
|
|
379
490
|
print(f"[model_service] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
380
491
|
|
|
@@ -393,7 +504,9 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
393
504
|
"module_id": "model_service",
|
|
394
505
|
"module_type": "service",
|
|
395
506
|
"events_publish": {
|
|
396
|
-
"model_service
|
|
507
|
+
"model_service": {
|
|
508
|
+
"test": {"description": "Test event from model_service module"},
|
|
509
|
+
}
|
|
397
510
|
},
|
|
398
511
|
"events_subscribe": [
|
|
399
512
|
"module.started",
|
|
@@ -419,6 +532,11 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
419
532
|
test_task = asyncio.create_task(_test_event_loop(ws))
|
|
420
533
|
|
|
421
534
|
# Message loop: handle incoming RPC + events
|
|
535
|
+
# CRITICAL: RPC 死锁防范
|
|
536
|
+
# - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
|
|
537
|
+
# - 原因:如果 handler 内部调用 rpc_call() 发出站请求,出站响应需要本接收循环来分发
|
|
538
|
+
# - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
|
|
539
|
+
# - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
|
|
422
540
|
async for raw in ws:
|
|
423
541
|
try:
|
|
424
542
|
msg = json.loads(raw)
|
|
@@ -433,8 +551,8 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
433
551
|
# Event Notification
|
|
434
552
|
await _handle_event_notification(msg)
|
|
435
553
|
elif has_method and has_id:
|
|
436
|
-
# Incoming RPC request
|
|
437
|
-
|
|
554
|
+
# Incoming RPC request — run in background to prevent deadlock
|
|
555
|
+
asyncio.create_task(_handle_rpc_request(ws, msg))
|
|
438
556
|
# Ignore RPC responses (we don't await them in this simple impl)
|
|
439
557
|
except Exception as e:
|
|
440
558
|
print(f"[model_service] 消息处理异常(已忽略): {e}")
|
|
@@ -523,29 +641,37 @@ async def _rpc_status() -> dict:
|
|
|
523
641
|
|
|
524
642
|
|
|
525
643
|
async def _handle_shutdown():
|
|
526
|
-
"""Handle module.shutdown event —
|
|
644
|
+
"""Handle module.shutdown event — ack → exiting → cleanup → ready → exit."""
|
|
527
645
|
global _shutting_down
|
|
528
646
|
print("[model_service] Received shutdown request")
|
|
529
647
|
_shutting_down = True
|
|
530
|
-
# Step
|
|
648
|
+
# Step 1: Send ack (立即确认收到)
|
|
531
649
|
await _publish_event(_ws_global, {
|
|
532
|
-
"event": "module.
|
|
533
|
-
"data": {"module_id": "model_service"
|
|
650
|
+
"event": "module.shutdown.ack",
|
|
651
|
+
"data": {"module_id": "model_service"},
|
|
534
652
|
})
|
|
535
|
-
# Step
|
|
653
|
+
# Step 2: Send module.exiting (开始清理)
|
|
536
654
|
await _publish_event(_ws_global, {
|
|
537
|
-
"event": "module.
|
|
538
|
-
"data": {
|
|
655
|
+
"event": "module.exiting",
|
|
656
|
+
"data": {
|
|
657
|
+
"module_id": "model_service",
|
|
658
|
+
"type": "passive",
|
|
659
|
+
"reason": "shutdown_requested",
|
|
660
|
+
"restart": "auto",
|
|
661
|
+
"action": "none",
|
|
662
|
+
"timeout": 2.0,
|
|
663
|
+
"restart_delay": 0.0,
|
|
664
|
+
},
|
|
539
665
|
})
|
|
540
|
-
# Step
|
|
541
|
-
# Step
|
|
666
|
+
# Step 3: Cleanup (nothing to clean up for model_service)
|
|
667
|
+
# Step 4: Send ready (清理完成)
|
|
542
668
|
await _publish_event(_ws_global, {
|
|
543
669
|
"event": "module.shutdown.ready",
|
|
544
670
|
"data": {"module_id": "model_service"},
|
|
545
671
|
})
|
|
546
672
|
print("[model_service] Shutdown ready, exiting")
|
|
547
|
-
# Step
|
|
548
|
-
sys.exit(
|
|
673
|
+
# Step 5: Exit
|
|
674
|
+
sys.exit(_exit_code)
|
|
549
675
|
|
|
550
676
|
|
|
551
677
|
async def _test_event_loop(ws):
|
|
@@ -1,22 +1,21 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: model_service
|
|
3
|
-
display_name: Model Service
|
|
4
|
-
version:
|
|
5
|
-
type: service
|
|
6
|
-
state:
|
|
7
|
-
runtime: python
|
|
8
|
-
entry: entry.py
|
|
9
|
-
events:
|
|
10
|
-
|
|
11
|
-
subscriptions:
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
---
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
-
|
|
22
|
-
- 事件通知 — 通过 Event Hub 发布模型服务状态事件
|
|
1
|
+
---
|
|
2
|
+
name: model_service
|
|
3
|
+
display_name: Model Service
|
|
4
|
+
version: '1.0'
|
|
5
|
+
type: service
|
|
6
|
+
state: manual
|
|
7
|
+
runtime: python
|
|
8
|
+
entry: entry.py
|
|
9
|
+
events:
|
|
10
|
+
- model_service.test
|
|
11
|
+
subscriptions:
|
|
12
|
+
- module.started
|
|
13
|
+
- module.stopped
|
|
14
|
+
- module.shutdown
|
|
15
|
+
---
|
|
16
|
+
# Model Service(大模型服务)
|
|
17
|
+
|
|
18
|
+
大模型服务模块,提供统一的 LLM 调用接口。
|
|
19
|
+
|
|
20
|
+
- 模型调用 — 封装多种大模型 API 的统一调用接口
|
|
21
|
+
- 事件通知 — 通过 Event Hub 发布模型服务状态事件
|
|
@@ -20,7 +20,113 @@ import websockets
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
# ── Module configuration ──
|
|
23
|
-
|
|
23
|
+
|
|
24
|
+
def _load_module_config() -> dict:
|
|
25
|
+
"""Load module configuration from module.md frontmatter.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Dict with keys: name, preferred_port, advertise_ip
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
SystemExit: If module.md is invalid or name is non-compliant
|
|
32
|
+
"""
|
|
33
|
+
_this_dir = os.path.dirname(os.path.abspath(__file__))
|
|
34
|
+
module_md = os.path.join(_this_dir, "module.md")
|
|
35
|
+
|
|
36
|
+
# Calculate relative path for error messages
|
|
37
|
+
project_root = os.environ.get("KITE_PROJECT", "")
|
|
38
|
+
if project_root and _this_dir.startswith(project_root):
|
|
39
|
+
rel_path = os.path.relpath(_this_dir, project_root)
|
|
40
|
+
else:
|
|
41
|
+
rel_path = _this_dir
|
|
42
|
+
|
|
43
|
+
# Default values (will be overridden if valid config exists)
|
|
44
|
+
result = {
|
|
45
|
+
"name": "",
|
|
46
|
+
"preferred_port": 0,
|
|
47
|
+
"advertise_ip": "0.0.0.0"
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Check if module.md exists
|
|
51
|
+
if not os.path.exists(module_md):
|
|
52
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
53
|
+
print(f" Path: {rel_path}/module.md")
|
|
54
|
+
print(f" Reason: File not found")
|
|
55
|
+
sys.exit(1)
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
with open(module_md, encoding="utf-8") as f:
|
|
59
|
+
text = f.read()
|
|
60
|
+
|
|
61
|
+
# Extract YAML frontmatter (between --- markers)
|
|
62
|
+
import re
|
|
63
|
+
m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
|
|
64
|
+
if not m:
|
|
65
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
66
|
+
print(f" Path: {rel_path}/module.md")
|
|
67
|
+
print(f" Reason: Missing YAML frontmatter")
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
# Parse YAML frontmatter
|
|
71
|
+
try:
|
|
72
|
+
import yaml
|
|
73
|
+
fm = yaml.safe_load(m.group(1)) or {}
|
|
74
|
+
except ImportError:
|
|
75
|
+
print(f"[{rel_path}] ERROR: PyYAML not installed, cannot parse module.md")
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
79
|
+
print(f" Path: {rel_path}/module.md")
|
|
80
|
+
print(f" Reason: YAML parse error: {e}")
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
# Validate 'name' field (required)
|
|
84
|
+
if "name" not in fm:
|
|
85
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
86
|
+
print(f" Path: {rel_path}/module.md")
|
|
87
|
+
print(f" Reason: Missing 'name' field")
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
raw_name = str(fm["name"]).strip()
|
|
91
|
+
|
|
92
|
+
if not raw_name:
|
|
93
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
94
|
+
print(f" Path: {rel_path}/module.md")
|
|
95
|
+
print(f" Reason: Empty module name")
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
# Validate name characters
|
|
99
|
+
sanitized = re.sub(r'[^a-zA-Z0-9_\-]', '', raw_name)
|
|
100
|
+
|
|
101
|
+
if sanitized != raw_name:
|
|
102
|
+
invalid_chars = ''.join(sorted(set(c for c in raw_name if c not in sanitized)))
|
|
103
|
+
print(f"[{rel_path}] ERROR: Invalid module configuration in module.md")
|
|
104
|
+
print(f" Path: {rel_path}/module.md")
|
|
105
|
+
print(f" Reason: Invalid characters in name '{raw_name}': {repr(invalid_chars)}")
|
|
106
|
+
sys.exit(1)
|
|
107
|
+
|
|
108
|
+
result["name"] = sanitized
|
|
109
|
+
|
|
110
|
+
# Extract optional fields
|
|
111
|
+
if "preferred_port" in fm:
|
|
112
|
+
try:
|
|
113
|
+
result["preferred_port"] = int(fm["preferred_port"])
|
|
114
|
+
except (ValueError, TypeError):
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
if "advertise_ip" in fm:
|
|
118
|
+
result["advertise_ip"] = str(fm["advertise_ip"])
|
|
119
|
+
|
|
120
|
+
except SystemExit:
|
|
121
|
+
raise # Re-raise exit to prevent catching by outer except
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"[{rel_path}] ERROR: Failed to read module.md: {e}")
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
_module_config = _load_module_config()
|
|
129
|
+
MODULE_NAME = _module_config["name"]
|
|
24
130
|
|
|
25
131
|
|
|
26
132
|
def _fmt_elapsed(t0: float) -> str:
|
|
@@ -266,6 +372,7 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
|
|
|
266
372
|
# Global WS reference for publish_event callback
|
|
267
373
|
_ws_global = None
|
|
268
374
|
_shutting_down = False
|
|
375
|
+
_exit_code = 0 # Exit code for main() to use
|
|
269
376
|
_monitor = None
|
|
270
377
|
_monitor_task = None
|
|
271
378
|
|
|
@@ -347,7 +454,7 @@ async def main():
|
|
|
347
454
|
|
|
348
455
|
async def _ws_loop(token: str, kernel_port: int, _t0: float):
|
|
349
456
|
"""Connect to Kernel with exponential backoff reconnection."""
|
|
350
|
-
global _shutting_down
|
|
457
|
+
global _shutting_down, _exit_code
|
|
351
458
|
retry_delay = 0.3
|
|
352
459
|
max_delay = 5.0
|
|
353
460
|
max_retries = 10
|
|
@@ -363,12 +470,21 @@ async def _ws_loop(token: str, kernel_port: int, _t0: float):
|
|
|
363
470
|
attempt += 1
|
|
364
471
|
if _is_auth_failure(e):
|
|
365
472
|
print(f"[watchdog] Kernel 认证失败,退出")
|
|
366
|
-
|
|
473
|
+
_exit_code = 1
|
|
474
|
+
_shutting_down = True
|
|
475
|
+
return
|
|
367
476
|
if attempt >= max_retries:
|
|
368
477
|
print(f"[watchdog] 重连失败 {max_retries} 次,退出")
|
|
369
|
-
|
|
478
|
+
_exit_code = 1
|
|
479
|
+
_shutting_down = True
|
|
480
|
+
return
|
|
370
481
|
_write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
|
|
371
482
|
print(f"[watchdog] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
|
|
483
|
+
if attempt == 5:
|
|
484
|
+
print(f"\033[33m[watchdog] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {kernel_port})")
|
|
485
|
+
if kernel_port < 1024:
|
|
486
|
+
print(f"[watchdog] ⚠ 端口 {kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
|
|
487
|
+
print(f"[watchdog] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
|
|
372
488
|
_ws_global_clear()
|
|
373
489
|
if _shutting_down:
|
|
374
490
|
return
|
|
@@ -388,7 +504,7 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
388
504
|
ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=watchdog"
|
|
389
505
|
print(f"[watchdog] Connecting to Kernel: {ws_url}")
|
|
390
506
|
|
|
391
|
-
async with websockets.connect(ws_url, open_timeout=5, ping_interval=
|
|
507
|
+
async with websockets.connect(ws_url, open_timeout=5, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
|
|
392
508
|
_ws_global = ws
|
|
393
509
|
print(f"[watchdog] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
394
510
|
|
|
@@ -410,10 +526,25 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
410
526
|
await _rpc_call(ws, "registry.register", {
|
|
411
527
|
"module_id": "watchdog",
|
|
412
528
|
"module_type": "service",
|
|
529
|
+
"tools": {
|
|
530
|
+
"rpc": {
|
|
531
|
+
"module": {
|
|
532
|
+
"health": {"method": "health", "description": "健康检查"},
|
|
533
|
+
"status": {"method": "status", "description": "状态查询"}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
},
|
|
413
537
|
"events_publish": {
|
|
414
|
-
"watchdog
|
|
415
|
-
|
|
416
|
-
|
|
538
|
+
"watchdog": {
|
|
539
|
+
"module": {
|
|
540
|
+
"unhealthy": {"description": "模块不健康"},
|
|
541
|
+
"recovered": {"description": "模块恢复"},
|
|
542
|
+
"resource_critical": {"description": "资源严重不足"},
|
|
543
|
+
"resource_warning": {"description": "资源警告"},
|
|
544
|
+
"resource_recovered": {"description": "资源恢复正常"}
|
|
545
|
+
},
|
|
546
|
+
"alert": {"description": "监控告警"}
|
|
547
|
+
}
|
|
417
548
|
},
|
|
418
549
|
"events_subscribe": [
|
|
419
550
|
"system.ready",
|
|
@@ -447,7 +578,15 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
447
578
|
if _monitor_task is None or _monitor_task.done():
|
|
448
579
|
_monitor_task = asyncio.create_task(_monitor.run())
|
|
449
580
|
|
|
581
|
+
# Start heartbeat loop
|
|
582
|
+
heartbeat_task = asyncio.create_task(_heartbeat_loop(ws))
|
|
583
|
+
|
|
450
584
|
# Message loop: handle incoming RPC + events
|
|
585
|
+
# CRITICAL: RPC 死锁防范
|
|
586
|
+
# - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
|
|
587
|
+
# - 原因:如果 handler 内部调用 rpc_call_with_response() 发出站请求,出站响应需要本接收循环来分发
|
|
588
|
+
# - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
|
|
589
|
+
# - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
|
|
451
590
|
async for raw in ws:
|
|
452
591
|
try:
|
|
453
592
|
msg = json.loads(raw)
|
|
@@ -462,8 +601,8 @@ async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
|
462
601
|
# Event Notification
|
|
463
602
|
await _handle_event_notification(msg, _monitor)
|
|
464
603
|
elif has_method and has_id:
|
|
465
|
-
# Incoming RPC request
|
|
466
|
-
|
|
604
|
+
# Incoming RPC request — run in background to prevent deadlock
|
|
605
|
+
asyncio.create_task(_handle_rpc_request(ws, msg, _monitor))
|
|
467
606
|
elif has_id and not has_method:
|
|
468
607
|
# RPC response — route to waiter
|
|
469
608
|
msg_id = msg["id"]
|
|
@@ -483,6 +622,18 @@ async def _rpc_call(ws, method: str, params: dict = None):
|
|
|
483
622
|
await ws.send(json.dumps(msg))
|
|
484
623
|
|
|
485
624
|
|
|
625
|
+
async def _heartbeat_loop(ws):
|
|
626
|
+
"""Send registry.heartbeat every 30 seconds to prevent TTL expiration."""
|
|
627
|
+
while True:
|
|
628
|
+
try:
|
|
629
|
+
await asyncio.sleep(30)
|
|
630
|
+
if not _shutting_down:
|
|
631
|
+
await _rpc_call(ws, "registry.heartbeat", {"module_id": "watchdog"})
|
|
632
|
+
except Exception as e:
|
|
633
|
+
print(f"[watchdog] Heartbeat error: {e}")
|
|
634
|
+
break
|
|
635
|
+
|
|
636
|
+
|
|
486
637
|
async def _rpc_call_with_response(ws, method: str, params: dict = None, timeout: float = 5) -> dict:
|
|
487
638
|
"""Send a JSON-RPC 2.0 request and await the response."""
|
|
488
639
|
rpc_id = str(uuid.uuid4())
|
|
@@ -520,10 +671,14 @@ async def _handle_event_notification(msg: dict, monitor: HealthMonitor):
|
|
|
520
671
|
event_type = params.get("event", "")
|
|
521
672
|
data = params.get("data", {})
|
|
522
673
|
|
|
523
|
-
#
|
|
524
|
-
if event_type == "module.shutdown"
|
|
525
|
-
|
|
526
|
-
|
|
674
|
+
# Debug: log all shutdown events
|
|
675
|
+
if event_type == "module.shutdown":
|
|
676
|
+
target = data.get("module_id", "")
|
|
677
|
+
reason = data.get("reason", "")
|
|
678
|
+
# Handle both targeted shutdown (module_id == "watchdog") and broadcast shutdown (no module_id or launcher_lost)
|
|
679
|
+
if target == "watchdog" or not target or reason == "launcher_lost":
|
|
680
|
+
await _handle_shutdown(monitor)
|
|
681
|
+
return
|
|
527
682
|
|
|
528
683
|
# Forward to monitor (extract params from JSON-RPC notification)
|
|
529
684
|
await monitor.handle_event(params)
|
|
@@ -573,30 +728,38 @@ async def _rpc_status(monitor: HealthMonitor) -> dict:
|
|
|
573
728
|
|
|
574
729
|
|
|
575
730
|
async def _handle_shutdown(monitor: HealthMonitor):
|
|
576
|
-
"""Handle module.shutdown event —
|
|
731
|
+
"""Handle module.shutdown event — ack → exiting → cleanup → ready → exit."""
|
|
577
732
|
global _shutting_down
|
|
578
733
|
print("[watchdog] Received shutdown request")
|
|
579
734
|
_shutting_down = True
|
|
580
|
-
# Step
|
|
735
|
+
# Step 1: Send ack (立即确认收到)
|
|
581
736
|
await _publish_event(_ws_global, {
|
|
582
|
-
"event": "module.
|
|
583
|
-
"data": {"module_id": "watchdog"
|
|
737
|
+
"event": "module.shutdown.ack",
|
|
738
|
+
"data": {"module_id": "watchdog"},
|
|
584
739
|
})
|
|
585
|
-
# Step
|
|
740
|
+
# Step 2: Send module.exiting (开始清理)
|
|
586
741
|
await _publish_event(_ws_global, {
|
|
587
|
-
"event": "module.
|
|
588
|
-
"data": {
|
|
742
|
+
"event": "module.exiting",
|
|
743
|
+
"data": {
|
|
744
|
+
"module_id": "watchdog",
|
|
745
|
+
"type": "passive",
|
|
746
|
+
"reason": "shutdown_requested",
|
|
747
|
+
"restart": "auto",
|
|
748
|
+
"action": "none",
|
|
749
|
+
"timeout": 2.0,
|
|
750
|
+
"restart_delay": 0.0,
|
|
751
|
+
},
|
|
589
752
|
})
|
|
590
|
-
# Step
|
|
753
|
+
# Step 3: Cleanup
|
|
591
754
|
monitor.stop()
|
|
592
|
-
# Step
|
|
755
|
+
# Step 4: Send ready (清理完成)
|
|
593
756
|
await _publish_event(_ws_global, {
|
|
594
757
|
"event": "module.shutdown.ready",
|
|
595
758
|
"data": {"module_id": "watchdog"},
|
|
596
759
|
})
|
|
597
760
|
print("[watchdog] Shutdown ready, exiting")
|
|
598
|
-
# Step
|
|
599
|
-
sys.exit(
|
|
761
|
+
# Step 5: Exit
|
|
762
|
+
sys.exit(_exit_code)
|
|
600
763
|
|
|
601
764
|
|
|
602
765
|
if __name__ == "__main__":
|