@agentunion/kite 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +87 -1
- package/extensions/agents/assistant/server.py +30 -12
- package/extensions/channels/acp_channel/server.py +30 -12
- package/extensions/services/backup/entry.py +123 -65
- package/extensions/services/model_service/entry.py +123 -65
- package/extensions/services/watchdog/entry.py +171 -80
- package/extensions/services/watchdog/monitor.py +112 -6
- package/extensions/services/web/routes/routes_modules.py +249 -0
- package/extensions/services/web/routes/schemas.py +22 -0
- package/extensions/services/web/server.py +37 -14
- package/extensions/services/web/static/css/style.css +97 -0
- package/extensions/services/web/static/index.html +105 -2
- package/extensions/services/web/static/js/app.js +288 -1
- package/kernel/event_hub.py +21 -3
- package/kernel/registry_store.py +22 -5
- package/kernel/rpc_router.py +15 -5
- package/kernel/server.py +75 -5
- package/launcher/count_lines.py +34 -0
- package/launcher/entry.py +92 -14
- package/launcher/process_manager.py +12 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,4 +1,90 @@
|
|
|
1
|
-
# v1.3.
|
|
1
|
+
# v1.3.2
|
|
2
|
+
|
|
3
|
+
**发布日期**:2026-03-04
|
|
4
|
+
**上一版本**:v1.3.1
|
|
5
|
+
**版本级别**:patch
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 核心变更
|
|
10
|
+
|
|
11
|
+
本次发布主要包含功能增强和稳定性改进。新增 Web 模块管理 API,支持通过 HTTP 接口扫描、查看和编辑模块元数据与配置。Kernel 模块增加断线重连防抖机制(5 秒窗口),避免短暂网络波动导致的模块状态误判。多个扩展模块和前端界面进行了优化更新。
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## 变更统计
|
|
16
|
+
|
|
17
|
+
| 类型 | 数量 | 说明 |
|
|
18
|
+
|------|------|------|
|
|
19
|
+
| 新增 | 1 | `routes_modules.py` (249 行) |
|
|
20
|
+
| 修改 | 18 | kernel、launcher、扩展模块、web 前端 |
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 关键变更
|
|
25
|
+
|
|
26
|
+
### 新增功能
|
|
27
|
+
|
|
28
|
+
**`extensions/services/web/routes/routes_modules.py`** (249 行)
|
|
29
|
+
- 模块管理 API:扫描项目中的所有模块
|
|
30
|
+
- 元数据查看与编辑(`module.md` 前置元数据)
|
|
31
|
+
- 配置查看与编辑(`config.yaml`)
|
|
32
|
+
- 只读字段保护(`name`, `type`, `runtime`, `entry`)
|
|
33
|
+
|
|
34
|
+
### 核心改进
|
|
35
|
+
|
|
36
|
+
**`kernel/server.py`** — 断线重连防抖
|
|
37
|
+
```diff
|
|
38
|
+
+ # Debounce timers for disconnected modules (module_id -> asyncio.Task)
|
|
39
|
+
+ self._debounce_tasks: dict[str, asyncio.Task] = {}
|
|
40
|
+
+ # Cancel debounce timer if module is reconnecting within 5s window
|
|
41
|
+
+ old_debounce = server._debounce_tasks.pop(module_id, None)
|
|
42
|
+
+ if old_debounce:
|
|
43
|
+
+ old_debounce.cancel()
|
|
44
|
+
```
|
|
45
|
+
- 5 秒防抖窗口,避免短暂断线误判
|
|
46
|
+
- Launcher 离线 35 秒后触发全局重启
|
|
47
|
+
|
|
48
|
+
**`kernel/registry_store.py`** — 连接状态管理
|
|
49
|
+
- 新增 `set_connected()` / `set_disconnected()` 方法
|
|
50
|
+
- 模块注册表支持实时连接状态跟踪
|
|
51
|
+
|
|
52
|
+
**`launcher/entry.py`** — 进程管理优化
|
|
53
|
+
- 改进模块启动和停止逻辑
|
|
54
|
+
- 优化日志输出和错误处理
|
|
55
|
+
|
|
56
|
+
### 前端更新
|
|
57
|
+
|
|
58
|
+
**Web UI** (`static/index.html`, `static/js/app.js`, `static/css/style.css`)
|
|
59
|
+
- 界面样式优化
|
|
60
|
+
- 交互逻辑改进
|
|
61
|
+
- 新增模块管理相关 UI 组件
|
|
62
|
+
|
|
63
|
+
### 扩展模块适配
|
|
64
|
+
|
|
65
|
+
更新以下模块以适配 Kernel 连接状态管理:
|
|
66
|
+
- `extensions/agents/assistant/server.py`
|
|
67
|
+
- `extensions/channels/acp_channel/server.py`
|
|
68
|
+
- `extensions/services/backup/entry.py`
|
|
69
|
+
- `extensions/services/model_service/entry.py`
|
|
70
|
+
- `extensions/services/watchdog/entry.py`
|
|
71
|
+
- `extensions/services/watchdog/monitor.py`
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## 包信息
|
|
76
|
+
|
|
77
|
+
- **文件数**:88(与 v1.3.1 相同)
|
|
78
|
+
- **打包大小**:~208 kB
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## 稳定性改进
|
|
83
|
+
|
|
84
|
+
- ✅ **断线重连防抖** — 避免短暂网络波动导致模块误判为离线
|
|
85
|
+
- ✅ **连接状态跟踪** — Registry 实时记录模块连接状态
|
|
86
|
+
- ✅ **Launcher 保护** — 35 秒容错窗口,避免误触发全局重启
|
|
87
|
+
# v1.3.1
|
|
2
88
|
|
|
3
89
|
**发布日期**:2026-03-04
|
|
4
90
|
**上一版本**:v1.2.0
|
|
@@ -21,7 +21,6 @@ class AssistantServer:
|
|
|
21
21
|
self._ws_task: asyncio.Task | None = None
|
|
22
22
|
self._test_task: asyncio.Task | None = None
|
|
23
23
|
self._ws: object | None = None
|
|
24
|
-
self._ready_sent = False
|
|
25
24
|
self._shutting_down = False
|
|
26
25
|
self._start_time = time.time()
|
|
27
26
|
|
|
@@ -42,15 +41,29 @@ class AssistantServer:
|
|
|
42
41
|
|
|
43
42
|
async def _ws_loop(self):
|
|
44
43
|
"""Connect to Kernel, subscribe, register, and listen. Reconnect on failure."""
|
|
45
|
-
retry_delay = 0.
|
|
46
|
-
max_delay =
|
|
44
|
+
retry_delay = 0.3
|
|
45
|
+
max_delay = 5.0
|
|
46
|
+
max_retries = 10
|
|
47
|
+
attempt = 0
|
|
47
48
|
while not self._shutting_down:
|
|
48
49
|
try:
|
|
49
50
|
await self._ws_connect()
|
|
51
|
+
retry_delay = 0.3
|
|
52
|
+
attempt = 0
|
|
50
53
|
except asyncio.CancelledError:
|
|
51
54
|
return
|
|
52
55
|
except Exception as e:
|
|
53
|
-
|
|
56
|
+
attempt += 1
|
|
57
|
+
# Auth failure — don't retry
|
|
58
|
+
if hasattr(e, 'rcvd') and e.rcvd is not None:
|
|
59
|
+
code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
|
|
60
|
+
if code in (4001, 4003):
|
|
61
|
+
print(f"[assistant] Kernel 认证失败 (code {code}),退出")
|
|
62
|
+
import sys; sys.exit(1)
|
|
63
|
+
if attempt >= max_retries:
|
|
64
|
+
print(f"[assistant] Kernel 重连失败 {max_retries} 次,退出")
|
|
65
|
+
import sys; sys.exit(1)
|
|
66
|
+
print(f"[assistant] Kernel connection error: {e}, retrying in {retry_delay:.1f}s ({attempt}/{max_retries})")
|
|
54
67
|
self._ws = None
|
|
55
68
|
if self._shutting_down:
|
|
56
69
|
return
|
|
@@ -87,8 +100,8 @@ class AssistantServer:
|
|
|
87
100
|
],
|
|
88
101
|
})
|
|
89
102
|
|
|
90
|
-
# Step 3: Publish module.ready (
|
|
91
|
-
if not self.
|
|
103
|
+
# Step 3: Publish module.ready (every reconnect)
|
|
104
|
+
if not self._shutting_down:
|
|
92
105
|
await self._rpc_call(ws, "event.publish", {
|
|
93
106
|
"event_id": str(uuid.uuid4()),
|
|
94
107
|
"event": "module.ready",
|
|
@@ -97,14 +110,10 @@ class AssistantServer:
|
|
|
97
110
|
"graceful_shutdown": True,
|
|
98
111
|
},
|
|
99
112
|
})
|
|
100
|
-
self._ready_sent = True
|
|
101
113
|
elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
|
|
102
114
|
elapsed_str = f" ({elapsed:.1f}s)" if elapsed else ""
|
|
103
115
|
print(f"[assistant] module.ready published{elapsed_str}")
|
|
104
116
|
|
|
105
|
-
# Reset retry delay on successful connection
|
|
106
|
-
retry_delay = 0.5
|
|
107
|
-
|
|
108
117
|
# Receive loop
|
|
109
118
|
async for raw in ws:
|
|
110
119
|
try:
|
|
@@ -123,7 +132,9 @@ class AssistantServer:
|
|
|
123
132
|
if event_name == "module.shutdown":
|
|
124
133
|
data = params.get("data", {})
|
|
125
134
|
target = data.get("module_id", "")
|
|
126
|
-
|
|
135
|
+
reason = data.get("reason", "")
|
|
136
|
+
# Handle both targeted shutdown (module_id == "assistant") and broadcast shutdown (no module_id or launcher_lost)
|
|
137
|
+
if target == "assistant" or not target or reason == "launcher_lost":
|
|
127
138
|
await self._handle_shutdown(ws)
|
|
128
139
|
return
|
|
129
140
|
elif not has_method and has_id:
|
|
@@ -133,10 +144,17 @@ class AssistantServer:
|
|
|
133
144
|
print(f"[assistant] 事件处理异常(已忽略): {e}")
|
|
134
145
|
|
|
135
146
|
async def _handle_shutdown(self, ws):
|
|
136
|
-
"""Handle module.shutdown: ack → cleanup → ready → exit."""
|
|
147
|
+
"""Handle module.shutdown: exiting → ack → cleanup → ready → exit."""
|
|
137
148
|
print("[assistant] Received module.shutdown")
|
|
138
149
|
self._shutting_down = True
|
|
139
150
|
|
|
151
|
+
# Step 0: Send module.exiting
|
|
152
|
+
await self._rpc_call(ws, "event.publish", {
|
|
153
|
+
"event_id": str(uuid.uuid4()),
|
|
154
|
+
"event": "module.exiting",
|
|
155
|
+
"data": {"module_id": "assistant", "action": "none"},
|
|
156
|
+
})
|
|
157
|
+
|
|
140
158
|
# Step 1: Send ack
|
|
141
159
|
await self._rpc_call(ws, "event.publish", {
|
|
142
160
|
"event_id": str(uuid.uuid4()),
|
|
@@ -21,7 +21,6 @@ class AcpChannelServer:
|
|
|
21
21
|
self._ws_task: asyncio.Task | None = None
|
|
22
22
|
self._test_task: asyncio.Task | None = None
|
|
23
23
|
self._ws: object | None = None
|
|
24
|
-
self._ready_sent = False
|
|
25
24
|
self._shutting_down = False
|
|
26
25
|
self._start_time = time.time()
|
|
27
26
|
|
|
@@ -42,15 +41,29 @@ class AcpChannelServer:
|
|
|
42
41
|
|
|
43
42
|
async def _ws_loop(self):
|
|
44
43
|
"""Connect to Kernel, subscribe, register, and listen. Reconnect on failure."""
|
|
45
|
-
retry_delay = 0.
|
|
46
|
-
max_delay =
|
|
44
|
+
retry_delay = 0.3
|
|
45
|
+
max_delay = 5.0
|
|
46
|
+
max_retries = 10
|
|
47
|
+
attempt = 0
|
|
47
48
|
while not self._shutting_down:
|
|
48
49
|
try:
|
|
49
50
|
await self._ws_connect()
|
|
51
|
+
retry_delay = 0.3
|
|
52
|
+
attempt = 0
|
|
50
53
|
except asyncio.CancelledError:
|
|
51
54
|
return
|
|
52
55
|
except Exception as e:
|
|
53
|
-
|
|
56
|
+
attempt += 1
|
|
57
|
+
# Auth failure — don't retry
|
|
58
|
+
if hasattr(e, 'rcvd') and e.rcvd is not None:
|
|
59
|
+
code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
|
|
60
|
+
if code in (4001, 4003):
|
|
61
|
+
print(f"[acp_channel] Kernel 认证失败 (code {code}),退出")
|
|
62
|
+
import sys; sys.exit(1)
|
|
63
|
+
if attempt >= max_retries:
|
|
64
|
+
print(f"[acp_channel] Kernel 重连失败 {max_retries} 次,退出")
|
|
65
|
+
import sys; sys.exit(1)
|
|
66
|
+
print(f"[acp_channel] Kernel connection error: {e}, retrying in {retry_delay:.1f}s ({attempt}/{max_retries})")
|
|
54
67
|
self._ws = None
|
|
55
68
|
if self._shutting_down:
|
|
56
69
|
return
|
|
@@ -87,8 +100,8 @@ class AcpChannelServer:
|
|
|
87
100
|
],
|
|
88
101
|
})
|
|
89
102
|
|
|
90
|
-
# Step 3: Publish module.ready (
|
|
91
|
-
if not self.
|
|
103
|
+
# Step 3: Publish module.ready (every reconnect)
|
|
104
|
+
if not self._shutting_down:
|
|
92
105
|
await self._rpc_call(ws, "event.publish", {
|
|
93
106
|
"event_id": str(uuid.uuid4()),
|
|
94
107
|
"event": "module.ready",
|
|
@@ -97,14 +110,10 @@ class AcpChannelServer:
|
|
|
97
110
|
"graceful_shutdown": True,
|
|
98
111
|
},
|
|
99
112
|
})
|
|
100
|
-
self._ready_sent = True
|
|
101
113
|
elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
|
|
102
114
|
elapsed_str = f" ({elapsed:.1f}s)" if elapsed else ""
|
|
103
115
|
print(f"[acp_channel] module.ready published{elapsed_str}")
|
|
104
116
|
|
|
105
|
-
# Reset retry delay on successful connection
|
|
106
|
-
retry_delay = 0.5
|
|
107
|
-
|
|
108
117
|
# Receive loop
|
|
109
118
|
async for raw in ws:
|
|
110
119
|
try:
|
|
@@ -123,7 +132,9 @@ class AcpChannelServer:
|
|
|
123
132
|
if event_name == "module.shutdown":
|
|
124
133
|
data = params.get("data", {})
|
|
125
134
|
target = data.get("module_id", "")
|
|
126
|
-
|
|
135
|
+
reason = data.get("reason", "")
|
|
136
|
+
# Handle both targeted shutdown (module_id == "acp_channel") and broadcast shutdown (no module_id or launcher_lost)
|
|
137
|
+
if target == "acp_channel" or not target or reason == "launcher_lost":
|
|
127
138
|
await self._handle_shutdown(ws)
|
|
128
139
|
return
|
|
129
140
|
elif not has_method and has_id:
|
|
@@ -133,10 +144,17 @@ class AcpChannelServer:
|
|
|
133
144
|
print(f"[acp_channel] 事件处理异常(已忽略): {e}")
|
|
134
145
|
|
|
135
146
|
async def _handle_shutdown(self, ws):
|
|
136
|
-
"""Handle module.shutdown: ack → cleanup → ready → exit."""
|
|
147
|
+
"""Handle module.shutdown: exiting → ack → cleanup → ready → exit."""
|
|
137
148
|
print("[acp_channel] Received module.shutdown")
|
|
138
149
|
self._shutting_down = True
|
|
139
150
|
|
|
151
|
+
# Step 0: Send module.exiting
|
|
152
|
+
await self._rpc_call(ws, "event.publish", {
|
|
153
|
+
"event_id": str(uuid.uuid4()),
|
|
154
|
+
"event": "module.exiting",
|
|
155
|
+
"data": {"module_id": "acp_channel", "action": "none"},
|
|
156
|
+
})
|
|
157
|
+
|
|
140
158
|
# Step 1: Send ack
|
|
141
159
|
await self._rpc_call(ws, "event.publish", {
|
|
142
160
|
"event_id": str(uuid.uuid4()),
|
|
@@ -263,10 +263,19 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
|
|
|
263
263
|
|
|
264
264
|
# Global WS reference for publish_event callback
|
|
265
265
|
_ws_global = None
|
|
266
|
+
_shutting_down = False
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _is_auth_failure(e: Exception) -> bool:
|
|
270
|
+
"""Check if a WebSocket exception indicates authentication failure."""
|
|
271
|
+
if hasattr(e, 'rcvd') and e.rcvd is not None:
|
|
272
|
+
code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
|
|
273
|
+
return code in (4001, 4003)
|
|
274
|
+
return False
|
|
266
275
|
|
|
267
276
|
|
|
268
277
|
async def main():
|
|
269
|
-
global _ws_global
|
|
278
|
+
global _ws_global, _shutting_down
|
|
270
279
|
# Initialize log file paths
|
|
271
280
|
global _log_dir, _log_latest_path, _crash_log_path
|
|
272
281
|
module_data = os.environ.get("KITE_MODULE_DATA")
|
|
@@ -318,41 +327,84 @@ async def main():
|
|
|
318
327
|
|
|
319
328
|
print(f"[backup] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
|
|
320
329
|
|
|
321
|
-
#
|
|
330
|
+
# Start reconnect loop
|
|
331
|
+
await _ws_loop(token, kernel_port, _t0)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
async def _ws_loop(token: str, kernel_port: int, _t0: float):
|
|
335
|
+
"""Connect to Kernel with exponential backoff reconnection."""
|
|
336
|
+
global _shutting_down
|
|
337
|
+
retry_delay = 0.3
|
|
338
|
+
max_delay = 5.0
|
|
339
|
+
max_retries = 10
|
|
340
|
+
attempt = 0
|
|
341
|
+
while not _shutting_down:
|
|
342
|
+
try:
|
|
343
|
+
await _ws_connect(token, kernel_port, _t0)
|
|
344
|
+
retry_delay = 0.3
|
|
345
|
+
attempt = 0
|
|
346
|
+
except asyncio.CancelledError:
|
|
347
|
+
return
|
|
348
|
+
except Exception as e:
|
|
349
|
+
attempt += 1
|
|
350
|
+
if _is_auth_failure(e):
|
|
351
|
+
print(f"[backup] Kernel 认证失败,退出")
|
|
352
|
+
sys.exit(1)
|
|
353
|
+
if attempt >= max_retries:
|
|
354
|
+
print(f"[backup] 重连失败 {max_retries} 次,退出")
|
|
355
|
+
sys.exit(1)
|
|
356
|
+
_write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
|
|
357
|
+
print(f"[backup] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
|
|
358
|
+
_ws_global_clear()
|
|
359
|
+
if _shutting_down:
|
|
360
|
+
return
|
|
361
|
+
await asyncio.sleep(retry_delay)
|
|
362
|
+
retry_delay = min(retry_delay * 2, max_delay)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _ws_global_clear():
|
|
366
|
+
global _ws_global
|
|
367
|
+
_ws_global = None
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
async def _ws_connect(token: str, kernel_port: int, _t0: float):
|
|
371
|
+
"""Single WebSocket session: connect → subscribe → register → ready → receive loop."""
|
|
372
|
+
global _ws_global
|
|
373
|
+
|
|
322
374
|
ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=backup"
|
|
323
375
|
print(f"[backup] Connecting to Kernel: {ws_url}")
|
|
324
376
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
"
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
"
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
"
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
print(f"[backup] Registered to Kernel ({_fmt_elapsed(_t0)})")
|
|
377
|
+
async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
378
|
+
_ws_global = ws
|
|
379
|
+
print(f"[backup] Connected to Kernel ({_fmt_elapsed(_t0)})")
|
|
380
|
+
|
|
381
|
+
# Subscribe to events
|
|
382
|
+
await _rpc_call(ws, "event.subscribe", {
|
|
383
|
+
"events": [
|
|
384
|
+
"module.started",
|
|
385
|
+
"module.stopped",
|
|
386
|
+
"module.shutdown",
|
|
387
|
+
],
|
|
388
|
+
})
|
|
389
|
+
print(f"[backup] Subscribed to events ({_fmt_elapsed(_t0)})")
|
|
390
|
+
|
|
391
|
+
# Register to Kernel Registry via RPC
|
|
392
|
+
await _rpc_call(ws, "registry.register", {
|
|
393
|
+
"module_id": "backup",
|
|
394
|
+
"module_type": "service",
|
|
395
|
+
"events_publish": {
|
|
396
|
+
"backup.test": {"description": "Test event from backup module"},
|
|
397
|
+
},
|
|
398
|
+
"events_subscribe": [
|
|
399
|
+
"module.started",
|
|
400
|
+
"module.stopped",
|
|
401
|
+
"module.shutdown",
|
|
402
|
+
],
|
|
403
|
+
})
|
|
404
|
+
print(f"[backup] Registered to Kernel ({_fmt_elapsed(_t0)})")
|
|
354
405
|
|
|
355
|
-
|
|
406
|
+
# Publish module.ready (every reconnect)
|
|
407
|
+
if not _shutting_down:
|
|
356
408
|
await _rpc_call(ws, "event.publish", {
|
|
357
409
|
"event_id": str(uuid.uuid4()),
|
|
358
410
|
"event": "module.ready",
|
|
@@ -363,34 +415,29 @@ async def main():
|
|
|
363
415
|
})
|
|
364
416
|
print(f"[backup] module.ready published ({_fmt_elapsed(_t0)})")
|
|
365
417
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
except Exception as e:
|
|
391
|
-
_write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
|
|
392
|
-
_print_crash_summary(type(e), e.__traceback__)
|
|
393
|
-
sys.exit(1)
|
|
418
|
+
# Start test event loop in background
|
|
419
|
+
test_task = asyncio.create_task(_test_event_loop(ws))
|
|
420
|
+
|
|
421
|
+
# Message loop: handle incoming RPC + events
|
|
422
|
+
async for raw in ws:
|
|
423
|
+
try:
|
|
424
|
+
msg = json.loads(raw)
|
|
425
|
+
except (json.JSONDecodeError, TypeError):
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
has_method = "method" in msg
|
|
430
|
+
has_id = "id" in msg
|
|
431
|
+
|
|
432
|
+
if has_method and not has_id:
|
|
433
|
+
# Event Notification
|
|
434
|
+
await _handle_event_notification(msg)
|
|
435
|
+
elif has_method and has_id:
|
|
436
|
+
# Incoming RPC request
|
|
437
|
+
await _handle_rpc_request(ws, msg)
|
|
438
|
+
# Ignore RPC responses (we don't await them in this simple impl)
|
|
439
|
+
except Exception as e:
|
|
440
|
+
print(f"[backup] 消息处理异常(已忽略): {e}")
|
|
394
441
|
|
|
395
442
|
|
|
396
443
|
async def _rpc_call(ws, method: str, params: dict = None):
|
|
@@ -416,10 +463,14 @@ async def _handle_event_notification(msg: dict):
|
|
|
416
463
|
event_type = params.get("event", "")
|
|
417
464
|
data = params.get("data", {})
|
|
418
465
|
|
|
419
|
-
# Special handling for module.shutdown
|
|
420
|
-
if event_type == "module.shutdown"
|
|
421
|
-
|
|
422
|
-
|
|
466
|
+
# Special handling for module.shutdown
|
|
467
|
+
if event_type == "module.shutdown":
|
|
468
|
+
target = data.get("module_id", "")
|
|
469
|
+
reason = data.get("reason", "")
|
|
470
|
+
# Handle both targeted shutdown (module_id == "backup") and broadcast shutdown (no module_id or launcher_lost)
|
|
471
|
+
if target == "backup" or not target or reason == "launcher_lost":
|
|
472
|
+
await _handle_shutdown()
|
|
473
|
+
return
|
|
423
474
|
|
|
424
475
|
# Log other events
|
|
425
476
|
print(f"[backup] Event received: {event_type}")
|
|
@@ -472,8 +523,15 @@ async def _rpc_status() -> dict:
|
|
|
472
523
|
|
|
473
524
|
|
|
474
525
|
async def _handle_shutdown():
|
|
475
|
-
"""Handle module.shutdown event — ack
|
|
526
|
+
"""Handle module.shutdown event — exiting → ack → cleanup → ready → exit."""
|
|
527
|
+
global _shutting_down
|
|
476
528
|
print("[backup] Received shutdown request")
|
|
529
|
+
_shutting_down = True
|
|
530
|
+
# Step 0: Send module.exiting
|
|
531
|
+
await _publish_event(_ws_global, {
|
|
532
|
+
"event": "module.exiting",
|
|
533
|
+
"data": {"module_id": "backup", "action": "none"},
|
|
534
|
+
})
|
|
477
535
|
# Step 1: Send ack
|
|
478
536
|
await _publish_event(_ws_global, {
|
|
479
537
|
"event": "module.shutdown.ack",
|