@agentunion/kite 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,4 +1,90 @@
1
- # v1.3.0
1
+ # v1.3.2
2
+
3
+ **发布日期**:2026-03-04
4
+ **上一版本**:v1.3.1
5
+ **版本级别**:patch
6
+
7
+ ---
8
+
9
+ ## 核心变更
10
+
11
+ 本次发布主要包含功能增强和稳定性改进。新增 Web 模块管理 API,支持通过 HTTP 接口扫描、查看和编辑模块元数据与配置。Kernel 模块增加断线重连防抖机制(5 秒窗口),避免短暂网络波动导致的模块状态误判。多个扩展模块和前端界面进行了优化更新。
12
+
13
+ ---
14
+
15
+ ## 变更统计
16
+
17
+ | 类型 | 数量 | 说明 |
18
+ |------|------|------|
19
+ | 新增 | 1 | `routes_modules.py` (249 行) |
20
+ | 修改 | 18 | kernel、launcher、扩展模块、web 前端 |
21
+
22
+ ---
23
+
24
+ ## 关键变更
25
+
26
+ ### 新增功能
27
+
28
+ **`extensions/services/web/routes/routes_modules.py`** (249 行)
29
+ - 模块管理 API:扫描项目中的所有模块
30
+ - 元数据查看与编辑(`module.md` 前置元数据)
31
+ - 配置查看与编辑(`config.yaml`)
32
+ - 只读字段保护(`name`, `type`, `runtime`, `entry`)
33
+
34
+ ### 核心改进
35
+
36
+ **`kernel/server.py`** — 断线重连防抖
37
+ ```diff
38
+ + # Debounce timers for disconnected modules (module_id -> asyncio.Task)
39
+ + self._debounce_tasks: dict[str, asyncio.Task] = {}
40
+ + # Cancel debounce timer if module is reconnecting within 5s window
41
+ + old_debounce = server._debounce_tasks.pop(module_id, None)
42
+ + if old_debounce:
43
+ + old_debounce.cancel()
44
+ ```
45
+ - 5 秒防抖窗口,避免短暂断线误判
46
+ - Launcher 离线 35 秒后触发全局重启
47
+
48
+ **`kernel/registry_store.py`** — 连接状态管理
49
+ - 新增 `set_connected()` / `set_disconnected()` 方法
50
+ - 模块注册表支持实时连接状态跟踪
51
+
52
+ **`launcher/entry.py`** — 进程管理优化
53
+ - 改进模块启动和停止逻辑
54
+ - 优化日志输出和错误处理
55
+
56
+ ### 前端更新
57
+
58
+ **Web UI** (`static/index.html`, `static/js/app.js`, `static/css/style.css`)
59
+ - 界面样式优化
60
+ - 交互逻辑改进
61
+ - 新增模块管理相关 UI 组件
62
+
63
+ ### 扩展模块适配
64
+
65
+ 更新以下模块以适配 Kernel 连接状态管理:
66
+ - `extensions/agents/assistant/server.py`
67
+ - `extensions/channels/acp_channel/server.py`
68
+ - `extensions/services/backup/entry.py`
69
+ - `extensions/services/model_service/entry.py`
70
+ - `extensions/services/watchdog/entry.py`
71
+ - `extensions/services/watchdog/monitor.py`
72
+
73
+ ---
74
+
75
+ ## 包信息
76
+
77
+ - **文件数**:88(与 v1.3.1 相同)
78
+ - **打包大小**:~208 kB
79
+
80
+ ---
81
+
82
+ ## 稳定性改进
83
+
84
+ - ✅ **断线重连防抖** — 避免短暂网络波动导致模块误判为离线
85
+ - ✅ **连接状态跟踪** — Registry 实时记录模块连接状态
86
+ - ✅ **Launcher 保护** — 35 秒容错窗口,避免误触发全局重启
87
+ # v1.3.1
2
88
 
3
89
  **发布日期**:2026-03-04
4
90
  **上一版本**:v1.2.0
@@ -21,7 +21,6 @@ class AssistantServer:
21
21
  self._ws_task: asyncio.Task | None = None
22
22
  self._test_task: asyncio.Task | None = None
23
23
  self._ws: object | None = None
24
- self._ready_sent = False
25
24
  self._shutting_down = False
26
25
  self._start_time = time.time()
27
26
 
@@ -42,15 +41,29 @@ class AssistantServer:
42
41
 
43
42
  async def _ws_loop(self):
44
43
  """Connect to Kernel, subscribe, register, and listen. Reconnect on failure."""
45
- retry_delay = 0.5
46
- max_delay = 30
44
+ retry_delay = 0.3
45
+ max_delay = 5.0
46
+ max_retries = 10
47
+ attempt = 0
47
48
  while not self._shutting_down:
48
49
  try:
49
50
  await self._ws_connect()
51
+ retry_delay = 0.3
52
+ attempt = 0
50
53
  except asyncio.CancelledError:
51
54
  return
52
55
  except Exception as e:
53
- print(f"[assistant] Kernel connection error: {e}, retrying in {retry_delay:.1f}s")
56
+ attempt += 1
57
+ # Auth failure — don't retry
58
+ if hasattr(e, 'rcvd') and e.rcvd is not None:
59
+ code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
60
+ if code in (4001, 4003):
61
+ print(f"[assistant] Kernel 认证失败 (code {code}),退出")
62
+ import sys; sys.exit(1)
63
+ if attempt >= max_retries:
64
+ print(f"[assistant] Kernel 重连失败 {max_retries} 次,退出")
65
+ import sys; sys.exit(1)
66
+ print(f"[assistant] Kernel connection error: {e}, retrying in {retry_delay:.1f}s ({attempt}/{max_retries})")
54
67
  self._ws = None
55
68
  if self._shutting_down:
56
69
  return
@@ -87,8 +100,8 @@ class AssistantServer:
87
100
  ],
88
101
  })
89
102
 
90
- # Step 3: Publish module.ready (once)
91
- if not self._ready_sent:
103
+ # Step 3: Publish module.ready (every reconnect)
104
+ if not self._shutting_down:
92
105
  await self._rpc_call(ws, "event.publish", {
93
106
  "event_id": str(uuid.uuid4()),
94
107
  "event": "module.ready",
@@ -97,14 +110,10 @@ class AssistantServer:
97
110
  "graceful_shutdown": True,
98
111
  },
99
112
  })
100
- self._ready_sent = True
101
113
  elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
102
114
  elapsed_str = f" ({elapsed:.1f}s)" if elapsed else ""
103
115
  print(f"[assistant] module.ready published{elapsed_str}")
104
116
 
105
- # Reset retry delay on successful connection
106
- retry_delay = 0.5
107
-
108
117
  # Receive loop
109
118
  async for raw in ws:
110
119
  try:
@@ -123,7 +132,9 @@ class AssistantServer:
123
132
  if event_name == "module.shutdown":
124
133
  data = params.get("data", {})
125
134
  target = data.get("module_id", "")
126
- if target == "assistant":
135
+ reason = data.get("reason", "")
136
+ # Handle both targeted shutdown (module_id == "assistant") and broadcast shutdown (no module_id or launcher_lost)
137
+ if target == "assistant" or not target or reason == "launcher_lost":
127
138
  await self._handle_shutdown(ws)
128
139
  return
129
140
  elif not has_method and has_id:
@@ -133,10 +144,17 @@ class AssistantServer:
133
144
  print(f"[assistant] 事件处理异常(已忽略): {e}")
134
145
 
135
146
  async def _handle_shutdown(self, ws):
136
- """Handle module.shutdown: ack → cleanup → ready → exit."""
147
+ """Handle module.shutdown: exiting → ack → cleanup → ready → exit."""
137
148
  print("[assistant] Received module.shutdown")
138
149
  self._shutting_down = True
139
150
 
151
+ # Step 0: Send module.exiting
152
+ await self._rpc_call(ws, "event.publish", {
153
+ "event_id": str(uuid.uuid4()),
154
+ "event": "module.exiting",
155
+ "data": {"module_id": "assistant", "action": "none"},
156
+ })
157
+
140
158
  # Step 1: Send ack
141
159
  await self._rpc_call(ws, "event.publish", {
142
160
  "event_id": str(uuid.uuid4()),
@@ -21,7 +21,6 @@ class AcpChannelServer:
21
21
  self._ws_task: asyncio.Task | None = None
22
22
  self._test_task: asyncio.Task | None = None
23
23
  self._ws: object | None = None
24
- self._ready_sent = False
25
24
  self._shutting_down = False
26
25
  self._start_time = time.time()
27
26
 
@@ -42,15 +41,29 @@ class AcpChannelServer:
42
41
 
43
42
  async def _ws_loop(self):
44
43
  """Connect to Kernel, subscribe, register, and listen. Reconnect on failure."""
45
- retry_delay = 0.5
46
- max_delay = 30
44
+ retry_delay = 0.3
45
+ max_delay = 5.0
46
+ max_retries = 10
47
+ attempt = 0
47
48
  while not self._shutting_down:
48
49
  try:
49
50
  await self._ws_connect()
51
+ retry_delay = 0.3
52
+ attempt = 0
50
53
  except asyncio.CancelledError:
51
54
  return
52
55
  except Exception as e:
53
- print(f"[acp_channel] Kernel connection error: {e}, retrying in {retry_delay:.1f}s")
56
+ attempt += 1
57
+ # Auth failure — don't retry
58
+ if hasattr(e, 'rcvd') and e.rcvd is not None:
59
+ code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
60
+ if code in (4001, 4003):
61
+ print(f"[acp_channel] Kernel 认证失败 (code {code}),退出")
62
+ import sys; sys.exit(1)
63
+ if attempt >= max_retries:
64
+ print(f"[acp_channel] Kernel 重连失败 {max_retries} 次,退出")
65
+ import sys; sys.exit(1)
66
+ print(f"[acp_channel] Kernel connection error: {e}, retrying in {retry_delay:.1f}s ({attempt}/{max_retries})")
54
67
  self._ws = None
55
68
  if self._shutting_down:
56
69
  return
@@ -87,8 +100,8 @@ class AcpChannelServer:
87
100
  ],
88
101
  })
89
102
 
90
- # Step 3: Publish module.ready (once)
91
- if not self._ready_sent:
103
+ # Step 3: Publish module.ready (every reconnect)
104
+ if not self._shutting_down:
92
105
  await self._rpc_call(ws, "event.publish", {
93
106
  "event_id": str(uuid.uuid4()),
94
107
  "event": "module.ready",
@@ -97,14 +110,10 @@ class AcpChannelServer:
97
110
  "graceful_shutdown": True,
98
111
  },
99
112
  })
100
- self._ready_sent = True
101
113
  elapsed = time.monotonic() - self.boot_t0 if self.boot_t0 else 0
102
114
  elapsed_str = f" ({elapsed:.1f}s)" if elapsed else ""
103
115
  print(f"[acp_channel] module.ready published{elapsed_str}")
104
116
 
105
- # Reset retry delay on successful connection
106
- retry_delay = 0.5
107
-
108
117
  # Receive loop
109
118
  async for raw in ws:
110
119
  try:
@@ -123,7 +132,9 @@ class AcpChannelServer:
123
132
  if event_name == "module.shutdown":
124
133
  data = params.get("data", {})
125
134
  target = data.get("module_id", "")
126
- if target == "acp_channel":
135
+ reason = data.get("reason", "")
136
+ # Handle both targeted shutdown (module_id == "acp_channel") and broadcast shutdown (no module_id or launcher_lost)
137
+ if target == "acp_channel" or not target or reason == "launcher_lost":
127
138
  await self._handle_shutdown(ws)
128
139
  return
129
140
  elif not has_method and has_id:
@@ -133,10 +144,17 @@ class AcpChannelServer:
133
144
  print(f"[acp_channel] 事件处理异常(已忽略): {e}")
134
145
 
135
146
  async def _handle_shutdown(self, ws):
136
- """Handle module.shutdown: ack → cleanup → ready → exit."""
147
+ """Handle module.shutdown: exiting → ack → cleanup → ready → exit."""
137
148
  print("[acp_channel] Received module.shutdown")
138
149
  self._shutting_down = True
139
150
 
151
+ # Step 0: Send module.exiting
152
+ await self._rpc_call(ws, "event.publish", {
153
+ "event_id": str(uuid.uuid4()),
154
+ "event": "module.exiting",
155
+ "data": {"module_id": "acp_channel", "action": "none"},
156
+ })
157
+
140
158
  # Step 1: Send ack
141
159
  await self._rpc_call(ws, "event.publish", {
142
160
  "event_id": str(uuid.uuid4()),
@@ -263,10 +263,19 @@ def _read_stdin_kite_message(expected_type: str, timeout: float = 10) -> dict |
263
263
 
264
264
  # Global WS reference for publish_event callback
265
265
  _ws_global = None
266
+ _shutting_down = False
267
+
268
+
269
+ def _is_auth_failure(e: Exception) -> bool:
270
+ """Check if a WebSocket exception indicates authentication failure."""
271
+ if hasattr(e, 'rcvd') and e.rcvd is not None:
272
+ code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
273
+ return code in (4001, 4003)
274
+ return False
266
275
 
267
276
 
268
277
  async def main():
269
- global _ws_global
278
+ global _ws_global, _shutting_down
270
279
  # Initialize log file paths
271
280
  global _log_dir, _log_latest_path, _crash_log_path
272
281
  module_data = os.environ.get("KITE_MODULE_DATA")
@@ -318,41 +327,84 @@ async def main():
318
327
 
319
328
  print(f"[backup] Token received ({len(token)} chars), kernel port: {kernel_port} ({_fmt_elapsed(_t0)})")
320
329
 
321
- # Connect to Kernel WebSocket
330
+ # Start reconnect loop
331
+ await _ws_loop(token, kernel_port, _t0)
332
+
333
+
334
+ async def _ws_loop(token: str, kernel_port: int, _t0: float):
335
+ """Connect to Kernel with exponential backoff reconnection."""
336
+ global _shutting_down
337
+ retry_delay = 0.3
338
+ max_delay = 5.0
339
+ max_retries = 10
340
+ attempt = 0
341
+ while not _shutting_down:
342
+ try:
343
+ await _ws_connect(token, kernel_port, _t0)
344
+ retry_delay = 0.3
345
+ attempt = 0
346
+ except asyncio.CancelledError:
347
+ return
348
+ except Exception as e:
349
+ attempt += 1
350
+ if _is_auth_failure(e):
351
+ print(f"[backup] Kernel 认证失败,退出")
352
+ sys.exit(1)
353
+ if attempt >= max_retries:
354
+ print(f"[backup] 重连失败 {max_retries} 次,退出")
355
+ sys.exit(1)
356
+ _write_crash(type(e), e, e.__traceback__, severity="error", handled=True)
357
+ print(f"[backup] 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
358
+ _ws_global_clear()
359
+ if _shutting_down:
360
+ return
361
+ await asyncio.sleep(retry_delay)
362
+ retry_delay = min(retry_delay * 2, max_delay)
363
+
364
+
365
+ def _ws_global_clear():
366
+ global _ws_global
367
+ _ws_global = None
368
+
369
+
370
+ async def _ws_connect(token: str, kernel_port: int, _t0: float):
371
+ """Single WebSocket session: connect → subscribe → register → ready → receive loop."""
372
+ global _ws_global
373
+
322
374
  ws_url = f"ws://127.0.0.1:{kernel_port}/ws?token={token}&id=backup"
323
375
  print(f"[backup] Connecting to Kernel: {ws_url}")
324
376
 
325
- try:
326
- async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
327
- _ws_global = ws
328
- print(f"[backup] Connected to Kernel ({_fmt_elapsed(_t0)})")
329
-
330
- # Subscribe to events
331
- await _rpc_call(ws, "event.subscribe", {
332
- "events": [
333
- "module.started",
334
- "module.stopped",
335
- "module.shutdown",
336
- ],
337
- })
338
- print(f"[backup] Subscribed to events ({_fmt_elapsed(_t0)})")
339
-
340
- # Register to Kernel Registry via RPC
341
- await _rpc_call(ws, "registry.register", {
342
- "module_id": "backup",
343
- "module_type": "service",
344
- "events_publish": {
345
- "backup.test": {"description": "Test event from backup module"},
346
- },
347
- "events_subscribe": [
348
- "module.started",
349
- "module.stopped",
350
- "module.shutdown",
351
- ],
352
- })
353
- print(f"[backup] Registered to Kernel ({_fmt_elapsed(_t0)})")
377
+ async with websockets.connect(ws_url, open_timeout=5, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
378
+ _ws_global = ws
379
+ print(f"[backup] Connected to Kernel ({_fmt_elapsed(_t0)})")
380
+
381
+ # Subscribe to events
382
+ await _rpc_call(ws, "event.subscribe", {
383
+ "events": [
384
+ "module.started",
385
+ "module.stopped",
386
+ "module.shutdown",
387
+ ],
388
+ })
389
+ print(f"[backup] Subscribed to events ({_fmt_elapsed(_t0)})")
390
+
391
+ # Register to Kernel Registry via RPC
392
+ await _rpc_call(ws, "registry.register", {
393
+ "module_id": "backup",
394
+ "module_type": "service",
395
+ "events_publish": {
396
+ "backup.test": {"description": "Test event from backup module"},
397
+ },
398
+ "events_subscribe": [
399
+ "module.started",
400
+ "module.stopped",
401
+ "module.shutdown",
402
+ ],
403
+ })
404
+ print(f"[backup] Registered to Kernel ({_fmt_elapsed(_t0)})")
354
405
 
355
- # Publish module.ready
406
+ # Publish module.ready (every reconnect)
407
+ if not _shutting_down:
356
408
  await _rpc_call(ws, "event.publish", {
357
409
  "event_id": str(uuid.uuid4()),
358
410
  "event": "module.ready",
@@ -363,34 +415,29 @@ async def main():
363
415
  })
364
416
  print(f"[backup] module.ready published ({_fmt_elapsed(_t0)})")
365
417
 
366
- # Start test event loop in background
367
- test_task = asyncio.create_task(_test_event_loop(ws))
368
-
369
- # Message loop: handle incoming RPC + events
370
- async for raw in ws:
371
- try:
372
- msg = json.loads(raw)
373
- except (json.JSONDecodeError, TypeError):
374
- continue
375
-
376
- try:
377
- has_method = "method" in msg
378
- has_id = "id" in msg
379
-
380
- if has_method and not has_id:
381
- # Event Notification
382
- await _handle_event_notification(msg)
383
- elif has_method and has_id:
384
- # Incoming RPC request
385
- await _handle_rpc_request(ws, msg)
386
- # Ignore RPC responses (we don't await them in this simple impl)
387
- except Exception as e:
388
- print(f"[backup] 消息处理异常(已忽略): {e}")
389
-
390
- except Exception as e:
391
- _write_crash(type(e), e, e.__traceback__, severity="critical", handled=True)
392
- _print_crash_summary(type(e), e.__traceback__)
393
- sys.exit(1)
418
+ # Start test event loop in background
419
+ test_task = asyncio.create_task(_test_event_loop(ws))
420
+
421
+ # Message loop: handle incoming RPC + events
422
+ async for raw in ws:
423
+ try:
424
+ msg = json.loads(raw)
425
+ except (json.JSONDecodeError, TypeError):
426
+ continue
427
+
428
+ try:
429
+ has_method = "method" in msg
430
+ has_id = "id" in msg
431
+
432
+ if has_method and not has_id:
433
+ # Event Notification
434
+ await _handle_event_notification(msg)
435
+ elif has_method and has_id:
436
+ # Incoming RPC request
437
+ await _handle_rpc_request(ws, msg)
438
+ # Ignore RPC responses (we don't await them in this simple impl)
439
+ except Exception as e:
440
+ print(f"[backup] 消息处理异常(已忽略): {e}")
394
441
 
395
442
 
396
443
  async def _rpc_call(ws, method: str, params: dict = None):
@@ -416,10 +463,14 @@ async def _handle_event_notification(msg: dict):
416
463
  event_type = params.get("event", "")
417
464
  data = params.get("data", {})
418
465
 
419
- # Special handling for module.shutdown targeting backup
420
- if event_type == "module.shutdown" and data.get("module_id") == "backup":
421
- await _handle_shutdown()
422
- return
466
+ # Special handling for module.shutdown
467
+ if event_type == "module.shutdown":
468
+ target = data.get("module_id", "")
469
+ reason = data.get("reason", "")
470
+ # Handle both targeted shutdown (module_id == "backup") and broadcast shutdown (no module_id or launcher_lost)
471
+ if target == "backup" or not target or reason == "launcher_lost":
472
+ await _handle_shutdown()
473
+ return
423
474
 
424
475
  # Log other events
425
476
  print(f"[backup] Event received: {event_type}")
@@ -472,8 +523,15 @@ async def _rpc_status() -> dict:
472
523
 
473
524
 
474
525
  async def _handle_shutdown():
475
- """Handle module.shutdown event — ack, cleanup, ready, exit."""
526
+ """Handle module.shutdown event — exiting → ack cleanup ready exit."""
527
+ global _shutting_down
476
528
  print("[backup] Received shutdown request")
529
+ _shutting_down = True
530
+ # Step 0: Send module.exiting
531
+ await _publish_event(_ws_global, {
532
+ "event": "module.exiting",
533
+ "data": {"module_id": "backup", "action": "none"},
534
+ })
477
535
  # Step 1: Send ack
478
536
  await _publish_event(_ws_global, {
479
537
  "event": "module.shutdown.ack",