@coclaw/openclaw-coclaw 0.18.0 → 0.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,10 +44,12 @@ const DC_REQ_SCAN_MS = 60 * 60 * 1000;
44
44
 
45
45
  /**
46
46
  * 判断一个出方向 res payload 是否表示 agent RPC 进入 phase-2 终态。
47
- * 终态 = res 帧 + status !== 'accepted'。覆盖三种情形:
47
+ * 终态 = res 帧 + status !== 'accepted'。OpenClaw 上游可能下发的终态 status:
48
48
  * - status='ok':成功
49
49
  * - status='error':执行失败
50
+ * - status='timeout':上游 agent.wait 等待 runId 终态超时(含 dedupe 命中的 timeout 快照)
50
51
  * - 参数校验失败:ok=false 且无 status(协议文档"特殊情况")
52
+ * 仅做兜底分类,不再追求枚举完备——上游若新增其他 non-accepted status,原样作为 reason 返回。
51
53
  *
52
54
  * @param {object} payload - 待判断的消息
53
55
  * @returns {string | null} 终态时返回 lag.summary 的 reason 字符串,否则 null
@@ -265,6 +267,8 @@ export class RealtimeBridge {
265
267
  this.__gatewayRetryTimer = null;
266
268
  }
267
269
  this.__gatewayAttempts = 0;
270
+ // 主动关闭时立即清 lag probe,不依赖 close 事件回调时序,避免 close 事件延迟期间 probe 误报
271
+ this.__clearAllLagProbes();
268
272
  if (!this.gatewayWs) {
269
273
  return;
270
274
  }
@@ -381,7 +385,8 @@ export class RealtimeBridge {
381
385
  * 两阶段 agent RPC:发送请求后等待 accepted 再等待最终响应。
382
386
  * agent() RPC 返回两次响应(同一 id):
383
387
  * 1. { status: "accepted", runId }
384
- * 2. { status: "ok", result: { payloads: [{ text }] } }
388
+ * 2. 终态帧,status 取值见 classifyAgentLagStop 注释(ok/error/timeout/参数校验失败);
389
+ * 其中 status='ok' 时附带 result.payloads,其余分支可能没有 result。
385
390
  *
386
391
  * @param {string} method - RPC 方法名(通常为 'agent')
387
392
  * @param {object} params - RPC 参数
@@ -727,122 +732,137 @@ export class RealtimeBridge {
727
732
  let wasReady = false; // 本 WS 曾经握手成功(区分"握手失败"与"成功后断开")
728
733
  let lastChallengeNonce = ''; // 最近一次 challenge 的 nonce,legacy 回退时复用
729
734
 
735
+ // 注意:listener 用 sync wrapper + IIFE.catch 形式,避免 async listener 抛出的
736
+ // promise 变 unhandledRejection 击穿 gateway 进程。await sendTo / settle / broadcast
737
+ // 等路径若抛错必须在此兜底。
730
738
  ws.addEventListener('message', (event) => {
731
- let payload = null;
732
- try {
733
- payload = JSON.parse(String(event.data ?? '{}'));
734
- }
735
- catch {
736
- return;
737
- }
738
- if (!payload || typeof payload !== 'object') {
739
- return;
740
- }
741
- if (payload.type === 'event' && payload.event === 'connect.challenge') {
742
- const nonce = payload?.payload?.nonce ?? '';
743
- lastChallengeNonce = nonce;
744
- this.__logDebug(`gateway event <- connect.challenge legacyMode=${this.__gatewayLegacyMode}`);
745
- // 已经学到此 gateway 是 legacy(上一条 WS 回退过)→ 直接发 legacy 握手
746
- if (this.__gatewayLegacyMode) {
747
- pendingLegacyAttempted = true;
748
- this.__sendGatewayConnectRequest(ws, nonce, { legacy: true });
739
+ (async () => {
740
+ // stale guard:与 server sock open/message 已加的 guard 对称。
741
+ // gateway ws 关闭后若仍有迟到的 message(connect.challenge / res / event),
742
+ // 处理路径会写 this.gatewayConnectReqId / this.gatewayReady / 转发 res 等共享状态,
743
+ // 污染当前 ws 的握手或路由
744
+ if (this.gatewayWs !== ws) {
745
+ return;
749
746
  }
750
- else {
751
- this.__sendGatewayConnectRequest(ws, nonce);
747
+ let payload = null;
748
+ try {
749
+ payload = JSON.parse(String(event.data ?? '{}'));
752
750
  }
753
- return;
754
- }
755
- if (payload.type === 'res' && this.gatewayConnectReqId && payload.id === this.gatewayConnectReqId) {
756
- if (payload.ok === true) {
757
- this.gatewayReady = true;
758
- wasReady = true;
759
- this.__gatewayAttempts = 0; // 成功握手 → 重置失败计数,让后续瞬态断开有完整重试预算
760
- remoteLog('ws.connected peer=gateway');
761
- this.__logDebug(`gateway connect ok <- id=${payload.id}`);
762
- this.gatewayConnectReqId = null;
763
- this.__ensureSessionsPromise = this.__ensureAllAgentSessions();
764
- this.__pushInstanceInfo();
751
+ catch {
752
+ return;
753
+ }
754
+ if (!payload || typeof payload !== 'object') {
755
+ return;
765
756
  }
766
- else {
767
- const reason = payload?.error?.message ?? 'unknown';
768
- // v3 → legacy 同 WS 回退:仅在签名/协议相关错误、且本 WS 尚未尝试 legacy 时触发
769
- const shouldFallback =
757
+ if (payload.type === 'event' && payload.event === 'connect.challenge') {
758
+ const nonce = payload?.payload?.nonce ?? '';
759
+ lastChallengeNonce = nonce;
760
+ this.__logDebug(`gateway event <- connect.challenge legacyMode=${this.__gatewayLegacyMode}`);
761
+ // 已经学到此 gateway 是 legacy(上一条 WS 回退过)→ 直接发 legacy 握手
762
+ if (this.__gatewayLegacyMode) {
763
+ pendingLegacyAttempted = true;
764
+ this.__sendGatewayConnectRequest(ws, nonce, { legacy: true });
765
+ }
766
+ else {
767
+ this.__sendGatewayConnectRequest(ws, nonce);
768
+ }
769
+ return;
770
+ }
771
+ if (payload.type === 'res' && this.gatewayConnectReqId && payload.id === this.gatewayConnectReqId) {
772
+ if (payload.ok === true) {
773
+ this.gatewayReady = true;
774
+ wasReady = true;
775
+ this.__gatewayAttempts = 0; // 成功握手 → 重置失败计数,让后续瞬态断开有完整重试预算
776
+ remoteLog('ws.connected peer=gateway');
777
+ this.__logDebug(`gateway connect ok <- id=${payload.id}`);
778
+ this.gatewayConnectReqId = null;
779
+ this.__ensureSessionsPromise = this.__ensureAllAgentSessions();
780
+ this.__pushInstanceInfo();
781
+ }
782
+ else {
783
+ const reason = payload?.error?.message ?? 'unknown';
784
+ // v3 → legacy 同 WS 回退:仅在签名/协议相关错误、且本 WS 尚未尝试 legacy 时触发
785
+ const shouldFallback =
770
786
  !pendingLegacyAttempted
771
787
  && !this.__gatewayLegacyMode
772
788
  && GATEWAY_HANDSHAKE_FALLBACK_PATTERN.test(reason);
773
- if (shouldFallback) {
774
- pendingLegacyAttempted = true;
775
- this.__gatewayLegacyMode = true;
776
- // v3 的失败原因已由这条 remoteLog 单独上报,不写入 __gatewayLastReason;
777
- // 后者保持"最后一次真正失败的原因"语义,供 gave-up 时使用。
778
- remoteLog(`gateway.handshake.fallback v3→legacy reason=${reason}`);
779
- this.logger.info?.(`[coclaw] gateway v3 handshake failed (${reason}), falling back to legacy`);
780
- this.__sendGatewayConnectRequest(ws, lastChallengeNonce, { legacy: true });
789
+ if (shouldFallback) {
790
+ pendingLegacyAttempted = true;
791
+ this.__gatewayLegacyMode = true;
792
+ // v3 的失败原因已由这条 remoteLog 单独上报,不写入 __gatewayLastReason;
793
+ // 后者保持"最后一次真正失败的原因"语义,供 gave-up 时使用。
794
+ remoteLog(`gateway.handshake.fallback v3→legacy reason=${reason}`);
795
+ this.logger.info?.(`[coclaw] gateway v3 handshake failed (${reason}), falling back to legacy`);
796
+ this.__sendGatewayConnectRequest(ws, lastChallengeNonce, { legacy: true });
797
+ return;
798
+ }
799
+ this.gatewayReady = false;
800
+ this.gatewayConnectReqId = null;
801
+ connectFailReported = true;
802
+ this.__gatewayLastReason = reason;
803
+ remoteLog(`ws.connect-failed peer=gateway msg=${reason}`);
804
+ this.logger.warn?.(`[coclaw] gateway connect failed: ${reason}`);
805
+ try { ws.close(1008, 'gateway_connect_failed'); }
806
+ /* c8 ignore next */
807
+ catch {}
808
+ }
809
+ return;
810
+ }
811
+ if (payload.type === 'res' && typeof payload.id === 'string') {
812
+ const settle = this.gatewayPendingRequests.get(payload.id);
813
+ if (settle) {
814
+ settle({
815
+ ok: payload.ok === true,
816
+ response: payload,
817
+ error: payload?.error?.message ?? payload?.error?.code,
818
+ });
781
819
  return;
782
820
  }
783
- this.gatewayReady = false;
784
- this.gatewayConnectReqId = null;
785
- connectFailReported = true;
786
- this.__gatewayLastReason = reason;
787
- remoteLog(`ws.connect-failed peer=gateway msg=${reason}`);
788
- this.logger.warn?.(`[coclaw] gateway connect failed: ${reason}`);
789
- try { ws.close(1008, 'gateway_connect_failed'); }
790
- /* c8 ignore next */
791
- catch {}
792
821
  }
793
- return;
794
- }
795
- if (payload.type === 'res' && typeof payload.id === 'string') {
796
- const settle = this.gatewayPendingRequests.get(payload.id);
797
- if (settle) {
798
- settle({
799
- ok: payload.ok === true,
800
- response: payload,
801
- error: payload?.error?.message ?? payload?.error?.code,
802
- });
822
+ /* c8 ignore next 3 -- connect 完成前的消息过滤 */
823
+ if (!this.gatewayReady) {
803
824
  return;
804
825
  }
805
- }
806
- /* c8 ignore next 3 -- connect 完成前的消息过滤 */
807
- if (!this.gatewayReady) {
808
- return;
809
- }
810
- if (payload.type === 'res' || payload.type === 'event') {
826
+ if (payload.type === 'res' || payload.type === 'event') {
811
827
  // (a) 过滤 gateway 的管理层广播事件,这些对 WebChat / plugin 客户端无意义:
812
828
  // - health: 全量状态快照(~3KB, ~60s 一次 + RPC 触发),给 Admin UI 的监控仪表盘用
813
829
  // - tick: gateway WS 保活心跳(30s 一次),UI 隔着 DC 不需要,DC 自己有 probe 机制
814
830
  // 不转发可避免后台时 rpc DC 队列被灌满。上游支持按需订阅前先在插件侧拦截。
815
- if (payload.type === 'event'
831
+ if (payload.type === 'event'
816
832
  && (payload.event === 'health' || payload.event === 'tick')) {
817
- return;
818
- }
819
- // (b) agent RPC 进入 phase-2 终态时停 lag 探针(必须放在 (c) 单播分支之前,
820
- // 避免命中后探针不停导致 60s 兜底 + 噪声日志)
821
- const lagReason = classifyAgentLagStop(payload);
822
- if (lagReason !== null) {
823
- this.__stopLagProbe(payload.id, lagReason);
824
- }
825
- // (c) UI 转发 RPC 的 res 单播:按 reqId 查路由表,命中则定向 sendTo
826
- if (payload.type === 'res' && typeof payload.id === 'string') {
827
- const info = this.__dcPendingRequests.get(payload.id);
828
- if (info) {
833
+ return;
834
+ }
835
+ // (b) agent RPC 进入 phase-2 终态时停 lag 探针(必须放在 (c) 单播分支之前,
836
+ // 避免命中后探针不停导致 60s 兜底 + 噪声日志)
837
+ const lagReason = classifyAgentLagStop(payload);
838
+ if (lagReason !== null) {
839
+ this.__stopLagProbe(payload.id, lagReason);
840
+ }
841
+ // (c) UI 转发 RPC 的 res 单播:按 reqId 查路由表,命中则定向 sendTo
842
+ if (payload.type === 'res' && typeof payload.id === 'string') {
843
+ const info = this.__dcPendingRequests.get(payload.id);
844
+ if (info) {
829
845
  // 终态才清条目;accepted 类中间态保留等下一帧
830
- if (isFinalResMsg(payload)) {
831
- this.__dcPendingRequests.delete(payload.id);
832
- }
833
- const delivered = this.webrtcPeer?.sendTo(info.connId, payload);
834
- if (!delivered) {
846
+ if (isFinalResMsg(payload)) {
847
+ this.__dcPendingRequests.delete(payload.id);
848
+ }
849
+ // sendTo 阶段 1 改为 async(admission 决策 await);外层 listener 已是 async
850
+ const delivered = await this.webrtcPeer?.sendTo(info.connId, payload);
851
+ if (!delivered) {
835
852
  // PC 已断 / DC 未 open / 队列拒收:本地 log 丢弃,不退回广播
836
- this.__logDebug(
837
- `dc res undeliverable: id=${payload.id} connId=${info.connId}`
838
- );
853
+ this.__logDebug(
854
+ `dc res undeliverable: id=${payload.id} connId=${info.connId}`
855
+ );
856
+ }
857
+ return;
839
858
  }
840
- return;
841
859
  }
860
+ // (d) 兜底广播:覆盖 event 类型 / 映射未命中场景
861
+ this.webrtcPeer?.broadcast(payload);
842
862
  }
843
- // (d) 兜底广播:覆盖 event 类型 / 映射未命中场景
844
- this.webrtcPeer?.broadcast(payload);
845
- }
863
+ })().catch((err) => {
864
+ this.logger.warn?.(`[coclaw] gateway ws message handler error: ${err?.message ?? err}`);
865
+ });
846
866
  });
847
867
 
848
868
  ws.addEventListener('open', () => {
@@ -850,19 +870,23 @@ export class RealtimeBridge {
850
870
  });
851
871
  ws.addEventListener('close', (ev) => {
852
872
  // 握手失败路径已经打过 ws.connect-failed,这里抑制重复的 disconnected 日志;
853
- // 成功后的意外断开、握手途中的异常断开仍按原样上报。
873
+ // 成功后的意外断开、握手途中的异常断开仍按原样上报。per-WS log 用闭包局部
874
+ // connectFailReported,无需身份校验
854
875
  if (!connectFailReported) {
855
876
  remoteLog(`ws.disconnected peer=gateway code=${ev?.code ?? '?'}`);
856
877
  }
857
878
  this.logger.info?.(`[coclaw] gateway ws closed (code=${ev?.code ?? '?'} reason=${ev?.reason ?? 'n/a'})`);
879
+ // stale guard:旧 ws 的迟到 close 不应清新 ws 的 lag probes / pending requests / DC 路由 /
880
+ // 也不应触发新一轮重试调度。非当前 ws → 直接早返,仅留 per-WS 日志。
881
+ if (this.gatewayWs !== ws) {
882
+ return;
883
+ }
858
884
  // gateway WS 一断,正在跑的 agent RPC 不会再有 phase-2 res,主动结算所有 lag 探针,
859
885
  // 避免它们空跑到 60s 兜底,期间还会持续打 spike 噪声。
860
886
  this.__clearAllLagProbes();
861
- if (this.gatewayWs === ws) {
862
- this.gatewayWs = null;
863
- this.gatewayReady = false;
864
- this.gatewayConnectReqId = null;
865
- }
887
+ this.gatewayWs = null;
888
+ this.gatewayReady = false;
889
+ this.gatewayConnectReqId = null;
866
890
  /* c8 ignore next 3 -- gateway 意外断开时结算未完成 RPC,避免等超时 */
867
891
  for (const [, settle] of this.gatewayPendingRequests) {
868
892
  settle({ ok: false, error: 'gateway_closed' });
@@ -943,6 +967,24 @@ export class RealtimeBridge {
943
967
  }
944
968
 
945
969
  async __handleGatewayRequestFromDc(payload, connId) {
970
+ // 入口校验:peer 可能发出残缺 / 类型错误的帧;不应向 gateway 转发 id/method 缺失的请求
971
+ const hasValidId = typeof payload?.id === 'string' && payload.id.length > 0;
972
+ const hasValidMethod = typeof payload?.method === 'string' && payload.method.length > 0;
973
+ if (!hasValidId || !hasValidMethod) {
974
+ this.logger.warn?.(
975
+ `[coclaw] dc gateway req invalid: id=${typeof payload?.id} method=${typeof payload?.method}`,
976
+ );
977
+ // 有合法 id 时回 INVALID_REQUEST 让发起方尽快放弃等待;id 不合法时只能 drop
978
+ if (hasValidId) {
979
+ this.webrtcPeer?.broadcast({
980
+ type: 'res',
981
+ id: payload.id,
982
+ ok: false,
983
+ error: { code: 'INVALID_REQUEST', message: 'missing or invalid id/method' },
984
+ });
985
+ }
986
+ return;
987
+ }
946
988
  const ready = await this.__waitGatewayReady();
947
989
  if (!ready || !this.gatewayWs || this.gatewayWs.readyState !== 1) {
948
990
  // OFFLINE 路径在写映射前触发,无脏映射;保留广播语义(属系统状态公告)
@@ -1133,6 +1175,11 @@ export class RealtimeBridge {
1133
1175
  this.connectTimer.unref?.();
1134
1176
 
1135
1177
  sock.addEventListener('open', () => {
1178
+ // 旧 sock 迟到的 open 不应接管当前会话:避免在 reconnect 后旧 sock 再注入 sender / 重置心跳,
1179
+ // 对称于 close handler 的 sock !== this.serverWs guard
1180
+ if (this.serverWs !== sock || this.intentionallyClosed) {
1181
+ return;
1182
+ }
1136
1183
  this.__clearConnectTimer();
1137
1184
  this.logger.info?.(`[coclaw] realtime bridge connected: ${maskedTarget}`);
1138
1185
  remoteLog('ws.connected peer=server');
@@ -1149,6 +1196,10 @@ export class RealtimeBridge {
1149
1196
  });
1150
1197
 
1151
1198
  sock.addEventListener('message', async (event) => {
1199
+ // 旧 sock 迟到的 message 不应重置当前 sock 的心跳节奏;同 open 路径处理
1200
+ if (this.serverWs !== sock || this.intentionallyClosed) {
1201
+ return;
1202
+ }
1152
1203
  this.__resetServerHbTimeout(sock);
1153
1204
  try {
1154
1205
  const payload = JSON.parse(String(event.data ?? '{}'));
@@ -1183,12 +1234,14 @@ export class RealtimeBridge {
1183
1234
  });
1184
1235
 
1185
1236
  sock.addEventListener('close', async (event) => {
1186
- this.__clearServerHeartbeat();
1187
- this.__clearConnectTimer();
1188
- // serverWs 已指向新实例(如 refresh 后),跳过旧 sock 的清理
1237
+ // 若 serverWs 已指向新实例(如 refresh 后),跳过旧 sock 的清理。
1238
+ // __clearServerHeartbeat / __clearConnectTimer 都是 per-bridge 全局单槽,
1239
+ // sock close 若跑在 guard 前会清掉新 sock 的 heartbeat
1189
1240
  if (this.serverWs !== null && this.serverWs !== sock) {
1190
1241
  return;
1191
1242
  }
1243
+ this.__clearServerHeartbeat();
1244
+ this.__clearConnectTimer();
1192
1245
  setRemoteLogSender(null);
1193
1246
  const wasIntentional = this.intentionallyClosed;
1194
1247
  this.serverWs = null;
@@ -7,6 +7,7 @@
7
7
 
8
8
  import { randomUUID } from 'node:crypto';
9
9
  import fs from 'node:fs/promises';
10
+ import nodeFs from 'node:fs';
10
11
  import nodePath from 'node:path';
11
12
 
12
13
  /**
@@ -60,4 +61,39 @@ async function atomicWriteJsonFile(filePath, value, opts) {
60
61
  await atomicWriteFile(filePath, text, opts);
61
62
  }
62
63
 
63
- export { atomicWriteFile, atomicWriteJsonFile };
64
+ /**
65
+ * 同步版 atomicWriteFile:仅供 device-identity 等启动期同步路径使用,
66
+ * 行为与 atomicWriteFile 等价(write-to-tmp + rename + finally cleanup)。
67
+ *
68
+ * @param {string} filePath - 目标文件路径
69
+ * @param {string | Buffer} content - 文件内容
70
+ * @param {object} [opts]
71
+ * @param {number} [opts.mode=0o600] - 文件权限
72
+ * @param {number} [opts.dirMode] - 父目录权限
73
+ * @param {string} [opts.encoding='utf8'] - 写入编码
74
+ */
75
+ function atomicWriteFileSync(filePath, content, opts) {
76
+ const mode = opts?.mode ?? 0o600;
77
+ const encoding = opts?.encoding ?? 'utf8';
78
+ const mkdirOpts = { recursive: true };
79
+ if (opts?.dirMode != null) {
80
+ mkdirOpts.mode = opts.dirMode;
81
+ }
82
+
83
+ nodeFs.mkdirSync(nodePath.dirname(filePath), mkdirOpts);
84
+
85
+ const tmp = `${filePath}.${randomUUID()}.tmp`;
86
+ try {
87
+ nodeFs.writeFileSync(tmp, content, { encoding, mode });
88
+ /* c8 ignore next -- chmod 在正常文件系统上不会失败 */
89
+ try { nodeFs.chmodSync(tmp, mode); } catch { /* ignore */ }
90
+ nodeFs.renameSync(tmp, filePath);
91
+ /* c8 ignore next -- chmod 在正常文件系统上不会失败 */
92
+ try { nodeFs.chmodSync(filePath, mode); } catch { /* ignore */ }
93
+ } finally {
94
+ // 确保临时文件不残留(rename 成功后 tmp 已不存在,rmSync force=true 会无声忽略)
95
+ try { nodeFs.rmSync(tmp, { force: true }); } catch { /* ignore */ }
96
+ }
97
+ }
98
+
99
+ export { atomicWriteFile, atomicWriteJsonFile, atomicWriteFileSync };