@coclaw/openclaw-coclaw 0.17.0 → 0.17.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@coclaw/openclaw-coclaw",
3
- "version": "0.17.0",
3
+ "version": "0.17.2",
4
4
  "type": "module",
5
5
  "license": "Apache-2.0",
6
6
  "description": "OpenClaw CoClaw channel plugin for remote chat",
@@ -58,7 +58,7 @@
58
58
  "release:versions": "npm view @coclaw/openclaw-coclaw versions --json --registry=https://registry.npmjs.org/ && npm view @coclaw/openclaw-coclaw versions --json"
59
59
  },
60
60
  "dependencies": {
61
- "@coclaw/pion-node": "^0.1.3",
61
+ "@coclaw/pion-node": "^0.3.0",
62
62
  "werift": "^0.19.0",
63
63
  "ws": "^8.19.0"
64
64
  },
@@ -659,6 +659,14 @@ export class RealtimeBridge {
659
659
  return;
660
660
  }
661
661
  if (payload.type === 'res' || payload.type === 'event') {
662
+ // 过滤 gateway 的管理层广播事件,这些对 WebChat / plugin 客户端无意义:
663
+ // - health: 全量状态快照(~3KB, ~60s 一次 + RPC 触发),给 Admin UI 的监控仪表盘用
664
+ // - tick: gateway WS 保活心跳(30s 一次),UI 隔着 DC 不需要,DC 自己有 probe 机制
665
+ // 不转发可避免后台时 rpc DC 队列被灌满。上游支持按需订阅前先在插件侧拦截。
666
+ if (payload.type === 'event'
667
+ && (payload.event === 'health' || payload.event === 'tick')) {
668
+ return;
669
+ }
662
670
  this.webrtcPeer?.broadcast(payload);
663
671
  }
664
672
  });
@@ -0,0 +1,375 @@
1
+ /**
2
+ * 文件回退队列:内存优先,超过预算后追加写入 JSONL 文件。
3
+ * 业务无关纯工具:存储任意字符串(调用方需保证不含裸 `\n`,否则行分隔语义被破坏)。
4
+ *
5
+ * 行为约定详见 docs/rpc-dc-file-queue.md。
6
+ * - FIFO、单一生产者/消费者;多消费者时每条只交付给其中一个。
7
+ * - 构造时清理目录残留(不跨生命周期复用)。
8
+ * - 消费侧:`for await (const item of queue) { ... }`;`destroy()` 让迭代结束。
9
+ */
10
+
11
+ import fs from 'node:fs/promises';
12
+ import { createReadStream, createWriteStream, rmSync } from 'node:fs';
13
+ import nodePath from 'node:path';
14
+ import readline from 'node:readline';
15
+
16
+ import { createMutex } from './mutex.js';
17
+
18
+ const DEFAULT_MEM_BUDGET = 8 * 1024 * 1024;
19
+ const DEFAULT_DISK_CAP = 1024 * 1024 * 1024;
20
+
21
+ class FileBackedQueue {
22
+ /**
23
+ * @param {object} opts
24
+ * @param {string} opts.dir - 队列文件根目录
25
+ * @param {string} opts.id - 队列标识(用于子目录命名)
26
+ * @param {number} [opts.memBudget=8MB] - 内存持有字节数上限
27
+ * @param {number} [opts.diskCap=1GB] - 磁盘+内存总字节数硬上限
28
+ * @param {(reason: string, size: number) => void} [opts.onDrop] - 拒入队时的回调
29
+ * @param {{ warn?: Function, info?: Function, error?: Function }} [opts.logger=console]
30
+ */
31
+ constructor(opts) {
32
+ const {
33
+ dir,
34
+ id,
35
+ memBudget = DEFAULT_MEM_BUDGET,
36
+ diskCap = DEFAULT_DISK_CAP,
37
+ onDrop,
38
+ logger = console,
39
+ } = opts ?? {};
40
+
41
+ if (!dir || typeof dir !== 'string') throw new TypeError('dir is required');
42
+ if (!id || typeof id !== 'string') throw new TypeError('id is required');
43
+
44
+ this.dir = dir;
45
+ this.id = id;
46
+ this.memBudget = memBudget;
47
+ this.diskCap = diskCap;
48
+ this.onDrop = onDrop;
49
+ this.logger = logger;
50
+
51
+ this.subdir = nodePath.join(dir, id);
52
+ this.filePath = nodePath.join(this.subdir, 'queue.jsonl');
53
+
54
+ this.memQueue = [];
55
+ this.memBytes = 0;
56
+ this.diskBytes = 0; // 磁盘上未消费的 payload 字节(不含分隔 \n)
57
+ this.writtenBytes = 0; // 已写入文件的累计字节(含 \n)
58
+ this.readOffset = 0; // 下次 refill 的起始偏移
59
+ this.spilled = false;
60
+ this.destroyed = false;
61
+ this.writeStream = null;
62
+ this.writeErr = null;
63
+ this.waiters = [];
64
+ this.mutex = createMutex();
65
+
66
+ // 防御性清理:不跨生命周期复用旧数据
67
+ try {
68
+ rmSync(this.subdir, { recursive: true, force: true });
69
+ } catch (err) {
70
+ /* c8 ignore next 2 -- rmSync with force rarely fails on posix */
71
+ this.logger?.warn?.('fbq.construct cleanup error', err);
72
+ }
73
+ }
74
+
75
+ /**
76
+ * 入队一条字符串。
77
+ * @param {string} jsonStr
78
+ * @returns {Promise<boolean>} accepted(true)/ dropped(false)
79
+ */
80
+ async enqueue(jsonStr) {
81
+ return await this.mutex.withLock(async () => {
82
+ if (this.destroyed) return false;
83
+ if (typeof jsonStr !== 'string') throw new TypeError('jsonStr must be a string');
84
+
85
+ const size = Buffer.byteLength(jsonStr, 'utf8');
86
+
87
+ if (this.memBytes + this.diskBytes + size > this.diskCap) {
88
+ this.__dispatchDrop('disk-cap', size);
89
+ return false;
90
+ }
91
+
92
+ // 内存路径:未溢出且加上新条目仍在预算内
93
+ if (!this.spilled && this.memBytes + size <= this.memBudget) {
94
+ this.memQueue.push(jsonStr);
95
+ this.memBytes += size;
96
+ this.__wakeOne();
97
+ return true;
98
+ }
99
+
100
+ // 溢出路径:lazy 打开写流
101
+ if (!this.spilled) {
102
+ await this.__openWriteStream();
103
+ if (this.writeErr) {
104
+ this.__dispatchDrop('fs-error', size);
105
+ return false;
106
+ }
107
+ this.spilled = true;
108
+ }
109
+
110
+ try {
111
+ await this.__writeLine(jsonStr + '\n');
112
+ this.diskBytes += size;
113
+ this.writtenBytes += size + 1;
114
+ this.__wakeOne();
115
+ return true;
116
+ } catch (err) {
117
+ this.logger?.warn?.('fbq.enqueue fs-error', err);
118
+ this.__dispatchDrop('fs-error', size);
119
+ return false;
120
+ }
121
+ });
122
+ }
123
+
124
+ /**
125
+ * @returns {{ memCount: number, memBytes: number, diskBytes: number, spilled: boolean }}
126
+ */
127
+ stats() {
128
+ return {
129
+ memCount: this.memQueue.length,
130
+ memBytes: this.memBytes,
131
+ diskBytes: this.diskBytes,
132
+ spilled: this.spilled,
133
+ };
134
+ }
135
+
136
+ /**
137
+ * 清空数据但保留实例可用。
138
+ */
139
+ async clear() {
140
+ return await this.mutex.withLock(async () => {
141
+ if (this.destroyed) return;
142
+ await this.__closeWriteStream();
143
+ try {
144
+ await fs.rm(this.filePath, { force: true });
145
+ } catch (err) {
146
+ /* c8 ignore next 2 -- rm with force rarely fails */
147
+ this.logger?.warn?.('fbq.clear rm error', err);
148
+ }
149
+ this.memQueue = [];
150
+ this.memBytes = 0;
151
+ this.diskBytes = 0;
152
+ this.writtenBytes = 0;
153
+ this.readOffset = 0;
154
+ this.spilled = false;
155
+ this.writeErr = null;
156
+ });
157
+ }
158
+
159
+ /**
160
+ * 停写、关 FD、删目录、结束所有迭代器。幂等。
161
+ */
162
+ async destroy() {
163
+ return await this.mutex.withLock(async () => {
164
+ if (this.destroyed) return;
165
+ this.destroyed = true;
166
+
167
+ // 唤醒所有等待者,让它们在下一轮循环中看到 destroyed 并返回 done
168
+ const toWake = this.waiters.splice(0);
169
+ for (const w of toWake) w.resolve();
170
+
171
+ await this.__closeWriteStream();
172
+ try {
173
+ await fs.rm(this.subdir, { recursive: true, force: true });
174
+ } catch (err) {
175
+ /* c8 ignore next 2 -- rm with force rarely fails */
176
+ this.logger?.warn?.('fbq.destroy rm error', err);
177
+ }
178
+
179
+ this.memQueue = [];
180
+ this.memBytes = 0;
181
+ this.diskBytes = 0;
182
+ this.writtenBytes = 0;
183
+ this.readOffset = 0;
184
+ this.spilled = false;
185
+ });
186
+ }
187
+
188
+ [Symbol.asyncIterator]() {
189
+ const self = this;
190
+ return {
191
+ next() { return self.__nextIter(); },
192
+ return() { return Promise.resolve({ done: true, value: undefined }); },
193
+ [Symbol.asyncIterator]() { return this; },
194
+ };
195
+ }
196
+
197
+ async __nextIter() {
198
+ while (true) {
199
+ let waitPromise = null;
200
+ const result = await this.mutex.withLock(async () => {
201
+ if (this.memQueue.length === 0 && this.spilled && !this.destroyed) {
202
+ await this.__refillImpl();
203
+ }
204
+ if (this.memQueue.length > 0) {
205
+ const item = this.memQueue.shift();
206
+ this.memBytes -= Buffer.byteLength(item, 'utf8');
207
+ return { value: item, done: false };
208
+ }
209
+ if (this.destroyed) return { done: true, value: undefined };
210
+ waitPromise = new Promise((resolve, reject) => {
211
+ this.waiters.push({ resolve, reject });
212
+ });
213
+ return null;
214
+ });
215
+ if (result !== null) return result;
216
+ await waitPromise;
217
+ }
218
+ }
219
+
220
+ __wakeOne() {
221
+ if (this.waiters.length > 0) {
222
+ const w = this.waiters.shift();
223
+ w.resolve();
224
+ }
225
+ }
226
+
227
+ __dispatchDrop(reason, size) {
228
+ try {
229
+ this.onDrop?.(reason, size);
230
+ } catch (err) {
231
+ /* c8 ignore next 2 -- onDrop throwing is caller's bug */
232
+ this.logger?.warn?.('fbq.onDrop threw', err);
233
+ }
234
+ this.logger?.warn?.('fbq.drop', { reason, size });
235
+ }
236
+
237
+ async __openWriteStream() {
238
+ this.writeErr = null;
239
+ try {
240
+ await fs.mkdir(this.subdir, { recursive: true });
241
+ } catch (err) {
242
+ this.writeErr = err;
243
+ return;
244
+ }
245
+ this.writeStream = createWriteStream(this.filePath, { flags: 'a' });
246
+ this.writeStream.on('error', (err) => {
247
+ this.writeErr = err;
248
+ this.logger?.warn?.('fbq.writeStream error', err);
249
+ });
250
+ }
251
+
252
+ async __writeLine(str) {
253
+ if (this.writeErr) throw this.writeErr;
254
+ return await new Promise((resolve, reject) => {
255
+ this.writeStream.write(str, (err) => {
256
+ if (err) reject(err);
257
+ else resolve();
258
+ });
259
+ });
260
+ }
261
+
262
+ async __closeWriteStream() {
263
+ if (!this.writeStream) return;
264
+ const stream = this.writeStream;
265
+ this.writeStream = null;
266
+ if (stream.destroyed || stream.writableEnded) return;
267
+ // 使用事件而非 end(cb):errored 流上 end 的回调可能永不触发 → 死锁风险。
268
+ // 'close' 在正常结束后触发;'error' 在异常流上作为兜底。Promise 幂等。
269
+ await new Promise((resolve) => {
270
+ stream.once('close', resolve);
271
+ stream.once('error', resolve);
272
+ try {
273
+ stream.end();
274
+ /* c8 ignore next 3 -- stream.end 同步抛极少见 */
275
+ } catch {
276
+ resolve();
277
+ }
278
+ });
279
+ }
280
+
281
+ // 调用方必须已持有 mutex,且已确认 !destroyed
282
+ async __refillImpl() {
283
+ if (!this.spilled) return;
284
+
285
+ let actualEnd;
286
+ try {
287
+ const st = await fs.stat(this.filePath);
288
+ actualEnd = st.size;
289
+ } catch (err) {
290
+ /* c8 ignore next 3 -- stat 在正常持有期间不会失败 */
291
+ this.logger?.warn?.('fbq.refill stat error', err);
292
+ return;
293
+ }
294
+
295
+ if (this.readOffset >= actualEnd) {
296
+ await this.__dropFile();
297
+ return;
298
+ }
299
+
300
+ const newLines = [];
301
+ let cumBytes = 0; // 文件字节:payload + \n
302
+ let cumPayload = 0; // 仅 payload
303
+ let stoppedAtEof = true;
304
+
305
+ const stream = createReadStream(this.filePath, {
306
+ start: this.readOffset,
307
+ end: actualEnd - 1,
308
+ });
309
+ const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
310
+
311
+ try {
312
+ for await (const line of rl) {
313
+ const sz = Buffer.byteLength(line, 'utf8');
314
+ if (newLines.length > 0 && this.memBytes + cumPayload + sz > this.memBudget) {
315
+ stoppedAtEof = false;
316
+ break;
317
+ }
318
+ newLines.push(line);
319
+ cumBytes += sz + 1;
320
+ cumPayload += sz;
321
+ }
322
+ } catch (err) {
323
+ /* c8 ignore next 4 -- read 错误罕见,保守退出 */
324
+ this.logger?.warn?.('fbq.refill read error', err);
325
+ rl.close();
326
+ stream.destroy();
327
+ return;
328
+ } finally {
329
+ rl.close();
330
+ stream.destroy();
331
+ }
332
+
333
+ const availableBytes = actualEnd - this.readOffset;
334
+
335
+ if (stoppedAtEof && cumBytes > availableBytes) {
336
+ // 最后一行未终止(尾部 \n 缺失):视为半截,丢弃
337
+ const partial = newLines.pop();
338
+ cumPayload -= Buffer.byteLength(partial, 'utf8');
339
+ this.logger?.warn?.('fbq.refill partial tail discarded', {
340
+ size: Buffer.byteLength(partial, 'utf8'),
341
+ });
342
+ // 将 readOffset 推到 writtenBytes,彻底丢弃尾部残片
343
+ this.readOffset = this.writtenBytes;
344
+ } else {
345
+ this.readOffset += cumBytes;
346
+ }
347
+
348
+ for (const line of newLines) {
349
+ this.memQueue.push(line);
350
+ this.memBytes += Buffer.byteLength(line, 'utf8');
351
+ }
352
+ this.diskBytes -= cumPayload;
353
+
354
+ if (this.readOffset >= this.writtenBytes) {
355
+ await this.__dropFile();
356
+ }
357
+ }
358
+
359
+ async __dropFile() {
360
+ await this.__closeWriteStream();
361
+ try {
362
+ await fs.rm(this.filePath, { force: true });
363
+ } catch (err) {
364
+ /* c8 ignore next 2 -- rm with force rarely fails */
365
+ this.logger?.warn?.('fbq.dropFile error', err);
366
+ }
367
+ this.spilled = false;
368
+ this.writtenBytes = 0;
369
+ this.readOffset = 0;
370
+ this.diskBytes = 0;
371
+ this.writeErr = null;
372
+ }
373
+ }
374
+
375
+ export { FileBackedQueue };
@@ -62,6 +62,10 @@ export class RpcSendQueue {
62
62
  send(jsonStr) {
63
63
  if (this.closed || this.dc.readyState !== 'open') return false;
64
64
 
65
+ // 诊断日志:打印每次入队的事件,跟踪 gateway 还会推哪些事件
66
+ // 需要时临时打开,平时保持注释避免日志噪音
67
+ // this.logger.info?.(`[rpc-queue${this.__tagSuffix()}] send-payload ${jsonStr}`);
68
+
65
69
  const chunks = buildChunks(jsonStr, this.maxMessageSize, this.getNextMsgId);
66
70
  const totalBytes = chunks
67
71
  ? chunks.reduce((n, c) => n + c.length, 0)
@@ -80,6 +84,8 @@ export class RpcSendQueue {
80
84
  this.droppedCount += 1;
81
85
  this.droppedBytes += totalBytes;
82
86
  this.logger.warn?.(`[rpc-queue${this.__tagSuffix()}] drop reason=queue-full size=${totalBytes} queueBytes=${this.queueBytes}`);
87
+ // 诊断日志:定位后台长时间占队的事件来源。需要时临时打开
88
+ // this.logger.info?.(`[rpc-queue${this.__tagSuffix()}] dropped-payload ${jsonStr}`);
83
89
  if (!this.queueOverflowActive) {
84
90
  this.queueOverflowActive = true;
85
91
  remoteLog(`rpc-queue.overflow-start${this.__tagSuffix()} queueBytes=${this.queueBytes}`);
@@ -69,6 +69,17 @@ export class WebRtcPeer {
69
69
  clearTimeout(session.__failedTimer);
70
70
  session.__failedTimer = null;
71
71
  }
72
+ // 清理 plugin-probe 定时器(避免 session 已关闭仍触发 timeout 日志,
73
+ // 或 500ms 调度窗口内 session 被替换时对着新 session 误发探针)
74
+ if (session.__pluginProbeSchedTimer) {
75
+ clearTimeout(session.__pluginProbeSchedTimer);
76
+ session.__pluginProbeSchedTimer = null;
77
+ }
78
+ if (session.__pluginProbeTimer) {
79
+ clearTimeout(session.__pluginProbeTimer);
80
+ session.__pluginProbeTimer = null;
81
+ session.__pluginProbeInFlight = null;
82
+ }
72
83
  this.__sessions.delete(connId);
73
84
  // 显式关闭 rpc 发送队列:dc.onclose 路径中 `sessions.get(connId)` 已返回 undefined 而短路,
74
85
  // 此处不主动 close 会丢失 drop 汇总 remoteLog 诊断
@@ -83,6 +94,9 @@ export class WebRtcPeer {
83
94
  if ('onselectedcandidatepairchange' in session.pc) {
84
95
  session.pc.onselectedcandidatepairchange = null;
85
96
  }
97
+ if ('oniceconnectionstatechange' in session.pc) {
98
+ session.pc.oniceconnectionstatechange = null;
99
+ }
86
100
  await session.pc.close();
87
101
  this.__remoteLog(`rtc.closed conn=${connId}`);
88
102
  this.logger.info?.(`${this.__rtcTag} [${connId}] closed`);
@@ -175,6 +189,7 @@ export class WebRtcPeer {
175
189
  toConnId: connId,
176
190
  payload: { sdp: answer.sdp },
177
191
  });
192
+ this.__remoteLog(`rtc.restart-answer-sent conn=${connId}`);
178
193
  this.logger.info?.(`${this.__rtcTag} ICE restart answer sent to ${connId}`);
179
194
  return;
180
195
  } catch (err) {
@@ -237,7 +252,14 @@ export class WebRtcPeer {
237
252
  const turnUrl = iceServers.find((s) => s.urls?.startsWith('turn:'))?.urls ?? 'none';
238
253
  this.__remoteLog(`rtc.ice-config conn=${connId} stun=${stunUrl} turn=${turnUrl}`);
239
254
 
240
- const pc = new this.__PeerConnection({ iceServers });
255
+ // settings 仅对 pion 生效:werift 路径不吃 settings 字段(大概率静默忽略,
256
+ // 但按 __impl 分层更干净)。只收紧 pion 的 SCTP RTO 退避上限到 10s,
257
+ // 让 APK 后台唤醒后的深度退避窗口能落在 UI 的 15s 超时内。
258
+ const pcConfig = { iceServers };
259
+ if (this.__impl === 'pion') {
260
+ pcConfig.settings = { sctpRtoMax: 10000 };
261
+ }
262
+ const pc = new this.__PeerConnection(pcConfig);
241
263
 
242
264
  const remoteMaxMessageSize = this.__resolveMaxMessageSize(pc, msg.payload.sdp);
243
265
 
@@ -245,11 +267,24 @@ export class WebRtcPeer {
245
267
  this.__sessions.set(connId, session);
246
268
 
247
269
  // ICE candidate → 发给 UI,并统计各类型 candidate 数量
270
+ // gather complete 时一并输出 host 候选的 IP:port 列表(诊断 docker/vbridge 误 gather)
248
271
  const candidateCounts = { host: 0, srflx: 0, relay: 0 };
272
+ const hostAddrs = [];
273
+ let gatheringEmitted = false;
274
+ const flushGatherDiag = () => {
275
+ if (gatheringEmitted) return;
276
+ gatheringEmitted = true;
277
+ const hostInfo = hostAddrs.length ? ` hosts=${hostAddrs.join(',')}` : '';
278
+ this.__remoteLog(`rtc.ice-gathered conn=${connId} host=${candidateCounts.host} srflx=${candidateCounts.srflx} relay=${candidateCounts.relay}${hostInfo}`);
279
+ candidateCounts.host = 0;
280
+ candidateCounts.srflx = 0;
281
+ candidateCounts.relay = 0;
282
+ hostAddrs.length = 0;
283
+ };
249
284
  pc.onicecandidate = ({ candidate }) => {
250
285
  if (!candidate) {
251
- // gathering 完成,输出汇总
252
- this.__remoteLog(`rtc.ice-gathered conn=${connId} host=${candidateCounts.host} srflx=${candidateCounts.srflx} relay=${candidateCounts.relay}`);
286
+ // 浏览器路径:gathering 完成通过 null candidate 通知
287
+ flushGatherDiag();
253
288
  return;
254
289
  }
255
290
  // 从 candidate 字符串中提取类型(typ host / typ srflx / typ relay)
@@ -257,6 +292,14 @@ export class WebRtcPeer {
257
292
  if (typMatch && candidateCounts[typMatch[1]] !== undefined) {
258
293
  candidateCounts[typMatch[1]]++;
259
294
  }
295
+ // host 候选记录 addr:port,用于观察 pion 是否把 docker0 / br-* / loopback 等接口当成 host
296
+ // candidate 格式: "candidate:<foundation> <comp> <proto> <prio> <ADDR> <PORT> typ host ..."
297
+ if (typMatch?.[1] === 'host') {
298
+ const parts = candidate.candidate.split(' ');
299
+ if (parts.length >= 6) {
300
+ hostAddrs.push(`${parts[4]}:${parts[5]}`);
301
+ }
302
+ }
260
303
  this.__onSend({
261
304
  type: 'rtc:ice',
262
305
  toConnId: connId,
@@ -267,6 +310,29 @@ export class WebRtcPeer {
267
310
  },
268
311
  });
269
312
  };
313
+ // pion-node 不会在 gather complete 时 fire onicecandidate(null),用 icegatheringstatechange 兜底。
314
+ // gathering→ 重置 flag 支持 ICE restart;complete→ flush 汇总
315
+ if ('onicegatheringstatechange' in pc) {
316
+ pc.onicegatheringstatechange = () => {
317
+ const state = pc.iceGatheringState;
318
+ if (state === 'gathering') {
319
+ gatheringEmitted = false;
320
+ } else if (state === 'complete') {
321
+ flushGatherDiag();
322
+ }
323
+ };
324
+ }
325
+
326
+ // ICE agent 状态(pion 暴露的独立事件):能看到 checking / connected / failed 等纯 ICE 侧跳转,
327
+ // 与复合 connectionState 互补。对诊断"pion 说 connected 但 UI 看不到数据"非常关键。
328
+ // 仅在 pion-node 实现中可用;其他实现赋值是 no-op。
329
+ if ('oniceconnectionstatechange' in pc) {
330
+ pc.oniceconnectionstatechange = () => {
331
+ const cur = this.__sessions.get(connId);
332
+ if (!cur || cur.pc !== pc) return;
333
+ this.__remoteLog(`rtc.iceState conn=${connId} ${pc.iceConnectionState ?? '?'}`);
334
+ };
335
+ }
270
336
 
271
337
  // 连接状态变更(校验 pc 归属,防止旧 PC 异步回调删除新 session)
272
338
  pc.onconnectionstatechange = () => {
@@ -285,6 +351,7 @@ export class WebRtcPeer {
285
351
  }
286
352
 
287
353
  if (state === 'connected') {
354
+ const prevDumpState = cur.__lastDumpState;
288
355
  // 重置 dump 去重水位(disconnected → connected → disconnected 仍能再 dump)
289
356
  cur.__lastDumpState = null;
290
357
  // werift: iceTransports[0].connection.nominated
@@ -298,6 +365,22 @@ export class WebRtcPeer {
298
365
  this.logger.info?.(`${this.__rtcTag} [${connId}] ICE nominated: local=${localInfo} remote=${remoteInfo}`);
299
366
  }
300
367
  // pion: pair 通过独立的 selectedcandidatepairchange 事件上报
368
+ // ICE restart 恢复(disconnected/failed → connected)时做诊断动作:
369
+ // - dump 当前 session DC 状态,对照"UI 看不到 connected 时 plugin 侧看到什么"
370
+ // - 发一次 plugin-probe,实测 DC 是否双向可用
371
+ // 只对 pion 生效:werift/ndc 为兼容路径,不涉及本次调查的病态场景。
372
+ if (this.__impl === 'pion' && (prevDumpState === 'disconnected' || prevDumpState === 'failed')) {
373
+ this.__dumpSessionState(connId, cur, 'connected');
374
+ // 挂到 session 上,使 closeByConnId 能在 500ms 窗口内取消;
375
+ // 否则 session 被替换(同 connId 新 offer)时会对着新 session 误发探针。
376
+ if (cur.__pluginProbeSchedTimer) clearTimeout(cur.__pluginProbeSchedTimer);
377
+ cur.__pluginProbeSchedTimer = setTimeout(() => {
378
+ cur.__pluginProbeSchedTimer = null;
379
+ this.__sendPluginProbe(connId);
380
+ }, 500);
381
+ // unref() 避免定时器阻塞 gateway 进程退出(gateway 由其他连接保活)。
382
+ cur.__pluginProbeSchedTimer.unref?.();
383
+ }
301
384
  } else if (state === 'disconnected' || state === 'failed' || state === 'closed') {
302
385
  // 诊断 dump:失败/断连/关闭时输出当前 PC 上 DC 状态,定位"PC 假活/DC 死"现象
303
386
  // - closed 由 closeByConnId 接管清理,dump 收敛诊断噪声
@@ -427,6 +510,11 @@ export class WebRtcPeer {
427
510
  catch { /* DC 已关闭,忽略 */ }
428
511
  return;
429
512
  }
513
+ // 来自 UI 的 plugin-probe 回复:验证 plugin → UI 方向确实传达并被回传
514
+ if (payload.type === 'plugin-probe-ack') {
515
+ this.__handlePluginProbeAck(connId, payload.id);
516
+ return;
517
+ }
430
518
  if (payload.type === 'req') {
431
519
  // coclaw.files.* 方法本地处理,不转发 gateway
432
520
  if (payload.method?.startsWith('coclaw.files.') && this.__onFileRpc) {
@@ -488,16 +576,57 @@ export class WebRtcPeer {
488
576
  */
489
577
  __dumpSessionState(connId, session, state) {
490
578
  const rpcState = session.rpcChannel?.readyState ?? 'none';
491
- const fileSummary = session.fileChannels.size === 0
492
- ? 'none'
493
- /* c8 ignore next -- ?? fallback for missing readyState */
494
- : [...session.fileChannels].map((dc) => `${dc.label}=${dc.readyState ?? '?'}`).join(',');
579
+ const fileSummary = this.__summarizeFileChannels(session.fileChannels);
495
580
  const q = session.rpcSendQueue;
496
581
  const queueInfo = q
497
582
  ? `queueLen=${q.queue.length} queueBytes=${q.queueBytes} dropped=${q.droppedCount}`
498
583
  : 'queue=none';
499
584
  this.__remoteLog(`rtc.dump conn=${connId} state=${state} sessions=${this.__sessions.size} rpc=${rpcState} ${queueInfo} fileCount=${session.fileChannels.size} files=[${fileSummary}]`);
500
585
  this.logger.info?.(`${this.__rtcTag} [${connId}] dump state=${state} rpc=${rpcState} ${queueInfo} fileCount=${session.fileChannels.size} files=${fileSummary}`);
586
+ // 仅 pion 路径追加 SCTP 采样:cwnd 是否塌回 1×MTU + bytesSent 增量是否 ~0
587
+ // 是判定"是否陷入深度 RTO 退避"的关键。fire-and-forget + 内部 try/catch
588
+ // 双保险,不阻塞 dump 主流程;rtc.sctp 独立一行避免污染既有 rtc.dump 格式。
589
+ if (this.__impl === 'pion' && typeof session.pc.getSctpStats === 'function') {
590
+ this.__dumpSctpStats(connId, session, state).catch(() => {});
591
+ }
592
+ }
593
+
594
+ /**
595
+ * 按 readyState 聚合 file DC。closed 态只给计数,非 closed 态附带 label —
596
+ * 长会话内已关闭的 DC 会累积到 FIFO 上限,全量拼 label 会让 dump 膨胀,
597
+ * 而断连时真正有诊断价值的是"还没关干净"的 DC。
598
+ */
599
+ __summarizeFileChannels(fileChannels) {
600
+ if (fileChannels.size === 0) return 'none';
601
+ const byState = new Map();
602
+ for (const dc of fileChannels) {
603
+ /* c8 ignore next -- ?? fallback for missing readyState */
604
+ const st = dc.readyState ?? '?';
605
+ if (!byState.has(st)) byState.set(st, []);
606
+ byState.get(st).push(dc.label);
607
+ }
608
+ const parts = [];
609
+ for (const [st, labels] of byState) {
610
+ if (st === 'closed') parts.push(`closed:${labels.length}`);
611
+ else parts.push(`${st}:${labels.length}(${labels.join(',')})`);
612
+ }
613
+ return parts.join(' ');
614
+ }
615
+
616
+ async __dumpSctpStats(connId, session, state) {
617
+ try {
618
+ const stats = await session.pc.getSctpStats();
619
+ if (!stats) {
620
+ this.__remoteLog(`rtc.sctp conn=${connId} state=${state} sctp=none`);
621
+ return;
622
+ }
623
+ this.__remoteLog(
624
+ `rtc.sctp conn=${connId} state=${state} cwnd=${stats.congestionWindow} srtt=${Math.round(stats.srttMs)}ms sent=${stats.bytesSent} recv=${stats.bytesReceived} mtu=${stats.mtu}`,
625
+ );
626
+ } catch (err) {
627
+ this.__remoteLog(`rtc.sctp conn=${connId} state=${state} error=${err.message}`);
628
+ this.logger.warn?.(`${this.__rtcTag} [${connId}] getSctpStats error: ${err.message}`);
629
+ }
501
630
  }
502
631
 
503
632
  /**
@@ -558,6 +687,59 @@ export class WebRtcPeer {
558
687
  this.__remoteLog(`rtc.peer-transport conn=${connId} type=${payload.candidateType} proto=${payload.protocol} relay=${payload.relayProtocol ?? '-'}`);
559
688
  }
560
689
 
690
+ /**
691
+ * 主动探针:在 rpc DC 上发一个 plugin-probe,期待 UI 回 plugin-probe-ack。
692
+ * 用于区分"pion 报告 connected 但 UI 其实没收到数据"与"UI 真的收到了但没记录事件"。
693
+ * 绕过 RpcSendQueue(与 probe-ack 对称),仅测量传输层,不受应用层积压影响。
694
+ * 同一 session 同时只保留一条 in-flight 探针;超时仅打日志,不影响业务恢复。
695
+ */
696
+ __sendPluginProbe(connId) {
697
+ const session = this.__sessions.get(connId);
698
+ if (!session) return;
699
+ const dc = session.rpcChannel;
700
+ if (!dc || dc.readyState !== 'open') return;
701
+ // 已有 in-flight 则跳过(避免重复)
702
+ if (session.__pluginProbeInFlight) return;
703
+
704
+ const id = (session.__pluginProbeIdSeq = (session.__pluginProbeIdSeq ?? 0) + 1);
705
+ const startMs = Date.now();
706
+ const timer = setTimeout(() => {
707
+ if (session.__pluginProbeInFlight?.id === id) {
708
+ session.__pluginProbeInFlight = null;
709
+ session.__pluginProbeTimer = null;
710
+ this.__remoteLog(`rtc.plugin-probe conn=${connId} id=${id} timeout`);
711
+ }
712
+ }, 5000);
713
+ timer.unref?.();
714
+ session.__pluginProbeInFlight = { id, startMs };
715
+ session.__pluginProbeTimer = timer;
716
+
717
+ try {
718
+ dc.send(JSON.stringify({ type: 'plugin-probe', id }));
719
+ this.__remoteLog(`rtc.plugin-probe conn=${connId} id=${id} sent`);
720
+ } catch (err) {
721
+ clearTimeout(timer);
722
+ session.__pluginProbeInFlight = null;
723
+ session.__pluginProbeTimer = null;
724
+ this.__remoteLog(`rtc.plugin-probe conn=${connId} id=${id} send-failed msg=${err?.message ?? err}`);
725
+ }
726
+ }
727
+
728
+ /** 收到 UI 的 plugin-probe-ack:计算 RTT 并释放 in-flight 槽位 */
729
+ __handlePluginProbeAck(connId, id) {
730
+ const session = this.__sessions.get(connId);
731
+ if (!session) return;
732
+ const inFlight = session.__pluginProbeInFlight;
733
+ if (!inFlight || inFlight.id !== id) return; // 过期 ack,忽略
734
+ const rtt = Date.now() - inFlight.startMs;
735
+ if (session.__pluginProbeTimer) {
736
+ clearTimeout(session.__pluginProbeTimer);
737
+ session.__pluginProbeTimer = null;
738
+ }
739
+ session.__pluginProbeInFlight = null;
740
+ this.__remoteLog(`rtc.plugin-probe conn=${connId} id=${id} acked rtt=${rtt}`);
741
+ }
742
+
561
743
  __remoteLog(msg) {
562
744
  remoteLog(this.__impl ? `${msg} rtc=${this.__impl}` : msg);
563
745
  }