@coclaw/openclaw-coclaw 0.17.2 → 0.17.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@coclaw/openclaw-coclaw",
3
- "version": "0.17.2",
3
+ "version": "0.17.4",
4
4
  "type": "module",
5
5
  "license": "Apache-2.0",
6
6
  "description": "OpenClaw CoClaw channel plugin for remote chat",
@@ -31,6 +31,7 @@
31
31
  "src/**/*.js",
32
32
  "!src/**/*.test.js",
33
33
  "!src/mock-server.helper.js",
34
+ "!src/homedir-mock.helper.js",
34
35
  "openclaw.plugin.json",
35
36
  "LICENSE"
36
37
  ],
@@ -1,134 +1,199 @@
1
1
  /**
2
2
  * worker-verify.js — 升级后验证
3
3
  *
4
- * 三步验证策略(任一失败即判定升级失败):
5
- * 1. Gateway 存活:轮询 `openclaw gateway status`,超时 60s
6
- * 2. 插件已加载:`openclaw plugins list` 包含指定插件
7
- * 3. 升级模块健康:`openclaw gateway call coclaw.upgradeHealth` 返回版本号
4
+ * 策略:触发 gateway restart → 轮询 coclaw.upgradeHealth RPC 直到返回版本
5
+ * 严格等于 toVersion。单次调用失败(gateway 未就绪 / plugin 未注册 / JSON 非法 /
6
+ * 版本不对)一律按"稍后重试"处理,在总超时窗口内持续尝试。
8
7
  *
9
- * 3 步同时验证了插件代码能正常执行、gateway method 注册链路正常,
10
- * 确保插件仍具备自我升级能力。
8
+ * 磁盘 package.json 的版本仅作为诊断写入本地日志,不参与判定——openclaw
9
+ * `plugins.installs[id].installPath` 可能在 id-migration 等极端场景发生漂移,
10
+ * 而 upgradeHealth 是 gateway 进程内"新代码真的被加载"的权威信号。
11
+ *
12
+ * worker 运行在独立子进程中,禁止使用 remoteLog;诊断信息全部通过 logger
13
+ * (本地日志)输出,由 updater 记录到 upgrade-log.jsonl。
11
14
  */
12
15
  import { execFile as nodeExecFile } from 'node:child_process';
16
+ import { readFile } from 'node:fs/promises';
17
+ import nodePath from 'node:path';
13
18
 
14
- const GATEWAY_READY_TIMEOUT_MS = 60_000;
15
- const POLL_INTERVAL_MS = 2000;
16
19
  const CMD_TIMEOUT_MS = 30_000;
20
+ const HEALTH_POLL_INTERVAL_MS = 3_000;
21
+ // 本机 openclaw 冷启动可能需访问外部资源(AWS 诊断、ollama 探测等)
22
+ // 及插件 bootstrap,合计 30~60s 常见;5 分钟给足余量
23
+ const HEALTH_TOTAL_TIMEOUT_MS = 5 * 60 * 1000;
17
24
 
18
25
  /**
19
- * 执行命令并返回 stdout
26
+ * 执行命令并返回 stdout;错误对象附带 stderr 以便诊断
20
27
  * @param {string} cmd
21
28
  * @param {string[]} args
22
29
  * @param {object} [opts]
23
30
  * @param {Function} [opts.execFileFn]
31
+ * @param {number} [opts.cmdTimeoutMs]
24
32
  * @returns {Promise<string>}
25
33
  */
26
34
  function runCmd(cmd, args, opts) {
27
35
  /* c8 ignore next -- ?./?? fallback */
28
36
  const doExecFile = opts?.execFileFn ?? nodeExecFile;
37
+ /* c8 ignore next -- ?./?? fallback */
38
+ const timeout = opts?.cmdTimeoutMs ?? CMD_TIMEOUT_MS;
29
39
  return new Promise((resolve, reject) => {
30
- doExecFile(cmd, args, { timeout: CMD_TIMEOUT_MS, shell: process.platform === 'win32' }, (err, stdout) => {
31
- if (err) reject(err);
40
+ doExecFile(cmd, args, { timeout, shell: process.platform === 'win32' }, (err, stdout, stderr) => {
41
+ if (err) {
42
+ /* c8 ignore next -- ?? fallback:execFile 实现不保证 stderr 一定字符串化 */
43
+ err.stderr = String(stderr ?? '');
44
+ reject(err);
45
+ }
32
46
  else resolve(String(stdout).trim());
33
47
  });
34
48
  });
35
49
  }
36
50
 
37
51
  /**
38
- * 步骤 1:等待 gateway 恢复运行
52
+ * 触发一次 gateway 重启;失败不抛(后续轮询 RPC 会兜底验证 gateway 是否就绪)
39
53
  * @param {object} [opts]
40
54
  * @param {Function} [opts.execFileFn]
41
- * @param {number} [opts.timeoutMs]
42
- * @param {number} [opts.pollIntervalMs]
43
55
  * @returns {Promise<void>}
44
56
  */
45
- export async function waitForGateway(opts) {
46
- // 主动触发重启,不依赖 OpenClaw 的文件变更自动重启策略
57
+ export async function triggerGatewayRestart(opts) {
47
58
  try {
48
59
  await runCmd('openclaw', ['gateway', 'restart'], opts);
49
60
  }
50
61
  catch {
51
- // restart 命令失败不阻断流程,仍尝试等待
52
- }
53
-
54
- /* c8 ignore next 2 -- ?./?? fallback */
55
- const timeout = opts?.timeoutMs ?? GATEWAY_READY_TIMEOUT_MS;
56
- const interval = opts?.pollIntervalMs ?? POLL_INTERVAL_MS;
57
- const start = Date.now();
58
-
59
- while (Date.now() - start < timeout) {
60
- try {
61
- const output = await runCmd('openclaw', ['gateway', 'status'], opts);
62
- if (output.includes('running')) return;
63
- }
64
- catch {
65
- // gateway 未就绪,继续轮询
66
- }
67
- await sleep(interval);
62
+ // restart 命令本身失败不阻断:openclaw 可能已在重启/daemon 自恢复;
63
+ // 无论如何都进入后续 upgradeHealth 轮询,由它判定 gateway 最终是否可用
68
64
  }
69
-
70
- throw new Error('Gateway did not become ready within timeout');
71
65
  }
72
66
 
73
67
  /**
74
- * 步骤 2:验证插件已加载
75
- * @param {string} pluginId - 插件 ID
76
- * @param {object} [opts]
77
- * @param {Function} [opts.execFileFn]
78
- * @returns {Promise<void>}
68
+ * 读取磁盘 package.json 的版本号(诊断用途,不参与判定)
69
+ * @param {string} pluginDir
70
+ * @returns {Promise<string | null>}
79
71
  */
80
- export async function verifyPluginLoaded(pluginId, opts) {
81
- const output = await runCmd('openclaw', ['plugins', 'list'], opts);
82
- if (!output.includes(pluginId)) {
83
- throw new Error(`Plugin ${pluginId} not found in plugins list`);
72
+ export async function readDiskPackageVersion(pluginDir) {
73
+ try {
74
+ const pkgPath = nodePath.join(pluginDir, 'package.json');
75
+ const raw = await readFile(pkgPath, 'utf8');
76
+ const pkg = JSON.parse(raw);
77
+ return typeof pkg?.version === 'string' ? pkg.version : null;
78
+ }
79
+ catch {
80
+ return null;
84
81
  }
85
82
  }
86
83
 
87
84
  /**
88
- * 步骤 3:验证升级模块健康
85
+ * 单次调用 coclaw.upgradeHealth;永不抛异常,失败归一化为 { ok: false, reason }
89
86
  * @param {object} [opts]
90
- * @param {Function} [opts.execFileFn]
91
- * @returns {Promise<string>} 返回版本号
87
+ * @returns {Promise<{ ok: true, version: string } | { ok: false, reason: string }>}
92
88
  */
93
- export async function verifyUpgradeHealth(opts) {
94
- const output = await runCmd(
95
- 'openclaw',
96
- ['gateway', 'call', 'coclaw.upgradeHealth', '--json'],
97
- opts,
98
- );
89
+ async function callUpgradeHealthOnce(opts) {
99
90
  try {
100
- const result = JSON.parse(output);
101
- if (!result.version) {
102
- throw new Error('upgradeHealth response missing version');
91
+ const output = await runCmd(
92
+ 'openclaw',
93
+ ['gateway', 'call', 'coclaw.upgradeHealth', '--json'],
94
+ opts,
95
+ );
96
+ let payload;
97
+ try {
98
+ payload = JSON.parse(output);
103
99
  }
104
- return result.version;
100
+ catch {
101
+ return { ok: false, reason: `invalid-json: ${output.slice(0, 120)}` };
102
+ }
103
+ if (!payload?.version) return { ok: false, reason: 'missing-version' };
104
+ return { ok: true, version: String(payload.version) };
105
105
  }
106
106
  catch (err) {
107
- if (err.message?.includes('upgradeHealth')) throw err;
108
- throw new Error(`Failed to parse upgradeHealth response: ${output}`);
107
+ const stderr = typeof err?.stderr === 'string' ? err.stderr.trim() : '';
108
+ /* c8 ignore next -- ?? fallback */
109
+ const msg = err?.message ?? String(err);
110
+ const reason = (stderr || msg || 'unknown').slice(0, 200);
111
+ return { ok: false, reason };
109
112
  }
110
113
  }
111
114
 
112
115
  /**
113
- * 执行完整验证流程
114
- * @param {string} pluginId - 插件 ID
116
+ * 轮询 upgradeHealth 直到版本严格等于 toVersion,或总超时
117
+ * @param {string} toVersion
115
118
  * @param {object} [opts]
116
119
  * @param {Function} [opts.execFileFn]
117
- * @param {number} [opts.timeoutMs]
120
+ * @param {number} [opts.totalTimeoutMs]
118
121
  * @param {number} [opts.pollIntervalMs]
119
- * @returns {Promise<{ ok: boolean, version?: string, error?: string }>}
122
+ * @param {number} [opts.cmdTimeoutMs]
123
+ * @returns {Promise<{ ok: true, version: string, attempts: number, elapsedMs: number }
124
+ * | { ok: false, attempts: number, elapsedMs: number, lastReason: string, lastVersion: string }>}
120
125
  */
121
- export async function verifyUpgrade(pluginId, opts) {
122
- try {
123
- await waitForGateway(opts);
124
- await verifyPluginLoaded(pluginId, opts);
125
- const version = await verifyUpgradeHealth(opts);
126
- return { ok: true, version };
126
+ export async function pollUpgradeHealth(toVersion, opts) {
127
+ /* c8 ignore next -- ?? fallback */
128
+ const totalTimeout = opts?.totalTimeoutMs ?? HEALTH_TOTAL_TIMEOUT_MS;
129
+ /* c8 ignore next -- ?? fallback */
130
+ const pollInterval = opts?.pollIntervalMs ?? HEALTH_POLL_INTERVAL_MS;
131
+ const start = Date.now();
132
+ let attempts = 0;
133
+ let lastReason = '';
134
+ let lastVersion = '';
135
+
136
+ while (Date.now() - start < totalTimeout) {
137
+ attempts += 1;
138
+ const result = await callUpgradeHealthOnce(opts);
139
+ if (result.ok) {
140
+ if (result.version === toVersion) {
141
+ return {
142
+ ok: true,
143
+ version: result.version,
144
+ attempts,
145
+ elapsedMs: Date.now() - start,
146
+ };
147
+ }
148
+ lastVersion = result.version;
149
+ lastReason = `version-mismatch got=${result.version} want=${toVersion}`;
150
+ }
151
+ else {
152
+ lastReason = result.reason;
153
+ }
154
+ // 剩余时间不足以再等一个 interval 就直接退出,避免最后一次毫无意义的 sleep
155
+ if (Date.now() - start + pollInterval >= totalTimeout) break;
156
+ await sleep(pollInterval);
127
157
  }
128
- catch (err) {
129
- /* c8 ignore next -- ?./?? fallback */
130
- return { ok: false, error: String(err?.message ?? err) };
158
+
159
+ return {
160
+ ok: false,
161
+ attempts,
162
+ elapsedMs: Date.now() - start,
163
+ lastReason,
164
+ lastVersion,
165
+ };
166
+ }
167
+
168
+ /**
169
+ * 完整验证流程:触发 gateway restart → 读磁盘版本(诊断)→ 轮询 upgradeHealth
170
+ * @param {string} pluginDir - 插件安装目录(来自 openclaw.json 的权威 installPath)
171
+ * @param {string} toVersion - 目标版本
172
+ * @param {object} [opts]
173
+ * @param {Function} [opts.execFileFn]
174
+ * @param {number} [opts.totalTimeoutMs]
175
+ * @param {number} [opts.pollIntervalMs]
176
+ * @param {number} [opts.cmdTimeoutMs]
177
+ * @param {Function} [log] - 本地日志函数
178
+ * @returns {Promise<{ ok: true, version: string } | { ok: false, error: string }>}
179
+ */
180
+ export async function verifyUpgrade(pluginDir, toVersion, opts, log) {
181
+ const logFn = typeof log === 'function' ? log : () => {};
182
+
183
+ await triggerGatewayRestart(opts);
184
+
185
+ const onDiskVersion = await readDiskPackageVersion(pluginDir);
186
+ logFn(`[upgrade-worker] On-disk package.json version: ${onDiskVersion ?? '(unreadable)'} (expected ${toVersion})`);
187
+
188
+ const result = await pollUpgradeHealth(toVersion, opts);
189
+ if (result.ok) {
190
+ logFn(`[upgrade-worker] upgradeHealth verified: version=${result.version} attempts=${result.attempts} elapsed=${result.elapsedMs}ms`);
191
+ return { ok: true, version: result.version };
131
192
  }
193
+
194
+ const error = `verify timeout: attempts=${result.attempts} elapsed=${result.elapsedMs}ms lastVersion=${result.lastVersion || '(none)'} lastReason=${result.lastReason || '(none)'}`;
195
+ logFn(`[upgrade-worker] ${error}`);
196
+ return { ok: false, error };
132
197
  }
133
198
 
134
199
  function sleep(ms) {
@@ -15,7 +15,7 @@
15
15
  import { execFile as nodeExecFile } from 'node:child_process';
16
16
  import { parseArgs } from 'node:util';
17
17
  import { createBackup, restoreFromBackup, removeBackup } from './worker-backup.js';
18
- import { verifyUpgrade, waitForGateway } from './worker-verify.js';
18
+ import { verifyUpgrade, triggerGatewayRestart } from './worker-verify.js';
19
19
  import { addSkippedVersion, updateLastUpgrade, appendLog } from './state.js';
20
20
  import { getCurrentNpmRegistry, pickFallbackRegistry } from './registry-fallback.js';
21
21
 
@@ -160,7 +160,7 @@ export async function runUpgrade({ pluginDir, fromVersion, toVersion, pluginId,
160
160
 
161
161
  // 3. 等待 gateway 重启并验证
162
162
  log('[upgrade-worker] Verifying upgrade...');
163
- const result = await verifyUpgrade(pluginId, opts);
163
+ const result = await verifyUpgrade(pluginDir, toVersion, opts, log);
164
164
 
165
165
  if (result.ok) {
166
166
  // 4a. 成功
@@ -212,15 +212,9 @@ async function handleRollback({ pluginDir, fromVersion, toVersion, pluginId, pkg
212
212
  }
213
213
  }
214
214
 
215
- // 等待 gateway 重启
216
- log('[upgrade-worker] Waiting for gateway to restart after rollback...');
217
- try {
218
- await waitForGateway(opts);
219
- log('[upgrade-worker] Gateway restarted after rollback');
220
- }
221
- catch {
222
- log('[upgrade-worker] Gateway did not restart after rollback');
223
- }
215
+ // 触发 gateway 重启让老版本回到运行态(尽力而为,不验证结果)
216
+ log('[upgrade-worker] Triggering gateway restart after rollback...');
217
+ await triggerGatewayRestart(opts);
224
218
 
225
219
  // 记录状态(顺序执行因共享 state 文件,但各自 try/catch 避免单个失败阻断其余)
226
220
  // 仅验证失败(新版本确实被加载并发现有问题)才标记为 skipped;
@@ -22,6 +22,12 @@ const CONNECT_TIMEOUT_MS = 10_000;
22
22
  const SERVER_HB_PING_MS = 25_000;
23
23
  const SERVER_HB_TIMEOUT_MS = 45_000;
24
24
  const SERVER_HB_MAX_MISS = 4; // 连续 4 次无响应才断连(~3 分钟)
25
+ // gateway 握手失败的指数退避表:每个元素是"上一次失败"之后、"下一次尝试"之前的等待时间。
26
+ // 最多 5 次重试(加上首次尝试共 6 次),全部失败后进入 gave-up 终态,不再自动尝试。
27
+ const GATEWAY_RETRY_DELAYS_MS = [5_000, 10_000, 20_000, 20_000, 20_000];
28
+ // v3 握手失败时,只有错误消息匹配此正则才回退到不带 device 的 legacy 握手。
29
+ // 严格限定在"签名/设备/scope/协议"相关错误,避免对网络/内部错误做无意义的降级尝试。
30
+ const GATEWAY_HANDSHAKE_FALLBACK_PATTERN = /signature|device|scope|protocol/i;
25
31
 
26
32
  function toServerWsUrl(baseUrl, token) {
27
33
  const url = new URL(baseUrl);
@@ -112,6 +118,12 @@ export class RealtimeBridge {
112
118
  this.__fileHandler = null;
113
119
  this.__ndcPreloadResult = null;
114
120
  this.__ndcCleanup = null;
121
+ // gateway 握手重试状态(刷屏治理 + 兼容性回退)
122
+ this.__gatewayAttempts = 0; // 已失败的连续握手次数(握手成功时归零)
123
+ this.__gatewayRetryTimer = null; // 下一次尝试的 setTimeout 句柄
124
+ this.__gatewayGaveUp = false; // 重试次数耗尽 → 终态,不再自动尝试
125
+ this.__gatewayLegacyMode = false; // 学到"本 gateway 不接受带 device 的 v3"
126
+ this.__gatewayLastReason = null; // 最近一次失败原因(用于 gave-up 上报)
115
127
  }
116
128
 
117
129
  __resolveWebSocket() {
@@ -192,6 +204,14 @@ export class RealtimeBridge {
192
204
  }
193
205
 
194
206
  __closeGatewayWs() {
207
+ // 当 server WS 失效主动关闭 gateway 时,取消任何 pending 重试定时器、把连续失败计数归零:
208
+ // 新 server 会话应从新预算开始重试 gateway,避免旧会话的零散失败累计吞掉未来的重试机会。
209
+ // 不清 __gatewayGaveUp / __gatewayLegacyMode —— 那是跨会话的终态/学习,只由 stop() 复位。
210
+ if (this.__gatewayRetryTimer) {
211
+ clearTimeout(this.__gatewayRetryTimer);
212
+ this.__gatewayRetryTimer = null;
213
+ }
214
+ this.__gatewayAttempts = 0;
195
215
  if (!this.gatewayWs) {
196
216
  return;
197
217
  }
@@ -558,12 +578,13 @@ export class RealtimeBridge {
558
578
  };
559
579
  }
560
580
 
561
- __sendGatewayConnectRequest(ws, nonce) {
562
- this.gatewayConnectReqId = `coclaw-connect-${Date.now()}`;
563
- this.__logDebug(`gateway connect request -> id=${this.gatewayConnectReqId}`);
581
+ __sendGatewayConnectRequest(ws, nonce, { legacy = false } = {}) {
582
+ // rpcSeq 保证 ID 唯一,避免 v3→legacy 同毫秒内两次调用产生相同 id
583
+ this.gatewayRpcSeq += 1;
584
+ this.gatewayConnectReqId = `coclaw-connect-${Date.now()}-${this.gatewayRpcSeq}`;
585
+ this.__logDebug(`gateway connect request -> id=${this.gatewayConnectReqId} legacy=${legacy}`);
564
586
  try {
565
587
  const authToken = this.__resolveGatewayAuthToken();
566
- const device = this.__buildDeviceField(nonce, authToken);
567
588
  const params = {
568
589
  minProtocol: 3,
569
590
  maxProtocol: 3,
@@ -577,8 +598,12 @@ export class RealtimeBridge {
577
598
  role: 'operator',
578
599
  scopes: ['operator.admin'],
579
600
  auth: authToken ? { token: authToken } : undefined,
580
- device,
581
601
  };
602
+ // legacy 回退仅省略 device 字段;其他字段保持与 v3 一致。
603
+ // 当 gateway 不支持/不接受 device 字段时,auth.token 足以完成旧版握手。
604
+ if (!legacy) {
605
+ params.device = this.__buildDeviceField(nonce, authToken);
606
+ }
582
607
  ws.send(JSON.stringify({
583
608
  type: 'req',
584
609
  id: this.gatewayConnectReqId,
@@ -592,7 +617,42 @@ export class RealtimeBridge {
592
617
  }
593
618
  }
594
619
 
620
+ /**
621
+ * 握手失败一次:累加计数;未耗尽则按退避表调度下次尝试,耗尽则进入 gave-up 终态。
622
+ * 调度 / 尝试 / 终态 guard 由 __ensureGatewayConnection 一致执行。
623
+ * @param {string} reason - 本次失败原因,用于 gave-up 时汇总上报
624
+ */
625
+ __onGatewayAttemptFailed(reason) {
626
+ if (!this.started || this.__gatewayGaveUp || this.__gatewayRetryTimer) {
627
+ return;
628
+ }
629
+ this.__gatewayLastReason = reason;
630
+ this.__gatewayAttempts += 1;
631
+ if (this.__gatewayAttempts > GATEWAY_RETRY_DELAYS_MS.length) {
632
+ this.__gatewayGaveUp = true;
633
+ remoteLog(`gateway.handshake.gave-up attempts=${this.__gatewayAttempts} lastReason=${reason}`);
634
+ this.logger.warn?.(`[coclaw] gateway handshake gave up after ${this.__gatewayAttempts} attempts (last reason: ${reason})`);
635
+ return;
636
+ }
637
+ const delay = GATEWAY_RETRY_DELAYS_MS[this.__gatewayAttempts - 1];
638
+ this.__gatewayRetryTimer = setTimeout(() => {
639
+ this.__gatewayRetryTimer = null;
640
+ this.__ensureGatewayConnection();
641
+ }, delay);
642
+ this.__gatewayRetryTimer.unref?.();
643
+ }
644
+
595
645
  __ensureGatewayConnection() {
646
+ // 停机守卫:防止 stop() 之后某个已进入调度队列的 retry timer callback 再触发新 WS
647
+ if (!this.started) {
648
+ return;
649
+ }
650
+ // 刷屏治理:已进入终态 / 已调度下次尝试 → 不启动新 WS。
651
+ // 这两个 guard 保证在 __waitGatewayReady 或 server WS 重连的连续触发下
652
+ // 只会按退避表节奏新建连接。
653
+ if (this.__gatewayGaveUp || this.__gatewayRetryTimer) {
654
+ return;
655
+ }
596
656
  if (this.gatewayWs || !this.serverWs || this.serverWs.readyState !== 1) {
597
657
  return;
598
658
  }
@@ -606,6 +666,12 @@ export class RealtimeBridge {
606
666
  this.gatewayReady = false;
607
667
  this.gatewayConnectReqId = null;
608
668
 
669
+ // per-WS 闭包状态,只在本条 WS 的生命周期内有效。
670
+ let connectFailReported = false; // 已经打过 ws.connect-failed;close 时抑制重复的 ws.disconnected
671
+ let pendingLegacyAttempted = false; // 本 WS 已尝试过 legacy 握手,避免重复降级
672
+ let wasReady = false; // 本 WS 曾经握手成功(区分"握手失败"与"成功后断开")
673
+ let lastChallengeNonce = ''; // 最近一次 challenge 的 nonce,legacy 回退时复用
674
+
609
675
  ws.addEventListener('message', (event) => {
610
676
  let payload = null;
611
677
  try {
@@ -619,13 +685,23 @@ export class RealtimeBridge {
619
685
  }
620
686
  if (payload.type === 'event' && payload.event === 'connect.challenge') {
621
687
  const nonce = payload?.payload?.nonce ?? '';
622
- this.__logDebug('gateway event <- connect.challenge');
623
- this.__sendGatewayConnectRequest(ws, nonce);
688
+ lastChallengeNonce = nonce;
689
+ this.__logDebug(`gateway event <- connect.challenge legacyMode=${this.__gatewayLegacyMode}`);
690
+ // 已经学到此 gateway 是 legacy(上一条 WS 回退过)→ 直接发 legacy 握手
691
+ if (this.__gatewayLegacyMode) {
692
+ pendingLegacyAttempted = true;
693
+ this.__sendGatewayConnectRequest(ws, nonce, { legacy: true });
694
+ }
695
+ else {
696
+ this.__sendGatewayConnectRequest(ws, nonce);
697
+ }
624
698
  return;
625
699
  }
626
700
  if (payload.type === 'res' && this.gatewayConnectReqId && payload.id === this.gatewayConnectReqId) {
627
701
  if (payload.ok === true) {
628
702
  this.gatewayReady = true;
703
+ wasReady = true;
704
+ this.__gatewayAttempts = 0; // 成功握手 → 重置失败计数,让后续瞬态断开有完整重试预算
629
705
  remoteLog('ws.connected peer=gateway');
630
706
  this.__logDebug(`gateway connect ok <- id=${payload.id}`);
631
707
  this.gatewayConnectReqId = null;
@@ -633,10 +709,28 @@ export class RealtimeBridge {
633
709
  this.__pushInstanceInfo();
634
710
  }
635
711
  else {
712
+ const reason = payload?.error?.message ?? 'unknown';
713
+ // v3 → legacy 同 WS 回退:仅在签名/协议相关错误、且本 WS 尚未尝试 legacy 时触发
714
+ const shouldFallback =
715
+ !pendingLegacyAttempted
716
+ && !this.__gatewayLegacyMode
717
+ && GATEWAY_HANDSHAKE_FALLBACK_PATTERN.test(reason);
718
+ if (shouldFallback) {
719
+ pendingLegacyAttempted = true;
720
+ this.__gatewayLegacyMode = true;
721
+ // v3 的失败原因已由这条 remoteLog 单独上报,不写入 __gatewayLastReason;
722
+ // 后者保持"最后一次真正失败的原因"语义,供 gave-up 时使用。
723
+ remoteLog(`gateway.handshake.fallback v3→legacy reason=${reason}`);
724
+ this.logger.info?.(`[coclaw] gateway v3 handshake failed (${reason}), falling back to legacy`);
725
+ this.__sendGatewayConnectRequest(ws, lastChallengeNonce, { legacy: true });
726
+ return;
727
+ }
636
728
  this.gatewayReady = false;
637
729
  this.gatewayConnectReqId = null;
638
- remoteLog(`ws.connect-failed peer=gateway msg=${payload?.error?.message ?? 'unknown'}`);
639
- this.logger.warn?.(`[coclaw] gateway connect failed: ${payload?.error?.message ?? 'unknown'}`);
730
+ connectFailReported = true;
731
+ this.__gatewayLastReason = reason;
732
+ remoteLog(`ws.connect-failed peer=gateway msg=${reason}`);
733
+ this.logger.warn?.(`[coclaw] gateway connect failed: ${reason}`);
640
734
  try { ws.close(1008, 'gateway_connect_failed'); }
641
735
  /* c8 ignore next */
642
736
  catch {}
@@ -675,21 +769,46 @@ export class RealtimeBridge {
675
769
  this.__logDebug('gateway ws open, waiting for connect.challenge');
676
770
  });
677
771
  ws.addEventListener('close', (ev) => {
678
- remoteLog(`ws.disconnected peer=gateway code=${ev?.code ?? '?'}`);
772
+ // 握手失败路径已经打过 ws.connect-failed,这里抑制重复的 disconnected 日志;
773
+ // 成功后的意外断开、握手途中的异常断开仍按原样上报。
774
+ if (!connectFailReported) {
775
+ remoteLog(`ws.disconnected peer=gateway code=${ev?.code ?? '?'}`);
776
+ }
679
777
  this.logger.info?.(`[coclaw] gateway ws closed (code=${ev?.code ?? '?'} reason=${ev?.reason ?? 'n/a'})`);
680
- this.gatewayWs = null;
681
- this.gatewayReady = false;
682
- this.gatewayConnectReqId = null;
778
+ if (this.gatewayWs === ws) {
779
+ this.gatewayWs = null;
780
+ this.gatewayReady = false;
781
+ this.gatewayConnectReqId = null;
782
+ }
683
783
  /* c8 ignore next 3 -- gateway 意外断开时结算未完成 RPC,避免等超时 */
684
784
  for (const [, settle] of this.gatewayPendingRequests) {
685
785
  settle({ ok: false, error: 'gateway_closed' });
686
786
  }
687
787
  this.gatewayPendingRequests.clear();
788
+ // 调度下一次尝试:仅在 bridge 仍活着、未 gave-up、server WS 健康时;
789
+ // 其他场景(如 bridge stop、server WS 已断)由上游流程兜底,不参与 gateway 重试。
790
+ if (this.started && !this.__gatewayGaveUp
791
+ && this.serverWs && this.serverWs.readyState === 1
792
+ && (wasReady || connectFailReported)) {
793
+ if (wasReady) {
794
+ // 之前握成功过,视为瞬态掉线 → 重置计数,让新一轮拿到完整重试预算
795
+ this.__gatewayAttempts = 0;
796
+ }
797
+ this.__onGatewayAttemptFailed(
798
+ /* c8 ignore next -- connectFailReported 路径必然已设 __gatewayLastReason */
799
+ wasReady ? 'disconnected' : (this.__gatewayLastReason ?? 'connect-failed')
800
+ );
801
+ }
688
802
  });
689
803
  ws.addEventListener('error', (err) => {
690
804
  /* c8 ignore next -- ?./?? fallback */
691
805
  remoteLog(`ws.error peer=gateway msg=${String(err?.message ?? err)}`);
692
806
  this.logger.warn?.(`[coclaw] gateway ws error: ${String(err?.message ?? err)}`);
807
+ // 防御 ws 库在某些错误下只 emit error 不跟随 close 的情况:主动关闭让 close handler
808
+ // 接管清理和重试调度,避免 gatewayWs 引用卡在僵尸状态阻塞后续 __ensureGatewayConnection。
809
+ try { ws.close(1011, 'ws_error'); }
810
+ /* c8 ignore next */
811
+ catch {}
693
812
  });
694
813
  }
695
814
 
@@ -1049,6 +1168,15 @@ export class RealtimeBridge {
1049
1168
  clearTimeout(this.reconnectTimer);
1050
1169
  this.reconnectTimer = null;
1051
1170
  }
1171
+ // 清理 gateway 重试状态:refresh()(stop+start 同一实例)后应以全新状态启动
1172
+ if (this.__gatewayRetryTimer) {
1173
+ clearTimeout(this.__gatewayRetryTimer);
1174
+ this.__gatewayRetryTimer = null;
1175
+ }
1176
+ this.__gatewayAttempts = 0;
1177
+ this.__gatewayGaveUp = false;
1178
+ this.__gatewayLegacyMode = false;
1179
+ this.__gatewayLastReason = null;
1052
1180
  this.__closeGatewayWs();
1053
1181
  if (this.webrtcPeer) {
1054
1182
  await this.webrtcPeer.closeAll().catch(() => {});
@@ -4,12 +4,13 @@
4
4
  *
5
5
  * 行为约定详见 docs/rpc-dc-file-queue.md。
6
6
  * - FIFO、单一生产者/消费者;多消费者时每条只交付给其中一个。
7
- * - 构造时清理目录残留(不跨生命周期复用)。
7
+ * - 构造纯字段初始化,不碰 FS;使用前需 `await q.init()`。
8
8
  * - 消费侧:`for await (const item of queue) { ... }`;`destroy()` 让迭代结束。
9
+ * - FS 异常下进入 `fsBroken` 粘性降级:mem 路径继续工作,溢出消息 drop。
9
10
  */
10
11
 
11
12
  import fs from 'node:fs/promises';
12
- import { createReadStream, createWriteStream, rmSync } from 'node:fs';
13
+ import { createReadStream, createWriteStream } from 'node:fs';
13
14
  import nodePath from 'node:path';
14
15
  import readline from 'node:readline';
15
16
 
@@ -18,13 +19,22 @@ import { createMutex } from './mutex.js';
18
19
  const DEFAULT_MEM_BUDGET = 8 * 1024 * 1024;
19
20
  const DEFAULT_DISK_CAP = 1024 * 1024 * 1024;
20
21
 
22
+ // JS 对象开销估算(string header + array slot 等),仅用于 admission 决策不影响 memBytes 报告
23
+ const ENTRY_OVERHEAD = 64;
24
+
25
+ // id 字符集:UUID / 字母数字 / 点 / 下划线 / 减号,且不能是 "." 或 ".."
26
+ const ID_RE = /^[A-Za-z0-9._-]+$/;
27
+
28
+ // 压缩阈值:head 越过 64 且占 memQueue 一半以上时切片回收
29
+ const COMPACT_HEAD_THRESHOLD = 64;
30
+
21
31
  class FileBackedQueue {
22
32
  /**
23
33
  * @param {object} opts
24
34
  * @param {string} opts.dir - 队列文件根目录
25
- * @param {string} opts.id - 队列标识(用于子目录命名)
35
+ * @param {string} opts.id - 队列标识,字符集受限,防路径穿越
26
36
  * @param {number} [opts.memBudget=8MB] - 内存持有字节数上限
27
- * @param {number} [opts.diskCap=1GB] - 磁盘+内存总字节数硬上限
37
+ * @param {number} [opts.diskCap=1GB] - 磁盘+内存总字节数硬上限(含 `\n`)
28
38
  * @param {(reason: string, size: number) => void} [opts.onDrop] - 拒入队时的回调
29
39
  * @param {{ warn?: Function, info?: Function, error?: Function }} [opts.logger=console]
30
40
  */
@@ -40,6 +50,17 @@ class FileBackedQueue {
40
50
 
41
51
  if (!dir || typeof dir !== 'string') throw new TypeError('dir is required');
42
52
  if (!id || typeof id !== 'string') throw new TypeError('id is required');
53
+ if (id === '.' || id === '..' || !ID_RE.test(id)) {
54
+ throw new TypeError('id contains invalid characters');
55
+ }
56
+ // 基础设施 fail-fast:容量参数必须是有限正数,避免 NaN/Infinity/非数字绕过 admission。
57
+ // NaN 与任何数比较皆为 false → admission 永远通过 → diskCap 变相失效。
58
+ if (!Number.isFinite(memBudget) || memBudget <= 0) {
59
+ throw new TypeError('memBudget must be a finite positive number');
60
+ }
61
+ if (!Number.isFinite(diskCap) || diskCap <= 0) {
62
+ throw new TypeError('diskCap must be a finite positive number');
63
+ }
43
64
 
44
65
  this.dir = dir;
45
66
  this.id = id;
@@ -48,28 +69,52 @@ class FileBackedQueue {
48
69
  this.onDrop = onDrop;
49
70
  this.logger = logger;
50
71
 
51
- this.subdir = nodePath.join(dir, id);
52
- this.filePath = nodePath.join(this.subdir, 'queue.jsonl');
72
+ this.filePath = nodePath.join(dir, `${id}.jsonl`);
53
73
 
74
+ // 单文件 ring-ish 结构:head 指针 + 数组;shift 为 O(1) 摊销
54
75
  this.memQueue = [];
76
+ this.head = 0;
55
77
  this.memBytes = 0;
56
- this.diskBytes = 0; // 磁盘上未消费的 payload 字节(不含分隔 \n)
57
78
  this.writtenBytes = 0; // 已写入文件的累计字节(含 \n)
58
79
  this.readOffset = 0; // 下次 refill 的起始偏移
59
80
  this.spilled = false;
81
+ this.initialized = false;
60
82
  this.destroyed = false;
83
+ this.fsBroken = false; // 粘性:一旦 FS 出错,不再尝试 reopen
61
84
  this.writeStream = null;
62
85
  this.writeErr = null;
63
86
  this.waiters = [];
64
87
  this.mutex = createMutex();
88
+ }
65
89
 
66
- // 防御性清理:不跨生命周期复用旧数据
67
- try {
68
- rmSync(this.subdir, { recursive: true, force: true });
69
- } catch (err) {
70
- /* c8 ignore next 2 -- rmSync with force rarely fails on posix */
71
- this.logger?.warn?.('fbq.construct cleanup error', err);
72
- }
90
+ /**
91
+ * 派生的未消费磁盘字节数(含 \n),用于 admission 与 stats。
92
+ */
93
+ get diskBytes() {
94
+ return this.writtenBytes - this.readOffset;
95
+ }
96
+
97
+ /**
98
+ * 异步初始化:清理残留文件,标记可用。幂等。
99
+ */
100
+ async init() {
101
+ return await this.mutex.withLock(async () => {
102
+ if (this.destroyed) return;
103
+ if (this.initialized) return;
104
+ try {
105
+ await fs.rm(this.filePath, { force: true });
106
+ } catch (err) {
107
+ // best-effort:init 的 rm 可能因 ENOTDIR / EACCES 等失败。
108
+ // 权威残留清理在 __openWriteStream 中(首次 spill 前)再做一次,
109
+ // 确保不会用 'a' flag 追加到旧数据上污染 FIFO。
110
+ this.logger?.warn?.('fbq.init rm warning', err);
111
+ }
112
+ this.initialized = true;
113
+ });
114
+ }
115
+
116
+ async [Symbol.asyncDispose]() {
117
+ await this.destroy();
73
118
  }
74
119
 
75
120
  /**
@@ -80,28 +125,45 @@ class FileBackedQueue {
80
125
  async enqueue(jsonStr) {
81
126
  return await this.mutex.withLock(async () => {
82
127
  if (this.destroyed) return false;
128
+ if (!this.initialized) throw new TypeError('queue not initialized');
83
129
  if (typeof jsonStr !== 'string') throw new TypeError('jsonStr must be a string');
84
130
 
85
131
  const size = Buffer.byteLength(jsonStr, 'utf8');
86
132
 
87
- if (this.memBytes + this.diskBytes + size > this.diskCap) {
133
+ // admission:按物理占用(mem + 已写文件总字节,含 \n)判定,保证 diskCap 是真正的硬上限。
134
+ // 用 writtenBytes(不减 readOffset)的含义:文件前缀已读但未被 __dropFile 回收前仍算占用。
135
+ // 代价:持续背压下消费者还没追到写端时新消息可能被 drop,直到完全 drain 触发 __dropFile 重置。
136
+ if (this.memBytes + this.writtenBytes + size + 1 > this.diskCap) {
88
137
  this.__dispatchDrop('disk-cap', size);
89
138
  return false;
90
139
  }
91
140
 
92
- // 内存路径:未溢出且加上新条目仍在预算内
93
- if (!this.spilled && this.memBytes + size <= this.memBudget) {
94
- this.memQueue.push(jsonStr);
95
- this.memBytes += size;
96
- this.__wakeOne();
97
- return true;
141
+ // 内存路径:未溢出且 admission 通过(考虑 overhead;首条无论多大都收)
142
+ if (!this.spilled) {
143
+ const pendingCount = this.memQueue.length - this.head;
144
+ const cost = this.memBytes + pendingCount * ENTRY_OVERHEAD + size + ENTRY_OVERHEAD;
145
+ if (pendingCount === 0 || cost <= this.memBudget) {
146
+ this.memQueue.push(jsonStr);
147
+ this.memBytes += size;
148
+ this.__wakeOne();
149
+ return true;
150
+ }
151
+ }
152
+
153
+ // 溢出路径:FS 已破直接 drop,不再尝试 reopen
154
+ if (this.fsBroken) {
155
+ this.__dispatchDrop('fs-error', size);
156
+ return false;
98
157
  }
99
158
 
100
- // 溢出路径:lazy 打开写流
101
159
  if (!this.spilled) {
102
160
  await this.__openWriteStream();
103
161
  if (this.writeErr) {
162
+ const err = this.writeErr;
104
163
  this.__dispatchDrop('fs-error', size);
164
+ // 前置 mkdir/rm 失败也进入粘性降级:与 stream 'error' 路径语义一致,
165
+ // 避免后续每次 overflow 都重试同一个持续性 FS 故障。
166
+ await this.__handleFsError(err);
105
167
  return false;
106
168
  }
107
169
  this.spilled = true;
@@ -109,32 +171,39 @@ class FileBackedQueue {
109
171
 
110
172
  try {
111
173
  await this.__writeLine(jsonStr + '\n');
112
- this.diskBytes += size;
113
174
  this.writtenBytes += size + 1;
114
175
  this.__wakeOne();
115
176
  return true;
116
177
  } catch (err) {
117
178
  this.logger?.warn?.('fbq.enqueue fs-error', err);
118
179
  this.__dispatchDrop('fs-error', size);
180
+ // 直接在当前锁内触发粘性降级:真实 Node stream 下 cb err 通常也会 emit 'error'
181
+ // (监听器会另外排一次 handleFsError,但 fsBroken 已置 → no-op);测试里的 monkey-patch
182
+ // 只触发 cb、不发 'error',这里主动降级保证行为一致。
183
+ await this.__handleFsError(err);
119
184
  return false;
120
185
  }
121
186
  });
122
187
  }
123
188
 
124
189
  /**
125
- * @returns {{ memCount: number, memBytes: number, diskBytes: number, spilled: boolean }}
190
+ * @returns {{ memCount: number, memBytes: number, diskBytes: number, writtenBytes: number, spilled: boolean, fsBroken: boolean }}
191
+ * - diskBytes:未消费 backlog(writtenBytes - readOffset)
192
+ * - writtenBytes:本次生命周期累计已写字节(admission 依据的物理占用),drain 或 FS 降级后重置为 0
126
193
  */
127
194
  stats() {
128
195
  return {
129
- memCount: this.memQueue.length,
196
+ memCount: this.memQueue.length - this.head,
130
197
  memBytes: this.memBytes,
131
198
  diskBytes: this.diskBytes,
199
+ writtenBytes: this.writtenBytes,
132
200
  spilled: this.spilled,
201
+ fsBroken: this.fsBroken,
133
202
  };
134
203
  }
135
204
 
136
205
  /**
137
- * 清空数据但保留实例可用。
206
+ * 清空数据但保留实例可用;显式清 fsBroken,允许再次尝试落盘。
138
207
  */
139
208
  async clear() {
140
209
  return await this.mutex.withLock(async () => {
@@ -147,17 +216,18 @@ class FileBackedQueue {
147
216
  this.logger?.warn?.('fbq.clear rm error', err);
148
217
  }
149
218
  this.memQueue = [];
219
+ this.head = 0;
150
220
  this.memBytes = 0;
151
- this.diskBytes = 0;
152
221
  this.writtenBytes = 0;
153
222
  this.readOffset = 0;
154
223
  this.spilled = false;
224
+ this.fsBroken = false;
155
225
  this.writeErr = null;
156
226
  });
157
227
  }
158
228
 
159
229
  /**
160
- * 停写、关 FD、删目录、结束所有迭代器。幂等。
230
+ * 停写、关 FD、删文件、结束所有迭代器。幂等。
161
231
  */
162
232
  async destroy() {
163
233
  return await this.mutex.withLock(async () => {
@@ -170,15 +240,15 @@ class FileBackedQueue {
170
240
 
171
241
  await this.__closeWriteStream();
172
242
  try {
173
- await fs.rm(this.subdir, { recursive: true, force: true });
243
+ await fs.rm(this.filePath, { force: true });
174
244
  } catch (err) {
175
245
  /* c8 ignore next 2 -- rm with force rarely fails */
176
246
  this.logger?.warn?.('fbq.destroy rm error', err);
177
247
  }
178
248
 
179
249
  this.memQueue = [];
250
+ this.head = 0;
180
251
  this.memBytes = 0;
181
- this.diskBytes = 0;
182
252
  this.writtenBytes = 0;
183
253
  this.readOffset = 0;
184
254
  this.spilled = false;
@@ -198,12 +268,20 @@ class FileBackedQueue {
198
268
  while (true) {
199
269
  let waitPromise = null;
200
270
  const result = await this.mutex.withLock(async () => {
201
- if (this.memQueue.length === 0 && this.spilled && !this.destroyed) {
271
+ const pendingCount = this.memQueue.length - this.head;
272
+ if (pendingCount === 0 && this.spilled && !this.destroyed) {
202
273
  await this.__refillImpl();
203
274
  }
204
- if (this.memQueue.length > 0) {
205
- const item = this.memQueue.shift();
275
+ if (this.memQueue.length - this.head > 0) {
276
+ const item = this.memQueue[this.head];
277
+ this.memQueue[this.head] = undefined;
278
+ this.head += 1;
206
279
  this.memBytes -= Buffer.byteLength(item, 'utf8');
280
+ // 惰性压缩:避免 head 一直向前、数组永不回收
281
+ if (this.head > COMPACT_HEAD_THRESHOLD && this.head * 2 >= this.memQueue.length) {
282
+ this.memQueue = this.memQueue.slice(this.head);
283
+ this.head = 0;
284
+ }
207
285
  return { value: item, done: false };
208
286
  }
209
287
  if (this.destroyed) return { done: true, value: undefined };
@@ -224,6 +302,11 @@ class FileBackedQueue {
224
302
  }
225
303
  }
226
304
 
305
+ __wakeAll() {
306
+ const toWake = this.waiters.splice(0);
307
+ for (const w of toWake) w.resolve();
308
+ }
309
+
227
310
  __dispatchDrop(reason, size) {
228
311
  try {
229
312
  this.onDrop?.(reason, size);
@@ -237,20 +320,32 @@ class FileBackedQueue {
237
320
  async __openWriteStream() {
238
321
  this.writeErr = null;
239
322
  try {
240
- await fs.mkdir(this.subdir, { recursive: true });
323
+ // 目录 0o700 / 文件 0o600:POSIX best-effort。
324
+ // - 新建目录/文件会按此 mode(再经 umask)创建
325
+ // - 已存在的目录 mkdir(recursive) 不会被 chmod 收紧,以该目录原权限为准
326
+ // - Windows 下 mode 参数语义很弱(无 owner/group/other 概念),实际访问控制依赖父目录 NTFS ACL
327
+ // 仍比默认 0o644 更保守;atomic-write.js 也是同一策略。
328
+ await fs.mkdir(nodePath.dirname(this.filePath), { recursive: true, mode: 0o700 });
329
+ // 权威残留清理:即便 init 的 rm 被吞掉,这里开流前再 rm 一次,
330
+ // 避免 'a' flag 追加到旧数据上污染 FIFO。
331
+ await fs.rm(this.filePath, { force: true });
241
332
  } catch (err) {
242
333
  this.writeErr = err;
243
334
  return;
244
335
  }
245
- this.writeStream = createWriteStream(this.filePath, { flags: 'a' });
336
+ this.writeStream = createWriteStream(this.filePath, { flags: 'a', mode: 0o600 });
246
337
  this.writeStream.on('error', (err) => {
247
338
  this.writeErr = err;
248
339
  this.logger?.warn?.('fbq.writeStream error', err);
340
+ // 异步错误:排队到 mutex 做粘性降级清理,避免状态半截卡死
341
+ this.mutex.withLock(() => this.__handleFsError(err)).catch(() => {});
249
342
  });
250
343
  }
251
344
 
252
345
  async __writeLine(str) {
253
- if (this.writeErr) throw this.writeErr;
346
+ // 不再前置 writeErr 检查:一旦 writeErr 被异步设置,__handleFsError 会立即排队清理并
347
+ // 把 fsBroken 置粘性;spill 路径入口已判 fsBroken,到这里 writeErr 必为 null。
348
+ // 写失败通过 write 回调的 err 反映,catch 块处理。
254
349
  return await new Promise((resolve, reject) => {
255
350
  this.writeStream.write(str, (err) => {
256
351
  if (err) reject(err);
@@ -278,6 +373,25 @@ class FileBackedQueue {
278
373
  });
279
374
  }
280
375
 
376
+ // mutex 内调用:FS 错误粘性降级
377
+ async __handleFsError(_err) {
378
+ if (this.destroyed || this.fsBroken) return;
379
+ this.fsBroken = true;
380
+ await this.__closeWriteStream();
381
+ try {
382
+ await fs.rm(this.filePath, { force: true });
383
+ } catch (err) {
384
+ /* c8 ignore next 2 -- rm with force rarely fails */
385
+ this.logger?.warn?.('fbq.handleFsError rm error', err);
386
+ }
387
+ this.spilled = false;
388
+ this.writtenBytes = 0;
389
+ this.readOffset = 0;
390
+ this.writeErr = null;
391
+ // 唤醒全部消费者,让它们重新观察状态
392
+ this.__wakeAll();
393
+ }
394
+
281
395
  // 调用方必须已持有 mutex,且已确认 !destroyed
282
396
  async __refillImpl() {
283
397
  if (!this.spilled) return;
@@ -287,8 +401,10 @@ class FileBackedQueue {
287
401
  const st = await fs.stat(this.filePath);
288
402
  actualEnd = st.size;
289
403
  } catch (err) {
290
- /* c8 ignore next 3 -- stat 在正常持有期间不会失败 */
404
+ // 读侧 FS 错误(外部删文件、权限丢失等)走粘性降级,
405
+ // 避免 spilled=true / fsBroken=false 的悬空态让消费者永远挂 waiter。
291
406
  this.logger?.warn?.('fbq.refill stat error', err);
407
+ await this.__handleFsError(err);
292
408
  return;
293
409
  }
294
410
 
@@ -302,6 +418,9 @@ class FileBackedQueue {
302
418
  let cumPayload = 0; // 仅 payload
303
419
  let stoppedAtEof = true;
304
420
 
421
+ const pendingCount = this.memQueue.length - this.head;
422
+ const baseCost = this.memBytes + pendingCount * ENTRY_OVERHEAD;
423
+
305
424
  const stream = createReadStream(this.filePath, {
306
425
  start: this.readOffset,
307
426
  end: actualEnd - 1,
@@ -311,7 +430,9 @@ class FileBackedQueue {
311
430
  try {
312
431
  for await (const line of rl) {
313
432
  const sz = Buffer.byteLength(line, 'utf8');
314
- if (newLines.length > 0 && this.memBytes + cumPayload + sz > this.memBudget) {
433
+ // overhead 一致性:admission 侧已用 overhead,refill 侧同步考虑
434
+ const newLinesCost = newLines.length * ENTRY_OVERHEAD;
435
+ if (newLines.length > 0 && baseCost + cumPayload + newLinesCost + sz + ENTRY_OVERHEAD > this.memBudget) {
315
436
  stoppedAtEof = false;
316
437
  break;
317
438
  }
@@ -320,10 +441,12 @@ class FileBackedQueue {
320
441
  cumPayload += sz;
321
442
  }
322
443
  } catch (err) {
323
- /* c8 ignore next 4 -- read 错误罕见,保守退出 */
444
+ /* c8 ignore next 6 -- read 错误极罕见(stat 已通过、fd 已打开),路径保留用于粘性降级 */
445
+ // read 错误同 stat:统一走粘性降级而非静默 return
324
446
  this.logger?.warn?.('fbq.refill read error', err);
325
447
  rl.close();
326
448
  stream.destroy();
449
+ await this.__handleFsError(err);
327
450
  return;
328
451
  } finally {
329
452
  rl.close();
@@ -349,7 +472,6 @@ class FileBackedQueue {
349
472
  this.memQueue.push(line);
350
473
  this.memBytes += Buffer.byteLength(line, 'utf8');
351
474
  }
352
- this.diskBytes -= cumPayload;
353
475
 
354
476
  if (this.readOffset >= this.writtenBytes) {
355
477
  await this.__dropFile();
@@ -367,7 +489,6 @@ class FileBackedQueue {
367
489
  this.spilled = false;
368
490
  this.writtenBytes = 0;
369
491
  this.readOffset = 0;
370
- this.diskBytes = 0;
371
492
  this.writeErr = null;
372
493
  }
373
494
  }
@@ -1,47 +0,0 @@
1
- /**
2
- * 跨平台 mock os.homedir()
3
- *
4
- * Node.js os.homedir() 在不同平台读取不同环境变量:
5
- * - POSIX: HOME
6
- * - Windows: USERPROFILE(优先)、HOMEDRIVE+HOMEPATH
7
- *
8
- * 测试中需同时设置两端变量,确保 os.homedir() 返回期望路径。
9
- */
10
-
11
- const HOME_VARS = ['HOME', 'USERPROFILE'];
12
-
13
- /**
14
- * 保存当前 home 相关环境变量
15
- * @returns {Record<string, string | undefined>}
16
- */
17
- export function saveHomedir() {
18
- const saved = {};
19
- for (const key of HOME_VARS) {
20
- saved[key] = process.env[key];
21
- }
22
- return saved;
23
- }
24
-
25
- /**
26
- * 将 home 相关环境变量统一设置为指定路径
27
- * @param {string} dir - 目标路径
28
- */
29
- export function setHomedir(dir) {
30
- for (const key of HOME_VARS) {
31
- process.env[key] = dir;
32
- }
33
- }
34
-
35
- /**
36
- * 恢复之前保存的 home 相关环境变量
37
- * @param {Record<string, string | undefined>} saved
38
- */
39
- export function restoreHomedir(saved) {
40
- for (const key of HOME_VARS) {
41
- if (saved[key] === undefined) {
42
- delete process.env[key];
43
- } else {
44
- process.env[key] = saved[key];
45
- }
46
- }
47
- }