@coclaw/openclaw-coclaw 0.17.2 → 0.17.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/src/auto-upgrade/worker-verify.js +137 -72
- package/src/auto-upgrade/worker.js +5 -11
- package/src/realtime-bridge.js +141 -13
- package/src/utils/file-backed-queue.js +162 -41
- package/src/homedir-mock.helper.js +0 -47
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@coclaw/openclaw-coclaw",
|
|
3
|
-
"version": "0.17.
|
|
3
|
+
"version": "0.17.4",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"description": "OpenClaw CoClaw channel plugin for remote chat",
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
"src/**/*.js",
|
|
32
32
|
"!src/**/*.test.js",
|
|
33
33
|
"!src/mock-server.helper.js",
|
|
34
|
+
"!src/homedir-mock.helper.js",
|
|
34
35
|
"openclaw.plugin.json",
|
|
35
36
|
"LICENSE"
|
|
36
37
|
],
|
|
@@ -1,134 +1,199 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* worker-verify.js — 升级后验证
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* 3. 升级模块健康:`openclaw gateway call coclaw.upgradeHealth` 返回版本号
|
|
4
|
+
* 策略:触发 gateway restart → 轮询 coclaw.upgradeHealth RPC 直到返回版本
|
|
5
|
+
* 严格等于 toVersion。单次调用失败(gateway 未就绪 / plugin 未注册 / JSON 非法 /
|
|
6
|
+
* 版本不对)一律按"稍后重试"处理,在总超时窗口内持续尝试。
|
|
8
7
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
8
|
+
* 磁盘 package.json 的版本仅作为诊断写入本地日志,不参与判定——openclaw 侧
|
|
9
|
+
* `plugins.installs[id].installPath` 可能在 id-migration 等极端场景发生漂移,
|
|
10
|
+
* 而 upgradeHealth 是 gateway 进程内"新代码真的被加载"的权威信号。
|
|
11
|
+
*
|
|
12
|
+
* worker 运行在独立子进程中,禁止使用 remoteLog;诊断信息全部通过 logger
|
|
13
|
+
* (本地日志)输出,由 updater 记录到 upgrade-log.jsonl。
|
|
11
14
|
*/
|
|
12
15
|
import { execFile as nodeExecFile } from 'node:child_process';
|
|
16
|
+
import { readFile } from 'node:fs/promises';
|
|
17
|
+
import nodePath from 'node:path';
|
|
13
18
|
|
|
14
|
-
const GATEWAY_READY_TIMEOUT_MS = 60_000;
|
|
15
|
-
const POLL_INTERVAL_MS = 2000;
|
|
16
19
|
const CMD_TIMEOUT_MS = 30_000;
|
|
20
|
+
const HEALTH_POLL_INTERVAL_MS = 3_000;
|
|
21
|
+
// 本机 openclaw 冷启动可能需访问外部资源(AWS 诊断、ollama 探测等)
|
|
22
|
+
// 及插件 bootstrap,合计 30~60s 常见;5 分钟给足余量
|
|
23
|
+
const HEALTH_TOTAL_TIMEOUT_MS = 5 * 60 * 1000;
|
|
17
24
|
|
|
18
25
|
/**
|
|
19
|
-
* 执行命令并返回 stdout
|
|
26
|
+
* 执行命令并返回 stdout;错误对象附带 stderr 以便诊断
|
|
20
27
|
* @param {string} cmd
|
|
21
28
|
* @param {string[]} args
|
|
22
29
|
* @param {object} [opts]
|
|
23
30
|
* @param {Function} [opts.execFileFn]
|
|
31
|
+
* @param {number} [opts.cmdTimeoutMs]
|
|
24
32
|
* @returns {Promise<string>}
|
|
25
33
|
*/
|
|
26
34
|
function runCmd(cmd, args, opts) {
|
|
27
35
|
/* c8 ignore next -- ?./?? fallback */
|
|
28
36
|
const doExecFile = opts?.execFileFn ?? nodeExecFile;
|
|
37
|
+
/* c8 ignore next -- ?./?? fallback */
|
|
38
|
+
const timeout = opts?.cmdTimeoutMs ?? CMD_TIMEOUT_MS;
|
|
29
39
|
return new Promise((resolve, reject) => {
|
|
30
|
-
doExecFile(cmd, args, { timeout
|
|
31
|
-
if (err)
|
|
40
|
+
doExecFile(cmd, args, { timeout, shell: process.platform === 'win32' }, (err, stdout, stderr) => {
|
|
41
|
+
if (err) {
|
|
42
|
+
/* c8 ignore next -- ?? fallback:execFile 实现不保证 stderr 一定字符串化 */
|
|
43
|
+
err.stderr = String(stderr ?? '');
|
|
44
|
+
reject(err);
|
|
45
|
+
}
|
|
32
46
|
else resolve(String(stdout).trim());
|
|
33
47
|
});
|
|
34
48
|
});
|
|
35
49
|
}
|
|
36
50
|
|
|
37
51
|
/**
|
|
38
|
-
*
|
|
52
|
+
* 触发一次 gateway 重启;失败不抛(后续轮询 RPC 会兜底验证 gateway 是否就绪)
|
|
39
53
|
* @param {object} [opts]
|
|
40
54
|
* @param {Function} [opts.execFileFn]
|
|
41
|
-
* @param {number} [opts.timeoutMs]
|
|
42
|
-
* @param {number} [opts.pollIntervalMs]
|
|
43
55
|
* @returns {Promise<void>}
|
|
44
56
|
*/
|
|
45
|
-
export async function
|
|
46
|
-
// 主动触发重启,不依赖 OpenClaw 的文件变更自动重启策略
|
|
57
|
+
export async function triggerGatewayRestart(opts) {
|
|
47
58
|
try {
|
|
48
59
|
await runCmd('openclaw', ['gateway', 'restart'], opts);
|
|
49
60
|
}
|
|
50
61
|
catch {
|
|
51
|
-
// restart
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
/* c8 ignore next 2 -- ?./?? fallback */
|
|
55
|
-
const timeout = opts?.timeoutMs ?? GATEWAY_READY_TIMEOUT_MS;
|
|
56
|
-
const interval = opts?.pollIntervalMs ?? POLL_INTERVAL_MS;
|
|
57
|
-
const start = Date.now();
|
|
58
|
-
|
|
59
|
-
while (Date.now() - start < timeout) {
|
|
60
|
-
try {
|
|
61
|
-
const output = await runCmd('openclaw', ['gateway', 'status'], opts);
|
|
62
|
-
if (output.includes('running')) return;
|
|
63
|
-
}
|
|
64
|
-
catch {
|
|
65
|
-
// gateway 未就绪,继续轮询
|
|
66
|
-
}
|
|
67
|
-
await sleep(interval);
|
|
62
|
+
// restart 命令本身失败不阻断:openclaw 可能已在重启/daemon 自恢复;
|
|
63
|
+
// 无论如何都进入后续 upgradeHealth 轮询,由它判定 gateway 最终是否可用
|
|
68
64
|
}
|
|
69
|
-
|
|
70
|
-
throw new Error('Gateway did not become ready within timeout');
|
|
71
65
|
}
|
|
72
66
|
|
|
73
67
|
/**
|
|
74
|
-
*
|
|
75
|
-
* @param {string}
|
|
76
|
-
* @
|
|
77
|
-
* @param {Function} [opts.execFileFn]
|
|
78
|
-
* @returns {Promise<void>}
|
|
68
|
+
* 读取磁盘 package.json 的版本号(诊断用途,不参与判定)
|
|
69
|
+
* @param {string} pluginDir
|
|
70
|
+
* @returns {Promise<string | null>}
|
|
79
71
|
*/
|
|
80
|
-
export async function
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
72
|
+
export async function readDiskPackageVersion(pluginDir) {
|
|
73
|
+
try {
|
|
74
|
+
const pkgPath = nodePath.join(pluginDir, 'package.json');
|
|
75
|
+
const raw = await readFile(pkgPath, 'utf8');
|
|
76
|
+
const pkg = JSON.parse(raw);
|
|
77
|
+
return typeof pkg?.version === 'string' ? pkg.version : null;
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
return null;
|
|
84
81
|
}
|
|
85
82
|
}
|
|
86
83
|
|
|
87
84
|
/**
|
|
88
|
-
*
|
|
85
|
+
* 单次调用 coclaw.upgradeHealth;永不抛异常,失败归一化为 { ok: false, reason }
|
|
89
86
|
* @param {object} [opts]
|
|
90
|
-
* @
|
|
91
|
-
* @returns {Promise<string>} 返回版本号
|
|
87
|
+
* @returns {Promise<{ ok: true, version: string } | { ok: false, reason: string }>}
|
|
92
88
|
*/
|
|
93
|
-
|
|
94
|
-
const output = await runCmd(
|
|
95
|
-
'openclaw',
|
|
96
|
-
['gateway', 'call', 'coclaw.upgradeHealth', '--json'],
|
|
97
|
-
opts,
|
|
98
|
-
);
|
|
89
|
+
async function callUpgradeHealthOnce(opts) {
|
|
99
90
|
try {
|
|
100
|
-
const
|
|
101
|
-
|
|
102
|
-
|
|
91
|
+
const output = await runCmd(
|
|
92
|
+
'openclaw',
|
|
93
|
+
['gateway', 'call', 'coclaw.upgradeHealth', '--json'],
|
|
94
|
+
opts,
|
|
95
|
+
);
|
|
96
|
+
let payload;
|
|
97
|
+
try {
|
|
98
|
+
payload = JSON.parse(output);
|
|
103
99
|
}
|
|
104
|
-
|
|
100
|
+
catch {
|
|
101
|
+
return { ok: false, reason: `invalid-json: ${output.slice(0, 120)}` };
|
|
102
|
+
}
|
|
103
|
+
if (!payload?.version) return { ok: false, reason: 'missing-version' };
|
|
104
|
+
return { ok: true, version: String(payload.version) };
|
|
105
105
|
}
|
|
106
106
|
catch (err) {
|
|
107
|
-
|
|
108
|
-
|
|
107
|
+
const stderr = typeof err?.stderr === 'string' ? err.stderr.trim() : '';
|
|
108
|
+
/* c8 ignore next -- ?? fallback */
|
|
109
|
+
const msg = err?.message ?? String(err);
|
|
110
|
+
const reason = (stderr || msg || 'unknown').slice(0, 200);
|
|
111
|
+
return { ok: false, reason };
|
|
109
112
|
}
|
|
110
113
|
}
|
|
111
114
|
|
|
112
115
|
/**
|
|
113
|
-
*
|
|
114
|
-
* @param {string}
|
|
116
|
+
* 轮询 upgradeHealth 直到版本严格等于 toVersion,或总超时
|
|
117
|
+
* @param {string} toVersion
|
|
115
118
|
* @param {object} [opts]
|
|
116
119
|
* @param {Function} [opts.execFileFn]
|
|
117
|
-
* @param {number} [opts.
|
|
120
|
+
* @param {number} [opts.totalTimeoutMs]
|
|
118
121
|
* @param {number} [opts.pollIntervalMs]
|
|
119
|
-
* @
|
|
122
|
+
* @param {number} [opts.cmdTimeoutMs]
|
|
123
|
+
* @returns {Promise<{ ok: true, version: string, attempts: number, elapsedMs: number }
|
|
124
|
+
* | { ok: false, attempts: number, elapsedMs: number, lastReason: string, lastVersion: string }>}
|
|
120
125
|
*/
|
|
121
|
-
export async function
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
126
|
+
export async function pollUpgradeHealth(toVersion, opts) {
|
|
127
|
+
/* c8 ignore next -- ?? fallback */
|
|
128
|
+
const totalTimeout = opts?.totalTimeoutMs ?? HEALTH_TOTAL_TIMEOUT_MS;
|
|
129
|
+
/* c8 ignore next -- ?? fallback */
|
|
130
|
+
const pollInterval = opts?.pollIntervalMs ?? HEALTH_POLL_INTERVAL_MS;
|
|
131
|
+
const start = Date.now();
|
|
132
|
+
let attempts = 0;
|
|
133
|
+
let lastReason = '';
|
|
134
|
+
let lastVersion = '';
|
|
135
|
+
|
|
136
|
+
while (Date.now() - start < totalTimeout) {
|
|
137
|
+
attempts += 1;
|
|
138
|
+
const result = await callUpgradeHealthOnce(opts);
|
|
139
|
+
if (result.ok) {
|
|
140
|
+
if (result.version === toVersion) {
|
|
141
|
+
return {
|
|
142
|
+
ok: true,
|
|
143
|
+
version: result.version,
|
|
144
|
+
attempts,
|
|
145
|
+
elapsedMs: Date.now() - start,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
lastVersion = result.version;
|
|
149
|
+
lastReason = `version-mismatch got=${result.version} want=${toVersion}`;
|
|
150
|
+
}
|
|
151
|
+
else {
|
|
152
|
+
lastReason = result.reason;
|
|
153
|
+
}
|
|
154
|
+
// 剩余时间不足以再等一个 interval 就直接退出,避免最后一次毫无意义的 sleep
|
|
155
|
+
if (Date.now() - start + pollInterval >= totalTimeout) break;
|
|
156
|
+
await sleep(pollInterval);
|
|
127
157
|
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
ok: false,
|
|
161
|
+
attempts,
|
|
162
|
+
elapsedMs: Date.now() - start,
|
|
163
|
+
lastReason,
|
|
164
|
+
lastVersion,
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* 完整验证流程:触发 gateway restart → 读磁盘版本(诊断)→ 轮询 upgradeHealth
|
|
170
|
+
* @param {string} pluginDir - 插件安装目录(来自 openclaw.json 的权威 installPath)
|
|
171
|
+
* @param {string} toVersion - 目标版本
|
|
172
|
+
* @param {object} [opts]
|
|
173
|
+
* @param {Function} [opts.execFileFn]
|
|
174
|
+
* @param {number} [opts.totalTimeoutMs]
|
|
175
|
+
* @param {number} [opts.pollIntervalMs]
|
|
176
|
+
* @param {number} [opts.cmdTimeoutMs]
|
|
177
|
+
* @param {Function} [log] - 本地日志函数
|
|
178
|
+
* @returns {Promise<{ ok: true, version: string } | { ok: false, error: string }>}
|
|
179
|
+
*/
|
|
180
|
+
export async function verifyUpgrade(pluginDir, toVersion, opts, log) {
|
|
181
|
+
const logFn = typeof log === 'function' ? log : () => {};
|
|
182
|
+
|
|
183
|
+
await triggerGatewayRestart(opts);
|
|
184
|
+
|
|
185
|
+
const onDiskVersion = await readDiskPackageVersion(pluginDir);
|
|
186
|
+
logFn(`[upgrade-worker] On-disk package.json version: ${onDiskVersion ?? '(unreadable)'} (expected ${toVersion})`);
|
|
187
|
+
|
|
188
|
+
const result = await pollUpgradeHealth(toVersion, opts);
|
|
189
|
+
if (result.ok) {
|
|
190
|
+
logFn(`[upgrade-worker] upgradeHealth verified: version=${result.version} attempts=${result.attempts} elapsed=${result.elapsedMs}ms`);
|
|
191
|
+
return { ok: true, version: result.version };
|
|
131
192
|
}
|
|
193
|
+
|
|
194
|
+
const error = `verify timeout: attempts=${result.attempts} elapsed=${result.elapsedMs}ms lastVersion=${result.lastVersion || '(none)'} lastReason=${result.lastReason || '(none)'}`;
|
|
195
|
+
logFn(`[upgrade-worker] ${error}`);
|
|
196
|
+
return { ok: false, error };
|
|
132
197
|
}
|
|
133
198
|
|
|
134
199
|
function sleep(ms) {
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import { execFile as nodeExecFile } from 'node:child_process';
|
|
16
16
|
import { parseArgs } from 'node:util';
|
|
17
17
|
import { createBackup, restoreFromBackup, removeBackup } from './worker-backup.js';
|
|
18
|
-
import { verifyUpgrade,
|
|
18
|
+
import { verifyUpgrade, triggerGatewayRestart } from './worker-verify.js';
|
|
19
19
|
import { addSkippedVersion, updateLastUpgrade, appendLog } from './state.js';
|
|
20
20
|
import { getCurrentNpmRegistry, pickFallbackRegistry } from './registry-fallback.js';
|
|
21
21
|
|
|
@@ -160,7 +160,7 @@ export async function runUpgrade({ pluginDir, fromVersion, toVersion, pluginId,
|
|
|
160
160
|
|
|
161
161
|
// 3. 等待 gateway 重启并验证
|
|
162
162
|
log('[upgrade-worker] Verifying upgrade...');
|
|
163
|
-
const result = await verifyUpgrade(
|
|
163
|
+
const result = await verifyUpgrade(pluginDir, toVersion, opts, log);
|
|
164
164
|
|
|
165
165
|
if (result.ok) {
|
|
166
166
|
// 4a. 成功
|
|
@@ -212,15 +212,9 @@ async function handleRollback({ pluginDir, fromVersion, toVersion, pluginId, pkg
|
|
|
212
212
|
}
|
|
213
213
|
}
|
|
214
214
|
|
|
215
|
-
//
|
|
216
|
-
log('[upgrade-worker]
|
|
217
|
-
|
|
218
|
-
await waitForGateway(opts);
|
|
219
|
-
log('[upgrade-worker] Gateway restarted after rollback');
|
|
220
|
-
}
|
|
221
|
-
catch {
|
|
222
|
-
log('[upgrade-worker] Gateway did not restart after rollback');
|
|
223
|
-
}
|
|
215
|
+
// 触发 gateway 重启让老版本回到运行态(尽力而为,不验证结果)
|
|
216
|
+
log('[upgrade-worker] Triggering gateway restart after rollback...');
|
|
217
|
+
await triggerGatewayRestart(opts);
|
|
224
218
|
|
|
225
219
|
// 记录状态(顺序执行因共享 state 文件,但各自 try/catch 避免单个失败阻断其余)
|
|
226
220
|
// 仅验证失败(新版本确实被加载并发现有问题)才标记为 skipped;
|
package/src/realtime-bridge.js
CHANGED
|
@@ -22,6 +22,12 @@ const CONNECT_TIMEOUT_MS = 10_000;
|
|
|
22
22
|
const SERVER_HB_PING_MS = 25_000;
|
|
23
23
|
const SERVER_HB_TIMEOUT_MS = 45_000;
|
|
24
24
|
const SERVER_HB_MAX_MISS = 4; // 连续 4 次无响应才断连(~3 分钟)
|
|
25
|
+
// gateway 握手失败的指数退避表:每个元素是"上一次失败"之后、"下一次尝试"之前的等待时间。
|
|
26
|
+
// 最多 5 次重试(加上首次尝试共 6 次),全部失败后进入 gave-up 终态,不再自动尝试。
|
|
27
|
+
const GATEWAY_RETRY_DELAYS_MS = [5_000, 10_000, 20_000, 20_000, 20_000];
|
|
28
|
+
// v3 握手失败时,只有错误消息匹配此正则才回退到不带 device 的 legacy 握手。
|
|
29
|
+
// 严格限定在"签名/设备/scope/协议"相关错误,避免对网络/内部错误做无意义的降级尝试。
|
|
30
|
+
const GATEWAY_HANDSHAKE_FALLBACK_PATTERN = /signature|device|scope|protocol/i;
|
|
25
31
|
|
|
26
32
|
function toServerWsUrl(baseUrl, token) {
|
|
27
33
|
const url = new URL(baseUrl);
|
|
@@ -112,6 +118,12 @@ export class RealtimeBridge {
|
|
|
112
118
|
this.__fileHandler = null;
|
|
113
119
|
this.__ndcPreloadResult = null;
|
|
114
120
|
this.__ndcCleanup = null;
|
|
121
|
+
// gateway 握手重试状态(刷屏治理 + 兼容性回退)
|
|
122
|
+
this.__gatewayAttempts = 0; // 已失败的连续握手次数(握手成功时归零)
|
|
123
|
+
this.__gatewayRetryTimer = null; // 下一次尝试的 setTimeout 句柄
|
|
124
|
+
this.__gatewayGaveUp = false; // 重试次数耗尽 → 终态,不再自动尝试
|
|
125
|
+
this.__gatewayLegacyMode = false; // 学到"本 gateway 不接受带 device 的 v3"
|
|
126
|
+
this.__gatewayLastReason = null; // 最近一次失败原因(用于 gave-up 上报)
|
|
115
127
|
}
|
|
116
128
|
|
|
117
129
|
__resolveWebSocket() {
|
|
@@ -192,6 +204,14 @@ export class RealtimeBridge {
|
|
|
192
204
|
}
|
|
193
205
|
|
|
194
206
|
__closeGatewayWs() {
|
|
207
|
+
// 当 server WS 失效主动关闭 gateway 时,取消任何 pending 重试定时器、把连续失败计数归零:
|
|
208
|
+
// 新 server 会话应从新预算开始重试 gateway,避免旧会话的零散失败累计吞掉未来的重试机会。
|
|
209
|
+
// 不清 __gatewayGaveUp / __gatewayLegacyMode —— 那是跨会话的终态/学习,只由 stop() 复位。
|
|
210
|
+
if (this.__gatewayRetryTimer) {
|
|
211
|
+
clearTimeout(this.__gatewayRetryTimer);
|
|
212
|
+
this.__gatewayRetryTimer = null;
|
|
213
|
+
}
|
|
214
|
+
this.__gatewayAttempts = 0;
|
|
195
215
|
if (!this.gatewayWs) {
|
|
196
216
|
return;
|
|
197
217
|
}
|
|
@@ -558,12 +578,13 @@ export class RealtimeBridge {
|
|
|
558
578
|
};
|
|
559
579
|
}
|
|
560
580
|
|
|
561
|
-
__sendGatewayConnectRequest(ws, nonce) {
|
|
562
|
-
|
|
563
|
-
this.
|
|
581
|
+
__sendGatewayConnectRequest(ws, nonce, { legacy = false } = {}) {
|
|
582
|
+
// 用 rpcSeq 保证 ID 唯一,避免 v3→legacy 同毫秒内两次调用产生相同 id
|
|
583
|
+
this.gatewayRpcSeq += 1;
|
|
584
|
+
this.gatewayConnectReqId = `coclaw-connect-${Date.now()}-${this.gatewayRpcSeq}`;
|
|
585
|
+
this.__logDebug(`gateway connect request -> id=${this.gatewayConnectReqId} legacy=${legacy}`);
|
|
564
586
|
try {
|
|
565
587
|
const authToken = this.__resolveGatewayAuthToken();
|
|
566
|
-
const device = this.__buildDeviceField(nonce, authToken);
|
|
567
588
|
const params = {
|
|
568
589
|
minProtocol: 3,
|
|
569
590
|
maxProtocol: 3,
|
|
@@ -577,8 +598,12 @@ export class RealtimeBridge {
|
|
|
577
598
|
role: 'operator',
|
|
578
599
|
scopes: ['operator.admin'],
|
|
579
600
|
auth: authToken ? { token: authToken } : undefined,
|
|
580
|
-
device,
|
|
581
601
|
};
|
|
602
|
+
// legacy 回退仅省略 device 字段;其他字段保持与 v3 一致。
|
|
603
|
+
// 当 gateway 不支持/不接受 device 字段时,auth.token 足以完成旧版握手。
|
|
604
|
+
if (!legacy) {
|
|
605
|
+
params.device = this.__buildDeviceField(nonce, authToken);
|
|
606
|
+
}
|
|
582
607
|
ws.send(JSON.stringify({
|
|
583
608
|
type: 'req',
|
|
584
609
|
id: this.gatewayConnectReqId,
|
|
@@ -592,7 +617,42 @@ export class RealtimeBridge {
|
|
|
592
617
|
}
|
|
593
618
|
}
|
|
594
619
|
|
|
620
|
+
/**
|
|
621
|
+
* 握手失败一次:累加计数;未耗尽则按退避表调度下次尝试,耗尽则进入 gave-up 终态。
|
|
622
|
+
* 调度 / 尝试 / 终态 guard 由 __ensureGatewayConnection 一致执行。
|
|
623
|
+
* @param {string} reason - 本次失败原因,用于 gave-up 时汇总上报
|
|
624
|
+
*/
|
|
625
|
+
__onGatewayAttemptFailed(reason) {
|
|
626
|
+
if (!this.started || this.__gatewayGaveUp || this.__gatewayRetryTimer) {
|
|
627
|
+
return;
|
|
628
|
+
}
|
|
629
|
+
this.__gatewayLastReason = reason;
|
|
630
|
+
this.__gatewayAttempts += 1;
|
|
631
|
+
if (this.__gatewayAttempts > GATEWAY_RETRY_DELAYS_MS.length) {
|
|
632
|
+
this.__gatewayGaveUp = true;
|
|
633
|
+
remoteLog(`gateway.handshake.gave-up attempts=${this.__gatewayAttempts} lastReason=${reason}`);
|
|
634
|
+
this.logger.warn?.(`[coclaw] gateway handshake gave up after ${this.__gatewayAttempts} attempts (last reason: ${reason})`);
|
|
635
|
+
return;
|
|
636
|
+
}
|
|
637
|
+
const delay = GATEWAY_RETRY_DELAYS_MS[this.__gatewayAttempts - 1];
|
|
638
|
+
this.__gatewayRetryTimer = setTimeout(() => {
|
|
639
|
+
this.__gatewayRetryTimer = null;
|
|
640
|
+
this.__ensureGatewayConnection();
|
|
641
|
+
}, delay);
|
|
642
|
+
this.__gatewayRetryTimer.unref?.();
|
|
643
|
+
}
|
|
644
|
+
|
|
595
645
|
__ensureGatewayConnection() {
|
|
646
|
+
// 停机守卫:防止 stop() 之后某个已进入调度队列的 retry timer callback 再触发新 WS
|
|
647
|
+
if (!this.started) {
|
|
648
|
+
return;
|
|
649
|
+
}
|
|
650
|
+
// 刷屏治理:已进入终态 / 已调度下次尝试 → 不启动新 WS。
|
|
651
|
+
// 这两个 guard 保证在 __waitGatewayReady 或 server WS 重连的连续触发下
|
|
652
|
+
// 只会按退避表节奏新建连接。
|
|
653
|
+
if (this.__gatewayGaveUp || this.__gatewayRetryTimer) {
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
596
656
|
if (this.gatewayWs || !this.serverWs || this.serverWs.readyState !== 1) {
|
|
597
657
|
return;
|
|
598
658
|
}
|
|
@@ -606,6 +666,12 @@ export class RealtimeBridge {
|
|
|
606
666
|
this.gatewayReady = false;
|
|
607
667
|
this.gatewayConnectReqId = null;
|
|
608
668
|
|
|
669
|
+
// per-WS 闭包状态,只在本条 WS 的生命周期内有效。
|
|
670
|
+
let connectFailReported = false; // 已经打过 ws.connect-failed;close 时抑制重复的 ws.disconnected
|
|
671
|
+
let pendingLegacyAttempted = false; // 本 WS 已尝试过 legacy 握手,避免重复降级
|
|
672
|
+
let wasReady = false; // 本 WS 曾经握手成功(区分"握手失败"与"成功后断开")
|
|
673
|
+
let lastChallengeNonce = ''; // 最近一次 challenge 的 nonce,legacy 回退时复用
|
|
674
|
+
|
|
609
675
|
ws.addEventListener('message', (event) => {
|
|
610
676
|
let payload = null;
|
|
611
677
|
try {
|
|
@@ -619,13 +685,23 @@ export class RealtimeBridge {
|
|
|
619
685
|
}
|
|
620
686
|
if (payload.type === 'event' && payload.event === 'connect.challenge') {
|
|
621
687
|
const nonce = payload?.payload?.nonce ?? '';
|
|
622
|
-
|
|
623
|
-
this.
|
|
688
|
+
lastChallengeNonce = nonce;
|
|
689
|
+
this.__logDebug(`gateway event <- connect.challenge legacyMode=${this.__gatewayLegacyMode}`);
|
|
690
|
+
// 已经学到此 gateway 是 legacy(上一条 WS 回退过)→ 直接发 legacy 握手
|
|
691
|
+
if (this.__gatewayLegacyMode) {
|
|
692
|
+
pendingLegacyAttempted = true;
|
|
693
|
+
this.__sendGatewayConnectRequest(ws, nonce, { legacy: true });
|
|
694
|
+
}
|
|
695
|
+
else {
|
|
696
|
+
this.__sendGatewayConnectRequest(ws, nonce);
|
|
697
|
+
}
|
|
624
698
|
return;
|
|
625
699
|
}
|
|
626
700
|
if (payload.type === 'res' && this.gatewayConnectReqId && payload.id === this.gatewayConnectReqId) {
|
|
627
701
|
if (payload.ok === true) {
|
|
628
702
|
this.gatewayReady = true;
|
|
703
|
+
wasReady = true;
|
|
704
|
+
this.__gatewayAttempts = 0; // 成功握手 → 重置失败计数,让后续瞬态断开有完整重试预算
|
|
629
705
|
remoteLog('ws.connected peer=gateway');
|
|
630
706
|
this.__logDebug(`gateway connect ok <- id=${payload.id}`);
|
|
631
707
|
this.gatewayConnectReqId = null;
|
|
@@ -633,10 +709,28 @@ export class RealtimeBridge {
|
|
|
633
709
|
this.__pushInstanceInfo();
|
|
634
710
|
}
|
|
635
711
|
else {
|
|
712
|
+
const reason = payload?.error?.message ?? 'unknown';
|
|
713
|
+
// v3 → legacy 同 WS 回退:仅在签名/协议相关错误、且本 WS 尚未尝试 legacy 时触发
|
|
714
|
+
const shouldFallback =
|
|
715
|
+
!pendingLegacyAttempted
|
|
716
|
+
&& !this.__gatewayLegacyMode
|
|
717
|
+
&& GATEWAY_HANDSHAKE_FALLBACK_PATTERN.test(reason);
|
|
718
|
+
if (shouldFallback) {
|
|
719
|
+
pendingLegacyAttempted = true;
|
|
720
|
+
this.__gatewayLegacyMode = true;
|
|
721
|
+
// v3 的失败原因已由这条 remoteLog 单独上报,不写入 __gatewayLastReason;
|
|
722
|
+
// 后者保持"最后一次真正失败的原因"语义,供 gave-up 时使用。
|
|
723
|
+
remoteLog(`gateway.handshake.fallback v3→legacy reason=${reason}`);
|
|
724
|
+
this.logger.info?.(`[coclaw] gateway v3 handshake failed (${reason}), falling back to legacy`);
|
|
725
|
+
this.__sendGatewayConnectRequest(ws, lastChallengeNonce, { legacy: true });
|
|
726
|
+
return;
|
|
727
|
+
}
|
|
636
728
|
this.gatewayReady = false;
|
|
637
729
|
this.gatewayConnectReqId = null;
|
|
638
|
-
|
|
639
|
-
this.
|
|
730
|
+
connectFailReported = true;
|
|
731
|
+
this.__gatewayLastReason = reason;
|
|
732
|
+
remoteLog(`ws.connect-failed peer=gateway msg=${reason}`);
|
|
733
|
+
this.logger.warn?.(`[coclaw] gateway connect failed: ${reason}`);
|
|
640
734
|
try { ws.close(1008, 'gateway_connect_failed'); }
|
|
641
735
|
/* c8 ignore next */
|
|
642
736
|
catch {}
|
|
@@ -675,21 +769,46 @@ export class RealtimeBridge {
|
|
|
675
769
|
this.__logDebug('gateway ws open, waiting for connect.challenge');
|
|
676
770
|
});
|
|
677
771
|
ws.addEventListener('close', (ev) => {
|
|
678
|
-
|
|
772
|
+
// 握手失败路径已经打过 ws.connect-failed,这里抑制重复的 disconnected 日志;
|
|
773
|
+
// 成功后的意外断开、握手途中的异常断开仍按原样上报。
|
|
774
|
+
if (!connectFailReported) {
|
|
775
|
+
remoteLog(`ws.disconnected peer=gateway code=${ev?.code ?? '?'}`);
|
|
776
|
+
}
|
|
679
777
|
this.logger.info?.(`[coclaw] gateway ws closed (code=${ev?.code ?? '?'} reason=${ev?.reason ?? 'n/a'})`);
|
|
680
|
-
this.gatewayWs
|
|
681
|
-
|
|
682
|
-
|
|
778
|
+
if (this.gatewayWs === ws) {
|
|
779
|
+
this.gatewayWs = null;
|
|
780
|
+
this.gatewayReady = false;
|
|
781
|
+
this.gatewayConnectReqId = null;
|
|
782
|
+
}
|
|
683
783
|
/* c8 ignore next 3 -- gateway 意外断开时结算未完成 RPC,避免等超时 */
|
|
684
784
|
for (const [, settle] of this.gatewayPendingRequests) {
|
|
685
785
|
settle({ ok: false, error: 'gateway_closed' });
|
|
686
786
|
}
|
|
687
787
|
this.gatewayPendingRequests.clear();
|
|
788
|
+
// 调度下一次尝试:仅在 bridge 仍活着、未 gave-up、server WS 健康时;
|
|
789
|
+
// 其他场景(如 bridge stop、server WS 已断)由上游流程兜底,不参与 gateway 重试。
|
|
790
|
+
if (this.started && !this.__gatewayGaveUp
|
|
791
|
+
&& this.serverWs && this.serverWs.readyState === 1
|
|
792
|
+
&& (wasReady || connectFailReported)) {
|
|
793
|
+
if (wasReady) {
|
|
794
|
+
// 之前握成功过,视为瞬态掉线 → 重置计数,让新一轮拿到完整重试预算
|
|
795
|
+
this.__gatewayAttempts = 0;
|
|
796
|
+
}
|
|
797
|
+
this.__onGatewayAttemptFailed(
|
|
798
|
+
/* c8 ignore next -- connectFailReported 路径必然已设 __gatewayLastReason */
|
|
799
|
+
wasReady ? 'disconnected' : (this.__gatewayLastReason ?? 'connect-failed')
|
|
800
|
+
);
|
|
801
|
+
}
|
|
688
802
|
});
|
|
689
803
|
ws.addEventListener('error', (err) => {
|
|
690
804
|
/* c8 ignore next -- ?./?? fallback */
|
|
691
805
|
remoteLog(`ws.error peer=gateway msg=${String(err?.message ?? err)}`);
|
|
692
806
|
this.logger.warn?.(`[coclaw] gateway ws error: ${String(err?.message ?? err)}`);
|
|
807
|
+
// 防御 ws 库在某些错误下只 emit error 不跟随 close 的情况:主动关闭让 close handler
|
|
808
|
+
// 接管清理和重试调度,避免 gatewayWs 引用卡在僵尸状态阻塞后续 __ensureGatewayConnection。
|
|
809
|
+
try { ws.close(1011, 'ws_error'); }
|
|
810
|
+
/* c8 ignore next */
|
|
811
|
+
catch {}
|
|
693
812
|
});
|
|
694
813
|
}
|
|
695
814
|
|
|
@@ -1049,6 +1168,15 @@ export class RealtimeBridge {
|
|
|
1049
1168
|
clearTimeout(this.reconnectTimer);
|
|
1050
1169
|
this.reconnectTimer = null;
|
|
1051
1170
|
}
|
|
1171
|
+
// 清理 gateway 重试状态:refresh()(stop+start 同一实例)后应以全新状态启动
|
|
1172
|
+
if (this.__gatewayRetryTimer) {
|
|
1173
|
+
clearTimeout(this.__gatewayRetryTimer);
|
|
1174
|
+
this.__gatewayRetryTimer = null;
|
|
1175
|
+
}
|
|
1176
|
+
this.__gatewayAttempts = 0;
|
|
1177
|
+
this.__gatewayGaveUp = false;
|
|
1178
|
+
this.__gatewayLegacyMode = false;
|
|
1179
|
+
this.__gatewayLastReason = null;
|
|
1052
1180
|
this.__closeGatewayWs();
|
|
1053
1181
|
if (this.webrtcPeer) {
|
|
1054
1182
|
await this.webrtcPeer.closeAll().catch(() => {});
|
|
@@ -4,12 +4,13 @@
|
|
|
4
4
|
*
|
|
5
5
|
* 行为约定详见 docs/rpc-dc-file-queue.md。
|
|
6
6
|
* - FIFO、单一生产者/消费者;多消费者时每条只交付给其中一个。
|
|
7
|
-
* -
|
|
7
|
+
* - 构造纯字段初始化,不碰 FS;使用前需 `await q.init()`。
|
|
8
8
|
* - 消费侧:`for await (const item of queue) { ... }`;`destroy()` 让迭代结束。
|
|
9
|
+
* - FS 异常下进入 `fsBroken` 粘性降级:mem 路径继续工作,溢出消息 drop。
|
|
9
10
|
*/
|
|
10
11
|
|
|
11
12
|
import fs from 'node:fs/promises';
|
|
12
|
-
import { createReadStream, createWriteStream
|
|
13
|
+
import { createReadStream, createWriteStream } from 'node:fs';
|
|
13
14
|
import nodePath from 'node:path';
|
|
14
15
|
import readline from 'node:readline';
|
|
15
16
|
|
|
@@ -18,13 +19,22 @@ import { createMutex } from './mutex.js';
|
|
|
18
19
|
const DEFAULT_MEM_BUDGET = 8 * 1024 * 1024;
|
|
19
20
|
const DEFAULT_DISK_CAP = 1024 * 1024 * 1024;
|
|
20
21
|
|
|
22
|
+
// JS 对象开销估算(string header + array slot 等),仅用于 admission 决策不影响 memBytes 报告
|
|
23
|
+
const ENTRY_OVERHEAD = 64;
|
|
24
|
+
|
|
25
|
+
// id 字符集:UUID / 字母数字 / 点 / 下划线 / 减号,且不能是 "." 或 ".."
|
|
26
|
+
const ID_RE = /^[A-Za-z0-9._-]+$/;
|
|
27
|
+
|
|
28
|
+
// 压缩阈值:head 越过 64 且占 memQueue 一半以上时切片回收
|
|
29
|
+
const COMPACT_HEAD_THRESHOLD = 64;
|
|
30
|
+
|
|
21
31
|
class FileBackedQueue {
|
|
22
32
|
/**
|
|
23
33
|
* @param {object} opts
|
|
24
34
|
* @param {string} opts.dir - 队列文件根目录
|
|
25
|
-
* @param {string} opts.id -
|
|
35
|
+
* @param {string} opts.id - 队列标识,字符集受限,防路径穿越
|
|
26
36
|
* @param {number} [opts.memBudget=8MB] - 内存持有字节数上限
|
|
27
|
-
* @param {number} [opts.diskCap=1GB] -
|
|
37
|
+
* @param {number} [opts.diskCap=1GB] - 磁盘+内存总字节数硬上限(含 `\n`)
|
|
28
38
|
* @param {(reason: string, size: number) => void} [opts.onDrop] - 拒入队时的回调
|
|
29
39
|
* @param {{ warn?: Function, info?: Function, error?: Function }} [opts.logger=console]
|
|
30
40
|
*/
|
|
@@ -40,6 +50,17 @@ class FileBackedQueue {
|
|
|
40
50
|
|
|
41
51
|
if (!dir || typeof dir !== 'string') throw new TypeError('dir is required');
|
|
42
52
|
if (!id || typeof id !== 'string') throw new TypeError('id is required');
|
|
53
|
+
if (id === '.' || id === '..' || !ID_RE.test(id)) {
|
|
54
|
+
throw new TypeError('id contains invalid characters');
|
|
55
|
+
}
|
|
56
|
+
// 基础设施 fail-fast:容量参数必须是有限正数,避免 NaN/Infinity/非数字绕过 admission。
|
|
57
|
+
// NaN 与任何数比较皆为 false → admission 永远通过 → diskCap 变相失效。
|
|
58
|
+
if (!Number.isFinite(memBudget) || memBudget <= 0) {
|
|
59
|
+
throw new TypeError('memBudget must be a finite positive number');
|
|
60
|
+
}
|
|
61
|
+
if (!Number.isFinite(diskCap) || diskCap <= 0) {
|
|
62
|
+
throw new TypeError('diskCap must be a finite positive number');
|
|
63
|
+
}
|
|
43
64
|
|
|
44
65
|
this.dir = dir;
|
|
45
66
|
this.id = id;
|
|
@@ -48,28 +69,52 @@ class FileBackedQueue {
|
|
|
48
69
|
this.onDrop = onDrop;
|
|
49
70
|
this.logger = logger;
|
|
50
71
|
|
|
51
|
-
this.
|
|
52
|
-
this.filePath = nodePath.join(this.subdir, 'queue.jsonl');
|
|
72
|
+
this.filePath = nodePath.join(dir, `${id}.jsonl`);
|
|
53
73
|
|
|
74
|
+
// 单文件 ring-ish 结构:head 指针 + 数组;shift 为 O(1) 摊销
|
|
54
75
|
this.memQueue = [];
|
|
76
|
+
this.head = 0;
|
|
55
77
|
this.memBytes = 0;
|
|
56
|
-
this.diskBytes = 0; // 磁盘上未消费的 payload 字节(不含分隔 \n)
|
|
57
78
|
this.writtenBytes = 0; // 已写入文件的累计字节(含 \n)
|
|
58
79
|
this.readOffset = 0; // 下次 refill 的起始偏移
|
|
59
80
|
this.spilled = false;
|
|
81
|
+
this.initialized = false;
|
|
60
82
|
this.destroyed = false;
|
|
83
|
+
this.fsBroken = false; // 粘性:一旦 FS 出错,不再尝试 reopen
|
|
61
84
|
this.writeStream = null;
|
|
62
85
|
this.writeErr = null;
|
|
63
86
|
this.waiters = [];
|
|
64
87
|
this.mutex = createMutex();
|
|
88
|
+
}
|
|
65
89
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
90
|
+
/**
|
|
91
|
+
* 派生的未消费磁盘字节数(含 \n),用于 admission 与 stats。
|
|
92
|
+
*/
|
|
93
|
+
get diskBytes() {
|
|
94
|
+
return this.writtenBytes - this.readOffset;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* 异步初始化:清理残留文件,标记可用。幂等。
|
|
99
|
+
*/
|
|
100
|
+
async init() {
|
|
101
|
+
return await this.mutex.withLock(async () => {
|
|
102
|
+
if (this.destroyed) return;
|
|
103
|
+
if (this.initialized) return;
|
|
104
|
+
try {
|
|
105
|
+
await fs.rm(this.filePath, { force: true });
|
|
106
|
+
} catch (err) {
|
|
107
|
+
// best-effort:init 的 rm 可能因 ENOTDIR / EACCES 等失败。
|
|
108
|
+
// 权威残留清理在 __openWriteStream 中(首次 spill 前)再做一次,
|
|
109
|
+
// 确保不会用 'a' flag 追加到旧数据上污染 FIFO。
|
|
110
|
+
this.logger?.warn?.('fbq.init rm warning', err);
|
|
111
|
+
}
|
|
112
|
+
this.initialized = true;
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async [Symbol.asyncDispose]() {
|
|
117
|
+
await this.destroy();
|
|
73
118
|
}
|
|
74
119
|
|
|
75
120
|
/**
|
|
@@ -80,28 +125,45 @@ class FileBackedQueue {
|
|
|
80
125
|
async enqueue(jsonStr) {
|
|
81
126
|
return await this.mutex.withLock(async () => {
|
|
82
127
|
if (this.destroyed) return false;
|
|
128
|
+
if (!this.initialized) throw new TypeError('queue not initialized');
|
|
83
129
|
if (typeof jsonStr !== 'string') throw new TypeError('jsonStr must be a string');
|
|
84
130
|
|
|
85
131
|
const size = Buffer.byteLength(jsonStr, 'utf8');
|
|
86
132
|
|
|
87
|
-
|
|
133
|
+
// admission:按物理占用(mem + 已写文件总字节,含 \n)判定,保证 diskCap 是真正的硬上限。
|
|
134
|
+
// 用 writtenBytes(不减 readOffset)的含义:文件前缀已读但未被 __dropFile 回收前仍算占用。
|
|
135
|
+
// 代价:持续背压下消费者还没追到写端时新消息可能被 drop,直到完全 drain 触发 __dropFile 重置。
|
|
136
|
+
if (this.memBytes + this.writtenBytes + size + 1 > this.diskCap) {
|
|
88
137
|
this.__dispatchDrop('disk-cap', size);
|
|
89
138
|
return false;
|
|
90
139
|
}
|
|
91
140
|
|
|
92
|
-
//
|
|
93
|
-
if (!this.spilled
|
|
94
|
-
this.memQueue.
|
|
95
|
-
this.memBytes
|
|
96
|
-
this.
|
|
97
|
-
|
|
141
|
+
// 内存路径:未溢出且 admission 通过(考虑 overhead;首条无论多大都收)
|
|
142
|
+
if (!this.spilled) {
|
|
143
|
+
const pendingCount = this.memQueue.length - this.head;
|
|
144
|
+
const cost = this.memBytes + pendingCount * ENTRY_OVERHEAD + size + ENTRY_OVERHEAD;
|
|
145
|
+
if (pendingCount === 0 || cost <= this.memBudget) {
|
|
146
|
+
this.memQueue.push(jsonStr);
|
|
147
|
+
this.memBytes += size;
|
|
148
|
+
this.__wakeOne();
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// 溢出路径:FS 已破直接 drop,不再尝试 reopen
|
|
154
|
+
if (this.fsBroken) {
|
|
155
|
+
this.__dispatchDrop('fs-error', size);
|
|
156
|
+
return false;
|
|
98
157
|
}
|
|
99
158
|
|
|
100
|
-
// 溢出路径:lazy 打开写流
|
|
101
159
|
if (!this.spilled) {
|
|
102
160
|
await this.__openWriteStream();
|
|
103
161
|
if (this.writeErr) {
|
|
162
|
+
const err = this.writeErr;
|
|
104
163
|
this.__dispatchDrop('fs-error', size);
|
|
164
|
+
// 前置 mkdir/rm 失败也进入粘性降级:与 stream 'error' 路径语义一致,
|
|
165
|
+
// 避免后续每次 overflow 都重试同一个持续性 FS 故障。
|
|
166
|
+
await this.__handleFsError(err);
|
|
105
167
|
return false;
|
|
106
168
|
}
|
|
107
169
|
this.spilled = true;
|
|
@@ -109,32 +171,39 @@ class FileBackedQueue {
|
|
|
109
171
|
|
|
110
172
|
try {
|
|
111
173
|
await this.__writeLine(jsonStr + '\n');
|
|
112
|
-
this.diskBytes += size;
|
|
113
174
|
this.writtenBytes += size + 1;
|
|
114
175
|
this.__wakeOne();
|
|
115
176
|
return true;
|
|
116
177
|
} catch (err) {
|
|
117
178
|
this.logger?.warn?.('fbq.enqueue fs-error', err);
|
|
118
179
|
this.__dispatchDrop('fs-error', size);
|
|
180
|
+
// 直接在当前锁内触发粘性降级:真实 Node stream 下 cb err 通常也会 emit 'error'
|
|
181
|
+
// (监听器会另外排一次 handleFsError,但 fsBroken 已置 → no-op);测试里的 monkey-patch
|
|
182
|
+
// 只触发 cb、不发 'error',这里主动降级保证行为一致。
|
|
183
|
+
await this.__handleFsError(err);
|
|
119
184
|
return false;
|
|
120
185
|
}
|
|
121
186
|
});
|
|
122
187
|
}
|
|
123
188
|
|
|
124
189
|
/**
|
|
125
|
-
* @returns {{ memCount: number, memBytes: number, diskBytes: number, spilled: boolean }}
|
|
190
|
+
* @returns {{ memCount: number, memBytes: number, diskBytes: number, writtenBytes: number, spilled: boolean, fsBroken: boolean }}
|
|
191
|
+
* - diskBytes:未消费 backlog(writtenBytes - readOffset)
|
|
192
|
+
* - writtenBytes:本次生命周期累计已写字节(admission 依据的物理占用),drain 或 FS 降级后重置为 0
|
|
126
193
|
*/
|
|
127
194
|
stats() {
|
|
128
195
|
return {
|
|
129
|
-
memCount: this.memQueue.length,
|
|
196
|
+
memCount: this.memQueue.length - this.head,
|
|
130
197
|
memBytes: this.memBytes,
|
|
131
198
|
diskBytes: this.diskBytes,
|
|
199
|
+
writtenBytes: this.writtenBytes,
|
|
132
200
|
spilled: this.spilled,
|
|
201
|
+
fsBroken: this.fsBroken,
|
|
133
202
|
};
|
|
134
203
|
}
|
|
135
204
|
|
|
136
205
|
/**
|
|
137
|
-
*
|
|
206
|
+
* 清空数据但保留实例可用;显式清 fsBroken,允许再次尝试落盘。
|
|
138
207
|
*/
|
|
139
208
|
async clear() {
|
|
140
209
|
return await this.mutex.withLock(async () => {
|
|
@@ -147,17 +216,18 @@ class FileBackedQueue {
|
|
|
147
216
|
this.logger?.warn?.('fbq.clear rm error', err);
|
|
148
217
|
}
|
|
149
218
|
this.memQueue = [];
|
|
219
|
+
this.head = 0;
|
|
150
220
|
this.memBytes = 0;
|
|
151
|
-
this.diskBytes = 0;
|
|
152
221
|
this.writtenBytes = 0;
|
|
153
222
|
this.readOffset = 0;
|
|
154
223
|
this.spilled = false;
|
|
224
|
+
this.fsBroken = false;
|
|
155
225
|
this.writeErr = null;
|
|
156
226
|
});
|
|
157
227
|
}
|
|
158
228
|
|
|
159
229
|
/**
|
|
160
|
-
* 停写、关 FD
|
|
230
|
+
* 停写、关 FD、删文件、结束所有迭代器。幂等。
|
|
161
231
|
*/
|
|
162
232
|
async destroy() {
|
|
163
233
|
return await this.mutex.withLock(async () => {
|
|
@@ -170,15 +240,15 @@ class FileBackedQueue {
|
|
|
170
240
|
|
|
171
241
|
await this.__closeWriteStream();
|
|
172
242
|
try {
|
|
173
|
-
await fs.rm(this.
|
|
243
|
+
await fs.rm(this.filePath, { force: true });
|
|
174
244
|
} catch (err) {
|
|
175
245
|
/* c8 ignore next 2 -- rm with force rarely fails */
|
|
176
246
|
this.logger?.warn?.('fbq.destroy rm error', err);
|
|
177
247
|
}
|
|
178
248
|
|
|
179
249
|
this.memQueue = [];
|
|
250
|
+
this.head = 0;
|
|
180
251
|
this.memBytes = 0;
|
|
181
|
-
this.diskBytes = 0;
|
|
182
252
|
this.writtenBytes = 0;
|
|
183
253
|
this.readOffset = 0;
|
|
184
254
|
this.spilled = false;
|
|
@@ -198,12 +268,20 @@ class FileBackedQueue {
|
|
|
198
268
|
while (true) {
|
|
199
269
|
let waitPromise = null;
|
|
200
270
|
const result = await this.mutex.withLock(async () => {
|
|
201
|
-
|
|
271
|
+
const pendingCount = this.memQueue.length - this.head;
|
|
272
|
+
if (pendingCount === 0 && this.spilled && !this.destroyed) {
|
|
202
273
|
await this.__refillImpl();
|
|
203
274
|
}
|
|
204
|
-
if (this.memQueue.length > 0) {
|
|
205
|
-
const item = this.memQueue.
|
|
275
|
+
if (this.memQueue.length - this.head > 0) {
|
|
276
|
+
const item = this.memQueue[this.head];
|
|
277
|
+
this.memQueue[this.head] = undefined;
|
|
278
|
+
this.head += 1;
|
|
206
279
|
this.memBytes -= Buffer.byteLength(item, 'utf8');
|
|
280
|
+
// 惰性压缩:避免 head 一直向前、数组永不回收
|
|
281
|
+
if (this.head > COMPACT_HEAD_THRESHOLD && this.head * 2 >= this.memQueue.length) {
|
|
282
|
+
this.memQueue = this.memQueue.slice(this.head);
|
|
283
|
+
this.head = 0;
|
|
284
|
+
}
|
|
207
285
|
return { value: item, done: false };
|
|
208
286
|
}
|
|
209
287
|
if (this.destroyed) return { done: true, value: undefined };
|
|
@@ -224,6 +302,11 @@ class FileBackedQueue {
|
|
|
224
302
|
}
|
|
225
303
|
}
|
|
226
304
|
|
|
305
|
+
__wakeAll() {
|
|
306
|
+
const toWake = this.waiters.splice(0);
|
|
307
|
+
for (const w of toWake) w.resolve();
|
|
308
|
+
}
|
|
309
|
+
|
|
227
310
|
__dispatchDrop(reason, size) {
|
|
228
311
|
try {
|
|
229
312
|
this.onDrop?.(reason, size);
|
|
@@ -237,20 +320,32 @@ class FileBackedQueue {
|
|
|
237
320
|
async __openWriteStream() {
|
|
238
321
|
this.writeErr = null;
|
|
239
322
|
try {
|
|
240
|
-
|
|
323
|
+
// 目录 0o700 / 文件 0o600:POSIX best-effort。
|
|
324
|
+
// - 新建目录/文件会按此 mode(再经 umask)创建
|
|
325
|
+
// - 已存在的目录 mkdir(recursive) 不会被 chmod 收紧,以该目录原权限为准
|
|
326
|
+
// - Windows 下 mode 参数语义很弱(无 owner/group/other 概念),实际访问控制依赖父目录 NTFS ACL
|
|
327
|
+
// 仍比默认 0o644 更保守;atomic-write.js 也是同一策略。
|
|
328
|
+
await fs.mkdir(nodePath.dirname(this.filePath), { recursive: true, mode: 0o700 });
|
|
329
|
+
// 权威残留清理:即便 init 的 rm 被吞掉,这里开流前再 rm 一次,
|
|
330
|
+
// 避免 'a' flag 追加到旧数据上污染 FIFO。
|
|
331
|
+
await fs.rm(this.filePath, { force: true });
|
|
241
332
|
} catch (err) {
|
|
242
333
|
this.writeErr = err;
|
|
243
334
|
return;
|
|
244
335
|
}
|
|
245
|
-
this.writeStream = createWriteStream(this.filePath, { flags: 'a' });
|
|
336
|
+
this.writeStream = createWriteStream(this.filePath, { flags: 'a', mode: 0o600 });
|
|
246
337
|
this.writeStream.on('error', (err) => {
|
|
247
338
|
this.writeErr = err;
|
|
248
339
|
this.logger?.warn?.('fbq.writeStream error', err);
|
|
340
|
+
// 异步错误:排队到 mutex 做粘性降级清理,避免状态半截卡死
|
|
341
|
+
this.mutex.withLock(() => this.__handleFsError(err)).catch(() => {});
|
|
249
342
|
});
|
|
250
343
|
}
|
|
251
344
|
|
|
252
345
|
async __writeLine(str) {
|
|
253
|
-
|
|
346
|
+
// 不再前置 writeErr 检查:一旦 writeErr 被异步设置,__handleFsError 会立即排队清理并
|
|
347
|
+
// 把 fsBroken 置粘性;spill 路径入口已判 fsBroken,到这里 writeErr 必为 null。
|
|
348
|
+
// 写失败通过 write 回调的 err 反映,catch 块处理。
|
|
254
349
|
return await new Promise((resolve, reject) => {
|
|
255
350
|
this.writeStream.write(str, (err) => {
|
|
256
351
|
if (err) reject(err);
|
|
@@ -278,6 +373,25 @@ class FileBackedQueue {
|
|
|
278
373
|
});
|
|
279
374
|
}
|
|
280
375
|
|
|
376
|
+
// mutex 内调用:FS 错误粘性降级
|
|
377
|
+
async __handleFsError(_err) {
|
|
378
|
+
if (this.destroyed || this.fsBroken) return;
|
|
379
|
+
this.fsBroken = true;
|
|
380
|
+
await this.__closeWriteStream();
|
|
381
|
+
try {
|
|
382
|
+
await fs.rm(this.filePath, { force: true });
|
|
383
|
+
} catch (err) {
|
|
384
|
+
/* c8 ignore next 2 -- rm with force rarely fails */
|
|
385
|
+
this.logger?.warn?.('fbq.handleFsError rm error', err);
|
|
386
|
+
}
|
|
387
|
+
this.spilled = false;
|
|
388
|
+
this.writtenBytes = 0;
|
|
389
|
+
this.readOffset = 0;
|
|
390
|
+
this.writeErr = null;
|
|
391
|
+
// 唤醒全部消费者,让它们重新观察状态
|
|
392
|
+
this.__wakeAll();
|
|
393
|
+
}
|
|
394
|
+
|
|
281
395
|
// 调用方必须已持有 mutex,且已确认 !destroyed
|
|
282
396
|
async __refillImpl() {
|
|
283
397
|
if (!this.spilled) return;
|
|
@@ -287,8 +401,10 @@ class FileBackedQueue {
|
|
|
287
401
|
const st = await fs.stat(this.filePath);
|
|
288
402
|
actualEnd = st.size;
|
|
289
403
|
} catch (err) {
|
|
290
|
-
|
|
404
|
+
// 读侧 FS 错误(外部删文件、权限丢失等)走粘性降级,
|
|
405
|
+
// 避免 spilled=true / fsBroken=false 的悬空态让消费者永远挂 waiter。
|
|
291
406
|
this.logger?.warn?.('fbq.refill stat error', err);
|
|
407
|
+
await this.__handleFsError(err);
|
|
292
408
|
return;
|
|
293
409
|
}
|
|
294
410
|
|
|
@@ -302,6 +418,9 @@ class FileBackedQueue {
|
|
|
302
418
|
let cumPayload = 0; // 仅 payload
|
|
303
419
|
let stoppedAtEof = true;
|
|
304
420
|
|
|
421
|
+
const pendingCount = this.memQueue.length - this.head;
|
|
422
|
+
const baseCost = this.memBytes + pendingCount * ENTRY_OVERHEAD;
|
|
423
|
+
|
|
305
424
|
const stream = createReadStream(this.filePath, {
|
|
306
425
|
start: this.readOffset,
|
|
307
426
|
end: actualEnd - 1,
|
|
@@ -311,7 +430,9 @@ class FileBackedQueue {
|
|
|
311
430
|
try {
|
|
312
431
|
for await (const line of rl) {
|
|
313
432
|
const sz = Buffer.byteLength(line, 'utf8');
|
|
314
|
-
|
|
433
|
+
// overhead 一致性:admission 侧已用 overhead,refill 侧同步考虑
|
|
434
|
+
const newLinesCost = newLines.length * ENTRY_OVERHEAD;
|
|
435
|
+
if (newLines.length > 0 && baseCost + cumPayload + newLinesCost + sz + ENTRY_OVERHEAD > this.memBudget) {
|
|
315
436
|
stoppedAtEof = false;
|
|
316
437
|
break;
|
|
317
438
|
}
|
|
@@ -320,10 +441,12 @@ class FileBackedQueue {
|
|
|
320
441
|
cumPayload += sz;
|
|
321
442
|
}
|
|
322
443
|
} catch (err) {
|
|
323
|
-
/* c8 ignore next
|
|
444
|
+
/* c8 ignore next 6 -- read 错误极罕见(stat 已通过、fd 已打开),路径保留用于粘性降级 */
|
|
445
|
+
// read 错误同 stat:统一走粘性降级而非静默 return
|
|
324
446
|
this.logger?.warn?.('fbq.refill read error', err);
|
|
325
447
|
rl.close();
|
|
326
448
|
stream.destroy();
|
|
449
|
+
await this.__handleFsError(err);
|
|
327
450
|
return;
|
|
328
451
|
} finally {
|
|
329
452
|
rl.close();
|
|
@@ -349,7 +472,6 @@ class FileBackedQueue {
|
|
|
349
472
|
this.memQueue.push(line);
|
|
350
473
|
this.memBytes += Buffer.byteLength(line, 'utf8');
|
|
351
474
|
}
|
|
352
|
-
this.diskBytes -= cumPayload;
|
|
353
475
|
|
|
354
476
|
if (this.readOffset >= this.writtenBytes) {
|
|
355
477
|
await this.__dropFile();
|
|
@@ -367,7 +489,6 @@ class FileBackedQueue {
|
|
|
367
489
|
this.spilled = false;
|
|
368
490
|
this.writtenBytes = 0;
|
|
369
491
|
this.readOffset = 0;
|
|
370
|
-
this.diskBytes = 0;
|
|
371
492
|
this.writeErr = null;
|
|
372
493
|
}
|
|
373
494
|
}
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* 跨平台 mock os.homedir()
|
|
3
|
-
*
|
|
4
|
-
* Node.js os.homedir() 在不同平台读取不同环境变量:
|
|
5
|
-
* - POSIX: HOME
|
|
6
|
-
* - Windows: USERPROFILE(优先)、HOMEDRIVE+HOMEPATH
|
|
7
|
-
*
|
|
8
|
-
* 测试中需同时设置两端变量,确保 os.homedir() 返回期望路径。
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
const HOME_VARS = ['HOME', 'USERPROFILE'];
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* 保存当前 home 相关环境变量
|
|
15
|
-
* @returns {Record<string, string | undefined>}
|
|
16
|
-
*/
|
|
17
|
-
export function saveHomedir() {
|
|
18
|
-
const saved = {};
|
|
19
|
-
for (const key of HOME_VARS) {
|
|
20
|
-
saved[key] = process.env[key];
|
|
21
|
-
}
|
|
22
|
-
return saved;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* 将 home 相关环境变量统一设置为指定路径
|
|
27
|
-
* @param {string} dir - 目标路径
|
|
28
|
-
*/
|
|
29
|
-
export function setHomedir(dir) {
|
|
30
|
-
for (const key of HOME_VARS) {
|
|
31
|
-
process.env[key] = dir;
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* 恢复之前保存的 home 相关环境变量
|
|
37
|
-
* @param {Record<string, string | undefined>} saved
|
|
38
|
-
*/
|
|
39
|
-
export function restoreHomedir(saved) {
|
|
40
|
-
for (const key of HOME_VARS) {
|
|
41
|
-
if (saved[key] === undefined) {
|
|
42
|
-
delete process.env[key];
|
|
43
|
-
} else {
|
|
44
|
-
process.env[key] = saved[key];
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
}
|