@coclaw/openclaw-coclaw 0.17.3 → 0.17.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -14,6 +14,15 @@ const INITIAL_DELAY_MS = 60 * 60 * 1000; // 60 分钟
|
|
|
14
14
|
const CHECK_INTERVAL_MS = 60 * 60 * 1000; // 1 小时
|
|
15
15
|
const CHANNEL_ID = 'coclaw';
|
|
16
16
|
const LOCK_FILENAME = 'upgrade.lock';
|
|
17
|
+
// 锁年龄兜底:worker 最坏耗时约 36 分钟,TTL 给到约 3 倍余量。
|
|
18
|
+
// 超龄一律视为过期清理,兜住 worker 被强杀未清锁 / PID 被 OS 复用给长命进程的场景,
|
|
19
|
+
// 避免自动升级被永久卡住。
|
|
20
|
+
// 刻意取 110 分钟而非 120 分钟:巡检间隔 60min,锁写入与巡检有秒级抖动;
|
|
21
|
+
// 若 TTL 正好等于巡检间隔的整数倍,锁年龄会在第 N 次巡检时刚好 "未过期",
|
|
22
|
+
// 要等到第 N+1 次巡检才清,白白多浪费一轮。110min 保证第 2 次巡检即过期。
|
|
23
|
+
// 代价是 worker 真卡超 110 分钟会多起一个并行 worker,此概率在当前超时矩阵下极低,
|
|
24
|
+
// 且底层升级命令失败会走回滚,不会破坏插件。
|
|
25
|
+
const LOCK_TTL_MS = 110 * 60 * 1000; // 110 分钟
|
|
17
26
|
|
|
18
27
|
// ── upgrade.lock:保证同时最多一个 worker 进程 ──
|
|
19
28
|
|
|
@@ -21,10 +30,34 @@ export function getLockPath() {
|
|
|
21
30
|
return nodePath.join(resolveStateDir(), CHANNEL_ID, LOCK_FILENAME);
|
|
22
31
|
}
|
|
23
32
|
|
|
33
|
+
/**
|
|
34
|
+
* 清理过期锁文件。
|
|
35
|
+
*
|
|
36
|
+
* 成功才打 "Stale lock removed" 的 info;失败意味着系统性异常(权限/只读 FS/
|
|
37
|
+
* 路径被替换为目录等),打 warn 并上报 server,避免运维无感——这类失败若与
|
|
38
|
+
* writeUpgradeLock 同源故障叠加,会让锁陷入"每轮都判过期但写不进新 pid"的循环。
|
|
39
|
+
* { force: true } 对文件不存在本身不会抛,所以这里 catch 到的一定是真故障。
|
|
40
|
+
* 函数本身不抛——调用方无需额外 catch。
|
|
41
|
+
* @param {string} lockPath
|
|
42
|
+
* @param {'missing-pid'|'ttl-exceeded'|'pid-dead'} reason - 清理原因 token,
|
|
43
|
+
* 同时用作 remoteLog 的 key=value 字段
|
|
44
|
+
* @param {object} [logger]
|
|
45
|
+
*/
|
|
46
|
+
async function removeStaleLock(lockPath, reason, logger) {
|
|
47
|
+
try {
|
|
48
|
+
await fs.rm(lockPath, { force: true });
|
|
49
|
+
logger?.info?.(`[auto-upgrade] Stale lock removed (${reason})`);
|
|
50
|
+
}
|
|
51
|
+
catch (err) {
|
|
52
|
+
logger?.warn?.(`[auto-upgrade] Stale lock removal failed (${reason}): ${err?.message}`);
|
|
53
|
+
remoteLog(`upgrade.lock-cleanup-failed reason=${reason} msg=${err?.message}`);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
24
57
|
/**
|
|
25
58
|
* 检查升级锁是否被持有(worker 进程是否存活)
|
|
26
59
|
*
|
|
27
|
-
*
|
|
60
|
+
* 若锁文件存在但判定为过期(PID 已死 / JSON 无效 / 超龄),顺手清理残留文件。
|
|
28
61
|
* @param {object} [opts]
|
|
29
62
|
* @param {object} [opts.logger]
|
|
30
63
|
* @returns {Promise<boolean>}
|
|
@@ -42,8 +75,14 @@ export async function isUpgradeLocked(opts) {
|
|
|
42
75
|
try {
|
|
43
76
|
const lock = JSON.parse(raw);
|
|
44
77
|
if (!lock.pid) {
|
|
45
|
-
|
|
46
|
-
|
|
78
|
+
await removeStaleLock(lockPath, 'missing-pid', logger);
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
// 超龄兜底:PID 复用误判、worker 被强杀未清锁等场景下一律视为过期。
|
|
82
|
+
// ts 不可解析也当过期(writeUpgradeLock 必写 ISO 时间戳,缺字段即异常状态)。
|
|
83
|
+
const lockTs = Date.parse(lock.ts);
|
|
84
|
+
if (!Number.isFinite(lockTs) || Date.now() - lockTs > LOCK_TTL_MS) {
|
|
85
|
+
await removeStaleLock(lockPath, 'ttl-exceeded', logger);
|
|
47
86
|
return false;
|
|
48
87
|
}
|
|
49
88
|
// signal 0 不发信号,仅检查进程存活性;进程不存在时抛异常
|
|
@@ -52,8 +91,7 @@ export async function isUpgradeLocked(opts) {
|
|
|
52
91
|
}
|
|
53
92
|
catch {
|
|
54
93
|
// JSON 无效 / PID 已死 → 清理过期锁
|
|
55
|
-
|
|
56
|
-
await fs.rm(lockPath, { force: true }).catch(() => {});
|
|
94
|
+
await removeStaleLock(lockPath, 'pid-dead', logger);
|
|
57
95
|
return false;
|
|
58
96
|
}
|
|
59
97
|
}
|
|
@@ -1,134 +1,221 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* worker-verify.js — 升级后验证
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* 3. 升级模块健康:`openclaw gateway call coclaw.upgradeHealth` 返回版本号
|
|
4
|
+
* 策略:触发 gateway restart → 轮询 coclaw.upgradeHealth RPC 直到返回版本
|
|
5
|
+
* ≥ toVersion(等于或更新)。单次调用失败(gateway 未就绪 / plugin 未注册 /
|
|
6
|
+
* JSON 非法 / 版本不够新)一律按"稍后重试"处理,在总超时窗口内持续尝试。
|
|
8
7
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
8
|
+
* 允许 > toVersion 的原因:scheduler 观察到 latest=x 并发起升级后,到实际
|
|
9
|
+
* 执行 `plugins update` 之间 npm dist-tag 可能已指向 x+1;严格等 x 会把
|
|
10
|
+
* 这种"升级到了更新版本"误判为失败并回滚。
|
|
11
|
+
*
|
|
12
|
+
* 磁盘 package.json 的版本仅作为诊断写入本地日志,不参与判定——openclaw 侧
|
|
13
|
+
* `plugins.installs[id].installPath` 可能在 id-migration 等极端场景发生漂移,
|
|
14
|
+
* 而 upgradeHealth 是 gateway 进程内"新代码真的被加载"的权威信号。
|
|
15
|
+
*
|
|
16
|
+
* worker 运行在独立子进程中,禁止使用 remoteLog;诊断信息全部通过 logger
|
|
17
|
+
* (本地日志)输出,由 updater 记录到 upgrade-log.jsonl。
|
|
11
18
|
*/
|
|
12
19
|
import { execFile as nodeExecFile } from 'node:child_process';
|
|
20
|
+
import { readFile } from 'node:fs/promises';
|
|
21
|
+
import nodePath from 'node:path';
|
|
22
|
+
|
|
23
|
+
// 与 updater-check.js 同逻辑,worker 运行在独立子进程,不跨进程复用 gateway 模块
|
|
24
|
+
function isNewerVersion(a, b) {
|
|
25
|
+
const parse = (v) => v.replace(/-.*$/, '').split('.').map(Number);
|
|
26
|
+
const pa = parse(a);
|
|
27
|
+
const pb = parse(b);
|
|
28
|
+
for (let i = 0; i < 3; i++) {
|
|
29
|
+
/* c8 ignore next 2 -- ?? fallback:正常 semver 不会有缺失段 */
|
|
30
|
+
if ((pa[i] ?? 0) > (pb[i] ?? 0)) return true;
|
|
31
|
+
if ((pa[i] ?? 0) < (pb[i] ?? 0)) return false;
|
|
32
|
+
}
|
|
33
|
+
// x.y.z 相同时:release > pre-release(semver 规则)
|
|
34
|
+
const aHasPre = a.includes('-');
|
|
35
|
+
const bHasPre = b.includes('-');
|
|
36
|
+
if (bHasPre && !aHasPre) return true;
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
13
39
|
|
|
14
|
-
const GATEWAY_READY_TIMEOUT_MS = 60_000;
|
|
15
|
-
const POLL_INTERVAL_MS = 2000;
|
|
16
40
|
const CMD_TIMEOUT_MS = 30_000;
|
|
41
|
+
const HEALTH_POLL_INTERVAL_MS = 3_000;
|
|
42
|
+
// 本机 openclaw 冷启动可能需访问外部资源(AWS 诊断、ollama 探测等)
|
|
43
|
+
// 及插件 bootstrap,合计 30~60s 常见;5 分钟给足余量
|
|
44
|
+
const HEALTH_TOTAL_TIMEOUT_MS = 5 * 60 * 1000;
|
|
17
45
|
|
|
18
46
|
/**
|
|
19
|
-
* 执行命令并返回 stdout
|
|
47
|
+
* 执行命令并返回 stdout;错误对象附带 stderr 以便诊断
|
|
20
48
|
* @param {string} cmd
|
|
21
49
|
* @param {string[]} args
|
|
22
50
|
* @param {object} [opts]
|
|
23
51
|
* @param {Function} [opts.execFileFn]
|
|
52
|
+
* @param {number} [opts.cmdTimeoutMs]
|
|
24
53
|
* @returns {Promise<string>}
|
|
25
54
|
*/
|
|
26
55
|
function runCmd(cmd, args, opts) {
|
|
27
56
|
/* c8 ignore next -- ?./?? fallback */
|
|
28
57
|
const doExecFile = opts?.execFileFn ?? nodeExecFile;
|
|
58
|
+
/* c8 ignore next -- ?./?? fallback */
|
|
59
|
+
const timeout = opts?.cmdTimeoutMs ?? CMD_TIMEOUT_MS;
|
|
29
60
|
return new Promise((resolve, reject) => {
|
|
30
|
-
doExecFile(cmd, args, { timeout
|
|
31
|
-
if (err)
|
|
61
|
+
doExecFile(cmd, args, { timeout, shell: process.platform === 'win32' }, (err, stdout, stderr) => {
|
|
62
|
+
if (err) {
|
|
63
|
+
/* c8 ignore next -- ?? fallback:execFile 实现不保证 stderr 一定字符串化 */
|
|
64
|
+
err.stderr = String(stderr ?? '');
|
|
65
|
+
reject(err);
|
|
66
|
+
}
|
|
32
67
|
else resolve(String(stdout).trim());
|
|
33
68
|
});
|
|
34
69
|
});
|
|
35
70
|
}
|
|
36
71
|
|
|
37
72
|
/**
|
|
38
|
-
*
|
|
73
|
+
* 触发一次 gateway 重启;失败不抛(后续轮询 RPC 会兜底验证 gateway 是否就绪)
|
|
39
74
|
* @param {object} [opts]
|
|
40
75
|
* @param {Function} [opts.execFileFn]
|
|
41
|
-
* @param {number} [opts.timeoutMs]
|
|
42
|
-
* @param {number} [opts.pollIntervalMs]
|
|
43
76
|
* @returns {Promise<void>}
|
|
44
77
|
*/
|
|
45
|
-
export async function
|
|
46
|
-
// 主动触发重启,不依赖 OpenClaw 的文件变更自动重启策略
|
|
78
|
+
export async function triggerGatewayRestart(opts) {
|
|
47
79
|
try {
|
|
48
80
|
await runCmd('openclaw', ['gateway', 'restart'], opts);
|
|
49
81
|
}
|
|
50
82
|
catch {
|
|
51
|
-
// restart
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
/* c8 ignore next 2 -- ?./?? fallback */
|
|
55
|
-
const timeout = opts?.timeoutMs ?? GATEWAY_READY_TIMEOUT_MS;
|
|
56
|
-
const interval = opts?.pollIntervalMs ?? POLL_INTERVAL_MS;
|
|
57
|
-
const start = Date.now();
|
|
58
|
-
|
|
59
|
-
while (Date.now() - start < timeout) {
|
|
60
|
-
try {
|
|
61
|
-
const output = await runCmd('openclaw', ['gateway', 'status'], opts);
|
|
62
|
-
if (output.includes('running')) return;
|
|
63
|
-
}
|
|
64
|
-
catch {
|
|
65
|
-
// gateway 未就绪,继续轮询
|
|
66
|
-
}
|
|
67
|
-
await sleep(interval);
|
|
83
|
+
// restart 命令本身失败不阻断:openclaw 可能已在重启/daemon 自恢复;
|
|
84
|
+
// 无论如何都进入后续 upgradeHealth 轮询,由它判定 gateway 最终是否可用
|
|
68
85
|
}
|
|
69
|
-
|
|
70
|
-
throw new Error('Gateway did not become ready within timeout');
|
|
71
86
|
}
|
|
72
87
|
|
|
73
88
|
/**
|
|
74
|
-
*
|
|
75
|
-
* @param {string}
|
|
76
|
-
* @
|
|
77
|
-
* @param {Function} [opts.execFileFn]
|
|
78
|
-
* @returns {Promise<void>}
|
|
89
|
+
* 读取磁盘 package.json 的版本号(诊断用途,不参与判定)
|
|
90
|
+
* @param {string} pluginDir
|
|
91
|
+
* @returns {Promise<string | null>}
|
|
79
92
|
*/
|
|
80
|
-
export async function
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
93
|
+
export async function readDiskPackageVersion(pluginDir) {
|
|
94
|
+
try {
|
|
95
|
+
const pkgPath = nodePath.join(pluginDir, 'package.json');
|
|
96
|
+
const raw = await readFile(pkgPath, 'utf8');
|
|
97
|
+
const pkg = JSON.parse(raw);
|
|
98
|
+
return typeof pkg?.version === 'string' ? pkg.version : null;
|
|
99
|
+
}
|
|
100
|
+
catch {
|
|
101
|
+
return null;
|
|
84
102
|
}
|
|
85
103
|
}
|
|
86
104
|
|
|
87
105
|
/**
|
|
88
|
-
*
|
|
106
|
+
* 单次调用 coclaw.upgradeHealth;永不抛异常,失败归一化为 { ok: false, reason }
|
|
89
107
|
* @param {object} [opts]
|
|
90
|
-
* @
|
|
91
|
-
* @returns {Promise<string>} 返回版本号
|
|
108
|
+
* @returns {Promise<{ ok: true, version: string } | { ok: false, reason: string }>}
|
|
92
109
|
*/
|
|
93
|
-
|
|
94
|
-
const output = await runCmd(
|
|
95
|
-
'openclaw',
|
|
96
|
-
['gateway', 'call', 'coclaw.upgradeHealth', '--json'],
|
|
97
|
-
opts,
|
|
98
|
-
);
|
|
110
|
+
async function callUpgradeHealthOnce(opts) {
|
|
99
111
|
try {
|
|
100
|
-
const
|
|
101
|
-
|
|
102
|
-
|
|
112
|
+
const output = await runCmd(
|
|
113
|
+
'openclaw',
|
|
114
|
+
['gateway', 'call', 'coclaw.upgradeHealth', '--json'],
|
|
115
|
+
opts,
|
|
116
|
+
);
|
|
117
|
+
let payload;
|
|
118
|
+
try {
|
|
119
|
+
payload = JSON.parse(output);
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
return { ok: false, reason: `invalid-json: ${output.slice(0, 120)}` };
|
|
103
123
|
}
|
|
104
|
-
return
|
|
124
|
+
if (!payload?.version) return { ok: false, reason: 'missing-version' };
|
|
125
|
+
return { ok: true, version: String(payload.version) };
|
|
105
126
|
}
|
|
106
127
|
catch (err) {
|
|
107
|
-
|
|
108
|
-
|
|
128
|
+
const stderr = typeof err?.stderr === 'string' ? err.stderr.trim() : '';
|
|
129
|
+
/* c8 ignore next -- ?? fallback */
|
|
130
|
+
const msg = err?.message ?? String(err);
|
|
131
|
+
const reason = (stderr || msg || 'unknown').slice(0, 200);
|
|
132
|
+
return { ok: false, reason };
|
|
109
133
|
}
|
|
110
134
|
}
|
|
111
135
|
|
|
112
136
|
/**
|
|
113
|
-
*
|
|
114
|
-
* @param {string}
|
|
137
|
+
* 轮询 upgradeHealth 直到版本 ≥ toVersion,或总超时
|
|
138
|
+
* @param {string} toVersion
|
|
115
139
|
* @param {object} [opts]
|
|
116
140
|
* @param {Function} [opts.execFileFn]
|
|
117
|
-
* @param {number} [opts.
|
|
141
|
+
* @param {number} [opts.totalTimeoutMs]
|
|
118
142
|
* @param {number} [opts.pollIntervalMs]
|
|
119
|
-
* @
|
|
143
|
+
* @param {number} [opts.cmdTimeoutMs]
|
|
144
|
+
* @returns {Promise<{ ok: true, version: string, attempts: number, elapsedMs: number }
|
|
145
|
+
* | { ok: false, attempts: number, elapsedMs: number, lastReason: string, lastVersion: string }>}
|
|
120
146
|
*/
|
|
121
|
-
export async function
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
147
|
+
export async function pollUpgradeHealth(toVersion, opts) {
|
|
148
|
+
/* c8 ignore next -- ?? fallback */
|
|
149
|
+
const totalTimeout = opts?.totalTimeoutMs ?? HEALTH_TOTAL_TIMEOUT_MS;
|
|
150
|
+
/* c8 ignore next -- ?? fallback */
|
|
151
|
+
const pollInterval = opts?.pollIntervalMs ?? HEALTH_POLL_INTERVAL_MS;
|
|
152
|
+
const start = Date.now();
|
|
153
|
+
let attempts = 0;
|
|
154
|
+
let lastReason = '';
|
|
155
|
+
let lastVersion = '';
|
|
156
|
+
|
|
157
|
+
while (Date.now() - start < totalTimeout) {
|
|
158
|
+
attempts += 1;
|
|
159
|
+
const result = await callUpgradeHealthOnce(opts);
|
|
160
|
+
if (result.ok) {
|
|
161
|
+
// 等于或更新均视为成功,覆盖"升级窗口期 dist-tag 前移"的情形
|
|
162
|
+
if (result.version === toVersion || isNewerVersion(result.version, toVersion)) {
|
|
163
|
+
return {
|
|
164
|
+
ok: true,
|
|
165
|
+
version: result.version,
|
|
166
|
+
attempts,
|
|
167
|
+
elapsedMs: Date.now() - start,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
lastVersion = result.version;
|
|
171
|
+
lastReason = `version-too-old got=${result.version} want>=${toVersion}`;
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
lastReason = result.reason;
|
|
175
|
+
}
|
|
176
|
+
// 剩余时间不足以再等一个 interval 就直接退出,避免最后一次毫无意义的 sleep
|
|
177
|
+
if (Date.now() - start + pollInterval >= totalTimeout) break;
|
|
178
|
+
await sleep(pollInterval);
|
|
127
179
|
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
ok: false,
|
|
183
|
+
attempts,
|
|
184
|
+
elapsedMs: Date.now() - start,
|
|
185
|
+
lastReason,
|
|
186
|
+
lastVersion,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* 完整验证流程:触发 gateway restart → 读磁盘版本(诊断)→ 轮询 upgradeHealth
|
|
192
|
+
* @param {string} pluginDir - 插件安装目录(来自 openclaw.json 的权威 installPath)
|
|
193
|
+
* @param {string} toVersion - 目标版本
|
|
194
|
+
* @param {object} [opts]
|
|
195
|
+
* @param {Function} [opts.execFileFn]
|
|
196
|
+
* @param {number} [opts.totalTimeoutMs]
|
|
197
|
+
* @param {number} [opts.pollIntervalMs]
|
|
198
|
+
* @param {number} [opts.cmdTimeoutMs]
|
|
199
|
+
* @param {Function} [log] - 本地日志函数
|
|
200
|
+
* @returns {Promise<{ ok: true, version: string } | { ok: false, error: string }>}
|
|
201
|
+
*/
|
|
202
|
+
export async function verifyUpgrade(pluginDir, toVersion, opts, log) {
|
|
203
|
+
const logFn = typeof log === 'function' ? log : () => {};
|
|
204
|
+
|
|
205
|
+
await triggerGatewayRestart(opts);
|
|
206
|
+
|
|
207
|
+
const onDiskVersion = await readDiskPackageVersion(pluginDir);
|
|
208
|
+
logFn(`[upgrade-worker] On-disk package.json version: ${onDiskVersion ?? '(unreadable)'} (expected ${toVersion})`);
|
|
209
|
+
|
|
210
|
+
const result = await pollUpgradeHealth(toVersion, opts);
|
|
211
|
+
if (result.ok) {
|
|
212
|
+
logFn(`[upgrade-worker] upgradeHealth verified: version=${result.version} attempts=${result.attempts} elapsed=${result.elapsedMs}ms`);
|
|
213
|
+
return { ok: true, version: result.version };
|
|
131
214
|
}
|
|
215
|
+
|
|
216
|
+
const error = `verify timeout: attempts=${result.attempts} elapsed=${result.elapsedMs}ms lastVersion=${result.lastVersion || '(none)'} lastReason=${result.lastReason || '(none)'}`;
|
|
217
|
+
logFn(`[upgrade-worker] ${error}`);
|
|
218
|
+
return { ok: false, error };
|
|
132
219
|
}
|
|
133
220
|
|
|
134
221
|
function sleep(ms) {
|
|
@@ -15,13 +15,17 @@
|
|
|
15
15
|
import { execFile as nodeExecFile } from 'node:child_process';
|
|
16
16
|
import { parseArgs } from 'node:util';
|
|
17
17
|
import { createBackup, restoreFromBackup, removeBackup } from './worker-backup.js';
|
|
18
|
-
import { verifyUpgrade,
|
|
18
|
+
import { verifyUpgrade, triggerGatewayRestart } from './worker-verify.js';
|
|
19
19
|
import { addSkippedVersion, updateLastUpgrade, appendLog } from './state.js';
|
|
20
20
|
import { getCurrentNpmRegistry, pickFallbackRegistry } from './registry-fallback.js';
|
|
21
21
|
|
|
22
22
|
const SEMVER_RE = /^\d+\.\d+\.\d+(-[\w.-]+)?$/;
|
|
23
23
|
// 单次 plugins update 上限:包含 npm install 大型 native deps,慢网络 + 弱机器需较长时间
|
|
24
24
|
const UPDATE_TIMEOUT_MS = 10 * 60 * 1000;
|
|
25
|
+
// 回滚兜底重装旧版本走的是同一条 npm 下载链路,且触发前置本身是"备份已丢"的异常态,
|
|
26
|
+
// 此时尽量兜住比快速失败更重要,与 UPDATE_TIMEOUT_MS 对齐
|
|
27
|
+
const FALLBACK_INSTALL_TIMEOUT_MS = 10 * 60 * 1000;
|
|
28
|
+
const FALLBACK_UNINSTALL_TIMEOUT_MS = 60 * 1000;
|
|
25
29
|
|
|
26
30
|
/**
|
|
27
31
|
* 执行 openclaw plugins update
|
|
@@ -73,7 +77,7 @@ async function fallbackInstallOldVersion(pkgName, version, pluginId, opts) {
|
|
|
73
77
|
}
|
|
74
78
|
/* c8 ignore next -- ?./?? fallback */
|
|
75
79
|
const doExecFile = opts?.execFileFn ?? nodeExecFile;
|
|
76
|
-
const run = (args, timeout
|
|
80
|
+
const run = (args, timeout) => new Promise((resolve, reject) => {
|
|
77
81
|
doExecFile('openclaw', args, { timeout, shell: process.platform === 'win32' }, (err) => {
|
|
78
82
|
if (err) reject(err);
|
|
79
83
|
else resolve();
|
|
@@ -82,13 +86,13 @@ async function fallbackInstallOldVersion(pkgName, version, pluginId, opts) {
|
|
|
82
86
|
|
|
83
87
|
// 先卸载:install 不支持覆盖已安装插件
|
|
84
88
|
try {
|
|
85
|
-
await run(['plugins', 'uninstall', pluginId],
|
|
89
|
+
await run(['plugins', 'uninstall', pluginId], FALLBACK_UNINSTALL_TIMEOUT_MS);
|
|
86
90
|
} catch {
|
|
87
91
|
// uninstall 失败不阻断,继续尝试 install
|
|
88
92
|
}
|
|
89
93
|
|
|
90
94
|
try {
|
|
91
|
-
await run(['plugins', 'install', `${pkgName}@${version}`]);
|
|
95
|
+
await run(['plugins', 'install', `${pkgName}@${version}`], FALLBACK_INSTALL_TIMEOUT_MS);
|
|
92
96
|
} catch (err) {
|
|
93
97
|
throw new Error(`fallback install failed: ${err.message}`);
|
|
94
98
|
}
|
|
@@ -160,7 +164,7 @@ export async function runUpgrade({ pluginDir, fromVersion, toVersion, pluginId,
|
|
|
160
164
|
|
|
161
165
|
// 3. 等待 gateway 重启并验证
|
|
162
166
|
log('[upgrade-worker] Verifying upgrade...');
|
|
163
|
-
const result = await verifyUpgrade(
|
|
167
|
+
const result = await verifyUpgrade(pluginDir, toVersion, opts, log);
|
|
164
168
|
|
|
165
169
|
if (result.ok) {
|
|
166
170
|
// 4a. 成功
|
|
@@ -171,8 +175,11 @@ export async function runUpgrade({ pluginDir, fromVersion, toVersion, pluginId,
|
|
|
171
175
|
catch (e) {
|
|
172
176
|
log(`[upgrade-worker] Backup cleanup failed (non-fatal): ${e.message}`);
|
|
173
177
|
}
|
|
174
|
-
|
|
175
|
-
|
|
178
|
+
// 记录真实装上的版本而非目标版本——dist-tag 前移窗口下两者可能不同。
|
|
179
|
+
// 不加 fallback:若 result.ok 时 version 缺失,说明上游契约被破坏,
|
|
180
|
+
// 宁可让状态里直接暴露 undefined 便于排障,也不要用 toVersion 糊过去
|
|
181
|
+
await updateLastUpgrade({ from: fromVersion, to: result.version, result: 'ok' });
|
|
182
|
+
await appendLog({ from: fromVersion, to: result.version, result: 'ok' });
|
|
176
183
|
log('[upgrade-worker] Upgrade complete');
|
|
177
184
|
} else {
|
|
178
185
|
// 4b. 失败,回滚
|
|
@@ -212,15 +219,9 @@ async function handleRollback({ pluginDir, fromVersion, toVersion, pluginId, pkg
|
|
|
212
219
|
}
|
|
213
220
|
}
|
|
214
221
|
|
|
215
|
-
//
|
|
216
|
-
log('[upgrade-worker]
|
|
217
|
-
|
|
218
|
-
await waitForGateway(opts);
|
|
219
|
-
log('[upgrade-worker] Gateway restarted after rollback');
|
|
220
|
-
}
|
|
221
|
-
catch {
|
|
222
|
-
log('[upgrade-worker] Gateway did not restart after rollback');
|
|
223
|
-
}
|
|
222
|
+
// 触发 gateway 重启让老版本回到运行态(尽力而为,不验证结果)
|
|
223
|
+
log('[upgrade-worker] Triggering gateway restart after rollback...');
|
|
224
|
+
await triggerGatewayRestart(opts);
|
|
224
225
|
|
|
225
226
|
// 记录状态(顺序执行因共享 state 文件,但各自 try/catch 避免单个失败阻断其余)
|
|
226
227
|
// 仅验证失败(新版本确实被加载并发现有问题)才标记为 skipped;
|
package/src/realtime-bridge.js
CHANGED
|
@@ -1087,7 +1087,7 @@ export class RealtimeBridge {
|
|
|
1087
1087
|
// 1. 尝试 pion(最高优先级)
|
|
1088
1088
|
const preloadPionFn = this.__preloadPion
|
|
1089
1089
|
?? (await import('./webrtc/pion-preloader.js')).preloadPion;
|
|
1090
|
-
const pionResult = await preloadPionFn().catch((err) => {
|
|
1090
|
+
const pionResult = await preloadPionFn({ logger: this.logger }).catch((err) => {
|
|
1091
1091
|
this.logger.warn?.(`[coclaw] pion preload unexpected failure: ${err?.message}`);
|
|
1092
1092
|
return null;
|
|
1093
1093
|
});
|
|
@@ -4,12 +4,13 @@
|
|
|
4
4
|
*
|
|
5
5
|
* 行为约定详见 docs/rpc-dc-file-queue.md。
|
|
6
6
|
* - FIFO、单一生产者/消费者;多消费者时每条只交付给其中一个。
|
|
7
|
-
* -
|
|
7
|
+
* - 构造纯字段初始化,不碰 FS;使用前需 `await q.init()`。
|
|
8
8
|
* - 消费侧:`for await (const item of queue) { ... }`;`destroy()` 让迭代结束。
|
|
9
|
+
* - FS 异常下进入 `fsBroken` 粘性降级:mem 路径继续工作,溢出消息 drop。
|
|
9
10
|
*/
|
|
10
11
|
|
|
11
12
|
import fs from 'node:fs/promises';
|
|
12
|
-
import { createReadStream, createWriteStream
|
|
13
|
+
import { createReadStream, createWriteStream } from 'node:fs';
|
|
13
14
|
import nodePath from 'node:path';
|
|
14
15
|
import readline from 'node:readline';
|
|
15
16
|
|
|
@@ -18,13 +19,22 @@ import { createMutex } from './mutex.js';
|
|
|
18
19
|
const DEFAULT_MEM_BUDGET = 8 * 1024 * 1024;
|
|
19
20
|
const DEFAULT_DISK_CAP = 1024 * 1024 * 1024;
|
|
20
21
|
|
|
22
|
+
// JS 对象开销估算(string header + array slot 等),仅用于 admission 决策不影响 memBytes 报告
|
|
23
|
+
const ENTRY_OVERHEAD = 64;
|
|
24
|
+
|
|
25
|
+
// id 字符集:UUID / 字母数字 / 点 / 下划线 / 减号,且不能是 "." 或 ".."
|
|
26
|
+
const ID_RE = /^[A-Za-z0-9._-]+$/;
|
|
27
|
+
|
|
28
|
+
// 压缩阈值:head 越过 64 且占 memQueue 一半以上时切片回收
|
|
29
|
+
const COMPACT_HEAD_THRESHOLD = 64;
|
|
30
|
+
|
|
21
31
|
class FileBackedQueue {
|
|
22
32
|
/**
|
|
23
33
|
* @param {object} opts
|
|
24
34
|
* @param {string} opts.dir - 队列文件根目录
|
|
25
|
-
* @param {string} opts.id -
|
|
35
|
+
* @param {string} opts.id - 队列标识,字符集受限,防路径穿越
|
|
26
36
|
* @param {number} [opts.memBudget=8MB] - 内存持有字节数上限
|
|
27
|
-
* @param {number} [opts.diskCap=1GB] -
|
|
37
|
+
* @param {number} [opts.diskCap=1GB] - 磁盘+内存总字节数硬上限(含 `\n`)
|
|
28
38
|
* @param {(reason: string, size: number) => void} [opts.onDrop] - 拒入队时的回调
|
|
29
39
|
* @param {{ warn?: Function, info?: Function, error?: Function }} [opts.logger=console]
|
|
30
40
|
*/
|
|
@@ -40,6 +50,17 @@ class FileBackedQueue {
|
|
|
40
50
|
|
|
41
51
|
if (!dir || typeof dir !== 'string') throw new TypeError('dir is required');
|
|
42
52
|
if (!id || typeof id !== 'string') throw new TypeError('id is required');
|
|
53
|
+
if (id === '.' || id === '..' || !ID_RE.test(id)) {
|
|
54
|
+
throw new TypeError('id contains invalid characters');
|
|
55
|
+
}
|
|
56
|
+
// 基础设施 fail-fast:容量参数必须是有限正数,避免 NaN/Infinity/非数字绕过 admission。
|
|
57
|
+
// NaN 与任何数比较皆为 false → admission 永远通过 → diskCap 变相失效。
|
|
58
|
+
if (!Number.isFinite(memBudget) || memBudget <= 0) {
|
|
59
|
+
throw new TypeError('memBudget must be a finite positive number');
|
|
60
|
+
}
|
|
61
|
+
if (!Number.isFinite(diskCap) || diskCap <= 0) {
|
|
62
|
+
throw new TypeError('diskCap must be a finite positive number');
|
|
63
|
+
}
|
|
43
64
|
|
|
44
65
|
this.dir = dir;
|
|
45
66
|
this.id = id;
|
|
@@ -48,28 +69,52 @@ class FileBackedQueue {
|
|
|
48
69
|
this.onDrop = onDrop;
|
|
49
70
|
this.logger = logger;
|
|
50
71
|
|
|
51
|
-
this.
|
|
52
|
-
this.filePath = nodePath.join(this.subdir, 'queue.jsonl');
|
|
72
|
+
this.filePath = nodePath.join(dir, `${id}.jsonl`);
|
|
53
73
|
|
|
74
|
+
// 单文件 ring-ish 结构:head 指针 + 数组;shift 为 O(1) 摊销
|
|
54
75
|
this.memQueue = [];
|
|
76
|
+
this.head = 0;
|
|
55
77
|
this.memBytes = 0;
|
|
56
|
-
this.diskBytes = 0; // 磁盘上未消费的 payload 字节(不含分隔 \n)
|
|
57
78
|
this.writtenBytes = 0; // 已写入文件的累计字节(含 \n)
|
|
58
79
|
this.readOffset = 0; // 下次 refill 的起始偏移
|
|
59
80
|
this.spilled = false;
|
|
81
|
+
this.initialized = false;
|
|
60
82
|
this.destroyed = false;
|
|
83
|
+
this.fsBroken = false; // 粘性:一旦 FS 出错,不再尝试 reopen
|
|
61
84
|
this.writeStream = null;
|
|
62
85
|
this.writeErr = null;
|
|
63
86
|
this.waiters = [];
|
|
64
87
|
this.mutex = createMutex();
|
|
88
|
+
}
|
|
65
89
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
90
|
+
/**
|
|
91
|
+
* 派生的未消费磁盘字节数(含 \n),用于 admission 与 stats。
|
|
92
|
+
*/
|
|
93
|
+
get diskBytes() {
|
|
94
|
+
return this.writtenBytes - this.readOffset;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* 异步初始化:清理残留文件,标记可用。幂等。
|
|
99
|
+
*/
|
|
100
|
+
async init() {
|
|
101
|
+
return await this.mutex.withLock(async () => {
|
|
102
|
+
if (this.destroyed) return;
|
|
103
|
+
if (this.initialized) return;
|
|
104
|
+
try {
|
|
105
|
+
await fs.rm(this.filePath, { force: true });
|
|
106
|
+
} catch (err) {
|
|
107
|
+
// best-effort:init 的 rm 可能因 ENOTDIR / EACCES 等失败。
|
|
108
|
+
// 权威残留清理在 __openWriteStream 中(首次 spill 前)再做一次,
|
|
109
|
+
// 确保不会用 'a' flag 追加到旧数据上污染 FIFO。
|
|
110
|
+
this.logger?.warn?.('fbq.init rm warning', err);
|
|
111
|
+
}
|
|
112
|
+
this.initialized = true;
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async [Symbol.asyncDispose]() {
|
|
117
|
+
await this.destroy();
|
|
73
118
|
}
|
|
74
119
|
|
|
75
120
|
/**
|
|
@@ -80,28 +125,45 @@ class FileBackedQueue {
|
|
|
80
125
|
async enqueue(jsonStr) {
|
|
81
126
|
return await this.mutex.withLock(async () => {
|
|
82
127
|
if (this.destroyed) return false;
|
|
128
|
+
if (!this.initialized) throw new TypeError('queue not initialized');
|
|
83
129
|
if (typeof jsonStr !== 'string') throw new TypeError('jsonStr must be a string');
|
|
84
130
|
|
|
85
131
|
const size = Buffer.byteLength(jsonStr, 'utf8');
|
|
86
132
|
|
|
87
|
-
|
|
133
|
+
// admission:按物理占用(mem + 已写文件总字节,含 \n)判定,保证 diskCap 是真正的硬上限。
|
|
134
|
+
// 用 writtenBytes(不减 readOffset)的含义:文件前缀已读但未被 __dropFile 回收前仍算占用。
|
|
135
|
+
// 代价:持续背压下消费者还没追到写端时新消息可能被 drop,直到完全 drain 触发 __dropFile 重置。
|
|
136
|
+
if (this.memBytes + this.writtenBytes + size + 1 > this.diskCap) {
|
|
88
137
|
this.__dispatchDrop('disk-cap', size);
|
|
89
138
|
return false;
|
|
90
139
|
}
|
|
91
140
|
|
|
92
|
-
//
|
|
93
|
-
if (!this.spilled
|
|
94
|
-
this.memQueue.
|
|
95
|
-
this.memBytes
|
|
96
|
-
this.
|
|
97
|
-
|
|
141
|
+
// 内存路径:未溢出且 admission 通过(考虑 overhead;首条无论多大都收)
|
|
142
|
+
if (!this.spilled) {
|
|
143
|
+
const pendingCount = this.memQueue.length - this.head;
|
|
144
|
+
const cost = this.memBytes + pendingCount * ENTRY_OVERHEAD + size + ENTRY_OVERHEAD;
|
|
145
|
+
if (pendingCount === 0 || cost <= this.memBudget) {
|
|
146
|
+
this.memQueue.push(jsonStr);
|
|
147
|
+
this.memBytes += size;
|
|
148
|
+
this.__wakeOne();
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// 溢出路径:FS 已破直接 drop,不再尝试 reopen
|
|
154
|
+
if (this.fsBroken) {
|
|
155
|
+
this.__dispatchDrop('fs-error', size);
|
|
156
|
+
return false;
|
|
98
157
|
}
|
|
99
158
|
|
|
100
|
-
// 溢出路径:lazy 打开写流
|
|
101
159
|
if (!this.spilled) {
|
|
102
160
|
await this.__openWriteStream();
|
|
103
161
|
if (this.writeErr) {
|
|
162
|
+
const err = this.writeErr;
|
|
104
163
|
this.__dispatchDrop('fs-error', size);
|
|
164
|
+
// 前置 mkdir/rm 失败也进入粘性降级:与 stream 'error' 路径语义一致,
|
|
165
|
+
// 避免后续每次 overflow 都重试同一个持续性 FS 故障。
|
|
166
|
+
await this.__handleFsError(err);
|
|
105
167
|
return false;
|
|
106
168
|
}
|
|
107
169
|
this.spilled = true;
|
|
@@ -109,32 +171,39 @@ class FileBackedQueue {
|
|
|
109
171
|
|
|
110
172
|
try {
|
|
111
173
|
await this.__writeLine(jsonStr + '\n');
|
|
112
|
-
this.diskBytes += size;
|
|
113
174
|
this.writtenBytes += size + 1;
|
|
114
175
|
this.__wakeOne();
|
|
115
176
|
return true;
|
|
116
177
|
} catch (err) {
|
|
117
178
|
this.logger?.warn?.('fbq.enqueue fs-error', err);
|
|
118
179
|
this.__dispatchDrop('fs-error', size);
|
|
180
|
+
// 直接在当前锁内触发粘性降级:真实 Node stream 下 cb err 通常也会 emit 'error'
|
|
181
|
+
// (监听器会另外排一次 handleFsError,但 fsBroken 已置 → no-op);测试里的 monkey-patch
|
|
182
|
+
// 只触发 cb、不发 'error',这里主动降级保证行为一致。
|
|
183
|
+
await this.__handleFsError(err);
|
|
119
184
|
return false;
|
|
120
185
|
}
|
|
121
186
|
});
|
|
122
187
|
}
|
|
123
188
|
|
|
124
189
|
/**
|
|
125
|
-
* @returns {{ memCount: number, memBytes: number, diskBytes: number, spilled: boolean }}
|
|
190
|
+
* @returns {{ memCount: number, memBytes: number, diskBytes: number, writtenBytes: number, spilled: boolean, fsBroken: boolean }}
|
|
191
|
+
* - diskBytes:未消费 backlog(writtenBytes - readOffset)
|
|
192
|
+
* - writtenBytes:本次生命周期累计已写字节(admission 依据的物理占用),drain 或 FS 降级后重置为 0
|
|
126
193
|
*/
|
|
127
194
|
stats() {
|
|
128
195
|
return {
|
|
129
|
-
memCount: this.memQueue.length,
|
|
196
|
+
memCount: this.memQueue.length - this.head,
|
|
130
197
|
memBytes: this.memBytes,
|
|
131
198
|
diskBytes: this.diskBytes,
|
|
199
|
+
writtenBytes: this.writtenBytes,
|
|
132
200
|
spilled: this.spilled,
|
|
201
|
+
fsBroken: this.fsBroken,
|
|
133
202
|
};
|
|
134
203
|
}
|
|
135
204
|
|
|
136
205
|
/**
|
|
137
|
-
*
|
|
206
|
+
* 清空数据但保留实例可用;显式清 fsBroken,允许再次尝试落盘。
|
|
138
207
|
*/
|
|
139
208
|
async clear() {
|
|
140
209
|
return await this.mutex.withLock(async () => {
|
|
@@ -147,17 +216,18 @@ class FileBackedQueue {
|
|
|
147
216
|
this.logger?.warn?.('fbq.clear rm error', err);
|
|
148
217
|
}
|
|
149
218
|
this.memQueue = [];
|
|
219
|
+
this.head = 0;
|
|
150
220
|
this.memBytes = 0;
|
|
151
|
-
this.diskBytes = 0;
|
|
152
221
|
this.writtenBytes = 0;
|
|
153
222
|
this.readOffset = 0;
|
|
154
223
|
this.spilled = false;
|
|
224
|
+
this.fsBroken = false;
|
|
155
225
|
this.writeErr = null;
|
|
156
226
|
});
|
|
157
227
|
}
|
|
158
228
|
|
|
159
229
|
/**
|
|
160
|
-
* 停写、关 FD
|
|
230
|
+
* 停写、关 FD、删文件、结束所有迭代器。幂等。
|
|
161
231
|
*/
|
|
162
232
|
async destroy() {
|
|
163
233
|
return await this.mutex.withLock(async () => {
|
|
@@ -170,15 +240,15 @@ class FileBackedQueue {
|
|
|
170
240
|
|
|
171
241
|
await this.__closeWriteStream();
|
|
172
242
|
try {
|
|
173
|
-
await fs.rm(this.
|
|
243
|
+
await fs.rm(this.filePath, { force: true });
|
|
174
244
|
} catch (err) {
|
|
175
245
|
/* c8 ignore next 2 -- rm with force rarely fails */
|
|
176
246
|
this.logger?.warn?.('fbq.destroy rm error', err);
|
|
177
247
|
}
|
|
178
248
|
|
|
179
249
|
this.memQueue = [];
|
|
250
|
+
this.head = 0;
|
|
180
251
|
this.memBytes = 0;
|
|
181
|
-
this.diskBytes = 0;
|
|
182
252
|
this.writtenBytes = 0;
|
|
183
253
|
this.readOffset = 0;
|
|
184
254
|
this.spilled = false;
|
|
@@ -198,12 +268,20 @@ class FileBackedQueue {
|
|
|
198
268
|
while (true) {
|
|
199
269
|
let waitPromise = null;
|
|
200
270
|
const result = await this.mutex.withLock(async () => {
|
|
201
|
-
|
|
271
|
+
const pendingCount = this.memQueue.length - this.head;
|
|
272
|
+
if (pendingCount === 0 && this.spilled && !this.destroyed) {
|
|
202
273
|
await this.__refillImpl();
|
|
203
274
|
}
|
|
204
|
-
if (this.memQueue.length > 0) {
|
|
205
|
-
const item = this.memQueue.
|
|
275
|
+
if (this.memQueue.length - this.head > 0) {
|
|
276
|
+
const item = this.memQueue[this.head];
|
|
277
|
+
this.memQueue[this.head] = undefined;
|
|
278
|
+
this.head += 1;
|
|
206
279
|
this.memBytes -= Buffer.byteLength(item, 'utf8');
|
|
280
|
+
// 惰性压缩:避免 head 一直向前、数组永不回收
|
|
281
|
+
if (this.head > COMPACT_HEAD_THRESHOLD && this.head * 2 >= this.memQueue.length) {
|
|
282
|
+
this.memQueue = this.memQueue.slice(this.head);
|
|
283
|
+
this.head = 0;
|
|
284
|
+
}
|
|
207
285
|
return { value: item, done: false };
|
|
208
286
|
}
|
|
209
287
|
if (this.destroyed) return { done: true, value: undefined };
|
|
@@ -224,6 +302,11 @@ class FileBackedQueue {
|
|
|
224
302
|
}
|
|
225
303
|
}
|
|
226
304
|
|
|
305
|
+
__wakeAll() {
|
|
306
|
+
const toWake = this.waiters.splice(0);
|
|
307
|
+
for (const w of toWake) w.resolve();
|
|
308
|
+
}
|
|
309
|
+
|
|
227
310
|
__dispatchDrop(reason, size) {
|
|
228
311
|
try {
|
|
229
312
|
this.onDrop?.(reason, size);
|
|
@@ -237,20 +320,32 @@ class FileBackedQueue {
|
|
|
237
320
|
async __openWriteStream() {
|
|
238
321
|
this.writeErr = null;
|
|
239
322
|
try {
|
|
240
|
-
|
|
323
|
+
// 目录 0o700 / 文件 0o600:POSIX best-effort。
|
|
324
|
+
// - 新建目录/文件会按此 mode(再经 umask)创建
|
|
325
|
+
// - 已存在的目录 mkdir(recursive) 不会被 chmod 收紧,以该目录原权限为准
|
|
326
|
+
// - Windows 下 mode 参数语义很弱(无 owner/group/other 概念),实际访问控制依赖父目录 NTFS ACL
|
|
327
|
+
// 仍比默认 0o644 更保守;atomic-write.js 也是同一策略。
|
|
328
|
+
await fs.mkdir(nodePath.dirname(this.filePath), { recursive: true, mode: 0o700 });
|
|
329
|
+
// 权威残留清理:即便 init 的 rm 被吞掉,这里开流前再 rm 一次,
|
|
330
|
+
// 避免 'a' flag 追加到旧数据上污染 FIFO。
|
|
331
|
+
await fs.rm(this.filePath, { force: true });
|
|
241
332
|
} catch (err) {
|
|
242
333
|
this.writeErr = err;
|
|
243
334
|
return;
|
|
244
335
|
}
|
|
245
|
-
this.writeStream = createWriteStream(this.filePath, { flags: 'a' });
|
|
336
|
+
this.writeStream = createWriteStream(this.filePath, { flags: 'a', mode: 0o600 });
|
|
246
337
|
this.writeStream.on('error', (err) => {
|
|
247
338
|
this.writeErr = err;
|
|
248
339
|
this.logger?.warn?.('fbq.writeStream error', err);
|
|
340
|
+
// 异步错误:排队到 mutex 做粘性降级清理,避免状态半截卡死
|
|
341
|
+
this.mutex.withLock(() => this.__handleFsError(err)).catch(() => {});
|
|
249
342
|
});
|
|
250
343
|
}
|
|
251
344
|
|
|
252
345
|
async __writeLine(str) {
|
|
253
|
-
|
|
346
|
+
// 不再前置 writeErr 检查:一旦 writeErr 被异步设置,__handleFsError 会立即排队清理并
|
|
347
|
+
// 把 fsBroken 置粘性;spill 路径入口已判 fsBroken,到这里 writeErr 必为 null。
|
|
348
|
+
// 写失败通过 write 回调的 err 反映,catch 块处理。
|
|
254
349
|
return await new Promise((resolve, reject) => {
|
|
255
350
|
this.writeStream.write(str, (err) => {
|
|
256
351
|
if (err) reject(err);
|
|
@@ -278,6 +373,25 @@ class FileBackedQueue {
|
|
|
278
373
|
});
|
|
279
374
|
}
|
|
280
375
|
|
|
376
|
+
// mutex 内调用:FS 错误粘性降级
|
|
377
|
+
async __handleFsError(_err) {
|
|
378
|
+
if (this.destroyed || this.fsBroken) return;
|
|
379
|
+
this.fsBroken = true;
|
|
380
|
+
await this.__closeWriteStream();
|
|
381
|
+
try {
|
|
382
|
+
await fs.rm(this.filePath, { force: true });
|
|
383
|
+
} catch (err) {
|
|
384
|
+
/* c8 ignore next 2 -- rm with force rarely fails */
|
|
385
|
+
this.logger?.warn?.('fbq.handleFsError rm error', err);
|
|
386
|
+
}
|
|
387
|
+
this.spilled = false;
|
|
388
|
+
this.writtenBytes = 0;
|
|
389
|
+
this.readOffset = 0;
|
|
390
|
+
this.writeErr = null;
|
|
391
|
+
// 唤醒全部消费者,让它们重新观察状态
|
|
392
|
+
this.__wakeAll();
|
|
393
|
+
}
|
|
394
|
+
|
|
281
395
|
// 调用方必须已持有 mutex,且已确认 !destroyed
|
|
282
396
|
async __refillImpl() {
|
|
283
397
|
if (!this.spilled) return;
|
|
@@ -287,8 +401,10 @@ class FileBackedQueue {
|
|
|
287
401
|
const st = await fs.stat(this.filePath);
|
|
288
402
|
actualEnd = st.size;
|
|
289
403
|
} catch (err) {
|
|
290
|
-
|
|
404
|
+
// 读侧 FS 错误(外部删文件、权限丢失等)走粘性降级,
|
|
405
|
+
// 避免 spilled=true / fsBroken=false 的悬空态让消费者永远挂 waiter。
|
|
291
406
|
this.logger?.warn?.('fbq.refill stat error', err);
|
|
407
|
+
await this.__handleFsError(err);
|
|
292
408
|
return;
|
|
293
409
|
}
|
|
294
410
|
|
|
@@ -302,6 +418,9 @@ class FileBackedQueue {
|
|
|
302
418
|
let cumPayload = 0; // 仅 payload
|
|
303
419
|
let stoppedAtEof = true;
|
|
304
420
|
|
|
421
|
+
const pendingCount = this.memQueue.length - this.head;
|
|
422
|
+
const baseCost = this.memBytes + pendingCount * ENTRY_OVERHEAD;
|
|
423
|
+
|
|
305
424
|
const stream = createReadStream(this.filePath, {
|
|
306
425
|
start: this.readOffset,
|
|
307
426
|
end: actualEnd - 1,
|
|
@@ -311,7 +430,9 @@ class FileBackedQueue {
|
|
|
311
430
|
try {
|
|
312
431
|
for await (const line of rl) {
|
|
313
432
|
const sz = Buffer.byteLength(line, 'utf8');
|
|
314
|
-
|
|
433
|
+
// overhead 一致性:admission 侧已用 overhead,refill 侧同步考虑
|
|
434
|
+
const newLinesCost = newLines.length * ENTRY_OVERHEAD;
|
|
435
|
+
if (newLines.length > 0 && baseCost + cumPayload + newLinesCost + sz + ENTRY_OVERHEAD > this.memBudget) {
|
|
315
436
|
stoppedAtEof = false;
|
|
316
437
|
break;
|
|
317
438
|
}
|
|
@@ -320,10 +441,12 @@ class FileBackedQueue {
|
|
|
320
441
|
cumPayload += sz;
|
|
321
442
|
}
|
|
322
443
|
} catch (err) {
|
|
323
|
-
/* c8 ignore next
|
|
444
|
+
/* c8 ignore next 6 -- read 错误极罕见(stat 已通过、fd 已打开),路径保留用于粘性降级 */
|
|
445
|
+
// read 错误同 stat:统一走粘性降级而非静默 return
|
|
324
446
|
this.logger?.warn?.('fbq.refill read error', err);
|
|
325
447
|
rl.close();
|
|
326
448
|
stream.destroy();
|
|
449
|
+
await this.__handleFsError(err);
|
|
327
450
|
return;
|
|
328
451
|
} finally {
|
|
329
452
|
rl.close();
|
|
@@ -349,7 +472,6 @@ class FileBackedQueue {
|
|
|
349
472
|
this.memQueue.push(line);
|
|
350
473
|
this.memBytes += Buffer.byteLength(line, 'utf8');
|
|
351
474
|
}
|
|
352
|
-
this.diskBytes -= cumPayload;
|
|
353
475
|
|
|
354
476
|
if (this.readOffset >= this.writtenBytes) {
|
|
355
477
|
await this.__dropFile();
|
|
@@ -367,7 +489,6 @@ class FileBackedQueue {
|
|
|
367
489
|
this.spilled = false;
|
|
368
490
|
this.writtenBytes = 0;
|
|
369
491
|
this.readOffset = 0;
|
|
370
|
-
this.diskBytes = 0;
|
|
371
492
|
this.writeErr = null;
|
|
372
493
|
}
|
|
373
494
|
}
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import { remoteLog as defaultRemoteLog } from '../remote-log.js';
|
|
2
2
|
|
|
3
|
-
const
|
|
3
|
+
const DEFAULT_IPC_REQUEST_TIMEOUT_MS = 20_000;
|
|
4
|
+
|
|
5
|
+
// 匹配 pion-node 内部视为严重的 log:IPC 请求超时、以及 Go 侧迟到的响应(主请求已 reject,响应变孤儿)
|
|
6
|
+
const SEVERE_LOG_PATTERN = /request timeout|orphan response/;
|
|
4
7
|
|
|
5
8
|
/**
|
|
6
9
|
* 预加载 Pion WebRTC 实现:启动 pion-ipc Go 进程,返回绑定了 ipc 的 PeerConnection。
|
|
@@ -13,13 +16,15 @@ const DEFAULT_START_TIMEOUT_MS = 10_000;
|
|
|
13
16
|
* @param {object} [deps] - 可注入依赖(测试用)
|
|
14
17
|
* @param {Function} [deps.dynamicImport] - (specifier) => import(specifier)
|
|
15
18
|
* @param {Function} [deps.remoteLog] - (text) => void
|
|
16
|
-
* @param {
|
|
19
|
+
* @param {object} [deps.logger] - plugin 本地 pino-style logger(.info/.warn/.error),用于本地调试可见性
|
|
20
|
+
* @param {number} [deps.ipcRequestTimeout] - 每次 IPC 请求的超时(ms,也用于启动 ping),默认 20s
|
|
17
21
|
* @returns {Promise<{ PeerConnection: Function, cleanup: Function, impl: string, ipc: object }|null>}
|
|
18
22
|
*/
|
|
19
23
|
export async function preloadPion(deps = {}) {
|
|
20
24
|
const log = deps.remoteLog ?? defaultRemoteLog;
|
|
25
|
+
const localLogger = deps.logger ?? null;
|
|
21
26
|
const dynamicImport = deps.dynamicImport ?? ((spec) => import(spec));
|
|
22
|
-
const
|
|
27
|
+
const ipcRequestTimeout = deps.ipcRequestTimeout ?? DEFAULT_IPC_REQUEST_TIMEOUT_MS;
|
|
23
28
|
|
|
24
29
|
log('pion.preload');
|
|
25
30
|
|
|
@@ -42,9 +47,18 @@ export async function preloadPion(deps = {}) {
|
|
|
42
47
|
}
|
|
43
48
|
|
|
44
49
|
// 启动 IPC 进程(内部会 ping 验证就绪,binary 由 pion-node 自动解析)
|
|
50
|
+
// logger 回调双打:始终走 remoteLog;同时送本地 logger,严重事件(IPC 超时、orphan 响应)
|
|
51
|
+
// 升级到 error 级别,便于本地调试时一眼可见;其他运维类消息走 info。
|
|
45
52
|
ipc = new PionIpc({
|
|
46
|
-
logger: (msg) =>
|
|
47
|
-
|
|
53
|
+
logger: (msg) => {
|
|
54
|
+
log(`pion.ipc ${msg}`);
|
|
55
|
+
if (SEVERE_LOG_PATTERN.test(msg)) {
|
|
56
|
+
localLogger?.error?.(`[pion-ipc] ${msg}`);
|
|
57
|
+
} else {
|
|
58
|
+
localLogger?.info?.(`[pion-ipc] ${msg}`);
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
timeout: ipcRequestTimeout,
|
|
48
62
|
autoRestart: true,
|
|
49
63
|
});
|
|
50
64
|
|