@coclaw/openclaw-coclaw 0.17.4 → 0.17.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -14,6 +14,15 @@ const INITIAL_DELAY_MS = 60 * 60 * 1000; // 60 分钟
|
|
|
14
14
|
const CHECK_INTERVAL_MS = 60 * 60 * 1000; // 1 小时
|
|
15
15
|
const CHANNEL_ID = 'coclaw';
|
|
16
16
|
const LOCK_FILENAME = 'upgrade.lock';
|
|
17
|
+
// 锁年龄兜底:worker 最坏耗时约 36 分钟,TTL 给到约 3 倍余量。
|
|
18
|
+
// 超龄一律视为过期清理,兜住 worker 被强杀未清锁 / PID 被 OS 复用给长命进程的场景,
|
|
19
|
+
// 避免自动升级被永久卡住。
|
|
20
|
+
// 刻意取 110 分钟而非 120 分钟:巡检间隔 60min,锁写入与巡检有秒级抖动;
|
|
21
|
+
// 若 TTL 正好等于巡检间隔的整数倍,锁年龄会在第 N 次巡检时刚好 "未过期",
|
|
22
|
+
// 要等到第 N+1 次巡检才清,白白多浪费一轮。110min 保证第 2 次巡检即过期。
|
|
23
|
+
// 代价是 worker 真卡超 110 分钟会多起一个并行 worker,此概率在当前超时矩阵下极低,
|
|
24
|
+
// 且底层升级命令失败会走回滚,不会破坏插件。
|
|
25
|
+
const LOCK_TTL_MS = 110 * 60 * 1000; // 110 分钟
|
|
17
26
|
|
|
18
27
|
// ── upgrade.lock:保证同时最多一个 worker 进程 ──
|
|
19
28
|
|
|
@@ -21,10 +30,34 @@ export function getLockPath() {
|
|
|
21
30
|
return nodePath.join(resolveStateDir(), CHANNEL_ID, LOCK_FILENAME);
|
|
22
31
|
}
|
|
23
32
|
|
|
33
|
+
/**
|
|
34
|
+
* 清理过期锁文件。
|
|
35
|
+
*
|
|
36
|
+
* 成功才打 "Stale lock removed" 的 info;失败意味着系统性异常(权限/只读 FS/
|
|
37
|
+
* 路径被替换为目录等),打 warn 并上报 server,避免运维无感——这类失败若与
|
|
38
|
+
* writeUpgradeLock 同源故障叠加,会让锁陷入"每轮都判过期但写不进新 pid"的循环。
|
|
39
|
+
* { force: true } 对文件不存在本身不会抛,所以这里 catch 到的一定是真故障。
|
|
40
|
+
* 函数本身不抛——调用方无需额外 catch。
|
|
41
|
+
* @param {string} lockPath
|
|
42
|
+
* @param {'missing-pid'|'ttl-exceeded'|'pid-dead'} reason - 清理原因 token,
|
|
43
|
+
* 同时用作 remoteLog 的 key=value 字段
|
|
44
|
+
* @param {object} [logger]
|
|
45
|
+
*/
|
|
46
|
+
async function removeStaleLock(lockPath, reason, logger) {
|
|
47
|
+
try {
|
|
48
|
+
await fs.rm(lockPath, { force: true });
|
|
49
|
+
logger?.info?.(`[auto-upgrade] Stale lock removed (${reason})`);
|
|
50
|
+
}
|
|
51
|
+
catch (err) {
|
|
52
|
+
logger?.warn?.(`[auto-upgrade] Stale lock removal failed (${reason}): ${err?.message}`);
|
|
53
|
+
remoteLog(`upgrade.lock-cleanup-failed reason=${reason} msg=${err?.message}`);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
24
57
|
/**
|
|
25
58
|
* 检查升级锁是否被持有(worker 进程是否存活)
|
|
26
59
|
*
|
|
27
|
-
*
|
|
60
|
+
* 若锁文件存在但判定为过期(PID 已死 / JSON 无效 / 超龄),顺手清理残留文件。
|
|
28
61
|
* @param {object} [opts]
|
|
29
62
|
* @param {object} [opts.logger]
|
|
30
63
|
* @returns {Promise<boolean>}
|
|
@@ -42,8 +75,14 @@ export async function isUpgradeLocked(opts) {
|
|
|
42
75
|
try {
|
|
43
76
|
const lock = JSON.parse(raw);
|
|
44
77
|
if (!lock.pid) {
|
|
45
|
-
|
|
46
|
-
|
|
78
|
+
await removeStaleLock(lockPath, 'missing-pid', logger);
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
// 超龄兜底:PID 复用误判、worker 被强杀未清锁等场景下一律视为过期。
|
|
82
|
+
// ts 不可解析也当过期(writeUpgradeLock 必写 ISO 时间戳,缺字段即异常状态)。
|
|
83
|
+
const lockTs = Date.parse(lock.ts);
|
|
84
|
+
if (!Number.isFinite(lockTs) || Date.now() - lockTs > LOCK_TTL_MS) {
|
|
85
|
+
await removeStaleLock(lockPath, 'ttl-exceeded', logger);
|
|
47
86
|
return false;
|
|
48
87
|
}
|
|
49
88
|
// signal 0 不发信号,仅检查进程存活性;进程不存在时抛异常
|
|
@@ -52,8 +91,7 @@ export async function isUpgradeLocked(opts) {
|
|
|
52
91
|
}
|
|
53
92
|
catch {
|
|
54
93
|
// JSON 无效 / PID 已死 → 清理过期锁
|
|
55
|
-
|
|
56
|
-
await fs.rm(lockPath, { force: true }).catch(() => {});
|
|
94
|
+
await removeStaleLock(lockPath, 'pid-dead', logger);
|
|
57
95
|
return false;
|
|
58
96
|
}
|
|
59
97
|
}
|
|
@@ -2,8 +2,12 @@
|
|
|
2
2
|
* worker-verify.js — 升级后验证
|
|
3
3
|
*
|
|
4
4
|
* 策略:触发 gateway restart → 轮询 coclaw.upgradeHealth RPC 直到返回版本
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
* ≥ toVersion(等于或更新)。单次调用失败(gateway 未就绪 / plugin 未注册 /
|
|
6
|
+
* JSON 非法 / 版本不够新)一律按"稍后重试"处理,在总超时窗口内持续尝试。
|
|
7
|
+
*
|
|
8
|
+
* 允许 > toVersion 的原因:scheduler 观察到 latest=x 并发起升级后,到实际
|
|
9
|
+
* 执行 `plugins update` 之间 npm dist-tag 可能已指向 x+1;严格等 x 会把
|
|
10
|
+
* 这种"升级到了更新版本"误判为失败并回滚。
|
|
7
11
|
*
|
|
8
12
|
* 磁盘 package.json 的版本仅作为诊断写入本地日志,不参与判定——openclaw 侧
|
|
9
13
|
* `plugins.installs[id].installPath` 可能在 id-migration 等极端场景发生漂移,
|
|
@@ -16,6 +20,23 @@ import { execFile as nodeExecFile } from 'node:child_process';
|
|
|
16
20
|
import { readFile } from 'node:fs/promises';
|
|
17
21
|
import nodePath from 'node:path';
|
|
18
22
|
|
|
23
|
+
// 与 updater-check.js 同逻辑,worker 运行在独立子进程,不跨进程复用 gateway 模块
|
|
24
|
+
function isNewerVersion(a, b) {
|
|
25
|
+
const parse = (v) => v.replace(/-.*$/, '').split('.').map(Number);
|
|
26
|
+
const pa = parse(a);
|
|
27
|
+
const pb = parse(b);
|
|
28
|
+
for (let i = 0; i < 3; i++) {
|
|
29
|
+
/* c8 ignore next 2 -- ?? fallback:正常 semver 不会有缺失段 */
|
|
30
|
+
if ((pa[i] ?? 0) > (pb[i] ?? 0)) return true;
|
|
31
|
+
if ((pa[i] ?? 0) < (pb[i] ?? 0)) return false;
|
|
32
|
+
}
|
|
33
|
+
// x.y.z 相同时:release > pre-release(semver 规则)
|
|
34
|
+
const aHasPre = a.includes('-');
|
|
35
|
+
const bHasPre = b.includes('-');
|
|
36
|
+
if (bHasPre && !aHasPre) return true;
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
|
|
19
40
|
const CMD_TIMEOUT_MS = 30_000;
|
|
20
41
|
const HEALTH_POLL_INTERVAL_MS = 3_000;
|
|
21
42
|
// 本机 openclaw 冷启动可能需访问外部资源(AWS 诊断、ollama 探测等)
|
|
@@ -113,7 +134,7 @@ async function callUpgradeHealthOnce(opts) {
|
|
|
113
134
|
}
|
|
114
135
|
|
|
115
136
|
/**
|
|
116
|
-
* 轮询 upgradeHealth
|
|
137
|
+
* 轮询 upgradeHealth 直到版本 ≥ toVersion,或总超时
|
|
117
138
|
* @param {string} toVersion
|
|
118
139
|
* @param {object} [opts]
|
|
119
140
|
* @param {Function} [opts.execFileFn]
|
|
@@ -137,7 +158,8 @@ export async function pollUpgradeHealth(toVersion, opts) {
|
|
|
137
158
|
attempts += 1;
|
|
138
159
|
const result = await callUpgradeHealthOnce(opts);
|
|
139
160
|
if (result.ok) {
|
|
140
|
-
|
|
161
|
+
// 等于或更新均视为成功,覆盖"升级窗口期 dist-tag 前移"的情形
|
|
162
|
+
if (result.version === toVersion || isNewerVersion(result.version, toVersion)) {
|
|
141
163
|
return {
|
|
142
164
|
ok: true,
|
|
143
165
|
version: result.version,
|
|
@@ -146,7 +168,7 @@ export async function pollUpgradeHealth(toVersion, opts) {
|
|
|
146
168
|
};
|
|
147
169
|
}
|
|
148
170
|
lastVersion = result.version;
|
|
149
|
-
lastReason = `version-
|
|
171
|
+
lastReason = `version-too-old got=${result.version} want>=${toVersion}`;
|
|
150
172
|
}
|
|
151
173
|
else {
|
|
152
174
|
lastReason = result.reason;
|
|
@@ -22,6 +22,10 @@ import { getCurrentNpmRegistry, pickFallbackRegistry } from './registry-fallback
|
|
|
22
22
|
const SEMVER_RE = /^\d+\.\d+\.\d+(-[\w.-]+)?$/;
|
|
23
23
|
// 单次 plugins update 上限:包含 npm install 大型 native deps,慢网络 + 弱机器需较长时间
|
|
24
24
|
const UPDATE_TIMEOUT_MS = 10 * 60 * 1000;
|
|
25
|
+
// 回滚兜底重装旧版本走的是同一条 npm 下载链路,且触发前置本身是"备份已丢"的异常态,
|
|
26
|
+
// 此时尽量兜住比快速失败更重要,与 UPDATE_TIMEOUT_MS 对齐
|
|
27
|
+
const FALLBACK_INSTALL_TIMEOUT_MS = 10 * 60 * 1000;
|
|
28
|
+
const FALLBACK_UNINSTALL_TIMEOUT_MS = 60 * 1000;
|
|
25
29
|
|
|
26
30
|
/**
|
|
27
31
|
* 执行 openclaw plugins update
|
|
@@ -73,7 +77,7 @@ async function fallbackInstallOldVersion(pkgName, version, pluginId, opts) {
|
|
|
73
77
|
}
|
|
74
78
|
/* c8 ignore next -- ?./?? fallback */
|
|
75
79
|
const doExecFile = opts?.execFileFn ?? nodeExecFile;
|
|
76
|
-
const run = (args, timeout
|
|
80
|
+
const run = (args, timeout) => new Promise((resolve, reject) => {
|
|
77
81
|
doExecFile('openclaw', args, { timeout, shell: process.platform === 'win32' }, (err) => {
|
|
78
82
|
if (err) reject(err);
|
|
79
83
|
else resolve();
|
|
@@ -82,13 +86,13 @@ async function fallbackInstallOldVersion(pkgName, version, pluginId, opts) {
|
|
|
82
86
|
|
|
83
87
|
// 先卸载:install 不支持覆盖已安装插件
|
|
84
88
|
try {
|
|
85
|
-
await run(['plugins', 'uninstall', pluginId],
|
|
89
|
+
await run(['plugins', 'uninstall', pluginId], FALLBACK_UNINSTALL_TIMEOUT_MS);
|
|
86
90
|
} catch {
|
|
87
91
|
// uninstall 失败不阻断,继续尝试 install
|
|
88
92
|
}
|
|
89
93
|
|
|
90
94
|
try {
|
|
91
|
-
await run(['plugins', 'install', `${pkgName}@${version}`]);
|
|
95
|
+
await run(['plugins', 'install', `${pkgName}@${version}`], FALLBACK_INSTALL_TIMEOUT_MS);
|
|
92
96
|
} catch (err) {
|
|
93
97
|
throw new Error(`fallback install failed: ${err.message}`);
|
|
94
98
|
}
|
|
@@ -171,8 +175,11 @@ export async function runUpgrade({ pluginDir, fromVersion, toVersion, pluginId,
|
|
|
171
175
|
catch (e) {
|
|
172
176
|
log(`[upgrade-worker] Backup cleanup failed (non-fatal): ${e.message}`);
|
|
173
177
|
}
|
|
174
|
-
|
|
175
|
-
|
|
178
|
+
// 记录真实装上的版本而非目标版本——dist-tag 前移窗口下两者可能不同。
|
|
179
|
+
// 不加 fallback:若 result.ok 时 version 缺失,说明上游契约被破坏,
|
|
180
|
+
// 宁可让状态里直接暴露 undefined 便于排障,也不要用 toVersion 糊过去
|
|
181
|
+
await updateLastUpgrade({ from: fromVersion, to: result.version, result: 'ok' });
|
|
182
|
+
await appendLog({ from: fromVersion, to: result.version, result: 'ok' });
|
|
176
183
|
log('[upgrade-worker] Upgrade complete');
|
|
177
184
|
} else {
|
|
178
185
|
// 4b. 失败,回滚
|
package/src/realtime-bridge.js
CHANGED
|
@@ -1087,7 +1087,7 @@ export class RealtimeBridge {
|
|
|
1087
1087
|
// 1. 尝试 pion(最高优先级)
|
|
1088
1088
|
const preloadPionFn = this.__preloadPion
|
|
1089
1089
|
?? (await import('./webrtc/pion-preloader.js')).preloadPion;
|
|
1090
|
-
const pionResult = await preloadPionFn().catch((err) => {
|
|
1090
|
+
const pionResult = await preloadPionFn({ logger: this.logger }).catch((err) => {
|
|
1091
1091
|
this.logger.warn?.(`[coclaw] pion preload unexpected failure: ${err?.message}`);
|
|
1092
1092
|
return null;
|
|
1093
1093
|
});
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import { remoteLog as defaultRemoteLog } from '../remote-log.js';
|
|
2
2
|
|
|
3
|
-
const
|
|
3
|
+
const DEFAULT_IPC_REQUEST_TIMEOUT_MS = 20_000;
|
|
4
|
+
|
|
5
|
+
// 匹配 pion-node 内部视为严重的 log:IPC 请求超时、以及 Go 侧迟到的响应(主请求已 reject,响应变孤儿)
|
|
6
|
+
const SEVERE_LOG_PATTERN = /request timeout|orphan response/;
|
|
4
7
|
|
|
5
8
|
/**
|
|
6
9
|
* 预加载 Pion WebRTC 实现:启动 pion-ipc Go 进程,返回绑定了 ipc 的 PeerConnection。
|
|
@@ -13,13 +16,15 @@ const DEFAULT_START_TIMEOUT_MS = 10_000;
|
|
|
13
16
|
* @param {object} [deps] - 可注入依赖(测试用)
|
|
14
17
|
* @param {Function} [deps.dynamicImport] - (specifier) => import(specifier)
|
|
15
18
|
* @param {Function} [deps.remoteLog] - (text) => void
|
|
16
|
-
* @param {
|
|
19
|
+
* @param {object} [deps.logger] - plugin 本地 pino-style logger(.info/.warn/.error),用于本地调试可见性
|
|
20
|
+
* @param {number} [deps.ipcRequestTimeout] - 每次 IPC 请求的超时(ms,也用于启动 ping),默认 20s
|
|
17
21
|
* @returns {Promise<{ PeerConnection: Function, cleanup: Function, impl: string, ipc: object }|null>}
|
|
18
22
|
*/
|
|
19
23
|
export async function preloadPion(deps = {}) {
|
|
20
24
|
const log = deps.remoteLog ?? defaultRemoteLog;
|
|
25
|
+
const localLogger = deps.logger ?? null;
|
|
21
26
|
const dynamicImport = deps.dynamicImport ?? ((spec) => import(spec));
|
|
22
|
-
const
|
|
27
|
+
const ipcRequestTimeout = deps.ipcRequestTimeout ?? DEFAULT_IPC_REQUEST_TIMEOUT_MS;
|
|
23
28
|
|
|
24
29
|
log('pion.preload');
|
|
25
30
|
|
|
@@ -42,9 +47,18 @@ export async function preloadPion(deps = {}) {
|
|
|
42
47
|
}
|
|
43
48
|
|
|
44
49
|
// 启动 IPC 进程(内部会 ping 验证就绪,binary 由 pion-node 自动解析)
|
|
50
|
+
// logger 回调双打:始终走 remoteLog;同时送本地 logger,严重事件(IPC 超时、orphan 响应)
|
|
51
|
+
// 升级到 error 级别,便于本地调试时一眼可见;其他运维类消息走 info。
|
|
45
52
|
ipc = new PionIpc({
|
|
46
|
-
logger: (msg) =>
|
|
47
|
-
|
|
53
|
+
logger: (msg) => {
|
|
54
|
+
log(`pion.ipc ${msg}`);
|
|
55
|
+
if (SEVERE_LOG_PATTERN.test(msg)) {
|
|
56
|
+
localLogger?.error?.(`[pion-ipc] ${msg}`);
|
|
57
|
+
} else {
|
|
58
|
+
localLogger?.info?.(`[pion-ipc] ${msg}`);
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
timeout: ipcRequestTimeout,
|
|
48
62
|
autoRestart: true,
|
|
49
63
|
});
|
|
50
64
|
|
|
@@ -83,11 +83,11 @@ export class RpcSendQueue {
|
|
|
83
83
|
if (this.queueBytes >= MAX_QUEUE_BYTES) {
|
|
84
84
|
this.droppedCount += 1;
|
|
85
85
|
this.droppedBytes += totalBytes;
|
|
86
|
-
|
|
87
|
-
//
|
|
88
|
-
// this.logger.info?.(`[rpc-queue${this.__tagSuffix()}] dropped-payload ${jsonStr}`);
|
|
86
|
+
// 仅状态翻转点打 log(warn + remoteLog 各一次);overflow 持续期间所有 drop 静默累加,
|
|
87
|
+
// 避免 UI 离线 + ICE 失败导致 DC 永远不 drain 时的日志刷屏
|
|
89
88
|
if (!this.queueOverflowActive) {
|
|
90
89
|
this.queueOverflowActive = true;
|
|
90
|
+
this.logger.warn?.(`[rpc-queue${this.__tagSuffix()}] overflow-start queueBytes=${this.queueBytes}`);
|
|
91
91
|
remoteLog(`rpc-queue.overflow-start${this.__tagSuffix()} queueBytes=${this.queueBytes}`);
|
|
92
92
|
}
|
|
93
93
|
return false;
|
|
@@ -173,9 +173,10 @@ export class RpcSendQueue {
|
|
|
173
173
|
}
|
|
174
174
|
this.queue.shift();
|
|
175
175
|
this.queueBytes -= chunk.length;
|
|
176
|
-
// 满 → 未满
|
|
176
|
+
// 满 → 未满 状态转换:打一条带累计数的 log,与 overflow-start 对称
|
|
177
177
|
if (this.queueOverflowActive && this.queueBytes < MAX_QUEUE_BYTES) {
|
|
178
178
|
this.queueOverflowActive = false;
|
|
179
|
+
this.logger.info?.(`[rpc-queue${this.__tagSuffix()}] overflow-end dropped=${this.droppedCount} droppedBytes=${this.droppedBytes}`);
|
|
179
180
|
remoteLog(`rpc-queue.overflow-end${this.__tagSuffix()} dropped=${this.droppedCount} droppedBytes=${this.droppedBytes}`);
|
|
180
181
|
}
|
|
181
182
|
}
|