coze_lab 0.1.36 → 0.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,8 +8,8 @@ Configure local AI agents (Claude Code, Codex, OpenClaw) to report traces to Coz
8
8
  # First-time setup — triggers browser OAuth authorization
9
9
  npx coze_lab --agent=<type>
10
10
 
11
- # Cloud setup for a managed agent
12
- npx coze_lab --cloud --agent-id=<agentId>
11
+ # Per-agent setup. Cloud/local is inferred from coze-bridge config.
12
+ npx coze_lab --agent-id=<agentId>
13
13
 
14
14
  # Auth-only commands (no agent configuration)
15
15
  npx coze_lab --login # Device Code login only
@@ -23,8 +23,8 @@ npx coze_lab --logout # Clear cached credentials
23
23
  | Parameter | Required | Values / Effect |
24
24
  |-----------|----------|-----------------|
25
25
  | `--agent` | ✓ (for setup) | `claude-code`, `codex`, `openclaw` |
26
- | `--agent-id` | — | Resolve `~/.coze/agents/<agentId>/config.json` and write per-agent config |
27
- | `--cloud` | — | Cloud mode: read trace token from env and emit `COZE_LAB_RESULT=...` |
26
+ | `--agent-id` | — | Resolve `~/.coze/agents/<agentId>/config.json` and write per-agent config. `deployType=cloud` enables cloud mode automatically |
27
+ | `--cloud` | — | Backward-compatible override for old callers. New callers should rely on `--agent-id` + config `deployType` |
28
28
  | `--codex-home` | — | Override Codex config home for non-cloud/custom runs |
29
29
  | `--login` | — | Run the Device Code login flow only |
30
30
  | `--status` | — | Print local token status (valid / expiring / expired) |
@@ -56,7 +56,7 @@ npx coze_lab --logout # Clear cached credentials
56
56
  | `codex` | `~/.codex/hooks/cozeloop_hook.py` | `~/.codex/hooks.json` | `~/.codex/hooks/cozeloop.env` |
57
57
  | `openclaw` | — (Node.js plugin) | `~/.openclaw/openclaw.json` | inline in config |
58
58
 
59
- For cloud Codex with `--cloud --agent-id=<agentId>`, Codex hooks are written to
59
+ For cloud Codex with `--agent-id=<agentId>` and config `deployType=cloud`, Codex hooks are written to
60
60
  `~/.coze/agents/<agentId>/codex-home` by default. The directory is created if it
61
61
  does not already exist, so callers do not need to pass `--codex-home` for the
62
62
  standard coze-bridge layout.
package/index.js CHANGED
@@ -11,7 +11,7 @@ const PACKAGE_VERSION = require('./package.json').version;
11
11
  const REFRESH_THRESHOLD_MS = 10 * 60 * 1000;
12
12
 
13
13
  // ─── 1. Cloud structured output ──────────────────────────────────────────────
14
- // 云端(--cloud)模式:在 stdout 输出一行机器可读结果 COZE_LAB_RESULT={...},
14
+ // 云端模式:在 stdout 输出一行机器可读结果 COZE_LAB_RESULT={...},
15
15
  // 供管理后台解析判定(inject/verify/logid/message),不依赖中文文案。
16
16
  let CLOUD_MODE = false;
17
17
  const cloudResult = { version: PACKAGE_VERSION, inject: 'skip', verify: 'skip', logid: '', message: '', token_source: '' };
@@ -156,7 +156,7 @@ function parseArgs() {
156
156
 
157
157
  const VALID_AGENTS = ['claude-code', 'codex', 'openclaw'];
158
158
 
159
- // resolveAgent 读 ~/.coze/agents/<agentId>/config.json,返回 { framework, workspace, agentId, root }。
159
+ // resolveAgent 读 ~/.coze/agents/<agentId>/config.json,返回 { framework, workspace, deployType, agentId, root }。
160
160
  // soft=true 时,config 不存在 / 解析失败 / framework 非法均返回 null(不退出),供云端回退到显式 --agent。
161
161
  function resolveAgent(agentId, soft) {
162
162
  const root = path.join(os.homedir(), '.coze', 'agents', agentId);
@@ -186,7 +186,8 @@ function resolveAgent(agentId, soft) {
186
186
  `支持的类型: ${VALID_AGENTS.join(', ')}`,
187
187
  ]);
188
188
  }
189
- return { framework, workspace: cfg.workspace || '', agentId, root };
189
+ const deployType = cfg.deployType === 'cloud' ? 'cloud' : 'local';
190
+ return { framework, workspace: cfg.workspace || '', deployType, agentId, root };
190
191
  }
191
192
 
192
193
  function validateArgs(args) {
@@ -196,28 +197,32 @@ function validateArgs(args) {
196
197
  if (args['refresh']) return { refresh: true };
197
198
  if (args['verify']) return { verify: true, pairCode: args['pair-code'] };
198
199
 
199
- // --cloud + --agent-id:优先读云端 ~/.coze/agents/<id>/config.json 拿 framework + workspace
200
- // (云端 config.json 稳定存在);读不到再回退到命令行显式 --agent(workspace main 推断)。
201
- if (args['cloud'] && args['agent-id']) {
202
- const resolved = resolveAgent(args['agent-id'], true /* soft */);
200
+ // --agent-id:优先读 coze-bridge 的 ~/.coze/agents/<id>/config.json 拿 framework/workspace/deployType。
201
+ // deployType=cloud 时自动进入云端注入路径;显式 --cloud 仍保留,供 config 缺失时兜底。
202
+ if (args['agent-id']) {
203
+ const explicitCloud = !!args['cloud'];
204
+ const resolved = resolveAgent(args['agent-id'], explicitCloud /* soft */);
203
205
  if (resolved) {
206
+ const cloud = explicitCloud || resolved.deployType === 'cloud';
204
207
  return {
205
208
  agent: resolved.framework,
206
209
  agentId: resolved.agentId,
207
210
  workspace: resolved.workspace,
208
211
  agentRoot: resolved.root,
212
+ deployType: resolved.deployType,
209
213
  'codex-home': args['codex-home'],
210
214
  pairCode: args['pair-code'],
211
- cloud: true,
215
+ cloud,
212
216
  force: !!args['force'],
213
217
  };
214
218
  }
215
- // config.json 缺失:回退到显式 --agent
219
+ // 显式 --cloud 且 config.json 缺失:回退到显式 --agent(workspace 在 main 推断)。
216
220
  if (!args['agent'] || !VALID_AGENTS.includes(args['agent'])) {
217
221
  errorBox([
218
222
  `ERROR: 未找到 agent "${args['agent-id']}" 的 config.json,且未显式指定 --agent`,
219
223
  '',
220
- '请确认 agentId 正确,或显式拼上 framework:',
224
+ '新调用方应确认 coze-bridge 已在目标环境写入该 agent config。',
225
+ '如需兼容旧手工命令,可显式拼上 framework 和 --cloud:',
221
226
  ` npx coze_lab --cloud --agent-id=${args['agent-id']} --agent=claude-code|codex|openclaw`,
222
227
  ]);
223
228
  }
@@ -225,6 +230,7 @@ function validateArgs(args) {
225
230
  agent: args['agent'],
226
231
  agentId: args['agent-id'],
227
232
  workspace: args['workspace'] || '',
233
+ deployType: 'cloud',
228
234
  'codex-home': args['codex-home'],
229
235
  pairCode: args['pair-code'],
230
236
  cloud: true,
@@ -232,21 +238,6 @@ function validateArgs(args) {
232
238
  };
233
239
  }
234
240
 
235
- // 本地 --agent-id:读 ~/.coze/agents/<id>/config.json 的 framework 自动路由。
236
- if (args['agent-id']) {
237
- const resolved = resolveAgent(args['agent-id']);
238
- return {
239
- agent: resolved.framework,
240
- agentId: resolved.agentId,
241
- workspace: resolved.workspace,
242
- agentRoot: resolved.root,
243
- 'codex-home': args['codex-home'],
244
- pairCode: args['pair-code'],
245
- cloud: !!args['cloud'],
246
- force: !!args['force'],
247
- };
248
- }
249
-
250
241
  if (!args['agent']) {
251
242
  errorBox([
252
243
  'ERROR: --agent 或 --agent-id 至少提供一个',
@@ -377,13 +368,19 @@ function checkPython() {
377
368
  const COZELOOP_MIN_SPEC = 'cozeloop>=0.1.28';
378
369
  // 探测脚本:import 成功且具备 set_finish_time 能力 → exit 0;否则非 0。
379
370
  const COZELOOP_CAPABLE_PROBE = `import cozeloop,sys; sys.exit(0 if hasattr(cozeloop.Span,'set_finish_time') else 3)`;
380
- function checkCozeloopSdk(pythonCmd) {
371
+ function checkCozeloopSdk(pythonCmd, options = {}) {
372
+ const cloud = !!options.cloud;
381
373
  try {
382
374
  execSync(`${pythonCmd} -c "${COZELOOP_CAPABLE_PROBE}"`, { stdio: 'pipe' });
383
375
  ok('cozeloop SDK — OK');
384
376
  return;
385
377
  } catch { /* 未装或版本过旧 — 下面安装/升级 */ }
386
378
 
379
+ if (cloud) {
380
+ warn(`cozeloop SDK 不可用或版本过旧,云端注入阶段跳过 pip 安装;Hook 运行时会在当前解释器内尝试安装/升级 ${COZELOOP_MIN_SPEC}`);
381
+ return;
382
+ }
383
+
387
384
  info(`cozeloop SDK 不可用或版本过旧,正在安装/升级 (pip install -U '${COZELOOP_MIN_SPEC}')...`);
388
385
  try {
389
386
  execSync(`${pythonCmd} -m pip install --quiet --upgrade '${COZELOOP_MIN_SPEC}'`, { stdio: 'pipe' });
@@ -1128,6 +1125,84 @@ except Exception as e:
1128
1125
  return { success, status: result.code || 0, body, traceId: '', pairCode: pair, apiBaseUrl: apiBase, tokenSource, logid: parsed?.logid || '' };
1129
1126
  }
1130
1127
 
1128
+ async function verifyCloudTraceReport(token, workspaceId, pairCode, tokenSource) {
1129
+ const apiBase = getCozeloopSdkApiBaseUrl(true);
1130
+ const tracesUrl = getCozeloopIngestUrlFromBase(apiBase || COZE_API);
1131
+ const traceId = crypto.randomBytes(16).toString('hex');
1132
+ const spanId = crypto.randomBytes(8).toString('hex');
1133
+ const nowMicros = Date.now() * 1000;
1134
+ const pair = pairCode || crypto.randomBytes(6).toString('hex');
1135
+ const ingestBody = {
1136
+ spans: [{
1137
+ started_at_micros: nowMicros,
1138
+ log_id: '',
1139
+ span_id: spanId,
1140
+ parent_id: '0',
1141
+ trace_id: traceId,
1142
+ duration_micros: 1,
1143
+ service_name: '',
1144
+ workspace_id: String(workspaceId),
1145
+ span_name: 'cozelab-onboard-selfcheck',
1146
+ span_type: 'main',
1147
+ status_code: 0,
1148
+ input: 'cozelab-onboard selfcheck',
1149
+ output: 'ok',
1150
+ object_storage: '',
1151
+ system_tags_string: {
1152
+ runtime: JSON.stringify({
1153
+ language: 'nodejs',
1154
+ library: 'coze_lab',
1155
+ scene: process.env.COZELOOP_SCENE || 'custom',
1156
+ loop_sdk_version: `coze_lab@${PACKAGE_VERSION}`,
1157
+ }),
1158
+ },
1159
+ system_tags_long: {},
1160
+ system_tags_double: {},
1161
+ tags_string: {
1162
+ pair_code: pair,
1163
+ source: 'cozelab-onboard',
1164
+ token_source: tokenSource || '',
1165
+ },
1166
+ tags_long: {},
1167
+ tags_double: {},
1168
+ tags_bool: {},
1169
+ }],
1170
+ };
1171
+
1172
+ info(`trace 上报 URL: ${tracesUrl} (cloud fast ingest, api_base_url=${apiBase || '(default)'})`);
1173
+ try {
1174
+ const res = await httpsPost(
1175
+ tracesUrl,
1176
+ ingestBody,
1177
+ {
1178
+ Authorization: `Bearer ${token}`,
1179
+ 'User-Agent': `coze_lab/${PACKAGE_VERSION} node/${process.versions.node}`,
1180
+ 'X-Coze-Client-User-Agent': JSON.stringify({
1181
+ version: PACKAGE_VERSION,
1182
+ lang: 'nodejs',
1183
+ lang_version: process.versions.node,
1184
+ os_name: process.platform,
1185
+ scene: 'cozeloop',
1186
+ source: 'openapi',
1187
+ }),
1188
+ },
1189
+ );
1190
+ const success = res.status >= 200 && res.status < 300;
1191
+ if (success) {
1192
+ ok(`trace 上报成功 (traceId=${traceId}, pair_code=${pair})`);
1193
+ info(`查询方可用 pair_code=${pair} 在 CozeLoop 回查确认该 trace 已落库。`);
1194
+ } else {
1195
+ warn(`trace 上报失败: HTTP ${res.status}`);
1196
+ const snippet = (res.body || '').slice(0, 300);
1197
+ if (snippet) console.log(snippet);
1198
+ }
1199
+ return { success, status: res.status, body: res.body || '', traceId, pairCode: pair, apiBaseUrl: apiBase, tokenSource, logid: extractLogid(res.body || '') };
1200
+ } catch (e) {
1201
+ warn(`trace 上报失败: ${e.message}`);
1202
+ return { success: false, status: 0, body: e.message, traceId, pairCode: pair, apiBaseUrl: apiBase, tokenSource, logid: extractLogid(e.message) };
1203
+ }
1204
+ }
1205
+
1131
1206
  // 真实发一条最小 OTLP trace 到 CozeLoop,验证上报链路是否打通。
1132
1207
  // 只看 HTTP 状态码(2xx=通),不回查 trace 是否落库——回查由外部查询方完成。
1133
1208
  // pairCode 写进 span 的 pair_code attribute,供查询方按该字段过滤回查;缺省自动生成。
@@ -1203,7 +1278,7 @@ async function verifyTraceReport(token, workspaceId, pairCode, tracesUrl) {
1203
1278
  // 所以额外检测【实际加载的插件是否含刷新逻辑 getRefreshedToken】,无则告警。
1204
1279
  // - cloud:disableLocalCredentials=true,插件只用写死的 token、不刷新,token 失效需重注入,
1205
1280
  // 刷新能力检测对 cloud 无意义(跳过)。
1206
- async function verifyOpenClawTraceLink(cloud) {
1281
+ async function verifyOpenClawTraceLink(cloud, pairCode) {
1207
1282
  const home = resolveHomeDir(cloud);
1208
1283
  const configPath = path.join(home, '.openclaw', 'openclaw.json');
1209
1284
  let pcfg = null;
@@ -1228,7 +1303,7 @@ async function verifyOpenClawTraceLink(cloud) {
1228
1303
  const traceId = crypto.randomBytes(16).toString('hex');
1229
1304
  const spanId = crypto.randomBytes(8).toString('hex');
1230
1305
  const nowMicros = Date.now() * 1000;
1231
- const pair = crypto.randomBytes(6).toString('hex');
1306
+ const pair = pairCode || crypto.randomBytes(6).toString('hex');
1232
1307
  const ingestBody = {
1233
1308
  spans: [{
1234
1309
  started_at_micros: nowMicros,
@@ -1697,7 +1772,7 @@ async function main() {
1697
1772
  if (cfg?.plugins?.entries?.['openclaw-cozeloop-trace']?.config?.authorization) {
1698
1773
  console.log('');
1699
1774
  info('检测到 openclaw cozeloop-trace 插件,校验其实际 token...');
1700
- const ocRes = await verifyOpenClawTraceLink(false);
1775
+ const ocRes = await verifyOpenClawTraceLink(false, args.pairCode);
1701
1776
  ocOk = ocRes.success;
1702
1777
  }
1703
1778
  } catch { /* 读不了就跳过 openclaw 校验 */ }
@@ -1721,7 +1796,7 @@ async function main() {
1721
1796
  }
1722
1797
 
1723
1798
  // Step 1: Authorize.
1724
- // 云端(--cloud):token 取自 sandbox 注入的环境变量,跳过 OAuth / credentials.json。
1799
+ // 云端模式:token 取自 sandbox 注入的环境变量,跳过 OAuth / credentials.json。
1725
1800
  // 优先使用 COZELOOP_API_TOKEN;兼容使用 COZE_API_TOKEN,并以真实 selfcheck 为准。
1726
1801
  // 本地:load cached → refresh → device code。
1727
1802
  // 注意:workspace_id 始终用写死的 WORKSPACE_ID(团队固定上报 workspace),不读环境。
@@ -1734,7 +1809,7 @@ async function main() {
1734
1809
  tokenSource = tokenInfo.source;
1735
1810
  if (!token) {
1736
1811
  errorBox([
1737
- 'ERROR: --cloud 模式要求环境变量 COZELOOP_API_TOKEN 或 COZE_API_TOKEN',
1812
+ 'ERROR: 云端模式要求环境变量 COZELOOP_API_TOKEN 或 COZE_API_TOKEN',
1738
1813
  '',
1739
1814
  '云端 sandbox 应在进程环境中注入可用于 trace ingest 的 token。',
1740
1815
  '未检测到该变量,无法配置 trace 上报。',
@@ -1762,7 +1837,7 @@ async function main() {
1762
1837
  let pythonCmd = null;
1763
1838
  if (agent !== 'openclaw') {
1764
1839
  pythonCmd = checkPython();
1765
- checkCozeloopSdk(pythonCmd);
1840
+ checkCozeloopSdk(pythonCmd, { cloud: args.cloud });
1766
1841
  }
1767
1842
  checkVersionWhitelist(agent, version);
1768
1843
  console.log('');
@@ -1837,9 +1912,9 @@ async function main() {
1837
1912
  // (插件不读这个临时 token)。openclaw 必须用插件实际配置的 authorization 打 ingest,
1838
1913
  // 才能真实反映运行时上报会不会 401。cloud/local 配置位置一致,统一走这条。
1839
1914
  const verifyResult = agent === 'openclaw'
1840
- ? await verifyOpenClawTraceLink(args.cloud)
1915
+ ? await verifyOpenClawTraceLink(args.cloud, args.pairCode)
1841
1916
  : args.cloud
1842
- ? await verifyTraceReportViaSdk(token, WORKSPACE_ID, args.pairCode, pythonCmd || 'python3', tokenSource)
1917
+ ? await verifyCloudTraceReport(token, WORKSPACE_ID, args.pairCode, tokenSource)
1843
1918
  : await verifyTraceReport(token, WORKSPACE_ID, args.pairCode, getOtelTracesUrl(false));
1844
1919
  if (verifyResult.success) {
1845
1920
  cloudResult.verify = 'ok';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "coze_lab",
3
- "version": "0.1.36",
3
+ "version": "0.1.37",
4
4
  "description": "Configure local AI agents (Claude Code, Codex, OpenClaw) to report traces to CozeLoop",
5
5
  "keywords": [
6
6
  "cozeloop",
@@ -109,6 +109,8 @@ _COZELOOP_CLIENT_ID = "08972682140163281554629748278108.app.coze"
109
109
  _COZE_API = "https://api.coze.cn"
110
110
  _OTEL_SUFFIX = "/v1/loop/opentelemetry"
111
111
  _REFRESH_THRESHOLD = 10 * 60 # refresh when < 10 minutes remain
112
+ _FORCE_REFRESH_COOLDOWN_MS = 60 * 1000
113
+ _REFRESH_LOCK_STALE = 30
112
114
  _DEFAULT_WORKSPACE_ID = "7649231955045072915" # hardcoded spaceID fallback
113
115
 
114
116
 
@@ -255,6 +257,12 @@ def debug_log(message: str):
255
257
  def _get_credentials_path() -> Path:
256
258
  return Path.home() / ".cozeloop" / "credentials.json"
257
259
 
260
+ def _get_refresh_state_path() -> Path:
261
+ return Path.home() / ".cozeloop" / "refresh-state.json"
262
+
263
+ def _get_refresh_lock_path() -> Path:
264
+ return Path.home() / ".cozeloop" / "refresh-state.lock"
265
+
258
266
  def _load_credentials() -> Optional[Dict]:
259
267
  path = _get_credentials_path()
260
268
  if not path.exists():
@@ -270,6 +278,55 @@ def _save_credentials(creds: Dict):
270
278
  path.write_text(json.dumps(creds, indent=2))
271
279
  os.chmod(path, 0o600)
272
280
 
281
+ def _load_refresh_state() -> Dict[str, Any]:
282
+ path = _get_refresh_state_path()
283
+ if not path.exists():
284
+ return {}
285
+ try:
286
+ return json.loads(path.read_text())
287
+ except Exception:
288
+ return {}
289
+
290
+ def _save_refresh_state(patch: Dict[str, Any]):
291
+ path = _get_refresh_state_path()
292
+ try:
293
+ path.parent.mkdir(parents=True, exist_ok=True)
294
+ state = _load_refresh_state()
295
+ state.update(patch)
296
+ path.write_text(json.dumps(state, indent=2))
297
+ os.chmod(path, 0o600)
298
+ except Exception:
299
+ pass
300
+
301
+ def _with_refresh_lock(fn):
302
+ lock_path = _get_refresh_lock_path()
303
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
304
+ for _ in range(24):
305
+ fd = None
306
+ try:
307
+ fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
308
+ try:
309
+ return fn()
310
+ finally:
311
+ try:
312
+ os.close(fd)
313
+ except Exception:
314
+ pass
315
+ try:
316
+ lock_path.unlink()
317
+ except Exception:
318
+ pass
319
+ except FileExistsError:
320
+ try:
321
+ if time.time() - lock_path.stat().st_mtime > _REFRESH_LOCK_STALE:
322
+ lock_path.unlink()
323
+ continue
324
+ except Exception:
325
+ pass
326
+ time.sleep(0.25)
327
+ hook_log("forced refresh skipped because refresh lock is busy")
328
+ return None
329
+
273
330
  def _refresh_token(refresh_token: str) -> Optional[str]:
274
331
  """Call Coze refresh token API. Returns new access_token or None on failure."""
275
332
  try:
@@ -302,6 +359,56 @@ def _refresh_token(refresh_token: str) -> Optional[str]:
302
359
  debug_log(f"Token refresh failed: {e}")
303
360
  return None
304
361
 
362
+ def force_refresh_token_after_upload_failure(reason: str = "upload_failure", current_token: Optional[str] = None) -> Optional[str]:
363
+ """Force-refresh local credentials after an upload failure, with cross-process throttling."""
364
+ is_cloud = os.environ.get("COZELAB_ONBOARD_CLOUD", "").lower() in ("1", "true", "yes")
365
+ if is_cloud:
366
+ hook_log("upload failure token refresh skipped in cloud mode")
367
+ return None
368
+
369
+ creds = _load_credentials()
370
+ if not creds or not creds.get("refresh_token"):
371
+ hook_log("upload failure token refresh skipped; no local refresh_token")
372
+ return None
373
+
374
+ cached = creds.get("access_token")
375
+ now_ms = int(time.time() * 1000)
376
+ last_ms = int(_load_refresh_state().get("last_forced_refresh_ms", 0) or 0)
377
+ if last_ms and now_ms - last_ms < _FORCE_REFRESH_COOLDOWN_MS:
378
+ if cached and cached != current_token:
379
+ hook_log("forced refresh throttled; reuse newer cached token")
380
+ return cached
381
+ hook_log(f"forced refresh throttled ageMs={now_ms - last_ms}")
382
+ return None
383
+
384
+ def _locked_refresh():
385
+ locked_creds = _load_credentials()
386
+ if not locked_creds or not locked_creds.get("refresh_token"):
387
+ hook_log("forced refresh skipped in lock; credentials missing")
388
+ return None
389
+ locked_cached = locked_creds.get("access_token")
390
+ locked_now_ms = int(time.time() * 1000)
391
+ locked_last_ms = int(_load_refresh_state().get("last_forced_refresh_ms", 0) or 0)
392
+ if locked_last_ms and locked_now_ms - locked_last_ms < _FORCE_REFRESH_COOLDOWN_MS:
393
+ if locked_cached and locked_cached != current_token:
394
+ hook_log("forced refresh already done by another process; reuse token")
395
+ return locked_cached
396
+ hook_log(f"forced refresh skipped in lock ageMs={locked_now_ms - locked_last_ms}")
397
+ return None
398
+ _save_refresh_state({
399
+ "last_forced_refresh_ms": locked_now_ms,
400
+ "last_reason": (reason or "upload_failure")[:120],
401
+ })
402
+ hook_log(f"forced token refresh after upload failure reason={(reason or 'upload_failure')[:120]}")
403
+ refreshed = _refresh_token(locked_creds["refresh_token"])
404
+ if refreshed:
405
+ hook_log("forced token refresh OK")
406
+ else:
407
+ hook_log("forced token refresh FAILED")
408
+ return refreshed
409
+
410
+ return _with_refresh_lock(_locked_refresh)
411
+
305
412
  def _normalize_api_base_url(url: str) -> str:
306
413
  base = (url or "").strip().rstrip("/")
307
414
  if not base:
@@ -960,7 +1067,7 @@ def _build_history_messages(history_turns: List[Dict[str, Any]]) -> list:
960
1067
 
961
1068
  # --- CozeLoop Trace Reporting ---
962
1069
 
963
- def send_turns_to_cozeloop(turns: List[Dict[str, Any]], session_id: str, history_turns: Optional[List[Dict[str, Any]]] = None):
1070
+ def send_turns_to_cozeloop(turns: List[Dict[str, Any]], session_id: str, history_turns: Optional[List[Dict[str, Any]]] = None, retry_on_auth_failure: bool = True):
964
1071
  """Send conversation turns to CozeLoop.
965
1072
 
966
1073
  Span hierarchy:
@@ -1460,6 +1567,12 @@ def send_turns_to_cozeloop(turns: List[Dict[str, Any]], session_id: str, history
1460
1567
 
1461
1568
  if upload_events:
1462
1569
  debug_log(f"Upload failed, state not advanced. Last failure: {upload_events[-1][:500]}")
1570
+ if retry_on_auth_failure:
1571
+ new_token = force_refresh_token_after_upload_failure(upload_events[-1], token)
1572
+ if new_token:
1573
+ os.environ["COZELOOP_API_TOKEN"] = new_token
1574
+ hook_log("retry upload once after forced token refresh")
1575
+ return send_turns_to_cozeloop(turns, session_id, history_turns, retry_on_auth_failure=False)
1463
1576
  return None
1464
1577
 
1465
1578
  return True
@@ -45,6 +45,8 @@ from typing import Optional, List, Dict, Any
45
45
  _COZELOOP_CLIENT_ID = "08972682140163281554629748278108.app.coze"
46
46
  _COZE_API = "https://api.coze.cn"
47
47
  _REFRESH_THRESHOLD = 10 * 60
48
+ _FORCE_REFRESH_COOLDOWN_MS = 60 * 1000
49
+ _REFRESH_LOCK_STALE = 30
48
50
  _DEFAULT_WORKSPACE_ID = "7649231955045072915" # hardcoded spaceID fallback
49
51
  _OTEL_SUFFIX = "/v1/loop/opentelemetry"
50
52
 
@@ -168,6 +170,12 @@ def _make_finish_event_processor(upload_events: Optional[List[str]] = None):
168
170
  def _get_credentials_path() -> Path:
169
171
  return Path.home() / ".cozeloop" / "credentials.json"
170
172
 
173
+ def _get_refresh_state_path() -> Path:
174
+ return Path.home() / ".cozeloop" / "refresh-state.json"
175
+
176
+ def _get_refresh_lock_path() -> Path:
177
+ return Path.home() / ".cozeloop" / "refresh-state.lock"
178
+
171
179
  def _load_credentials():
172
180
  path = _get_credentials_path()
173
181
  if not path.exists():
@@ -183,6 +191,55 @@ def _save_credentials(creds):
183
191
  path.write_text(json.dumps(creds, indent=2))
184
192
  os.chmod(path, 0o600)
185
193
 
194
+ def _load_refresh_state():
195
+ path = _get_refresh_state_path()
196
+ if not path.exists():
197
+ return {}
198
+ try:
199
+ return json.loads(path.read_text())
200
+ except Exception:
201
+ return {}
202
+
203
+ def _save_refresh_state(patch):
204
+ path = _get_refresh_state_path()
205
+ try:
206
+ path.parent.mkdir(parents=True, exist_ok=True)
207
+ state = _load_refresh_state()
208
+ state.update(patch)
209
+ path.write_text(json.dumps(state, indent=2))
210
+ os.chmod(path, 0o600)
211
+ except Exception:
212
+ pass
213
+
214
+ def _with_refresh_lock(fn):
215
+ lock_path = _get_refresh_lock_path()
216
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
217
+ for _ in range(24):
218
+ fd = None
219
+ try:
220
+ fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
221
+ try:
222
+ return fn()
223
+ finally:
224
+ try:
225
+ os.close(fd)
226
+ except Exception:
227
+ pass
228
+ try:
229
+ lock_path.unlink()
230
+ except Exception:
231
+ pass
232
+ except FileExistsError:
233
+ try:
234
+ if time.time() - lock_path.stat().st_mtime > _REFRESH_LOCK_STALE:
235
+ lock_path.unlink()
236
+ continue
237
+ except Exception:
238
+ pass
239
+ time.sleep(0.25)
240
+ hook_log("forced refresh skipped because refresh lock is busy")
241
+ return None
242
+
186
243
  def _refresh_token(refresh_tok: str):
187
244
  try:
188
245
  payload = json.dumps({
@@ -213,6 +270,55 @@ def _refresh_token(refresh_tok: str):
213
270
  pass
214
271
  return None
215
272
 
273
+ def force_refresh_token_after_upload_failure(reason: str = "upload_failure", current_token: Optional[str] = None):
274
+ is_cloud = os.environ.get("COZELAB_ONBOARD_CLOUD", "").lower() in ("1", "true", "yes")
275
+ if is_cloud:
276
+ hook_log("upload failure token refresh skipped in cloud mode")
277
+ return None
278
+
279
+ creds = _load_credentials()
280
+ if not creds or not creds.get("refresh_token"):
281
+ hook_log("upload failure token refresh skipped; no local refresh_token")
282
+ return None
283
+
284
+ cached = creds.get("access_token")
285
+ now_ms = int(time.time() * 1000)
286
+ last_ms = int(_load_refresh_state().get("last_forced_refresh_ms", 0) or 0)
287
+ if last_ms and now_ms - last_ms < _FORCE_REFRESH_COOLDOWN_MS:
288
+ if cached and cached != current_token:
289
+ hook_log("forced refresh throttled; reuse newer cached token")
290
+ return cached
291
+ hook_log(f"forced refresh throttled ageMs={now_ms - last_ms}")
292
+ return None
293
+
294
+ def _locked_refresh():
295
+ locked_creds = _load_credentials()
296
+ if not locked_creds or not locked_creds.get("refresh_token"):
297
+ hook_log("forced refresh skipped in lock; credentials missing")
298
+ return None
299
+ locked_cached = locked_creds.get("access_token")
300
+ locked_now_ms = int(time.time() * 1000)
301
+ locked_last_ms = int(_load_refresh_state().get("last_forced_refresh_ms", 0) or 0)
302
+ if locked_last_ms and locked_now_ms - locked_last_ms < _FORCE_REFRESH_COOLDOWN_MS:
303
+ if locked_cached and locked_cached != current_token:
304
+ hook_log("forced refresh already done by another process; reuse token")
305
+ return locked_cached
306
+ hook_log(f"forced refresh skipped in lock ageMs={locked_now_ms - locked_last_ms}")
307
+ return None
308
+ _save_refresh_state({
309
+ "last_forced_refresh_ms": locked_now_ms,
310
+ "last_reason": (reason or "upload_failure")[:120],
311
+ })
312
+ hook_log(f"forced token refresh after upload failure reason={(reason or 'upload_failure')[:120]}")
313
+ refreshed = _refresh_token(locked_creds["refresh_token"])
314
+ if refreshed:
315
+ hook_log("forced token refresh OK")
316
+ else:
317
+ hook_log("forced token refresh FAILED")
318
+ return refreshed
319
+
320
+ return _with_refresh_lock(_locked_refresh)
321
+
216
322
 
217
323
  def _normalize_api_base_url(url: str) -> str:
218
324
  base = (url or "").strip().rstrip("/")
@@ -965,7 +1071,8 @@ def _make_model_message(role: str, content: str = "", tool_calls: list = None,
965
1071
 
966
1072
 
967
1073
  def send_turns_to_cozeloop(turns: List[Dict[str, Any]], session_id: str, model_name: str = "codex",
968
- history_context: Optional[List[Dict[str, Any]]] = None) -> Optional[List[Dict[str, Any]]]:
1074
+ history_context: Optional[List[Dict[str, Any]]] = None,
1075
+ retry_on_auth_failure: bool = True) -> Optional[List[Dict[str, Any]]]:
969
1076
  """Send conversation turns to CozeLoop for tracing.
970
1077
 
971
1078
  Span hierarchy:
@@ -1387,6 +1494,18 @@ def send_turns_to_cozeloop(turns: List[Dict[str, Any]], session_id: str, model_n
1387
1494
 
1388
1495
  if upload_events:
1389
1496
  hook_log(f"upload failed state not advanced failures={len(upload_events)} detail={upload_events[-1][:500]}")
1497
+ if retry_on_auth_failure:
1498
+ new_token = force_refresh_token_after_upload_failure(upload_events[-1], token)
1499
+ if new_token:
1500
+ os.environ["COZELOOP_API_TOKEN"] = new_token
1501
+ hook_log("retry upload once after forced token refresh")
1502
+ return send_turns_to_cozeloop(
1503
+ turns,
1504
+ session_id,
1505
+ model_name,
1506
+ history_context,
1507
+ retry_on_auth_failure=False,
1508
+ )
1390
1509
  return None
1391
1510
 
1392
1511
  return ctx
@@ -5,7 +5,7 @@ import { ATTR_SERVICE_NAME, ATTR_SERVICE_INSTANCE_ID } from "@opentelemetry/sema
5
5
  import { hostname } from "os";
6
6
  import { basename, join } from "path";
7
7
  import { createRequire } from "node:module";
8
- import { readFileSync, writeFileSync, mkdirSync, appendFileSync } from "fs";
8
+ import { readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, unlinkSync, statSync } from "fs";
9
9
  import { homedir } from "os";
10
10
  import http from "http";
11
11
  import https from "https";
@@ -25,7 +25,11 @@ const CLIENT_USER_AGENT = {
25
25
  const _CLIENT_ID = "08972682140163281554629748278108.app.coze";
26
26
  const _COZE_API = "https://api.coze.cn";
27
27
  const _REFRESH_THRESHOLD_MS = 10 * 60 * 1000;
28
+ const _FORCE_REFRESH_COOLDOWN_MS = 60 * 1000;
29
+ const _REFRESH_LOCK_STALE_MS = 30 * 1000;
28
30
  const _CREDS_PATH = join(homedir(), ".cozeloop", "credentials.json");
31
+ const _REFRESH_STATE_PATH = join(homedir(), ".cozeloop", "refresh-state.json");
32
+ const _REFRESH_LOCK_PATH = join(homedir(), ".cozeloop", "refresh-state.lock");
29
33
 
30
34
  function _loadCreds() {
31
35
  try { return JSON.parse(readFileSync(_CREDS_PATH, "utf8")); }
@@ -39,6 +43,60 @@ function _saveCreds(c) {
39
43
  } catch { /* non-fatal */ }
40
44
  }
41
45
 
46
+ function _loadRefreshState() {
47
+ try { return JSON.parse(readFileSync(_REFRESH_STATE_PATH, "utf8")); }
48
+ catch { return {}; }
49
+ }
50
+
51
+ function _saveRefreshState(patch) {
52
+ try {
53
+ mkdirSync(join(homedir(), ".cozeloop"), { recursive: true });
54
+ const state = { ..._loadRefreshState(), ...patch };
55
+ writeFileSync(_REFRESH_STATE_PATH, JSON.stringify(state, null, 2), { mode: 0o600 });
56
+ } catch { /* non-fatal */ }
57
+ }
58
+
59
+ function _authorizationFromCreds(creds) {
60
+ return creds?.access_token ? `Bearer ${creds.access_token}` : null;
61
+ }
62
+
63
+ function _sleep(ms) {
64
+ return new Promise(resolve => setTimeout(resolve, ms));
65
+ }
66
+
67
+ async function _withRefreshLock(logFile, fn) {
68
+ mkdirSync(join(homedir(), ".cozeloop"), { recursive: true });
69
+ for (let i = 0; i < 24; i++) {
70
+ let fd = null;
71
+ try {
72
+ fd = openSync(_REFRESH_LOCK_PATH, "wx", 0o600);
73
+ try {
74
+ return await fn();
75
+ }
76
+ finally {
77
+ try { if (fd !== null) closeSync(fd); } catch { /* ignore */ }
78
+ try { unlinkSync(_REFRESH_LOCK_PATH); } catch { /* ignore */ }
79
+ }
80
+ }
81
+ catch (err) {
82
+ if (err?.code !== "EEXIST") {
83
+ fileLog(logFile, `[auth] refresh lock error=${err?.message || err}`);
84
+ return null;
85
+ }
86
+ try {
87
+ const ageMs = Date.now() - statSync(_REFRESH_LOCK_PATH).mtimeMs;
88
+ if (ageMs > _REFRESH_LOCK_STALE_MS) {
89
+ unlinkSync(_REFRESH_LOCK_PATH);
90
+ continue;
91
+ }
92
+ } catch { /* ignore */ }
93
+ await _sleep(250);
94
+ }
95
+ }
96
+ fileLog(logFile, "[auth] refresh lock busy; skip forced refresh");
97
+ return null;
98
+ }
99
+
42
100
  async function _refreshToken(refreshTok) {
43
101
  return new Promise((resolve) => {
44
102
  const body = JSON.stringify({ grant_type: "refresh_token", client_id: _CLIENT_ID, refresh_token: refreshTok });
@@ -72,8 +130,65 @@ async function _refreshToken(refreshTok) {
72
130
  });
73
131
  }
74
132
 
133
+ async function _forceRefreshToken(currentAuthorization, opts = {}) {
134
+ const logFile = opts.logFile;
135
+ const creds = _loadCreds();
136
+ if (!creds?.refresh_token) {
137
+ fileLog(logFile, "[auth] upload failed but no local refresh_token is available");
138
+ return null;
139
+ }
140
+
141
+ const cachedAuth = _authorizationFromCreds(creds);
142
+ const now = Date.now();
143
+ const state = _loadRefreshState();
144
+ const lastForced = Number(state.last_forced_refresh_ms || 0);
145
+ if (lastForced && now - lastForced < _FORCE_REFRESH_COOLDOWN_MS) {
146
+ if (cachedAuth && cachedAuth !== currentAuthorization) {
147
+ fileLog(logFile, `[auth] forced refresh throttled; reuse newer cached token tokenLength=${cachedAuth.length}`);
148
+ return cachedAuth;
149
+ }
150
+ fileLog(logFile, `[auth] forced refresh throttled ageMs=${now - lastForced}`);
151
+ return null;
152
+ }
153
+
154
+ return _withRefreshLock(logFile, async () => {
155
+ const lockedCreds = _loadCreds();
156
+ if (!lockedCreds?.refresh_token) {
157
+ fileLog(logFile, "[auth] forced refresh skipped; credentials disappeared");
158
+ return null;
159
+ }
160
+ const lockedCachedAuth = _authorizationFromCreds(lockedCreds);
161
+ const lockedState = _loadRefreshState();
162
+ const lockedLastForced = Number(lockedState.last_forced_refresh_ms || 0);
163
+ const lockedNow = Date.now();
164
+ if (lockedLastForced && lockedNow - lockedLastForced < _FORCE_REFRESH_COOLDOWN_MS) {
165
+ if (lockedCachedAuth && lockedCachedAuth !== currentAuthorization) {
166
+ fileLog(logFile, `[auth] forced refresh already done by another process; reuse tokenLength=${lockedCachedAuth.length}`);
167
+ return lockedCachedAuth;
168
+ }
169
+ fileLog(logFile, `[auth] forced refresh skipped in lock ageMs=${lockedNow - lockedLastForced}`);
170
+ return null;
171
+ }
172
+
173
+ _saveRefreshState({
174
+ last_forced_refresh_ms: lockedNow,
175
+ last_reason: String(opts.reason || "upload_failure").slice(0, 120),
176
+ });
177
+ fileLog(logFile, `[auth] forced token refresh after local upload failure reason=${String(opts.reason || "upload_failure").slice(0, 120)}`);
178
+ const newToken = await _refreshToken(lockedCreds.refresh_token);
179
+ if (!newToken) {
180
+ fileLog(logFile, "[auth] forced token refresh FAILED");
181
+ return null;
182
+ }
183
+ const freshAuth = `Bearer ${newToken}`;
184
+ fileLog(logFile, `[auth] forced token refresh OK tokenLength=${freshAuth.length}`);
185
+ return freshAuth;
186
+ });
187
+ }
188
+
75
189
  async function getRefreshedToken(currentAuthorization, opts = {}) {
76
190
  if (opts.disableLocalCredentials) return currentAuthorization;
191
+ if (opts.force) return _forceRefreshToken(currentAuthorization, opts);
77
192
  const creds = _loadCreds();
78
193
  if (!creds) return currentAuthorization; // no creds file, keep as-is
79
194
  const remaining = (creds.expires_at ?? 0) - Date.now();
@@ -287,6 +402,7 @@ class CozeloopIngestExporter {
287
402
  this.logFile = config.logFile;
288
403
  this.workspaceId = config.workspaceId;
289
404
  this.serviceName = config.serviceName;
405
+ this.onAuthFailure = config.onAuthFailure;
290
406
  this.shutdownRequested = false;
291
407
  fileLog(this.logFile, `[ingest] exporter ready url=${this.url} workspaceId=${this.workspaceId}`);
292
408
  }
@@ -310,7 +426,43 @@ class CozeloopIngestExporter {
310
426
  serviceName: this.serviceName,
311
427
  })),
312
428
  };
313
- fileLog(this.logFile, `[ingest] POST url=${this.url} spans=${body.spans.length}`);
429
+ try {
430
+ await this.postBody(body, false);
431
+ }
432
+ catch (err) {
433
+ const freshAuth = await this.refreshAuthorizationAfterFailure(err);
434
+ if (!freshAuth) {
435
+ throw err;
436
+ }
437
+ this.headers = { ...this.headers, Authorization: freshAuth };
438
+ fileLog(this.logFile, `[ingest] retry after token refresh url=${this.url} spans=${body.spans.length}`);
439
+ try {
440
+ await this.postBody(body, true);
441
+ }
442
+ catch (retryErr) {
443
+ throw new Error(`retry after token refresh failed: ${retryErr?.message || retryErr}`);
444
+ }
445
+ }
446
+ }
447
+ async refreshAuthorizationAfterFailure(err) {
448
+ if (typeof this.onAuthFailure !== "function")
449
+ return null;
450
+ fileLog(this.logFile, `[auth] local upload failure triggers refresh attempt err=${String(err?.message || err).slice(0, 300)}`);
451
+ try {
452
+ const current = this.headers?.Authorization || "";
453
+ const fresh = await this.onAuthFailure(current, err);
454
+ if (fresh && fresh !== current) {
455
+ return fresh;
456
+ }
457
+ fileLog(this.logFile, "[auth] no newer authorization available for retry");
458
+ }
459
+ catch (refreshErr) {
460
+ fileLog(this.logFile, `[auth] refresh attempt threw error=${refreshErr?.message || refreshErr}`);
461
+ }
462
+ return null;
463
+ }
464
+ async postBody(body, retry) {
465
+ fileLog(this.logFile, `[ingest] ${retry ? "RETRY " : ""}POST url=${this.url} spans=${body.spans.length}`);
314
466
  const res = await postJson(this.url, body, this.headers);
315
467
  if (res.status < 200 || res.status >= 300) {
316
468
  const snippet = String(res.body || "").slice(0, 300);
@@ -331,7 +483,7 @@ class CozeloopIngestExporter {
331
483
  }
332
484
  }
333
485
  }
334
- fileLog(this.logFile, `[ingest] OK HTTP ${res.status} spans=${body.spans.length}`);
486
+ fileLog(this.logFile, `[ingest] OK HTTP ${res.status} spans=${body.spans.length}${retry ? " retry=1" : ""}`);
335
487
  }
336
488
  async forceFlush() {
337
489
  return;
@@ -416,6 +568,20 @@ export class CozeloopExporter {
416
568
  this.config.authorization = "";
417
569
  }
418
570
  }
571
+ async refreshAuthAfterUploadFailure(currentAuthorization, err) {
572
+ const fresh = await getRefreshedToken(currentAuthorization || this.config.authorization, {
573
+ disableLocalCredentials: this.config.disableLocalCredentials,
574
+ force: true,
575
+ reason: err?.message || "upload_failure",
576
+ logFile: this.config.logFile,
577
+ });
578
+ if (fresh && fresh !== currentAuthorization) {
579
+ this.api.logger.info("[CozeloopTrace] Token refreshed after upload failure; retrying export...");
580
+ this.config.authorization = fresh;
581
+ return fresh;
582
+ }
583
+ return null;
584
+ }
419
585
  async ensureInitialized() {
420
586
  if (this.initialized)
421
587
  return;
@@ -443,6 +609,7 @@ export class CozeloopExporter {
443
609
  logFile: this.config.logFile,
444
610
  workspaceId,
445
611
  serviceName: this.config.serviceName,
612
+ onAuthFailure: (currentAuthorization, err) => this.refreshAuthAfterUploadFailure(currentAuthorization, err),
446
613
  headers: {
447
614
  "Authorization": authorization,
448
615
  "User-Agent": `openclaw-cozeloop-trace/${PLUGIN_VERSION} node/${process.versions.node}`,
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "id": "openclaw-cozeloop-trace",
3
3
  "name": "OpenClaw CozeLoop Trace",
4
- "version": "0.1.18",
4
+ "version": "0.1.19",
5
5
  "description": "Report OpenClaw execution traces to CozeLoop via OpenTelemetry",
6
6
  "type": "plugin",
7
7
  "entry": "./dist/index.js",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cozeloop/openclaw-cozeloop-trace",
3
- "version": "0.1.18",
3
+ "version": "0.1.19",
4
4
  "description": "OpenClaw Plugin for reporting traces to CozeLoop via OpenTelemetry",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",