botmux 2.71.0 → 2.71.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/worker.js CHANGED
@@ -15,7 +15,7 @@
15
15
  import { randomBytes } from 'node:crypto';
16
16
  import { mkdirSync, writeFileSync, unlinkSync, existsSync, statSync, readdirSync, readlinkSync, readFileSync, watch as fsWatch, createWriteStream } from 'node:fs';
17
17
  import { isAbsolute, join } from 'node:path';
18
- import { drainTranscript, joinAssistantText, findJsonlContainingFingerprint, findJsonlsContainingExactContent, findLatestJsonl, extractLastAssistantTurn, extractTurnStartText, splitTranscriptEventsByCutoff } from './services/claude-transcript.js';
18
+ import { drainTranscript, joinAssistantText, trailingAssistantText, findJsonlContainingFingerprint, findJsonlsContainingExactContent, findLatestJsonl, extractLastAssistantTurn, extractTurnStartText, splitTranscriptEventsByCutoff } from './services/claude-transcript.js';
19
19
  import { BridgeTurnQueue, makeFingerprint, normaliseForFingerprint } from './services/bridge-turn-queue.js';
20
20
  import { shouldSuppressBridgeEmit } from './services/bridge-fallback-gate.js';
21
21
  import { shouldWriteNow } from './utils/input-gate.js';
@@ -71,6 +71,26 @@ let cliPidMarker = null; // path to .botmux-cli-pids/<pid>
71
71
  let sandboxStopWatcher = null; // stop fn for the sandbox outbox watcher
72
72
  let sandboxCleanup = null; // unmount overlays + rm the per-session sandbox tree
73
73
  let sandboxTeardownDone = false; // guards the exit-time best-effort teardown from double-running / running on suspend-for-resume
74
+ /** Counts consecutive in-worker restart cycles (see case 'restart'). Used by
75
+ * the SECONDARY guard so an adapter whose checkResumeTargetExists misses
76
+ * (returns undefined) or whose resume target vanishes between the check and
77
+ * spawn never crash-loops: 2nd consecutive restart → drop resume semantics,
78
+ * spawn fresh. Reset to 0 whenever spawnCli proceeds with a successful
79
+ * (non-forced) config, so healthy restarts (e.g. user `/restart`) are
80
+ * unaffected. */
81
+ let consecutiveInWorkerRestarts = 0;
82
+ /** Guard: user_notify for "resume → fresh fallback" is sent once per worker
83
+ * lifecycle so a 4× crash loop does not spam the Lark thread with 4 copies
84
+ * of the same warning. */
85
+ let resumeFallbackNotified = false;
86
+ /** The effectiveResume flag used by the most recent spawnCli call. Written
87
+ * immediately after the two-tier fallback check so late-attach timers
88
+ * (hermes, cursor, etc.) can read THE SAME semantics the spawn used,
89
+ * instead of re-deriving from lastInitConfig.resume (which never reflects
90
+ * Tier-1/Tier-2 fresh demotion). Updated in spawnCli BEFORE any bridge
91
+ * setup so even the tick that fires between spawnCli-start and the
92
+ * adapter's hermesBridgeAttach reads the correct mode. */
93
+ let lastSpawnEffectiveResume = false;
74
94
  let idleDetector = null;
75
95
  let isTmuxMode = false;
76
96
  /** Adopt-bridge mode using TmuxPipeBackend: not a tmux attach client, all
@@ -1416,7 +1436,13 @@ function emitReadyTurns() {
1416
1436
  }
1417
1437
  const set = new Set(turn.assistantUuids);
1418
1438
  const matched = drained.events.filter(e => e.uuid && set.has(e.uuid));
1419
- const assistantText = joinAssistantText(matched);
1439
+ // Non-adopt fallback posts the turn's FINAL answer (text after the last
1440
+ // tool_use), not the whole-turn narration collage — joining every interim
1441
+ // block both reads as noise in Lark and inflates finalText past the
1442
+ // material-longer gate, re-posting turns the model already `botmux send`ed.
1443
+ // Adopt keeps the full join: transcript drain is that mode's only channel,
1444
+ // so interim narration is the user's only window into the turn.
1445
+ const assistantText = adoptMode ? joinAssistantText(matched) : trailingAssistantText(drained.events, turn.assistantUuids);
1420
1446
  if (assistantText.length === 0)
1421
1447
  continue;
1422
1448
  const lastUuid = turn.assistantUuids[turn.assistantUuids.length - 1];
@@ -1541,8 +1567,12 @@ function codexBridgeStartTimer() {
1541
1567
  codexBridgeTimer = setInterval(() => {
1542
1568
  try {
1543
1569
  if (structuredBridgeIsHermes()) {
1570
+ // Use lastSpawnEffectiveResume (written by spawnCli AFTER the
1571
+ // two-tier fallback), NOT lastInitConfig.resume. Otherwise a
1572
+ // Tier-1/Tier-2 demotion to fresh would still baseline the empty
1573
+ // hermes store as "existing" and swallow the first turn.
1544
1574
  if (!hermesBridgeBaselineDone)
1545
- hermesBridgeAttach(lastInitConfig?.resume ? 'baseline-existing' : 'fresh-empty');
1575
+ hermesBridgeAttach(lastSpawnEffectiveResume ? 'baseline-existing' : 'fresh-empty');
1546
1576
  hermesBridgeIngest();
1547
1577
  if (isPromptReady)
1548
1578
  emitReadyCodexTurns();
@@ -2768,6 +2798,13 @@ function markPromptReady() {
2768
2798
  return;
2769
2799
  }
2770
2800
  isPromptReady = true;
2801
+ // CLI 实际启动成功(回到 prompt):复位连续重启计数。
2802
+ // 任何能到这一步的 spawn 都算"成功"——后续即便再崩溃(不是 resume 目标不存在
2803
+ // 的问题),下一轮也该有新的 2 次重试预算,而不是被历史重启计数卡住。
2804
+ if (consecutiveInWorkerRestarts > 0) {
2805
+ log(`CLI reached prompt successfully — resetting consecutive restart count (was ${consecutiveInWorkerRestarts})`);
2806
+ consecutiveInWorkerRestarts = 0;
2807
+ }
2771
2808
  // CLI is back at its prompt — every previously written input has been
2772
2809
  // consumed, so nothing is in flight anymore. A later crash must not
2773
2810
  // replay these.
@@ -3499,33 +3536,86 @@ function spawnCli(cfg) {
3499
3536
  log(`[sandbox] redirecting Claude bridge dataDir → overlay upper: ${redirected}`);
3500
3537
  claudeDataDir = redirected;
3501
3538
  }
3502
- // Resume-fresh fallback: if we'd `--resume` a Claude-family session but its
3503
- // conversation jsonl is GONE, the CLI exits 1 ("No conversation found") and the
3504
- // auto-restarter keeps re-resuming a single transient crash amplifies into a
3505
- // crash-loop. This bites sandboxed sessions especially: their jsonl lives in the
3506
- // ephemeral overlay upper, which crash-cleanup reclaims. Detect the missing
3507
- // conversation and spawn FRESH instead loses prior context but never loops.
3539
+ // ── Resume pre-flight check + two-tier fallback ──────────────────────────
3540
+ // Tier 1 (adapter probe): adapter.checkResumeTargetExists returns false
3541
+ // → skip --resume, spawn FRESH.
3542
+ // Tier 2 (restart count): 2nd consecutive in-worker restart force FRESH,
3543
+ // regardless of probe result. This covers adapters without a probe AND
3544
+ // probe/spawn races (target vanishes between the check and spawn).
3545
+ //
3546
+ // Supersedes the claude-family-only inline probe (PR #189) with a
3547
+ // general adapter-owned check (cleaner boundary) + a numeric safety net.
3548
+ //
3549
+ // User impact: losing context is better than a 4× daemon-side crash loop
3550
+ // that leaves the bot stuck in "crashed N times" state until the human
3551
+ // re-closes the session.
3508
3552
  let effectiveResume = cfg.resume ?? false;
3509
- if (effectiveResume && claudeDataDir && cfg.cliSessionId) {
3510
- const resumeJsonl = claudeJsonlPathForSession(cfg.cliSessionId, cfg.workingDir, claudeDataDir);
3511
- if (!existsSync(resumeJsonl)) {
3512
- log(`[resume] conversation gone (${resumeJsonl}) — spawning FRESH instead of --resume (avoids "No conversation found" crash-loop)`);
3513
- effectiveResume = false;
3553
+ let effectiveCliSessionId = cfg.cliSessionId;
3554
+ let effectiveAdapterSessionId = adapterSessionId;
3555
+ const tier2ForceFresh = effectiveResume && consecutiveInWorkerRestarts >= 2;
3556
+ let tier1ProbeFalse = false;
3557
+ if (effectiveResume && !tier2ForceFresh) {
3558
+ const probe = cliAdapter.checkResumeTargetExists?.({
3559
+ sessionId: effectiveAdapterSessionId,
3560
+ cliSessionId: effectiveCliSessionId,
3561
+ workingDir: cfg.workingDir,
3562
+ dataDir: claudeDataDir,
3563
+ });
3564
+ if (probe === false)
3565
+ tier1ProbeFalse = true;
3566
+ }
3567
+ const fallBackToFresh = effectiveResume && (tier1ProbeFalse || tier2ForceFresh);
3568
+ if (fallBackToFresh) {
3569
+ const reason = tier2ForceFresh
3570
+ ? `consecutive restart x${consecutiveInWorkerRestarts} — 2nd failed resume attempt`
3571
+ : 'adapter confirmed resume target does not exist on disk';
3572
+ log(`Resume fallback: dropping --resume (${reason}) → fresh session ${cfg.sessionId}`);
3573
+ effectiveResume = false;
3574
+ effectiveCliSessionId = undefined;
3575
+ effectiveAdapterSessionId = cfg.sessionId;
3576
+ // Recompute the claude-family JSONL path: it now targets the FRESH
3577
+ // sessionId (fresh spawn creates <newSid>.jsonl, not the old one).
3578
+ if (claudeDataDir) {
3579
+ backend.claudeJsonlPath =
3580
+ claudeJsonlPathForSession(effectiveAdapterSessionId, cfg.workingDir, claudeDataDir);
3581
+ }
3582
+ // Single human-visible warning. Spam guard: at most once per worker
3583
+ // lifecycle (a 4× crash loop otherwise duplicates the notice).
3584
+ if (!resumeFallbackNotified) {
3585
+ resumeFallbackNotified = true;
3586
+ send({
3587
+ type: 'user_notify',
3588
+ turnId: currentBotmuxTurnId,
3589
+ message: `⚠️ 历史会话(${(cfg.cliSessionId ?? cfg.originalSessionId ?? cfg.sessionId).substring(0, 16)}…)` +
3590
+ `无法恢复,已为你**新起一个干净会话**(原因:${reason})。\n` +
3591
+ `之前的上下文不会带到本轮,需要的话请简述背景。`,
3592
+ });
3514
3593
  }
3594
+ // Reset the counter so the fresh spawn gets a clean 2-attempt budget in
3595
+ // case IT crashes later for an unrelated reason.
3596
+ consecutiveInWorkerRestarts = 0;
3515
3597
  }
3516
- if (claudeDataDir) {
3598
+ else if (claudeDataDir) {
3517
3599
  // Watch where the spawned CLI will actually write: the resumed conversation
3518
3600
  // when resuming, else the fresh session id (a stale cliSessionId would point
3519
3601
  // the bridge at the gone jsonl).
3520
- const bridgeWatchId = effectiveResume ? (cfg.cliSessionId ?? adapterSessionId) : adapterSessionId;
3602
+ const bridgeWatchId = effectiveResume
3603
+ ? (effectiveCliSessionId ?? effectiveAdapterSessionId)
3604
+ : effectiveAdapterSessionId;
3521
3605
  backend.claudeJsonlPath =
3522
3606
  claudeJsonlPathForSession(bridgeWatchId, cfg.workingDir, claudeDataDir);
3523
3607
  }
3608
+ // Publish the resolved resume semantics so any late-attach timer (hermes,
3609
+ // cursor, …) driven by codexBridgeStartTimer sees the SAME mode the spawn
3610
+ // used. Without this, Tier-1/Tier-2 fresh demotion would still use
3611
+ // `lastInitConfig.resume` (= true) and baseline an empty store, swallowing
3612
+ // the fresh session's first turn.
3613
+ lastSpawnEffectiveResume = effectiveResume;
3524
3614
  const args = cliAdapter.buildArgs({
3525
- sessionId: adapterSessionId,
3615
+ sessionId: effectiveAdapterSessionId,
3526
3616
  resume: effectiveResume,
3527
3617
  workingDir: cfg.workingDir,
3528
- resumeSessionId: cfg.cliSessionId,
3618
+ resumeSessionId: effectiveCliSessionId,
3529
3619
  initialPrompt: cfg.prompt || undefined,
3530
3620
  botName: cfg.botName,
3531
3621
  botOpenId: cfg.botOpenId,
@@ -3815,9 +3905,6 @@ function spawnCli(cfg) {
3815
3905
  };
3816
3906
  setTimeout(resolveCliPidLate, 120);
3817
3907
  }
3818
- // On tmux re-attach, keep awaitingFirstPrompt = true so screen updates are
3819
- // suppressed until the idle detector fires markNewTurn() — this prevents the
3820
- // full tmux scrollback history from leaking into the streaming card.
3821
3908
  // Bridge fallback: claude-code only. Tail Claude's transcript JSONL so a
3822
3909
  // turn the model finishes WITHOUT calling `botmux send` still gets its
3823
3910
  // assistant text forwarded to Lark (the gate in emitReadyTurns suppresses
@@ -3826,11 +3913,14 @@ function spawnCli(cfg) {
3826
3913
  // the file Claude creates on first submit isn't absorbed as history,
3827
3914
  // and baseline-existing on resume so prior-run turns ARE absorbed (we
3828
3915
  // don't want to re-emit yesterday's conversation as fresh turns).
3829
- if (claudeDataDir && adapterSessionId) {
3830
- // Use effectiveResume (not cfg.resume): the resume-fresh fallback above may
3831
- // have demoted a resume to fresh because the conversation jsonl was gone, so
3832
- // the bridge must watch the FRESH session's jsonl in fresh-empty mode.
3833
- const claudeBridgeSessionId = effectiveResume ? (cfg.cliSessionId ?? adapterSessionId) : adapterSessionId;
3916
+ //
3917
+ // NOTE: use effectiveResume / effectiveAdapterSessionId / effectiveCliSessionId
3918
+ // here, NOT cfg.* the two-tier fallback above may have flipped
3919
+ // resume FRESH, in which case the baseline mode and session id MUST
3920
+ // follow the flip. The same variables also cover Tier-2 (count-based)
3921
+ // fallbacks that fire for non-Claude CLIs (below).
3922
+ if (claudeDataDir && effectiveAdapterSessionId) {
3923
+ const claudeBridgeSessionId = effectiveCliSessionId ?? effectiveAdapterSessionId;
3834
3924
  const claudeJsonl = claudeJsonlPathForSession(claudeBridgeSessionId, cfg.workingDir, claudeDataDir);
3835
3925
  startBridgeWatcher(claudeJsonl, {
3836
3926
  cliPid: cliPid ?? undefined,
@@ -3845,17 +3935,21 @@ function spawnCli(cfg) {
3845
3935
  // discovered after the first submit; CoCo's events path is deterministic
3846
3936
  // from botmux sessionId. Hermes and MTR use SQLite stores, so baseline the
3847
3937
  // relevant cursor at spawn and poll for rows after each queued prompt flushes.
3938
+ //
3939
+ // Mode uses effectiveResume: when the resume probe flipped us to FRESH, we
3940
+ // must NOT baseline the "restored" cursor against an empty / absent store
3941
+ // (would otherwise swallow the fresh session's first turn).
3848
3942
  if (cfg.cliId === 'hermes') {
3849
- hermesBridgeAttach(cfg.resume ? 'baseline-existing' : 'fresh-empty');
3943
+ hermesBridgeAttach(effectiveResume ? 'baseline-existing' : 'fresh-empty');
3850
3944
  }
3851
3945
  else if (cfg.cliId === 'codex') {
3852
- if (cfg.cliSessionId) {
3853
- const rolloutPath = findCodexRolloutBySessionId(cfg.cliSessionId);
3946
+ if (effectiveCliSessionId) {
3947
+ const rolloutPath = findCodexRolloutBySessionId(effectiveCliSessionId);
3854
3948
  if (rolloutPath) {
3855
3949
  codexBridgeAttach(rolloutPath, 'baseline-existing');
3856
3950
  }
3857
3951
  else {
3858
- codexBridgePendingSessionId = cfg.cliSessionId;
3952
+ codexBridgePendingSessionId = effectiveCliSessionId;
3859
3953
  codexBridgeStartTimer();
3860
3954
  }
3861
3955
  }
@@ -3868,13 +3962,13 @@ function spawnCli(cfg) {
3868
3962
  // spawn (no cliSessionId yet) we just arm the poller; writeInput will
3869
3963
  // surface the cliSessionId on the first successful submit and trigger
3870
3964
  // codexBridgeNotifyCliSessionId → rollout attach.
3871
- if (cfg.cliSessionId) {
3872
- const rolloutPath = findTraexRolloutBySessionId(cfg.cliSessionId);
3965
+ if (effectiveCliSessionId) {
3966
+ const rolloutPath = findTraexRolloutBySessionId(effectiveCliSessionId);
3873
3967
  if (rolloutPath) {
3874
3968
  codexBridgeAttach(rolloutPath, 'baseline-existing');
3875
3969
  }
3876
3970
  else {
3877
- codexBridgePendingSessionId = cfg.cliSessionId;
3971
+ codexBridgePendingSessionId = effectiveCliSessionId;
3878
3972
  codexBridgeStartTimer();
3879
3973
  }
3880
3974
  }
@@ -3883,16 +3977,16 @@ function spawnCli(cfg) {
3883
3977
  }
3884
3978
  }
3885
3979
  else if (cfg.cliId === 'coco') {
3886
- const eventsPath = cocoEventsPathForSession(cfg.sessionId);
3887
- codexBridgeAttach(eventsPath, cfg.resume ? 'baseline-existing' : 'fresh-empty');
3980
+ const eventsPath = cocoEventsPathForSession(effectiveAdapterSessionId);
3981
+ codexBridgeAttach(eventsPath, effectiveResume ? 'baseline-existing' : 'fresh-empty');
3888
3982
  codexBridgeStartTimer();
3889
3983
  }
3890
3984
  else if (cfg.cliId === 'mtr') {
3891
- const mtrSessionId = cfg.cliSessionId ?? mtrSessionIdForBotmuxSession(cfg.sessionId);
3985
+ const mtrSessionId = effectiveCliSessionId ?? mtrSessionIdForBotmuxSession(effectiveAdapterSessionId);
3892
3986
  codexBridgePendingSessionId = mtrSessionId;
3893
3987
  const source = findMtrSessionById(mtrSessionId);
3894
3988
  if (source) {
3895
- mtrBridgeAttach(source, cfg.resume ? 'baseline-existing' : 'fresh-empty');
3989
+ mtrBridgeAttach(source, effectiveResume ? 'baseline-existing' : 'fresh-empty');
3896
3990
  }
3897
3991
  else {
3898
3992
  codexBridgeStartTimer();
@@ -4776,6 +4870,17 @@ process.on('message', async (raw) => {
4776
4870
  break;
4777
4871
  }
4778
4872
  log('Restart requested');
4873
+ // Tier-2 guard: 2nd consecutive in-worker restart forces FRESH.
4874
+ // Increment BEFORE spawnCli so the guard trips at count==2 (i.e. the
4875
+ // third attempted spawn in a 1-success → 2-failure sequence):
4876
+ // initial spawn (count=0) → fail → claude_exit → daemon sends restart
4877
+ // 1st restart (count=1) → resume still fails → restart
4878
+ // 2nd restart (count=2) → tier-2 kicks in → FRESH
4879
+ // Tier 1 probe (adapter.checkResumeTargetExists) is re-run on each
4880
+ // spawn, so even count=1 often short-circuits; tier-2 only catches
4881
+ // silent/race failures and adapters that don't implement the probe.
4882
+ consecutiveInWorkerRestarts++;
4883
+ log(`Restart count: ${consecutiveInWorkerRestarts} (>=2 forces FRESH)`);
4779
4884
  // Must destroySession(), not kill(): for persistent backends (tmux/herdr)
4780
4885
  // kill() only detaches — the backing session + CLI process keep running,
4781
4886
  // so the resume:true spawnCli below would re-attach to the SAME live CLI