botmux 2.70.0 → 2.71.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/adapters/cli/claude-code.d.ts.map +1 -1
  2. package/dist/adapters/cli/claude-code.js +36 -0
  3. package/dist/adapters/cli/claude-code.js.map +1 -1
  4. package/dist/adapters/cli/codex.d.ts.map +1 -1
  5. package/dist/adapters/cli/codex.js +1 -0
  6. package/dist/adapters/cli/codex.js.map +1 -1
  7. package/dist/adapters/cli/types.d.ts +33 -0
  8. package/dist/adapters/cli/types.d.ts.map +1 -1
  9. package/dist/core/command-handler.d.ts +6 -4
  10. package/dist/core/command-handler.d.ts.map +1 -1
  11. package/dist/core/command-handler.js +59 -12
  12. package/dist/core/command-handler.js.map +1 -1
  13. package/dist/core/types.d.ts +16 -0
  14. package/dist/core/types.d.ts.map +1 -1
  15. package/dist/core/types.js.map +1 -1
  16. package/dist/core/worker-pool.d.ts.map +1 -1
  17. package/dist/core/worker-pool.js +20 -1
  18. package/dist/core/worker-pool.js.map +1 -1
  19. package/dist/daemon.d.ts.map +1 -1
  20. package/dist/daemon.js +125 -1
  21. package/dist/daemon.js.map +1 -1
  22. package/dist/i18n/en.d.ts.map +1 -1
  23. package/dist/i18n/en.js +4 -3
  24. package/dist/i18n/en.js.map +1 -1
  25. package/dist/i18n/zh.d.ts.map +1 -1
  26. package/dist/i18n/zh.js +4 -3
  27. package/dist/i18n/zh.js.map +1 -1
  28. package/dist/im/lark/card-builder.d.ts +6 -4
  29. package/dist/im/lark/card-builder.d.ts.map +1 -1
  30. package/dist/im/lark/card-builder.js +14 -7
  31. package/dist/im/lark/card-builder.js.map +1 -1
  32. package/dist/im/lark/card-handler.d.ts.map +1 -1
  33. package/dist/im/lark/card-handler.js +35 -4
  34. package/dist/im/lark/card-handler.js.map +1 -1
  35. package/dist/types.d.ts +8 -1
  36. package/dist/types.d.ts.map +1 -1
  37. package/dist/worker.js +152 -24
  38. package/dist/worker.js.map +1 -1
  39. package/package.json +1 -1
package/dist/worker.js CHANGED
@@ -71,6 +71,26 @@ let cliPidMarker = null; // path to .botmux-cli-pids/<pid>
71
71
  let sandboxStopWatcher = null; // stop fn for the sandbox outbox watcher
72
72
  let sandboxCleanup = null; // unmount overlays + rm the per-session sandbox tree
73
73
  let sandboxTeardownDone = false; // guards the exit-time best-effort teardown from double-running / running on suspend-for-resume
74
+ /** Counts consecutive in-worker restart cycles (see case 'restart'). Used by
75
+ * the SECONDARY guard so an adapter whose checkResumeTargetExists misses
76
+ * (returns undefined) or whose resume target vanishes between the check and
77
+ * spawn never crash-loops: 2nd consecutive restart → drop resume semantics,
78
+ * spawn fresh. Reset to 0 whenever spawnCli proceeds with a successful
79
+ * (non-forced) config, so healthy restarts (e.g. user `/restart`) are
80
+ * unaffected. */
81
+ let consecutiveInWorkerRestarts = 0;
82
+ /** Guard: user_notify for "resume → fresh fallback" is sent once per worker
83
+ * lifecycle so a 4× crash loop does not spam the Lark thread with 4 copies
84
+ * of the same warning. */
85
+ let resumeFallbackNotified = false;
86
+ /** The effectiveResume flag used by the most recent spawnCli call. Written
87
+ * immediately after the two-tier fallback check so late-attach timers
88
+ * (hermes, cursor, etc.) can read THE SAME semantics the spawn used,
89
+ * instead of re-deriving from lastInitConfig.resume (which never reflects
90
+ * Tier-1/Tier-2 fresh demotion). Updated in spawnCli BEFORE any bridge
91
+ * setup so even the tick that fires between spawnCli-start and the
92
+ * adapter's hermesBridgeAttach reads the correct mode. */
93
+ let lastSpawnEffectiveResume = false;
74
94
  let idleDetector = null;
75
95
  let isTmuxMode = false;
76
96
  /** Adopt-bridge mode using TmuxPipeBackend: not a tmux attach client, all
@@ -1541,8 +1561,12 @@ function codexBridgeStartTimer() {
1541
1561
  codexBridgeTimer = setInterval(() => {
1542
1562
  try {
1543
1563
  if (structuredBridgeIsHermes()) {
1564
+ // Use lastSpawnEffectiveResume (written by spawnCli AFTER the
1565
+ // two-tier fallback), NOT lastInitConfig.resume. Otherwise a
1566
+ // Tier-1/Tier-2 demotion to fresh would still baseline the empty
1567
+ // hermes store as "existing" and swallow the first turn.
1544
1568
  if (!hermesBridgeBaselineDone)
1545
- hermesBridgeAttach(lastInitConfig?.resume ? 'baseline-existing' : 'fresh-empty');
1569
+ hermesBridgeAttach(lastSpawnEffectiveResume ? 'baseline-existing' : 'fresh-empty');
1546
1570
  hermesBridgeIngest();
1547
1571
  if (isPromptReady)
1548
1572
  emitReadyCodexTurns();
@@ -2768,6 +2792,13 @@ function markPromptReady() {
2768
2792
  return;
2769
2793
  }
2770
2794
  isPromptReady = true;
2795
+ // CLI 实际启动成功(回到 prompt):复位连续重启计数。
2796
+ // 任何能到这一步的 spawn 都算"成功"——后续即便再崩溃(不是 resume 目标不存在
2797
+ // 的问题),下一轮也该有新的 2 次重试预算,而不是被历史重启计数卡住。
2798
+ if (consecutiveInWorkerRestarts > 0) {
2799
+ log(`CLI reached prompt successfully — resetting consecutive restart count (was ${consecutiveInWorkerRestarts})`);
2800
+ consecutiveInWorkerRestarts = 0;
2801
+ }
2771
2802
  // CLI is back at its prompt — every previously written input has been
2772
2803
  // consumed, so nothing is in flight anymore. A later crash must not
2773
2804
  // replay these.
@@ -3499,15 +3530,86 @@ function spawnCli(cfg) {
3499
3530
  log(`[sandbox] redirecting Claude bridge dataDir → overlay upper: ${redirected}`);
3500
3531
  claudeDataDir = redirected;
3501
3532
  }
3502
- if (claudeDataDir) {
3533
+ // ── Resume pre-flight check + two-tier fallback ──────────────────────────
3534
+ // Tier 1 (adapter probe): adapter.checkResumeTargetExists returns false
3535
+ // → skip --resume, spawn FRESH.
3536
+ // Tier 2 (restart count): 2nd consecutive in-worker restart → force FRESH,
3537
+ // regardless of probe result. This covers adapters without a probe AND
3538
+ // probe/spawn races (target vanishes between the check and spawn).
3539
+ //
3540
+ // Supersedes the claude-family-only inline probe (PR #189) with a
3541
+ // general adapter-owned check (cleaner boundary) + a numeric safety net.
3542
+ //
3543
+ // User impact: losing context is better than a 4× daemon-side crash loop
3544
+ // that leaves the bot stuck in "crashed N times" state until the human
3545
+ // re-closes the session.
3546
+ let effectiveResume = cfg.resume ?? false;
3547
+ let effectiveCliSessionId = cfg.cliSessionId;
3548
+ let effectiveAdapterSessionId = adapterSessionId;
3549
+ const tier2ForceFresh = effectiveResume && consecutiveInWorkerRestarts >= 2;
3550
+ let tier1ProbeFalse = false;
3551
+ if (effectiveResume && !tier2ForceFresh) {
3552
+ const probe = cliAdapter.checkResumeTargetExists?.({
3553
+ sessionId: effectiveAdapterSessionId,
3554
+ cliSessionId: effectiveCliSessionId,
3555
+ workingDir: cfg.workingDir,
3556
+ dataDir: claudeDataDir,
3557
+ });
3558
+ if (probe === false)
3559
+ tier1ProbeFalse = true;
3560
+ }
3561
+ const fallBackToFresh = effectiveResume && (tier1ProbeFalse || tier2ForceFresh);
3562
+ if (fallBackToFresh) {
3563
+ const reason = tier2ForceFresh
3564
+ ? `consecutive restart x${consecutiveInWorkerRestarts} — 2nd failed resume attempt`
3565
+ : 'adapter confirmed resume target does not exist on disk';
3566
+ log(`Resume fallback: dropping --resume (${reason}) → fresh session ${cfg.sessionId}`);
3567
+ effectiveResume = false;
3568
+ effectiveCliSessionId = undefined;
3569
+ effectiveAdapterSessionId = cfg.sessionId;
3570
+ // Recompute the claude-family JSONL path: it now targets the FRESH
3571
+ // sessionId (fresh spawn creates <newSid>.jsonl, not the old one).
3572
+ if (claudeDataDir) {
3573
+ backend.claudeJsonlPath =
3574
+ claudeJsonlPathForSession(effectiveAdapterSessionId, cfg.workingDir, claudeDataDir);
3575
+ }
3576
+ // Single human-visible warning. Spam guard: at most once per worker
3577
+ // lifecycle (a 4× crash loop otherwise duplicates the notice).
3578
+ if (!resumeFallbackNotified) {
3579
+ resumeFallbackNotified = true;
3580
+ send({
3581
+ type: 'user_notify',
3582
+ turnId: currentBotmuxTurnId,
3583
+ message: `⚠️ 历史会话(${(cfg.cliSessionId ?? cfg.originalSessionId ?? cfg.sessionId).substring(0, 16)}…)` +
3584
+ `无法恢复,已为你**新起一个干净会话**(原因:${reason})。\n` +
3585
+ `之前的上下文不会带到本轮,需要的话请简述背景。`,
3586
+ });
3587
+ }
3588
+ // Reset the counter so the fresh spawn gets a clean 2-attempt budget in
3589
+ // case IT crashes later for an unrelated reason.
3590
+ consecutiveInWorkerRestarts = 0;
3591
+ }
3592
+ else if (claudeDataDir) {
3593
+ // Watch where the spawned CLI will actually write: the resumed conversation
3594
+ // when resuming, else the fresh session id (a stale cliSessionId would point
3595
+ // the bridge at the gone jsonl).
3596
+ const bridgeWatchId = effectiveResume
3597
+ ? (effectiveCliSessionId ?? effectiveAdapterSessionId)
3598
+ : effectiveAdapterSessionId;
3503
3599
  backend.claudeJsonlPath =
3504
- claudeJsonlPathForSession(cfg.cliSessionId ?? adapterSessionId, cfg.workingDir, claudeDataDir);
3505
- }
3600
+ claudeJsonlPathForSession(bridgeWatchId, cfg.workingDir, claudeDataDir);
3601
+ }
3602
+ // Publish the resolved resume semantics so any late-attach timer (hermes,
3603
+ // cursor, …) driven by codexBridgeStartTimer sees the SAME mode the spawn
3604
+ // used. Without this, Tier-1/Tier-2 fresh demotion would still use
3605
+ // `lastInitConfig.resume` (= true) and baseline an empty store, swallowing
3606
+ // the fresh session's first turn.
3607
+ lastSpawnEffectiveResume = effectiveResume;
3506
3608
  const args = cliAdapter.buildArgs({
3507
- sessionId: adapterSessionId,
3508
- resume: cfg.resume ?? false,
3609
+ sessionId: effectiveAdapterSessionId,
3610
+ resume: effectiveResume,
3509
3611
  workingDir: cfg.workingDir,
3510
- resumeSessionId: cfg.cliSessionId,
3612
+ resumeSessionId: effectiveCliSessionId,
3511
3613
  initialPrompt: cfg.prompt || undefined,
3512
3614
  botName: cfg.botName,
3513
3615
  botOpenId: cfg.botOpenId,
@@ -3797,9 +3899,6 @@ function spawnCli(cfg) {
3797
3899
  };
3798
3900
  setTimeout(resolveCliPidLate, 120);
3799
3901
  }
3800
- // On tmux re-attach, keep awaitingFirstPrompt = true so screen updates are
3801
- // suppressed until the idle detector fires markNewTurn() — this prevents the
3802
- // full tmux scrollback history from leaking into the streaming card.
3803
3902
  // Bridge fallback: claude-code only. Tail Claude's transcript JSONL so a
3804
3903
  // turn the model finishes WITHOUT calling `botmux send` still gets its
3805
3904
  // assistant text forwarded to Lark (the gate in emitReadyTurns suppresses
@@ -3808,13 +3907,19 @@ function spawnCli(cfg) {
3808
3907
  // the file Claude creates on first submit isn't absorbed as history,
3809
3908
  // and baseline-existing on resume so prior-run turns ARE absorbed (we
3810
3909
  // don't want to re-emit yesterday's conversation as fresh turns).
3811
- if (claudeDataDir && adapterSessionId) {
3812
- const claudeBridgeSessionId = cfg.cliSessionId ?? adapterSessionId;
3910
+ //
3911
+ // NOTE: use effectiveResume / effectiveAdapterSessionId / effectiveCliSessionId
3912
+ // here, NOT cfg.* — the two-tier fallback above may have flipped
3913
+ // resume → FRESH, in which case the baseline mode and session id MUST
3914
+ // follow the flip. The same variables also cover Tier-2 (count-based)
3915
+ // fallbacks that fire for non-Claude CLIs (below).
3916
+ if (claudeDataDir && effectiveAdapterSessionId) {
3917
+ const claudeBridgeSessionId = effectiveCliSessionId ?? effectiveAdapterSessionId;
3813
3918
  const claudeJsonl = claudeJsonlPathForSession(claudeBridgeSessionId, cfg.workingDir, claudeDataDir);
3814
3919
  startBridgeWatcher(claudeJsonl, {
3815
3920
  cliPid: cliPid ?? undefined,
3816
3921
  cliCwd: cfg.workingDir,
3817
- mode: cfg.resume ? 'baseline-existing' : 'fresh-empty',
3922
+ mode: effectiveResume ? 'baseline-existing' : 'fresh-empty',
3818
3923
  dataDir: claudeDataDir,
3819
3924
  });
3820
3925
  }
@@ -3824,17 +3929,21 @@ function spawnCli(cfg) {
3824
3929
  // discovered after the first submit; CoCo's events path is deterministic
3825
3930
  // from botmux sessionId. Hermes and MTR use SQLite stores, so baseline the
3826
3931
  // relevant cursor at spawn and poll for rows after each queued prompt flushes.
3932
+ //
3933
+ // Mode uses effectiveResume: when the resume probe flipped us to FRESH, we
3934
+ // must NOT baseline the "restored" cursor against an empty / absent store
3935
+ // (would otherwise swallow the fresh session's first turn).
3827
3936
  if (cfg.cliId === 'hermes') {
3828
- hermesBridgeAttach(cfg.resume ? 'baseline-existing' : 'fresh-empty');
3937
+ hermesBridgeAttach(effectiveResume ? 'baseline-existing' : 'fresh-empty');
3829
3938
  }
3830
3939
  else if (cfg.cliId === 'codex') {
3831
- if (cfg.cliSessionId) {
3832
- const rolloutPath = findCodexRolloutBySessionId(cfg.cliSessionId);
3940
+ if (effectiveCliSessionId) {
3941
+ const rolloutPath = findCodexRolloutBySessionId(effectiveCliSessionId);
3833
3942
  if (rolloutPath) {
3834
3943
  codexBridgeAttach(rolloutPath, 'baseline-existing');
3835
3944
  }
3836
3945
  else {
3837
- codexBridgePendingSessionId = cfg.cliSessionId;
3946
+ codexBridgePendingSessionId = effectiveCliSessionId;
3838
3947
  codexBridgeStartTimer();
3839
3948
  }
3840
3949
  }
@@ -3847,13 +3956,13 @@ function spawnCli(cfg) {
3847
3956
  // spawn (no cliSessionId yet) we just arm the poller; writeInput will
3848
3957
  // surface the cliSessionId on the first successful submit and trigger
3849
3958
  // codexBridgeNotifyCliSessionId → rollout attach.
3850
- if (cfg.cliSessionId) {
3851
- const rolloutPath = findTraexRolloutBySessionId(cfg.cliSessionId);
3959
+ if (effectiveCliSessionId) {
3960
+ const rolloutPath = findTraexRolloutBySessionId(effectiveCliSessionId);
3852
3961
  if (rolloutPath) {
3853
3962
  codexBridgeAttach(rolloutPath, 'baseline-existing');
3854
3963
  }
3855
3964
  else {
3856
- codexBridgePendingSessionId = cfg.cliSessionId;
3965
+ codexBridgePendingSessionId = effectiveCliSessionId;
3857
3966
  codexBridgeStartTimer();
3858
3967
  }
3859
3968
  }
@@ -3862,16 +3971,16 @@ function spawnCli(cfg) {
3862
3971
  }
3863
3972
  }
3864
3973
  else if (cfg.cliId === 'coco') {
3865
- const eventsPath = cocoEventsPathForSession(cfg.sessionId);
3866
- codexBridgeAttach(eventsPath, cfg.resume ? 'baseline-existing' : 'fresh-empty');
3974
+ const eventsPath = cocoEventsPathForSession(effectiveAdapterSessionId);
3975
+ codexBridgeAttach(eventsPath, effectiveResume ? 'baseline-existing' : 'fresh-empty');
3867
3976
  codexBridgeStartTimer();
3868
3977
  }
3869
3978
  else if (cfg.cliId === 'mtr') {
3870
- const mtrSessionId = cfg.cliSessionId ?? mtrSessionIdForBotmuxSession(cfg.sessionId);
3979
+ const mtrSessionId = effectiveCliSessionId ?? mtrSessionIdForBotmuxSession(effectiveAdapterSessionId);
3871
3980
  codexBridgePendingSessionId = mtrSessionId;
3872
3981
  const source = findMtrSessionById(mtrSessionId);
3873
3982
  if (source) {
3874
- mtrBridgeAttach(source, cfg.resume ? 'baseline-existing' : 'fresh-empty');
3983
+ mtrBridgeAttach(source, effectiveResume ? 'baseline-existing' : 'fresh-empty');
3875
3984
  }
3876
3985
  else {
3877
3986
  codexBridgeStartTimer();
@@ -4738,6 +4847,14 @@ process.on('message', async (raw) => {
4738
4847
  isPromptReady = false;
4739
4848
  idleDetector?.reset();
4740
4849
  log(`Passthrough slash command: ${msg.content}`);
4850
+ // Follow-up rides on the SAME IPC (see DaemonToWorker.raw_input) so it
4851
+ // cannot race the 200ms text→Enter window above. Enqueue only after the
4852
+ // Enter landed: sendToPty queues it as the next turn (type-ahead /
4853
+ // pendingMessages), exactly like a Lark message arriving while busy.
4854
+ if (msg.followUpContent) {
4855
+ sendToPty(msg.followUpContent);
4856
+ log(`Enqueued follow-up after raw input (${msg.followUpContent.length} chars)`);
4857
+ }
4741
4858
  }
4742
4859
  break;
4743
4860
  }
@@ -4747,6 +4864,17 @@ process.on('message', async (raw) => {
4747
4864
  break;
4748
4865
  }
4749
4866
  log('Restart requested');
4867
+ // Tier-2 guard: 2nd consecutive in-worker restart forces FRESH.
4868
+ // Increment BEFORE spawnCli so the guard trips at count==2 (i.e. the
4869
+ // third attempted spawn in a 1-success → 2-failure sequence):
4870
+ // initial spawn (count=0) → fail → claude_exit → daemon sends restart
4871
+ // 1st restart (count=1) → resume still fails → restart
4872
+ // 2nd restart (count=2) → tier-2 kicks in → FRESH
4873
+ // Tier 1 probe (adapter.checkResumeTargetExists) is re-run on each
4874
+ // spawn, so even count=1 often short-circuits; tier-2 only catches
4875
+ // silent/race failures and adapters that don't implement the probe.
4876
+ consecutiveInWorkerRestarts++;
4877
+ log(`Restart count: ${consecutiveInWorkerRestarts} (>=2 forces FRESH)`);
4750
4878
  // Must destroySession(), not kill(): for persistent backends (tmux/herdr)
4751
4879
  // kill() only detaches — the backing session + CLI process keep running,
4752
4880
  // so the resume:true spawnCli below would re-attach to the SAME live CLI