botmux 2.71.0 → 2.71.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/cli/claude-code.d.ts.map +1 -1
- package/dist/adapters/cli/claude-code.js +35 -0
- package/dist/adapters/cli/claude-code.js.map +1 -1
- package/dist/adapters/cli/types.d.ts +28 -0
- package/dist/adapters/cli/types.d.ts.map +1 -1
- package/dist/core/command-handler.d.ts +1 -1
- package/dist/core/command-handler.d.ts.map +1 -1
- package/dist/core/command-handler.js +5 -1
- package/dist/core/command-handler.js.map +1 -1
- package/dist/core/reply-target.d.ts +10 -0
- package/dist/core/reply-target.d.ts.map +1 -1
- package/dist/core/reply-target.js +12 -0
- package/dist/core/reply-target.js.map +1 -1
- package/dist/core/worker-pool.d.ts.map +1 -1
- package/dist/core/worker-pool.js +6 -2
- package/dist/core/worker-pool.js.map +1 -1
- package/dist/im/lark/card-handler.d.ts +1 -1
- package/dist/im/lark/card-handler.d.ts.map +1 -1
- package/dist/im/lark/card-handler.js +10 -2
- package/dist/im/lark/card-handler.js.map +1 -1
- package/dist/services/claude-transcript.d.ts +17 -0
- package/dist/services/claude-transcript.d.ts.map +1 -1
- package/dist/services/claude-transcript.js +72 -0
- package/dist/services/claude-transcript.js.map +1 -1
- package/dist/worker.js +142 -37
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
package/dist/worker.js
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import { randomBytes } from 'node:crypto';
|
|
16
16
|
import { mkdirSync, writeFileSync, unlinkSync, existsSync, statSync, readdirSync, readlinkSync, readFileSync, watch as fsWatch, createWriteStream } from 'node:fs';
|
|
17
17
|
import { isAbsolute, join } from 'node:path';
|
|
18
|
-
import { drainTranscript, joinAssistantText, findJsonlContainingFingerprint, findJsonlsContainingExactContent, findLatestJsonl, extractLastAssistantTurn, extractTurnStartText, splitTranscriptEventsByCutoff } from './services/claude-transcript.js';
|
|
18
|
+
import { drainTranscript, joinAssistantText, trailingAssistantText, findJsonlContainingFingerprint, findJsonlsContainingExactContent, findLatestJsonl, extractLastAssistantTurn, extractTurnStartText, splitTranscriptEventsByCutoff } from './services/claude-transcript.js';
|
|
19
19
|
import { BridgeTurnQueue, makeFingerprint, normaliseForFingerprint } from './services/bridge-turn-queue.js';
|
|
20
20
|
import { shouldSuppressBridgeEmit } from './services/bridge-fallback-gate.js';
|
|
21
21
|
import { shouldWriteNow } from './utils/input-gate.js';
|
|
@@ -71,6 +71,26 @@ let cliPidMarker = null; // path to .botmux-cli-pids/<pid>
|
|
|
71
71
|
let sandboxStopWatcher = null; // stop fn for the sandbox outbox watcher
|
|
72
72
|
let sandboxCleanup = null; // unmount overlays + rm the per-session sandbox tree
|
|
73
73
|
let sandboxTeardownDone = false; // guards the exit-time best-effort teardown from double-running / running on suspend-for-resume
|
|
74
|
+
/** Counts consecutive in-worker restart cycles (see case 'restart'). Used by
|
|
75
|
+
* the SECONDARY guard so an adapter whose checkResumeTargetExists misses
|
|
76
|
+
* (returns undefined) or whose resume target vanishes between the check and
|
|
77
|
+
* spawn never crash-loops: 2nd consecutive restart → drop resume semantics,
|
|
78
|
+
* spawn fresh. Reset to 0 whenever spawnCli proceeds with a successful
|
|
79
|
+
* (non-forced) config, so healthy restarts (e.g. user `/restart`) are
|
|
80
|
+
* unaffected. */
|
|
81
|
+
let consecutiveInWorkerRestarts = 0;
|
|
82
|
+
/** Guard: user_notify for "resume → fresh fallback" is sent once per worker
|
|
83
|
+
* lifecycle so a 4× crash loop does not spam the Lark thread with 4 copies
|
|
84
|
+
* of the same warning. */
|
|
85
|
+
let resumeFallbackNotified = false;
|
|
86
|
+
/** The effectiveResume flag used by the most recent spawnCli call. Written
|
|
87
|
+
* immediately after the two-tier fallback check so late-attach timers
|
|
88
|
+
* (hermes, cursor, etc.) can read THE SAME semantics the spawn used,
|
|
89
|
+
* instead of re-deriving from lastInitConfig.resume (which never reflects
|
|
90
|
+
* Tier-1/Tier-2 fresh demotion). Updated in spawnCli BEFORE any bridge
|
|
91
|
+
* setup so even the tick that fires between spawnCli-start and the
|
|
92
|
+
* adapter's hermesBridgeAttach reads the correct mode. */
|
|
93
|
+
let lastSpawnEffectiveResume = false;
|
|
74
94
|
let idleDetector = null;
|
|
75
95
|
let isTmuxMode = false;
|
|
76
96
|
/** Adopt-bridge mode using TmuxPipeBackend: not a tmux attach client, all
|
|
@@ -1416,7 +1436,13 @@ function emitReadyTurns() {
|
|
|
1416
1436
|
}
|
|
1417
1437
|
const set = new Set(turn.assistantUuids);
|
|
1418
1438
|
const matched = drained.events.filter(e => e.uuid && set.has(e.uuid));
|
|
1419
|
-
|
|
1439
|
+
// Non-adopt fallback posts the turn's FINAL answer (text after the last
|
|
1440
|
+
// tool_use), not the whole-turn narration collage — joining every interim
|
|
1441
|
+
// block both reads as noise in Lark and inflates finalText past the
|
|
1442
|
+
// material-longer gate, re-posting turns the model already `botmux send`ed.
|
|
1443
|
+
// Adopt keeps the full join: transcript drain is that mode's only channel,
|
|
1444
|
+
// so interim narration is the user's only window into the turn.
|
|
1445
|
+
const assistantText = adoptMode ? joinAssistantText(matched) : trailingAssistantText(drained.events, turn.assistantUuids);
|
|
1420
1446
|
if (assistantText.length === 0)
|
|
1421
1447
|
continue;
|
|
1422
1448
|
const lastUuid = turn.assistantUuids[turn.assistantUuids.length - 1];
|
|
@@ -1541,8 +1567,12 @@ function codexBridgeStartTimer() {
|
|
|
1541
1567
|
codexBridgeTimer = setInterval(() => {
|
|
1542
1568
|
try {
|
|
1543
1569
|
if (structuredBridgeIsHermes()) {
|
|
1570
|
+
// Use lastSpawnEffectiveResume (written by spawnCli AFTER the
|
|
1571
|
+
// two-tier fallback), NOT lastInitConfig.resume. Otherwise a
|
|
1572
|
+
// Tier-1/Tier-2 demotion to fresh would still baseline the empty
|
|
1573
|
+
// hermes store as "existing" and swallow the first turn.
|
|
1544
1574
|
if (!hermesBridgeBaselineDone)
|
|
1545
|
-
hermesBridgeAttach(
|
|
1575
|
+
hermesBridgeAttach(lastSpawnEffectiveResume ? 'baseline-existing' : 'fresh-empty');
|
|
1546
1576
|
hermesBridgeIngest();
|
|
1547
1577
|
if (isPromptReady)
|
|
1548
1578
|
emitReadyCodexTurns();
|
|
@@ -2768,6 +2798,13 @@ function markPromptReady() {
|
|
|
2768
2798
|
return;
|
|
2769
2799
|
}
|
|
2770
2800
|
isPromptReady = true;
|
|
2801
|
+
// CLI 实际启动成功(回到 prompt):复位连续重启计数。
|
|
2802
|
+
// 任何能到这一步的 spawn 都算"成功"——后续即便再崩溃(不是 resume 目标不存在
|
|
2803
|
+
// 的问题),下一轮也该有新的 2 次重试预算,而不是被历史重启计数卡住。
|
|
2804
|
+
if (consecutiveInWorkerRestarts > 0) {
|
|
2805
|
+
log(`CLI reached prompt successfully — resetting consecutive restart count (was ${consecutiveInWorkerRestarts})`);
|
|
2806
|
+
consecutiveInWorkerRestarts = 0;
|
|
2807
|
+
}
|
|
2771
2808
|
// CLI is back at its prompt — every previously written input has been
|
|
2772
2809
|
// consumed, so nothing is in flight anymore. A later crash must not
|
|
2773
2810
|
// replay these.
|
|
@@ -3499,33 +3536,86 @@ function spawnCli(cfg) {
|
|
|
3499
3536
|
log(`[sandbox] redirecting Claude bridge dataDir → overlay upper: ${redirected}`);
|
|
3500
3537
|
claudeDataDir = redirected;
|
|
3501
3538
|
}
|
|
3502
|
-
// Resume-
|
|
3503
|
-
//
|
|
3504
|
-
//
|
|
3505
|
-
//
|
|
3506
|
-
//
|
|
3507
|
-
//
|
|
3539
|
+
// ── Resume pre-flight check + two-tier fallback ──────────────────────────
|
|
3540
|
+
// Tier 1 (adapter probe): adapter.checkResumeTargetExists returns false
|
|
3541
|
+
// → skip --resume, spawn FRESH.
|
|
3542
|
+
// Tier 2 (restart count): 2nd consecutive in-worker restart → force FRESH,
|
|
3543
|
+
// regardless of probe result. This covers adapters without a probe AND
|
|
3544
|
+
// probe/spawn races (target vanishes between the check and spawn).
|
|
3545
|
+
//
|
|
3546
|
+
// Supersedes the claude-family-only inline probe (PR #189) with a
|
|
3547
|
+
// general adapter-owned check (cleaner boundary) + a numeric safety net.
|
|
3548
|
+
//
|
|
3549
|
+
// User impact: losing context is better than a 4× daemon-side crash loop
|
|
3550
|
+
// that leaves the bot stuck in "crashed N times" state until the human
|
|
3551
|
+
// re-closes the session.
|
|
3508
3552
|
let effectiveResume = cfg.resume ?? false;
|
|
3509
|
-
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3553
|
+
let effectiveCliSessionId = cfg.cliSessionId;
|
|
3554
|
+
let effectiveAdapterSessionId = adapterSessionId;
|
|
3555
|
+
const tier2ForceFresh = effectiveResume && consecutiveInWorkerRestarts >= 2;
|
|
3556
|
+
let tier1ProbeFalse = false;
|
|
3557
|
+
if (effectiveResume && !tier2ForceFresh) {
|
|
3558
|
+
const probe = cliAdapter.checkResumeTargetExists?.({
|
|
3559
|
+
sessionId: effectiveAdapterSessionId,
|
|
3560
|
+
cliSessionId: effectiveCliSessionId,
|
|
3561
|
+
workingDir: cfg.workingDir,
|
|
3562
|
+
dataDir: claudeDataDir,
|
|
3563
|
+
});
|
|
3564
|
+
if (probe === false)
|
|
3565
|
+
tier1ProbeFalse = true;
|
|
3566
|
+
}
|
|
3567
|
+
const fallBackToFresh = effectiveResume && (tier1ProbeFalse || tier2ForceFresh);
|
|
3568
|
+
if (fallBackToFresh) {
|
|
3569
|
+
const reason = tier2ForceFresh
|
|
3570
|
+
? `consecutive restart x${consecutiveInWorkerRestarts} — 2nd failed resume attempt`
|
|
3571
|
+
: 'adapter confirmed resume target does not exist on disk';
|
|
3572
|
+
log(`Resume fallback: dropping --resume (${reason}) → fresh session ${cfg.sessionId}`);
|
|
3573
|
+
effectiveResume = false;
|
|
3574
|
+
effectiveCliSessionId = undefined;
|
|
3575
|
+
effectiveAdapterSessionId = cfg.sessionId;
|
|
3576
|
+
// Recompute the claude-family JSONL path: it now targets the FRESH
|
|
3577
|
+
// sessionId (fresh spawn creates <newSid>.jsonl, not the old one).
|
|
3578
|
+
if (claudeDataDir) {
|
|
3579
|
+
backend.claudeJsonlPath =
|
|
3580
|
+
claudeJsonlPathForSession(effectiveAdapterSessionId, cfg.workingDir, claudeDataDir);
|
|
3581
|
+
}
|
|
3582
|
+
// Single human-visible warning. Spam guard: at most once per worker
|
|
3583
|
+
// lifecycle (a 4× crash loop otherwise duplicates the notice).
|
|
3584
|
+
if (!resumeFallbackNotified) {
|
|
3585
|
+
resumeFallbackNotified = true;
|
|
3586
|
+
send({
|
|
3587
|
+
type: 'user_notify',
|
|
3588
|
+
turnId: currentBotmuxTurnId,
|
|
3589
|
+
message: `⚠️ 历史会话(${(cfg.cliSessionId ?? cfg.originalSessionId ?? cfg.sessionId).substring(0, 16)}…)` +
|
|
3590
|
+
`无法恢复,已为你**新起一个干净会话**(原因:${reason})。\n` +
|
|
3591
|
+
`之前的上下文不会带到本轮,需要的话请简述背景。`,
|
|
3592
|
+
});
|
|
3514
3593
|
}
|
|
3594
|
+
// Reset the counter so the fresh spawn gets a clean 2-attempt budget in
|
|
3595
|
+
// case IT crashes later for an unrelated reason.
|
|
3596
|
+
consecutiveInWorkerRestarts = 0;
|
|
3515
3597
|
}
|
|
3516
|
-
if (claudeDataDir) {
|
|
3598
|
+
else if (claudeDataDir) {
|
|
3517
3599
|
// Watch where the spawned CLI will actually write: the resumed conversation
|
|
3518
3600
|
// when resuming, else the fresh session id (a stale cliSessionId would point
|
|
3519
3601
|
// the bridge at the gone jsonl).
|
|
3520
|
-
const bridgeWatchId = effectiveResume
|
|
3602
|
+
const bridgeWatchId = effectiveResume
|
|
3603
|
+
? (effectiveCliSessionId ?? effectiveAdapterSessionId)
|
|
3604
|
+
: effectiveAdapterSessionId;
|
|
3521
3605
|
backend.claudeJsonlPath =
|
|
3522
3606
|
claudeJsonlPathForSession(bridgeWatchId, cfg.workingDir, claudeDataDir);
|
|
3523
3607
|
}
|
|
3608
|
+
// Publish the resolved resume semantics so any late-attach timer (hermes,
|
|
3609
|
+
// cursor, …) driven by codexBridgeStartTimer sees the SAME mode the spawn
|
|
3610
|
+
// used. Without this, Tier-1/Tier-2 fresh demotion would still use
|
|
3611
|
+
// `lastInitConfig.resume` (= true) and baseline an empty store, swallowing
|
|
3612
|
+
// the fresh session's first turn.
|
|
3613
|
+
lastSpawnEffectiveResume = effectiveResume;
|
|
3524
3614
|
const args = cliAdapter.buildArgs({
|
|
3525
|
-
sessionId:
|
|
3615
|
+
sessionId: effectiveAdapterSessionId,
|
|
3526
3616
|
resume: effectiveResume,
|
|
3527
3617
|
workingDir: cfg.workingDir,
|
|
3528
|
-
resumeSessionId:
|
|
3618
|
+
resumeSessionId: effectiveCliSessionId,
|
|
3529
3619
|
initialPrompt: cfg.prompt || undefined,
|
|
3530
3620
|
botName: cfg.botName,
|
|
3531
3621
|
botOpenId: cfg.botOpenId,
|
|
@@ -3815,9 +3905,6 @@ function spawnCli(cfg) {
|
|
|
3815
3905
|
};
|
|
3816
3906
|
setTimeout(resolveCliPidLate, 120);
|
|
3817
3907
|
}
|
|
3818
|
-
// On tmux re-attach, keep awaitingFirstPrompt = true so screen updates are
|
|
3819
|
-
// suppressed until the idle detector fires markNewTurn() — this prevents the
|
|
3820
|
-
// full tmux scrollback history from leaking into the streaming card.
|
|
3821
3908
|
// Bridge fallback: claude-code only. Tail Claude's transcript JSONL so a
|
|
3822
3909
|
// turn the model finishes WITHOUT calling `botmux send` still gets its
|
|
3823
3910
|
// assistant text forwarded to Lark (the gate in emitReadyTurns suppresses
|
|
@@ -3826,11 +3913,14 @@ function spawnCli(cfg) {
|
|
|
3826
3913
|
// the file Claude creates on first submit isn't absorbed as history,
|
|
3827
3914
|
// and baseline-existing on resume so prior-run turns ARE absorbed (we
|
|
3828
3915
|
// don't want to re-emit yesterday's conversation as fresh turns).
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
|
|
3832
|
-
|
|
3833
|
-
|
|
3916
|
+
//
|
|
3917
|
+
// NOTE: use effectiveResume / effectiveAdapterSessionId / effectiveCliSessionId
|
|
3918
|
+
// here, NOT cfg.* — the two-tier fallback above may have flipped
|
|
3919
|
+
// resume → FRESH, in which case the baseline mode and session id MUST
|
|
3920
|
+
// follow the flip. The same variables also cover Tier-2 (count-based)
|
|
3921
|
+
// fallbacks that fire for non-Claude CLIs (below).
|
|
3922
|
+
if (claudeDataDir && effectiveAdapterSessionId) {
|
|
3923
|
+
const claudeBridgeSessionId = effectiveCliSessionId ?? effectiveAdapterSessionId;
|
|
3834
3924
|
const claudeJsonl = claudeJsonlPathForSession(claudeBridgeSessionId, cfg.workingDir, claudeDataDir);
|
|
3835
3925
|
startBridgeWatcher(claudeJsonl, {
|
|
3836
3926
|
cliPid: cliPid ?? undefined,
|
|
@@ -3845,17 +3935,21 @@ function spawnCli(cfg) {
|
|
|
3845
3935
|
// discovered after the first submit; CoCo's events path is deterministic
|
|
3846
3936
|
// from botmux sessionId. Hermes and MTR use SQLite stores, so baseline the
|
|
3847
3937
|
// relevant cursor at spawn and poll for rows after each queued prompt flushes.
|
|
3938
|
+
//
|
|
3939
|
+
// Mode uses effectiveResume: when the resume probe flipped us to FRESH, we
|
|
3940
|
+
// must NOT baseline the "restored" cursor against an empty / absent store
|
|
3941
|
+
// (would otherwise swallow the fresh session's first turn).
|
|
3848
3942
|
if (cfg.cliId === 'hermes') {
|
|
3849
|
-
hermesBridgeAttach(
|
|
3943
|
+
hermesBridgeAttach(effectiveResume ? 'baseline-existing' : 'fresh-empty');
|
|
3850
3944
|
}
|
|
3851
3945
|
else if (cfg.cliId === 'codex') {
|
|
3852
|
-
if (
|
|
3853
|
-
const rolloutPath = findCodexRolloutBySessionId(
|
|
3946
|
+
if (effectiveCliSessionId) {
|
|
3947
|
+
const rolloutPath = findCodexRolloutBySessionId(effectiveCliSessionId);
|
|
3854
3948
|
if (rolloutPath) {
|
|
3855
3949
|
codexBridgeAttach(rolloutPath, 'baseline-existing');
|
|
3856
3950
|
}
|
|
3857
3951
|
else {
|
|
3858
|
-
codexBridgePendingSessionId =
|
|
3952
|
+
codexBridgePendingSessionId = effectiveCliSessionId;
|
|
3859
3953
|
codexBridgeStartTimer();
|
|
3860
3954
|
}
|
|
3861
3955
|
}
|
|
@@ -3868,13 +3962,13 @@ function spawnCli(cfg) {
|
|
|
3868
3962
|
// spawn (no cliSessionId yet) we just arm the poller; writeInput will
|
|
3869
3963
|
// surface the cliSessionId on the first successful submit and trigger
|
|
3870
3964
|
// codexBridgeNotifyCliSessionId → rollout attach.
|
|
3871
|
-
if (
|
|
3872
|
-
const rolloutPath = findTraexRolloutBySessionId(
|
|
3965
|
+
if (effectiveCliSessionId) {
|
|
3966
|
+
const rolloutPath = findTraexRolloutBySessionId(effectiveCliSessionId);
|
|
3873
3967
|
if (rolloutPath) {
|
|
3874
3968
|
codexBridgeAttach(rolloutPath, 'baseline-existing');
|
|
3875
3969
|
}
|
|
3876
3970
|
else {
|
|
3877
|
-
codexBridgePendingSessionId =
|
|
3971
|
+
codexBridgePendingSessionId = effectiveCliSessionId;
|
|
3878
3972
|
codexBridgeStartTimer();
|
|
3879
3973
|
}
|
|
3880
3974
|
}
|
|
@@ -3883,16 +3977,16 @@ function spawnCli(cfg) {
|
|
|
3883
3977
|
}
|
|
3884
3978
|
}
|
|
3885
3979
|
else if (cfg.cliId === 'coco') {
|
|
3886
|
-
const eventsPath = cocoEventsPathForSession(
|
|
3887
|
-
codexBridgeAttach(eventsPath,
|
|
3980
|
+
const eventsPath = cocoEventsPathForSession(effectiveAdapterSessionId);
|
|
3981
|
+
codexBridgeAttach(eventsPath, effectiveResume ? 'baseline-existing' : 'fresh-empty');
|
|
3888
3982
|
codexBridgeStartTimer();
|
|
3889
3983
|
}
|
|
3890
3984
|
else if (cfg.cliId === 'mtr') {
|
|
3891
|
-
const mtrSessionId =
|
|
3985
|
+
const mtrSessionId = effectiveCliSessionId ?? mtrSessionIdForBotmuxSession(effectiveAdapterSessionId);
|
|
3892
3986
|
codexBridgePendingSessionId = mtrSessionId;
|
|
3893
3987
|
const source = findMtrSessionById(mtrSessionId);
|
|
3894
3988
|
if (source) {
|
|
3895
|
-
mtrBridgeAttach(source,
|
|
3989
|
+
mtrBridgeAttach(source, effectiveResume ? 'baseline-existing' : 'fresh-empty');
|
|
3896
3990
|
}
|
|
3897
3991
|
else {
|
|
3898
3992
|
codexBridgeStartTimer();
|
|
@@ -4776,6 +4870,17 @@ process.on('message', async (raw) => {
|
|
|
4776
4870
|
break;
|
|
4777
4871
|
}
|
|
4778
4872
|
log('Restart requested');
|
|
4873
|
+
// Tier-2 guard: 2nd consecutive in-worker restart forces FRESH.
|
|
4874
|
+
// Increment BEFORE spawnCli so the guard trips at count==2 (i.e. the
|
|
4875
|
+
// third attempted spawn in a 1-success → 2-failure sequence):
|
|
4876
|
+
// initial spawn (count=0) → fail → claude_exit → daemon sends restart
|
|
4877
|
+
// 1st restart (count=1) → resume still fails → restart
|
|
4878
|
+
// 2nd restart (count=2) → tier-2 kicks in → FRESH
|
|
4879
|
+
// Tier 1 probe (adapter.checkResumeTargetExists) is re-run on each
|
|
4880
|
+
// spawn, so even count=1 often short-circuits; tier-2 only catches
|
|
4881
|
+
// silent/race failures and adapters that don't implement the probe.
|
|
4882
|
+
consecutiveInWorkerRestarts++;
|
|
4883
|
+
log(`Restart count: ${consecutiveInWorkerRestarts} (>=2 forces FRESH)`);
|
|
4779
4884
|
// Must destroySession(), not kill(): for persistent backends (tmux/herdr)
|
|
4780
4885
|
// kill() only detaches — the backing session + CLI process keep running,
|
|
4781
4886
|
// so the resume:true spawnCli below would re-attach to the SAME live CLI
|