@integrity-labs/agt-cli 0.27.150-test.15 → 0.27.150

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,10 +15,11 @@ import {
15
15
  provisionAutoKanbanProgressHook,
16
16
  provisionIsolationHook,
17
17
  provisionOrientHook,
18
+ provisionSessionStateHook,
18
19
  provisionStopHook,
19
20
  requireHost,
20
21
  safeWriteJsonAtomic
21
- } from "../chunk-24FTY53Z.js";
22
+ } from "../chunk-CUHP2SVW.js";
22
23
  import {
23
24
  getProjectDir as getProjectDir2,
24
25
  getReadyTasks,
@@ -37,10 +38,13 @@ import {
37
38
  getSessionState,
38
39
  injectMessage,
39
40
  injectMessageWithStatus,
41
+ isAgentIdle,
40
42
  isAgentPromptReady,
41
43
  isSessionHealthy,
44
+ isStaleForToday,
42
45
  paneLogPath,
43
46
  parseEnvIntegrations,
47
+ peekCurrentSession,
44
48
  prepareForRespawn,
45
49
  probeMcpEnvSubstitution,
46
50
  readPaneLogTail,
@@ -49,12 +53,14 @@ import {
49
53
  rotateSessionForWedge,
50
54
  sanitizeMcpJson,
51
55
  sendToAgent,
56
+ sessionTranscriptDir,
52
57
  startPersistentSession,
53
58
  stopAllSessionsAndWait,
54
59
  stopPersistentSession,
55
60
  takeWatchdogGiveUpCount,
56
- takeZombieDetection
57
- } from "../chunk-7GKJZBTB.js";
61
+ takeZombieDetection,
62
+ transcriptActivityAgeSeconds
63
+ } from "../chunk-JLS7NQFE.js";
58
64
  import {
59
65
  KANBAN_CHECK_COMMAND,
60
66
  MAX_AVATAR_ENV_URL_BYTES,
@@ -83,13 +89,7 @@ import {
83
89
  resolveDmTarget,
84
90
  worseConnectivityOutcome,
85
91
  wrapScheduledTaskPrompt
86
- } from "../chunk-WOOYOAPG.js";
87
- import {
88
- isAgentIdle,
89
- isStaleForToday,
90
- peekCurrentSession,
91
- sessionTranscriptDir
92
- } from "../chunk-354FAVQR.js";
92
+ } from "../chunk-A75AOK6E.js";
93
93
  import {
94
94
  parsePsRows,
95
95
  reapOrphanChannelMcps
@@ -1515,6 +1515,7 @@ function isUrgentUpgrade(opts) {
1515
1515
  }
1516
1516
  var RESTART_IDLE_THRESHOLD_SECONDS = 120;
1517
1517
  var RESTART_INBOUND_QUIET_SECONDS = 300;
1518
+ var RESTART_TRANSCRIPT_STALE_SECONDS = 60;
1518
1519
  var GATEABLE_RESTART_REASONS = /* @__PURE__ */ new Set([
1519
1520
  "model-change",
1520
1521
  "channel-set-change",
@@ -1533,9 +1534,11 @@ function decideRestartGate(opts) {
1533
1534
  }
1534
1535
  const paneThreshold = opts.idleThresholdSeconds ?? RESTART_IDLE_THRESHOLD_SECONDS;
1535
1536
  const inboundThreshold = opts.inboundQuietSeconds ?? RESTART_INBOUND_QUIET_SECONDS;
1536
- const paneBusy = opts.paneLogAgeSeconds !== null && opts.paneLogAgeSeconds < paneThreshold;
1537
+ const transcriptThreshold = opts.transcriptStaleSeconds ?? RESTART_TRANSCRIPT_STALE_SECONDS;
1538
+ const transcriptAge = opts.transcriptAgeSeconds ?? null;
1539
+ const progressBusy = transcriptAge !== null ? transcriptAge < transcriptThreshold : opts.paneLogAgeSeconds !== null && opts.paneLogAgeSeconds < paneThreshold;
1537
1540
  const inboundBusy = opts.inboundAgeSeconds !== null && opts.inboundAgeSeconds < inboundThreshold;
1538
- if (paneBusy || inboundBusy) return "defer-idle";
1541
+ if (progressBusy || inboundBusy) return "defer-idle";
1539
1542
  return "proceed";
1540
1543
  }
1541
1544
 
@@ -3377,13 +3380,18 @@ function clearAgentState(agentId, codeName) {
3377
3380
  // src/lib/wedge-detection.ts
3378
3381
  var DEFAULTS = {
3379
3382
  inboundWaitSeconds: 120,
3380
- // ENG-6238: the hard cap is now an ABSOLUTE BACKSTOP, not the primary
3381
- // discriminator — a still-producing session (e.g. a runaway loop) is only
3382
- // reaped here. Raised 300→1200 because the soft path now reliably catches
3383
- // the frozen-turn wedge via the transcript signal, so the hard cap no longer
3384
- // needs to fire early (which is exactly what false-killed kylie's long
3385
- // legitimate turns, ENG-6238).
3386
- inboundHardWaitSeconds: 1200,
3383
+ // ENG-6264: DISABLED by default (0). A session that's actively producing
3384
+ // tokens is never force-respawned — a working agent must not be killed just
3385
+ // because a message has been queued behind its turn, no matter how long.
3386
+ // ENG-6238 made this an absolute backstop (1200s) to still catch a
3387
+ // producing-but-never-draining runaway loop, but that re-introduced the exact
3388
+ // failure we set out to kill: cutting off real work on a long turn. Runaway
3389
+ // token burn is owned by the cost guardrail (ENG-5556); a producing-but-silent
3390
+ // loop still trips the synthetic-probe alarm. So the backstop is now opt-in:
3391
+ // set AGT_WEDGE_INBOUND_HARD_WAIT_SECONDS to a positive value to re-enable it
3392
+ // (floored at inboundWaitSeconds). 0 = the frozen/hung wedge (transcript
3393
+ // static) is still caught by the soft path; only the *producing* path is spared.
3394
+ inboundHardWaitSeconds: 0,
3387
3395
  paneStaleSeconds: 120,
3388
3396
  transcriptStaleSeconds: 60,
3389
3397
  minCycles: 3
@@ -3402,10 +3410,12 @@ function resolveWedgeConfig(env = process.env) {
3402
3410
  DEFAULTS.inboundWaitSeconds,
3403
3411
  30
3404
3412
  );
3405
- const inboundHardWaitSeconds = Math.max(
3406
- inboundWaitSeconds,
3407
- parsePositiveInt(env.AGT_WEDGE_INBOUND_HARD_WAIT_SECONDS, DEFAULTS.inboundHardWaitSeconds, 30)
3413
+ const inboundHardWaitRaw = parsePositiveInt(
3414
+ env.AGT_WEDGE_INBOUND_HARD_WAIT_SECONDS,
3415
+ DEFAULTS.inboundHardWaitSeconds,
3416
+ 0
3408
3417
  );
3418
+ const inboundHardWaitSeconds = inboundHardWaitRaw <= 0 ? 0 : Math.max(inboundWaitSeconds, inboundHardWaitRaw);
3409
3419
  return {
3410
3420
  mode: parseMode(env.AGT_WEDGE_RESTART_MODE),
3411
3421
  inboundWaitSeconds,
@@ -3430,6 +3440,7 @@ function isWedgeCandidateCycle(signals, config2) {
3430
3440
  if (inboundAge === null) return false;
3431
3441
  if (inboundAge < config2.inboundWaitSeconds) return false;
3432
3442
  if (isSessionProducing(signals, config2)) {
3443
+ if (config2.inboundHardWaitSeconds <= 0) return false;
3433
3444
  return inboundAge >= config2.inboundHardWaitSeconds;
3434
3445
  }
3435
3446
  return true;
@@ -4417,11 +4428,16 @@ function paneLogAgeSecondsFor(codeName) {
4417
4428
  return 0;
4418
4429
  }
4419
4430
  }
4420
- function restartGateFor(codeName, breakerReason) {
4421
- if (!isGateableRestartReason(breakerReason)) return "bypass";
4431
+ function transcriptAgeSecondsFor(codeName) {
4432
+ const sessionId = getSessionState(codeName)?.currentSessionId ?? null;
4433
+ return transcriptActivityAgeSeconds(getProjectDir2(codeName), sessionId, /* @__PURE__ */ new Date());
4434
+ }
4435
+ function restartGateFor(codeName, reason) {
4436
+ if (!isGateableRestartReason(reason)) return "bypass";
4422
4437
  return decideRestartGate({
4423
4438
  window: cachedMaintenanceWindow,
4424
4439
  paneLogAgeSeconds: paneLogAgeSecondsFor(codeName),
4440
+ transcriptAgeSeconds: transcriptAgeSecondsFor(codeName),
4425
4441
  inboundAgeSeconds: inboundAgeSecondsFor(codeName),
4426
4442
  now: /* @__PURE__ */ new Date()
4427
4443
  });
@@ -4580,10 +4596,10 @@ async function runAgentConnectivityProbes(agent, integrations, projectDir) {
4580
4596
  );
4581
4597
  }
4582
4598
  }
4583
- function stopPersistentSessionAndForgetMcpBaseline(codeName, breakerReason) {
4584
- const gate = restartGateFor(codeName, breakerReason);
4599
+ function stopPersistentSessionAndForgetMcpBaseline(codeName, breakerReason, gateReason = breakerReason) {
4600
+ const gate = restartGateFor(codeName, gateReason);
4585
4601
  if (gate !== "bypass" && gate !== "proceed") {
4586
- log(`[maintenance-window] Deferring '${breakerReason}' restart for '${codeName}' (${gate})`);
4602
+ log(`[maintenance-window] Deferring '${gateReason}' restart for '${codeName}' (${gate})`);
4587
4603
  return;
4588
4604
  }
4589
4605
  cancelPendingSessionRestart(codeName);
@@ -4760,7 +4776,7 @@ var cachedMaintenanceWindow = null;
4760
4776
  var lastVersionCheckAt = 0;
4761
4777
  var VERSION_CHECK_INTERVAL_MS = 5 * 60 * 1e3;
4762
4778
  var lastResponsivenessProbeAt = 0;
4763
- var agtCliVersion = true ? "0.27.150-test.15" : "dev";
4779
+ var agtCliVersion = true ? "0.27.150" : "dev";
4764
4780
  function resolveBrewPath(execFileSync4) {
4765
4781
  try {
4766
4782
  const out = execFileSync4("which", ["brew"], { timeout: 5e3 }).toString().trim();
@@ -5958,7 +5974,7 @@ async function pollCycle() {
5958
5974
  }
5959
5975
  try {
5960
5976
  const { detectHostSecurity } = await import("../host-security-6PDFG7F5.js");
5961
- const { collectDiagnostics } = await import("../persistent-session-35PWSTLO.js");
5977
+ const { collectDiagnostics } = await import("../persistent-session-ZLEK4KBF.js");
5962
5978
  const diagCodeNames = [...agentState.persistentSessionAgents];
5963
5979
  const agentDiagnostics = diagCodeNames.length > 0 ? collectDiagnostics(diagCodeNames) : void 0;
5964
5980
  let tailscaleHostname;
@@ -6045,12 +6061,12 @@ async function pollCycle() {
6045
6061
  const {
6046
6062
  collectResponsivenessProbes,
6047
6063
  getResponsivenessIntervalMs
6048
- } = await import("../responsiveness-probe-MA4M2QM4.js");
6064
+ } = await import("../responsiveness-probe-3EUNCJDU.js");
6049
6065
  const probeIntervalMs = getResponsivenessIntervalMs();
6050
6066
  if (now - lastResponsivenessProbeAt > probeIntervalMs) {
6051
6067
  const probeCodeNames = [...agentState.persistentSessionAgents];
6052
6068
  if (probeCodeNames.length > 0) {
6053
- const { takeAcpxExecFailureCount, creditAcpxExecFailureCount } = await import("../persistent-session-35PWSTLO.js");
6069
+ const { takeAcpxExecFailureCount, creditAcpxExecFailureCount } = await import("../persistent-session-ZLEK4KBF.js");
6054
6070
  const drainedGiveUps = /* @__PURE__ */ new Map();
6055
6071
  const drainedAcpxFailures = /* @__PURE__ */ new Map();
6056
6072
  const probes = collectResponsivenessProbes(probeCodeNames).map((p) => {
@@ -6084,8 +6100,7 @@ async function pollCycle() {
6084
6100
  collectResponsivenessProbes,
6085
6101
  livePendingInboundOldestAgeSeconds,
6086
6102
  deadLetterPendingInbound
6087
- } = await import("../responsiveness-probe-MA4M2QM4.js");
6088
- const { transcriptActivityAgeSeconds } = await import("../daily-session-PNQX5URX.js");
6103
+ } = await import("../responsiveness-probe-3EUNCJDU.js");
6089
6104
  const { getProjectDir: wedgeProjectDir } = await import("../claude-scheduler-FATCLHDM.js");
6090
6105
  const wedgeNow = /* @__PURE__ */ new Date();
6091
6106
  const liveAgents = agentState.persistentSessionAgents;
@@ -6149,6 +6164,21 @@ async function pollCycle() {
6149
6164
  log(
6150
6165
  `[wedge] forced fresh respawn ${detail} \u2192 new session ${newId} (transcript preserved${deadNote})`
6151
6166
  );
6167
+ const wedgeAgentId = agentState.codeNameToAgentId.get(codeName);
6168
+ if (wedgeAgentId) {
6169
+ api.post("/host/wedge-respawn", {
6170
+ agent_id: wedgeAgentId,
6171
+ code_name: codeName,
6172
+ dead_lettered_count: deadLettered,
6173
+ pane_age_seconds: signals.paneActivityAgeSeconds,
6174
+ inbound_age_seconds: signals.pendingInboundOldestAgeSeconds,
6175
+ transcript_age_seconds: transcriptAge
6176
+ }).catch((err) => {
6177
+ log(
6178
+ `[wedge] failed to record respawn event for '${codeName}' (ENG-6265): ${err.message} \u2014 respawn proceeded; CloudWatch metric will under-count this event`
6179
+ );
6180
+ });
6181
+ }
6152
6182
  const inProgressCardIds = (kanbanBoardCache.get(codeName) ?? []).filter((item) => item.status === "in_progress" && isHybridActionable(item)).map((item) => item.id);
6153
6183
  const cardStates = wedgeRestartsByCard.get(codeName) ?? /* @__PURE__ */ new Map();
6154
6184
  const { next, newlyPoisoned } = recordWedgeForCards(
@@ -7868,7 +7898,19 @@ async function processAgent(agent, agentStates) {
7868
7898
  // isolated and the agent keeps running degraded instead of being
7869
7899
  // paused wholesale. reaperRestartBreakerReason() encodes that
7870
7900
  // single-vs-multi decision; undefined means "restart, don't count".
7871
- stopSession: (codeName, ctx) => stopPersistentSessionAndForgetMcpBaseline(codeName, reaperRestartBreakerReason(ctx.activeKeys)),
7901
+ //
7902
+ // ENG-6264: the breaker-count reason (above, undefined for a single dead
7903
+ // MCP) is decoupled from the GATE reason. Pre-6264 an undefined breaker
7904
+ // reason also made the restart non-gateable → 'bypass' → the session was
7905
+ // torn down mid-turn (the common single-MCP case interrupted busy
7906
+ // agents). Always pass 'mcp-presence-reaper' as the gate reason so the
7907
+ // restart defers-until-idle, while breakerReason still governs whether it
7908
+ // counts against the breaker.
7909
+ stopSession: (codeName, ctx) => stopPersistentSessionAndForgetMcpBaseline(
7910
+ codeName,
7911
+ reaperRestartBreakerReason(ctx.activeKeys),
7912
+ "mcp-presence-reaper"
7913
+ ),
7872
7914
  // ENG-5292: when the reaper gives up on a managed MCP (cap from
7873
7915
  // ENG-5279 + state-preservation from ENG-5285 both said "this
7874
7916
  // MCP keeps failing after 3 restart cycles"), mark the matching
@@ -9087,6 +9129,11 @@ ${truncateForLog(ctx.tail)}` : `; pane_tail_hash=sha256:${createHash3("sha256").
9087
9129
  } catch (err) {
9088
9130
  log(`[persistent-session] Failed to provision auto-progress hook for '${codeName}': ${err.message}`);
9089
9131
  }
9132
+ try {
9133
+ provisionSessionStateHook(codeName);
9134
+ } catch (err) {
9135
+ log(`[persistent-session] Failed to provision session-state hook for '${codeName}': ${err.message}`);
9136
+ }
9090
9137
  const sessionRunResult = await startRun({
9091
9138
  agent_id: agent.agent_id,
9092
9139
  source_type: "system",
@@ -10615,7 +10662,7 @@ async function processClaudePairSessions(agents) {
10615
10662
  killPairSession,
10616
10663
  pairTmuxSession,
10617
10664
  finalizeClaudePairOnboarding
10618
- } = await import("../claude-pair-runtime-GIUCD7IG.js");
10665
+ } = await import("../claude-pair-runtime-3ZIOY3Z5.js");
10619
10666
  for (const pairId of pendingResp.cancelled_pair_ids ?? []) {
10620
10667
  log(`[claude-pair] sweeping orphan tmux session for pair ${pairId.slice(0, 8)}`);
10621
10668
  const killed = await killPairSession(pairTmuxSession(pairId));