brainclaw 1.7.2 → 1.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,14 @@ import { spawnSync } from 'node:child_process';
5
5
  import yaml from 'yaml';
6
6
  import { MEMORY_DIR } from './io.js';
7
7
  import { detectHostExecutionProfile, } from './execution-profile.js';
8
+ import { getCapabilityProfile } from './agent-capability.js';
9
+ /**
10
+ * trp#427 — cold-start CLI `--version` probes need headroom; a 3s timeout
11
+ * false-negatived claude-code on first launch. The spawnable check (binary on
12
+ * PATH) is the robust signal, so this only affects version-string capture
13
+ * latency, not the installed/spawnable decision.
14
+ */
15
+ const VERSION_PROBE_TIMEOUT_MS = 8000;
8
16
  function tryCommand(command, args, timeout = 5000) {
9
17
  try {
10
18
  const r = spawnSync(command, args, { encoding: 'utf-8', timeout, windowsHide: true });
@@ -14,12 +22,40 @@ function tryCommand(command, args, timeout = 5000) {
14
22
  return { ok: false, stdout: '' };
15
23
  }
16
24
  }
25
+ /**
26
+ * trp#427 — fast PATH resolution for a binary (no process launch, unlike a
27
+ * `--version` probe). Uses `where` (Windows) / `which` (POSIX).
28
+ */
29
+ function isBinaryOnPath(binary) {
30
+ if (!binary)
31
+ return false;
32
+ try {
33
+ const cmd = process.platform === 'win32' ? 'where' : 'which';
34
+ const r = spawnSync(cmd, [binary], { encoding: 'utf-8', timeout: 3000, windowsHide: true });
35
+ return r.status === 0 && (r.stdout ?? '').trim().length > 0;
36
+ }
37
+ catch {
38
+ return false;
39
+ }
40
+ }
41
+ /**
42
+ * trp#427 — an agent is SPAWNABLE when its capability profile is CLI-spawnable,
43
+ * declares an invoke binary, and that binary resolves on PATH. Decoupled from
44
+ * the `--version` health probe so a slow cold-start CLI is never misreported as
45
+ * "not installed" / undispatchable.
46
+ */
47
+ export function detectSpawnable(agentName) {
48
+ const profile = getCapabilityProfile(agentName);
49
+ if (!profile || !profile.runtime?.canBeSpawnedCli || !profile.invoke_binary)
50
+ return false;
51
+ return isBinaryOnPath(profile.invoke_binary);
52
+ }
17
53
  const AGENT_DEFINITIONS = [
18
54
  {
19
55
  name: 'claude-code',
20
56
  detect: (_home, env) => {
21
57
  // Check if claude CLI is available
22
- const cli = tryCommand('claude', ['--version'], 3000);
58
+ const cli = tryCommand('claude', ['--version'], VERSION_PROBE_TIMEOUT_MS);
23
59
  if (cli.ok) {
24
60
  const ver = cli.stdout.trim().match(/(\d+\.\d+\.\d+)/)?.[1];
25
61
  return { installed: true, method: 'claude CLI', version: ver };
@@ -81,7 +117,7 @@ const AGENT_DEFINITIONS = [
81
117
  if (fs.existsSync(codexDir)) {
82
118
  return { installed: true, method: '~/.codex directory' };
83
119
  }
84
- const cli = tryCommand('codex', ['--version'], 3000);
120
+ const cli = tryCommand('codex', ['--version'], VERSION_PROBE_TIMEOUT_MS);
85
121
  if (cli.ok) {
86
122
  const ver = cli.stdout.trim().match(/(\d+\.\d+\.\d+)/)?.[1];
87
123
  return { installed: true, method: 'codex CLI', version: ver };
@@ -252,7 +288,7 @@ const AGENT_DEFINITIONS = [
252
288
  if (fs.existsSync(path.join(home, '.gemini', 'antigravity'))) {
253
289
  return { installed: true, method: '~/.gemini/antigravity directory' };
254
290
  }
255
- const cli = tryCommand('gemini', ['--version'], 3000);
291
+ const cli = tryCommand('gemini', ['--version'], VERSION_PROBE_TIMEOUT_MS);
256
292
  if (cli.ok) {
257
293
  return { installed: true, method: 'gemini CLI', version: cli.stdout.trim() };
258
294
  }
@@ -309,7 +345,7 @@ const AGENT_DEFINITIONS = [
309
345
  if (fs.existsSync(path.join(home, '.hermes'))) {
310
346
  return { installed: true, method: '~/.hermes directory' };
311
347
  }
312
- const cli = tryCommand('hermes', ['--version'], 3000);
348
+ const cli = tryCommand('hermes', ['--version'], VERSION_PROBE_TIMEOUT_MS);
313
349
  if (cli.ok) {
314
350
  return { installed: true, method: 'hermes CLI', version: cli.stdout.trim() };
315
351
  }
@@ -332,14 +368,23 @@ const AGENT_DEFINITIONS = [
332
368
  /**
333
369
  * Detect ALL installed agents on this machine (not just the running one).
334
370
  */
335
- export function buildAgentInventory(homeDir = os.homedir(), env = process.env) {
371
+ export function buildAgentInventory(homeDir = os.homedir(), env = process.env, opts = {}) {
372
+ const spawnableResolver = opts.spawnableResolver ?? detectSpawnable;
336
373
  const agents = AGENT_DEFINITIONS.map(def => {
337
374
  const detection = def.detect(homeDir, env);
375
+ const spawnable = spawnableResolver(def.name);
376
+ // trp#427: an agent brainclaw can spawn (invoke binary on PATH) IS installed,
377
+ // even when the cold-start `--version` probe timed out. This decouples the
378
+ // dispatch decision (getInstalledAgentNames) from probe latency.
379
+ const installed = detection.installed || spawnable;
338
380
  return {
339
381
  name: def.name,
340
- installed: detection.installed,
341
- detection_method: detection.method,
382
+ installed,
383
+ detection_method: detection.installed
384
+ ? detection.method
385
+ : (spawnable ? 'spawnable: invoke binary on PATH' : detection.method),
342
386
  version: detection.version,
387
+ spawnable,
343
388
  models: def.models,
344
389
  native_tools: def.native_tools,
345
390
  mcp_support: def.mcp_support,
@@ -415,6 +460,8 @@ export function renderAgentInventorySummary(inventory) {
415
460
  features.push('Rules');
416
461
  if (agent.hooks_support)
417
462
  features.push('Hooks');
463
+ if (agent.spawnable)
464
+ features.push('Spawnable');
418
465
  lines.push(` Features: ${features.join(', ') || 'none'}`);
419
466
  if (agent.instruction_file) {
420
467
  lines.push(` Instructions: ${agent.instruction_file}`);
@@ -34,11 +34,11 @@
34
34
  */
35
35
  import { spawnSync } from 'node:child_process';
36
36
  import { loadAgentRun, transitionAgentRun, listAgentRuns } from './agentruns.js';
37
- import { loadClaim } from './claims.js';
37
+ import { loadClaim, releaseClaim } from './claims.js';
38
38
  import { loadAssignment } from './assignments.js';
39
39
  import { createRuntimeEvent } from './events.js';
40
40
  import { nowISO } from './ids.js';
41
- import { readHeartbeat, readLogTail, signalExists } from './runtime-signals.js';
41
+ import { readHeartbeat, readLogTail, signalExists, latestActivityMs } from './runtime-signals.js';
42
42
  // ── Constants ──────────────────────────────────────────────────────────────
43
43
  /**
44
44
  * Minimum age before a run is eligible for reconciliation. Below this, the
@@ -175,11 +175,59 @@ export function collectEvidence(run, cwd, options) {
175
175
  heartbeat_age_ms = now - hb.mtimeMs;
176
176
  }
177
177
  catch { /* defensive */ }
178
+ // pln#527 — filesystem-activity liveness (logs + worktree). Independent of the
179
+ // heartbeat: a worker can be actively editing files / streaming to stderr while
180
+ // its heartbeat is frozen (written once at step 0).
181
+ let fs_activity_age_ms;
182
+ try {
183
+ const lastFs = latestActivityMs(signalRoot, run.assignment_id, run.worktree_path);
184
+ if (lastFs !== undefined)
185
+ fs_activity_age_ms = now - lastFs;
186
+ }
187
+ catch { /* defensive */ }
178
188
  return {
179
189
  age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive,
180
- completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms,
190
+ completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms, fs_activity_age_ms,
181
191
  };
182
192
  }
193
+ /**
194
+ * pln#527 — true when the run shows filesystem activity within `windowMs`
195
+ * (logs growing / worktree files touched). Used to VETO a `stalled` verdict: a
196
+ * stale heartbeat with fresh fs activity means "working", not "hung".
197
+ */
198
+ function fsActiveWithin(evidence, windowMs) {
199
+ return evidence.fs_activity_age_ms !== undefined && evidence.fs_activity_age_ms < windowMs;
200
+ }
201
+ /**
202
+ * trp#433 — when a run is reconciled to `failed` (silent_death / stalled), release
203
+ * its linked claim so dead runs stop leaving active claims (and their worktrees)
204
+ * accumulating for manual cleanup. Best-effort + idempotent: only an active claim
205
+ * is released, and any error is swallowed (GC must never break reconciliation).
206
+ * Inference only fires after the stale window with no life evidence, so this is
207
+ * conservative. (Loop auto-close on failure is a follow-up.)
208
+ */
209
+ function cascadeReleaseOnFailure(run, actor, cwd) {
210
+ if (!run.claim_id)
211
+ return;
212
+ try {
213
+ const claim = loadClaim(run.claim_id, cwd);
214
+ if (claim && claim.status === 'active') {
215
+ releaseClaim(run.claim_id, cwd);
216
+ createRuntimeEvent({
217
+ agent: actor,
218
+ session_id: run.session_id,
219
+ event_type: 'run_failed',
220
+ text: `Auto-released claim ${run.claim_id} after run ${run.id} was reconciled to failed (trp#433 GC cascade)`,
221
+ tags: ['reconciler', 'gc', 'claim-release'],
222
+ assignment_id: run.assignment_id,
223
+ run_id: run.id,
224
+ claim_id: run.claim_id,
225
+ status_reason: 'gc_cascade_release_on_failure',
226
+ }, cwd);
227
+ }
228
+ }
229
+ catch { /* best-effort — never let GC break reconciliation */ }
230
+ }
183
231
  function anyCompletionEvidence(evidence) {
184
232
  return evidence.completed_signal
185
233
  || evidence.has_post_start_commit
@@ -328,6 +376,7 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
328
376
  const failHere = (reason) => {
329
377
  try {
330
378
  transitionAgentRun(runId, 'failed', { actor, status_reason: reason }, cwd);
379
+ cascadeReleaseOnFailure(run, actor, cwd);
331
380
  return { run_id: runId, action: 'inferred_failed', reason, evidence, previous_status, current_status: 'failed' };
332
381
  }
333
382
  catch (err) {
@@ -342,9 +391,18 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
342
391
  if (evidence.failed_signal) {
343
392
  return failHere(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
344
393
  }
345
- // Heartbeat present but stale → reached the loop then went silent.
394
+ // Heartbeat present but stale → reached the loop then went silent — UNLESS the
395
+ // filesystem shows recent activity (pln#527): a frozen heartbeat with fresh
396
+ // log/worktree writes means the worker is mid-operation, not hung.
346
397
  if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
347
- return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
398
+ if (fsActiveWithin(evidence, heartbeatStale)) {
399
+ return {
400
+ run_id: runId, action: 'no_op',
401
+ reason: `heartbeat stale (${Math.round(evidence.heartbeat_age_ms / 1000)}s) but fs active ${Math.round((evidence.fs_activity_age_ms ?? 0) / 1000)}s ago — working, not stalled`,
402
+ evidence, previous_status, current_status: run.status,
403
+ };
404
+ }
405
+ return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago, no fs activity${logTailSuffix(run, cwd)}`);
348
406
  }
349
407
  // Fresh heartbeat → alive; trust it over the untrustworthy wrapper pid.
350
408
  if (evidence.heartbeat_exists) {
@@ -416,6 +474,7 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
416
474
  const failRun = (reason) => {
417
475
  try {
418
476
  transitionAgentRun(run.id, 'failed', { actor, status_reason: reason }, cwd);
477
+ cascadeReleaseOnFailure(run, actor, cwd);
419
478
  return { run_id: run.id, action: 'inferred_failed', reason, evidence, previous_status: run.status, current_status: 'failed' };
420
479
  }
421
480
  catch (err) {
@@ -458,7 +517,14 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
458
517
  // 3. Heartbeat present but STALE → the worker reached its loop then went
459
518
  // silent (e.g. hung). pid-independent: a hung worker keeps the wrapper alive.
460
519
  if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
461
- return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
520
+ if (fsActiveWithin(evidence, heartbeatStale)) {
521
+ return {
522
+ run_id: run.id, action: 'no_op',
523
+ reason: `heartbeat stale (${Math.round(evidence.heartbeat_age_ms / 1000)}s) but fs active ${Math.round((evidence.fs_activity_age_ms ?? 0) / 1000)}s ago — working, not stalled`,
524
+ evidence, previous_status: run.status, current_status: run.status,
525
+ };
526
+ }
527
+ return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago, no fs activity${logTailSuffix(run, cwd)}`);
462
528
  }
463
529
  // 4. Fresh heartbeat → the worker is alive and working; trust it OVER the
464
530
  // (untrustworthy) wrapper pid. This is the can_f792cacd fix: never fail a
@@ -44,13 +44,19 @@ function defaultRunGit(cwd, args) {
44
44
  return { ok: false, stdout: '' };
45
45
  }
46
46
  }
47
- /** True for coordination/store paths that are dirty as a side effect of dispatching. */
47
+ /**
48
+ * Top-level directories that are dirty as a side effect of coordination /
49
+ * agent tooling, never part of a dispatch's code scope:
50
+ * - `.brainclaw`, `.git` — coordination store + VCS metadata.
51
+ * - `.claude`, `.cursor`, `.codex` — per-agent local config (trp#371). A
52
+ * worker leaving these dirty (Claude Code settings, etc.) must not block an
53
+ * otherwise-safe dispatch of an unrelated code scope.
54
+ */
55
+ const SYSTEM_DIRTY_DIRS = ['.brainclaw', '.git', '.claude', '.cursor', '.codex'];
56
+ /** True for coordination/store/agent-config paths that are dirty as a side effect of tooling. */
48
57
  export function isSystemDirtyPath(p) {
49
58
  const norm = p.replace(/\\/g, '/');
50
- return norm === '.brainclaw'
51
- || norm.startsWith('.brainclaw/')
52
- || norm === '.git'
53
- || norm.startsWith('.git/');
59
+ return SYSTEM_DIRTY_DIRS.some((dir) => norm === dir || norm.startsWith(dir + '/'));
54
60
  }
55
61
  /**
56
62
  * Parse `git status --porcelain=v1 -z` output into a flat list of paths.
@@ -25,6 +25,7 @@ import { loadAgentRun, listAgentRuns } from './agentruns.js';
25
25
  import { loadClaim } from './claims.js';
26
26
  import { getLoop, listLoops } from './loops/store.js';
27
27
  import { isProcessAlive } from './agentrun-reconciler.js';
28
+ import { latestActivityMs } from './runtime-signals.js';
28
29
  const DEFAULT_TAIL = 20;
29
30
  const DEFAULT_STALL_MS = 5 * 60_000;
30
31
  // ── Internal helpers ──────────────────────────────────────────────────────
@@ -96,6 +97,37 @@ function resolveTarget(targetId, cwd) {
96
97
  const TERMINAL_RUN_STATUSES = new Set([
97
98
  'completed', 'failed', 'cancelled', 'timed_out', 'interrupted',
98
99
  ]);
100
+ /**
101
+ * pln#527 (#5) — recognize known fatal boot signatures in a worker's stderr tail
102
+ * so dispatch_status returns a targeted diagnosis + remediation instead of a
103
+ * generic silent_death. These are agent/CLI/config faults (NOT brainclaw bugs)
104
+ * that a coordinator can fix and re-dispatch. Patterns sourced from field traps
105
+ * (trp#292 codex service_tier / model mismatch).
106
+ */
107
+ export function recognizeStderrSignature(tail) {
108
+ if (!tail || tail.length === 0)
109
+ return undefined;
110
+ const text = tail.join('\n');
111
+ if (/service_tier/i.test(text) && /flex|unsupported/i.test(text)) {
112
+ return {
113
+ summary: 'codex rejected an unsupported `service_tier` (e.g. flex) — a config/model mismatch at boot, not a brainclaw fault',
114
+ recommended_next_action: 'Fix ~/.codex/config.toml `service_tier` (remove it or set a supported value) or upgrade codex, then re-dispatch. See trap trp#292.',
115
+ };
116
+ }
117
+ if (/unknown variant/i.test(text)) {
118
+ return {
119
+ summary: 'codex CLI rejected an unknown config variant — the installed codex does not support a value in ~/.codex/config.toml (e.g. model/approval)',
120
+ recommended_next_action: 'Reconcile ~/.codex/config.toml with the installed codex (`codex --version`) or upgrade codex, then re-dispatch.',
121
+ };
122
+ }
123
+ if (/\b400\b/.test(text) && /(unsupported|requires a newer|model)/i.test(text)) {
124
+ return {
125
+ summary: 'the model API returned 400 (unsupported model / needs a newer CLI) — the worker died at boot, before doing work',
126
+ recommended_next_action: 'Check the configured model vs the installed CLI version; upgrade the agent CLI or pick a supported model, then re-dispatch.',
127
+ };
128
+ }
129
+ return undefined;
130
+ }
99
131
  function computeDiagnosis(assignment, agentRun, runtime, options) {
100
132
  if (!assignment && !agentRun) {
101
133
  return {
@@ -127,17 +159,37 @@ function computeDiagnosis(assignment, agentRun, runtime, options) {
127
159
  const lastEventMs = new Date(agentRun.last_event_at ?? agentRun.started_at ?? agentRun.created_at).getTime();
128
160
  const stallAge = options.nowMs - lastEventMs;
129
161
  if (runtime.pid_alive === false) {
162
+ // pln#527 (#5) — surface a TARGETED diagnosis when the captured stderr matches
163
+ // a known fatal boot signature (codex model/service_tier mismatch, API 400)
164
+ // instead of a generic "silent_death".
165
+ const sig = recognizeStderrSignature(runtime.log_files.stderr?.tail);
130
166
  return {
131
167
  health: 'silent_death',
132
- summary: `agent_run.status="${agentRun.status}" but pid ${runtime.pid} is dead — worker exited without self-reporting; lazy reconciler will mark it failed after the stale window (default 30min)`,
133
- recommended_next_action: 'Read .stderr.log for the exit reason; then trigger reconciliation by calling bclaw_find(entity="agent_run") again, or cancel + reroute.',
168
+ summary: sig
169
+ ? `agent_run.status="${agentRun.status}", pid ${runtime.pid} dead — ${sig.summary}`
170
+ : `agent_run.status="${agentRun.status}" but pid ${runtime.pid} is dead — worker exited without self-reporting; lazy reconciler will mark it failed after the stale window (default 30min)`,
171
+ recommended_next_action: sig?.recommended_next_action
172
+ ?? 'Read .stderr.log for the exit reason; then trigger reconciliation by calling bclaw_find(entity="agent_run") again, or cancel + reroute.',
173
+ };
174
+ }
175
+ // pln#527 — a stale last_event_at is NOT "stalled" when the filesystem is still
176
+ // active (logs streaming / worktree files edited). Workers emit no heartbeat
177
+ // during a long single operation (codex→stderr, claude -p buffering stdout),
178
+ // so fs activity is the truer liveness signal and vetoes the false-stalled.
179
+ const fsAge = runtime.last_fs_activity_ms;
180
+ const fsActive = fsAge !== undefined && fsAge < options.stallMs;
181
+ if (runtime.pid_alive === true && stallAge > options.stallMs && fsActive) {
182
+ return {
183
+ health: 'healthy',
184
+ summary: `agent_run alive (pid=${runtime.pid}); last_event_at stale (${Math.round(stallAge / 1000)}s) but filesystem active ${Math.round((fsAge ?? 0) / 1000)}s ago — working through a long op without a heartbeat`,
185
+ recommended_next_action: 'No action — the worker is actively writing to logs/worktree. Re-check periodically until terminal.',
134
186
  };
135
187
  }
136
188
  if (runtime.pid_alive === true && stallAge > options.stallMs) {
137
189
  return {
138
190
  health: 'stalled',
139
- summary: `agent_run alive (pid=${runtime.pid}) but no activity for ${Math.round(stallAge / 1000)}s; last_event_at=${agentRun.last_event_at ?? '(never)'}`,
140
- recommended_next_action: 'Tail the stdout/stderr log to see whether the worker is doing useful work; if truly hung, kill the pid and reroute.',
191
+ summary: `agent_run alive (pid=${runtime.pid}) but no activity for ${Math.round(stallAge / 1000)}s AND no filesystem writes${fsAge !== undefined ? ` (last fs ${Math.round(fsAge / 1000)}s ago)` : ' (no logs/worktree mtime)'}; last_event_at=${agentRun.last_event_at ?? '(never)'}`,
192
+ recommended_next_action: 'Worker appears genuinely hung (no log/file writes). Tail stderr to confirm, then kill the pid and reroute.',
141
193
  };
142
194
  }
143
195
  if (runtime.pid_alive === true) {
@@ -186,6 +238,16 @@ export function getDispatchStatus(options) {
186
238
  const ackPath = assignmentId ? path.join(runtimeRoot, 'ack', `${assignmentId}.ack`) : undefined;
187
239
  const stdoutPath = assignmentId ? path.join(runtimeRoot, 'log', `${assignmentId}.stdout.log`) : undefined;
188
240
  const stderrPath = assignmentId ? path.join(runtimeRoot, 'log', `${assignmentId}.stderr.log`) : undefined;
241
+ // pln#527 — filesystem-activity age: max mtime across the captured logs + the
242
+ // run's worktree files (skipping junctions). The truer liveness signal when
243
+ // the heartbeat / last_event_at is stale during a long single operation.
244
+ const worktreeForFs = agentRun?.worktree_path ?? claim?.worktree_path;
245
+ let lastFsActivityMs;
246
+ if (assignmentId) {
247
+ const lastFs = latestActivityMs(projectRoot, assignmentId, worktreeForFs);
248
+ if (lastFs !== undefined)
249
+ lastFsActivityMs = nowMs - lastFs;
250
+ }
189
251
  const runtime = {
190
252
  pid: agentRun?.pid,
191
253
  pid_alive: isProcessAlive(agentRun?.pid),
@@ -197,6 +259,7 @@ export function getDispatchStatus(options) {
197
259
  stdout: stdoutPath ? readLogTail(stdoutPath, tailLines) : undefined,
198
260
  stderr: stderrPath ? readLogTail(stderrPath, tailLines) : undefined,
199
261
  },
262
+ last_fs_activity_ms: lastFsActivityMs,
200
263
  };
201
264
  const diagnosis = computeDiagnosis(assignment, agentRun, runtime, { stallMs, nowMs });
202
265
  return {
@@ -43,7 +43,7 @@ import { memoryDir } from './io.js';
43
43
  import { loadVersionedJsonFile } from './migration.js';
44
44
  import fs from 'node:fs';
45
45
  import path from 'node:path';
46
- import { buildInvokeCommand, resolveBriefMode, getCapabilityProfile, resolveConcurrencyLimit, resolveResourceKey, resolveModel, serializeConcurrencyLimit } from './agent-capability.js';
46
+ import { buildInvokeCommand, resolveBriefMode, getCapabilityProfile, dispatchHasMcp, resolveConcurrencyLimit, resolveResourceKey, resolveModel, serializeConcurrencyLimit } from './agent-capability.js';
47
47
  import { getRuntimeSignalPath } from './runtime-signals.js';
48
48
  import { attemptExecution } from './execution.js';
49
49
  import { createAssignment, transitionAssignment, generateAssignmentId, patchAssignmentMessageId } from './assignments.js';
@@ -156,6 +156,11 @@ export function analyzeSequence(cwd) {
156
156
  plan,
157
157
  lane: item.lane,
158
158
  reason: `All hard dependencies met${softNote}`,
159
+ // pln#529 — readiness ≠ code-availability for gated lanes.
160
+ ...(item.hard_after.length > 0 ? {
161
+ code_propagation_note: `Unblocked by hard_after [${item.hard_after.join(', ')}]. Ensure that work is committed AND on the dispatch base (HEAD), ` +
162
+ `or dispatch this lane with ref=<predecessor branch> — otherwise the worker spawns from HEAD without it.`,
163
+ } : {}),
159
164
  });
160
165
  }
161
166
  // Build capacity summary per agent (multi-instance aware)
@@ -238,6 +243,13 @@ export function buildProtocolSection(options) {
238
243
  }
239
244
  if (options?.worktreePath) {
240
245
  parts.push(`Worktree: ${options.worktreePath}`);
246
+ // pln#523: tell the worker how dependencies are provisioned so it does not
247
+ // stall trying to install them. node_modules (and per-package node_modules in
248
+ // monorepos) are junction-linked from the main repo — run builds/typecheck
249
+ // directly. If they are missing, do NOT `npm install` in the worktree: check
250
+ // `.brainclaw-worktree.json` → `symlink_warnings` (a link may have failed,
251
+ // e.g. cross-volume) and validate the build centrally with the coordinator.
252
+ parts.push('Dependencies: node_modules is linked from the main repo (incl. monorepo per-package). Build/typecheck directly; if deps are missing, do NOT npm install here — see .brainclaw-worktree.json symlink_warnings and validate centrally.');
241
253
  }
242
254
  parts.push('');
243
255
  // Assignment lifecycle protocol (Agent SDK)
@@ -254,6 +266,14 @@ export function buildProtocolSection(options) {
254
266
  parts.push(`${options.worktreePath ? '7' : '6'}. Release the claim: bclaw_release_claim(${claimRef}, planStatus: "done") — required for hard_after gating to unblock downstream tasks`);
255
267
  parts.push(`${options.worktreePath ? '8' : '7'}. If blocked: bclaw_assignment_update(status: "blocked", blocker: "...")`);
256
268
  parts.push(`${options.worktreePath ? '9' : '8'}. If failed: bclaw_assignment_update(status: "failed", error_message: "...")`);
269
+ // pln#479: compile-check contract for code workers — a per-worktree
270
+ // pre-commit gate may HARD-block a commit that fails tsc (opt-in).
271
+ if (options.worktreePath) {
272
+ parts.push('**Compile check**: before every commit, `tsc --noEmit` (or the project build) must pass — a per-worktree pre-commit gate may enforce this and reject the commit otherwise. Do not bypass with --no-verify unless you intend to hand off a known-broken state.');
273
+ }
274
+ // pln#526: standard fallback channel — works even when MCP is unreachable
275
+ // (sandboxed agents). The coordinator ingests it with `brainclaw harvest`.
276
+ parts.push(`Final fallback (if bclaw_assignment_update / MCP is unavailable, e.g. a sandboxed agent): write LANE-RESULT.json at the worktree root — {"assignment_id":"${options.assignmentId}","status":"completed|blocked|failed","summary":"<what you did>","files_changed":["..."],"artifacts":["..."]}. The coordinator harvests it via \`brainclaw harvest ${options.assignmentId}\`.`);
257
277
  }
258
278
  else if (options?.claimId) {
259
279
  parts.push('1. Call bclaw_session_start to register your session');
@@ -406,6 +426,25 @@ export function generateBrief(plan, item, cwd, briefMode, options) {
406
426
  if (mode === 'full') {
407
427
  parts.push(buildProtocolSection(options));
408
428
  }
429
+ // pln#528 — transport-aware addendum (debrief LeaseUp P1#2). When the agent is
430
+ // spawned sandboxed (no MCP + no git commit — e.g. codex --sandbox
431
+ // workspace-write), the MCP lifecycle lines in the Protocol section do NOT
432
+ // apply. Say so explicitly and make the FILE protocol authoritative, so the
433
+ // worker never receives instructions it cannot follow nor has to guess the
434
+ // fallback. (Note: resolveBriefMode still returns 'full' for codex per pln#496
435
+ // so the reconciler-independent path is preserved; this addendum disambiguates
436
+ // the transport rather than stripping the section — the full compact reversal
437
+ // is a separate human-owned call on the May-vs-June MCP-availability conflict.)
438
+ const briefProfile = options?.agent ? getCapabilityProfile(options.agent) : undefined;
439
+ if (briefProfile && !dispatchHasMcp(briefProfile)) {
440
+ parts.push('## ⚠ Transport: sandboxed run (no MCP, no commit)');
441
+ parts.push('Your runtime is sandboxed — the brainclaw MCP server is NOT reachable and `git commit` is unavailable (.git is outside the sandbox root). Any `bclaw_*` MCP instruction above does NOT apply to you. Report your outcome via the FILE protocol only — it is authoritative for this run:');
442
+ const asgn = options?.assignmentId ?? '<assignment_id>';
443
+ parts.push(`- When done, write LANE-RESULT.json at the worktree root: {"assignment_id":"${asgn}","status":"completed|blocked|failed","summary":"<what you did>","files_changed":["..."]}.`);
444
+ parts.push('- Capture decisions/traps as candidate JSON under .brainclaw/coordination/inbox/ (the coordinator harvests them).');
445
+ parts.push('- Do NOT call bclaw_* tools — they are unavailable here. The coordinator harvests your result and integrates/commits it.');
446
+ parts.push('');
447
+ }
409
448
  // Codex-specific constraints: focus and speed guidance for sandboxed runs.
410
449
  // Gated on agent identity (not brief mode) so future non-codex compact consumers
411
450
  // don't inherit sandbox-specific wording. (Codex review cnd#561)
@@ -413,7 +452,6 @@ export function generateBrief(plan, item, cwd, briefMode, options) {
413
452
  parts.push('## Constraints');
414
453
  parts.push('- Focus on specified files only — do not explore the broader codebase');
415
454
  parts.push('- Produce output quickly; if blocked, capture as trap candidate and move on');
416
- parts.push('- Sandbox blocks MCP writes: use filesystem writes for candidates, coordinator harvests');
417
455
  parts.push('');
418
456
  }
419
457
  return parts.join('\n');
@@ -437,12 +475,22 @@ export function generateDispatchBrief(options) {
437
475
  assignmentId: options.assignmentId,
438
476
  }));
439
477
  }
478
+ // pln#528 — transport-aware addendum for sandboxed agents (see generateBrief).
479
+ const taskBriefProfile = options.agent ? getCapabilityProfile(options.agent) : undefined;
480
+ if (taskBriefProfile && !dispatchHasMcp(taskBriefProfile)) {
481
+ parts.push('## ⚠ Transport: sandboxed run (no MCP, no commit)');
482
+ parts.push('Your runtime is sandboxed — the brainclaw MCP server is NOT reachable and `git commit` is unavailable (.git is outside the sandbox root). Any `bclaw_*` MCP instruction above does NOT apply to you. Report your outcome via the FILE protocol only — it is authoritative for this run:');
483
+ const asgn = options.assignmentId ?? '<assignment_id>';
484
+ parts.push(`- When done, write LANE-RESULT.json at the worktree root: {"assignment_id":"${asgn}","status":"completed|blocked|failed","summary":"<what you did>","files_changed":["..."]}.`);
485
+ parts.push('- Capture decisions/traps as candidate JSON under .brainclaw/coordination/inbox/ (the coordinator harvests them).');
486
+ parts.push('- Do NOT call bclaw_* tools — they are unavailable here. The coordinator harvests your result and integrates/commits it.');
487
+ parts.push('');
488
+ }
440
489
  // Codex-specific constraints: focus and speed guidance for sandboxed runs
441
490
  if (options.agent === 'codex') {
442
491
  parts.push('## Constraints');
443
492
  parts.push('- Focus on specified files only — do not explore the broader codebase');
444
493
  parts.push('- Produce output quickly; if blocked, capture as trap candidate and move on');
445
- parts.push('- Sandbox blocks MCP writes: use filesystem writes for candidates, coordinator harvests');
446
494
  parts.push('');
447
495
  }
448
496
  return parts.join('\n');
@@ -126,6 +126,42 @@ export function listEntities(name, cwd, filter = {}) {
126
126
  const paged = applyPaging(filtered, filter);
127
127
  return { entity: name, total: filtered.length, items: paged };
128
128
  }
129
+ /** Default serialized-items budget (chars) — keeps a bclaw_find payload well under the ~25k-token MCP cap (trp#449). */
130
+ export const DEFAULT_FIND_CHAR_BUDGET = 40000;
131
+ /**
132
+ * pln#491 — bound a list payload so a verbose result set never overflows the MCP
133
+ * token cap (which makes agents silently fall back to the CLI, trp#449).
134
+ * `listEntities` already caps COUNT (default 50 via applyPaging); this additionally
135
+ * caps SIZE: if the serialized items exceed `charBudget`, the page is shrunk until
136
+ * it fits (always keeping at least one item). Either way the result advertises
137
+ * has_more / next_offset / a hint so the caller paginates explicitly instead of
138
+ * guessing or falling back to the terminal.
139
+ */
140
+ export function boundListResult(result, offset, charBudget = DEFAULT_FIND_CHAR_BUDGET) {
141
+ let items = result.items;
142
+ let omittedForSize = 0;
143
+ while (items.length > 1 && JSON.stringify(items).length > charBudget) {
144
+ const drop = Math.max(1, Math.ceil(items.length * 0.25));
145
+ items = items.slice(0, items.length - drop);
146
+ omittedForSize = result.items.length - items.length;
147
+ }
148
+ const returned = items.length;
149
+ const hasMore = offset + returned < result.total;
150
+ const bounded = {
151
+ ...result,
152
+ items,
153
+ returned,
154
+ has_more: hasMore,
155
+ ...(omittedForSize > 0 ? { omitted_for_size: omittedForSize } : {}),
156
+ };
157
+ if (hasMore) {
158
+ bounded.next_offset = offset + returned;
159
+ bounded.hint = omittedForSize > 0
160
+ ? `Payload size-bounded: returned ${returned} of ${result.total} ${result.entity} item(s). Fetch more with filter.offset=${bounded.next_offset}, or narrow the filter (status/tag/author).`
161
+ : `Returned ${returned} of ${result.total} ${result.entity} item(s). Page with filter.offset=${bounded.next_offset}, or narrow the filter.`;
162
+ }
163
+ return bounded;
164
+ }
129
165
  function loadAll(name, cwd) {
130
166
  switch (name) {
131
167
  case 'plan': return loadState(cwd).plan_items;
@@ -23,7 +23,7 @@ const plan = {
23
23
  name: 'plan',
24
24
  shortLabelPrefix: 'pln',
25
25
  schema: PlanItemSchema,
26
- updatable: ['text', 'priority', 'tags', 'assignee', 'estimated_effort', 'actual_effort', 'depends_on'],
26
+ updatable: ['text', 'priority', 'tags', 'assignee', 'estimated_effort', 'actual_effort', 'depends_on', 'related_paths'],
27
27
  statusField: 'status',
28
28
  transitions: {
29
29
  todo: ['in_progress', 'blocked', 'done', 'dropped'],
@@ -237,7 +237,7 @@ function renderSessionProtocol() {
237
237
  '',
238
238
  'Do NOT call `bclaw_loop(intent=open)` directly — it creates a loop structure without dispatch, so the reviewer/participant never gets the work. Use the goal entries above.',
239
239
  '',
240
- '_How to verify a dispatch actually worked:_ `execution_status="delivered_and_started"` only means the brief-ack sentinel was touched — it does NOT mean the worker is doing useful work. Always (1) `bclaw_find(entity="agent_run", filter={assignment_id})` to read the spawn record; (2) check OS pid liveness yourself (`Get-Process -Id <pid>` on Windows, `kill -0 <pid>` on POSIX); (3) if the worker is silent, read its captured streams at `.brainclaw/coordination/runtime/log/<assignment_id>.{stdout,stderr}.log`. Full FSM tables + diagnostic decision tree in `docs/concepts/dispatch-lifecycle.md`.',
240
+ '_How to verify a dispatch actually worked:_ `execution_status="delivered_and_started"` only means the brief-ack sentinel was touched — it does NOT mean the worker is doing useful work. (1) Call `bclaw_dispatch_status(target_id=<asgn_…|clm_…|lop_…|run_…>)` — the purpose-built facade: it resolves the linked entities, reads the runtime sentinels (`ack` / `heartbeat` / `completed` / `failed`) and the captured stdout/stderr tails, checks pid liveness, and returns a single health verdict plus a recommended next action. This is the `verify_with` target named in the coordinate/dispatch response — prefer it over assembling the picture by hand. (2) Do NOT diagnose liveness from the tracked pid yourself: on Windows an ack-wrapped spawn runs under a `cmd.exe` shell, so `agent_run.pid` is the wrapper (which exits early by design), NOT the real worker — `Get-Process -Id <pid>` reads it dead while the worker is alive and committing. Trust the sentinel-derived verdict instead; the reconciler already infers `completed` from a post-start commit on the worktree branch even when the worker never called `bclaw_assignment_update`. (3) Fallback only if the facade is unavailable: `bclaw_find(entity="agent_run", filter={assignment_id})` plus the captured streams at `.brainclaw/coordination/runtime/log/<assignment_id>.{stdout,stderr}.log` — note that `claude -p` buffers stdout until exit, so an empty log mid-run is expected; use the `heartbeat` sentinel as the live progress signal, not stdout. Full FSM tables + diagnostic decision tree in `docs/concepts/dispatch-lifecycle.md`.',
241
241
  ].join('\n');
242
242
  }
243
243
  function renderUserWorkflow() {
@@ -99,4 +99,76 @@ export function readLogTail(root, assignmentId, stream, maxBytes = 2000) {
99
99
  return '';
100
100
  }
101
101
  }
102
+ /**
103
+ * pln#527 — directories never worth walking for filesystem-activity (junction
104
+ * targets / VCS / coordination store). Skipping them keeps the worktree mtime
105
+ * scan cheap AND avoids following node_modules/dist junctions into the main repo.
106
+ */
107
+ const FS_ACTIVITY_SKIP_DIRS = new Set(['.git', '.brainclaw', 'node_modules', 'dist', '.venv', 'venv', 'vendor']);
108
+ /**
109
+ * pln#527 — most-recent file mtime (ms) under a worktree, via a bounded walk that
110
+ * NEVER follows symlinks/junctions (lstat) and skips dependency/VCS dirs. This is
111
+ * the liveness signal for workers that edit files but emit no heartbeat/stdout
112
+ * (e.g. `claude -p` buffers stdout; a long single edit pass refreshes no
113
+ * sentinel). Returns undefined when the path is absent/unreadable.
114
+ */
115
+ export function latestWorktreeFileMtimeMs(worktreePath, maxDepth = 4) {
116
+ let latest;
117
+ const walk = (dir, depth) => {
118
+ if (depth > maxDepth)
119
+ return;
120
+ let entries;
121
+ try {
122
+ entries = fs.readdirSync(dir, { withFileTypes: true });
123
+ }
124
+ catch {
125
+ return;
126
+ }
127
+ for (const entry of entries) {
128
+ if (entry.isSymbolicLink())
129
+ continue; // never follow junctions (node_modules/dist)
130
+ const full = path.join(dir, entry.name);
131
+ if (entry.isDirectory()) {
132
+ if (FS_ACTIVITY_SKIP_DIRS.has(entry.name))
133
+ continue;
134
+ walk(full, depth + 1);
135
+ }
136
+ else if (entry.isFile()) {
137
+ try {
138
+ const m = fs.statSync(full).mtimeMs;
139
+ if (latest === undefined || m > latest)
140
+ latest = m;
141
+ }
142
+ catch { /* ignore */ }
143
+ }
144
+ }
145
+ };
146
+ walk(worktreePath, 0);
147
+ return latest;
148
+ }
149
+ /**
150
+ * pln#527 — the most recent filesystem activity (ms since epoch) attributable to
151
+ * a dispatched run: the max mtime across its captured stdout/stderr logs AND any
152
+ * file in its worktree. Lets the reconciler / dispatch_status distinguish
153
+ * "no heartbeat BUT fs active" (working — e.g. codex streaming to stderr, or
154
+ * claude -p editing files) from "no heartbeat AND fs inert" (genuinely stalled),
155
+ * fixing the false-`stalled` verdict (debrief LeaseUp P1#1). Returns undefined
156
+ * when nothing is observable.
157
+ */
158
+ export function latestActivityMs(root, assignmentId, worktreePath) {
159
+ let latest;
160
+ const bump = (ms) => {
161
+ if (ms !== undefined && (latest === undefined || ms > latest))
162
+ latest = ms;
163
+ };
164
+ for (const stream of ['stdout', 'stderr']) {
165
+ try {
166
+ bump(fs.statSync(getRuntimeLogPath(root, assignmentId, stream)).mtimeMs);
167
+ }
168
+ catch { /* no log */ }
169
+ }
170
+ if (worktreePath)
171
+ bump(latestWorktreeFileMtimeMs(worktreePath));
172
+ return latest;
173
+ }
102
174
  //# sourceMappingURL=runtime-signals.js.map