brainclaw 1.7.1 → 1.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,6 +46,10 @@ const PROFILES = {
46
46
  invoke_binary: 'claude',
47
47
  invoke_review_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
48
48
  invoke_consult_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
49
+ // pln#520 step 3: model is selectable via `--model` — no need for a
50
+ // per-model pseudo-identity. `claude-sonnet` below is now redundant
51
+ // (run `claude-code --model sonnet`) and kept only for back-compat.
52
+ model_flag: '--model',
49
53
  },
50
54
  'claude-sonnet': {
51
55
  name: 'claude-sonnet', category: 'code-agent', workflowModel: 'interactive',
@@ -323,6 +327,63 @@ export function getCapabilityProfile(name) {
323
327
  const resolved = resolveAgentAlias(name);
324
328
  return _customProfiles.get(resolved) ?? PROFILES[resolved];
325
329
  }
330
+ /**
331
+ * pln#520 step 3 — concurrency is a resolvable execution-config value, NOT a
332
+ * structural constant baked into agent identity.
333
+ *
334
+ * The host resource a concurrency cap actually protects is the binary on the
335
+ * machine (its API quota / its RAM/CPU footprint), not the agent label.
336
+ * `resolveResourceKey` returns that shared key so callers count usage across
337
+ * every identity that drives one binary. This kills the can_dc4e4a11 bug:
338
+ * `claude-code` and `claude-sonnet` are the SAME `claude` binary on the SAME
339
+ * host but were counted separately (3 + 6 → up to 9 concurrent `claude`
340
+ * processes, oversubscribing the machine + API).
341
+ */
342
+ export function resolveResourceKey(name) {
343
+ const profile = getCapabilityProfile(name);
344
+ return profile?.invoke_binary ?? resolveAgentAlias(name);
345
+ }
346
+ /**
347
+ * Resolve the concurrency limit for an agent. `Infinity` = unlimited.
348
+ *
349
+ * Resolution chain (highest priority first), decoupled from agent identity:
350
+ * 1. explicit `override` (e.g. `brainclaw dispatch --max-concurrency N`)
351
+ * 2. host opt-in cap via `BRAINCLAW_MAX_CONCURRENCY` (protect one machine / quota)
352
+ * 3. structural floor — agents that cannot run headless in parallel
353
+ * (IDE / desktop agents, i.e. not CLI-spawnable) stay hard-capped at their
354
+ * profile `max_concurrent_tasks` (you can't spawn N IDE windows headlessly)
355
+ * 4. default for parallelizable CLI agents: UNLIMITED. There is no arbitrary
356
+ * per-identity throttle — the operator opts into a cap when they want one.
357
+ *
358
+ * When a finite cap applies it is enforced per host-binary resource
359
+ * (see `resolveResourceKey`), so all variants of one binary share the pool.
360
+ */
361
+ export function resolveConcurrencyLimit(name, opts = {}) {
362
+ if (opts.override !== undefined && opts.override > 0)
363
+ return opts.override;
364
+ const envCap = Number(process.env.BRAINCLAW_MAX_CONCURRENCY);
365
+ if (Number.isFinite(envCap) && envCap > 0)
366
+ return envCap;
367
+ const profile = getCapabilityProfile(name);
368
+ if (!profile?.runtime?.canBeSpawnedCli)
369
+ return profile?.max_concurrent_tasks ?? 1;
370
+ return Infinity;
371
+ }
372
+ /** JSON-safe rendering of a concurrency limit: `Infinity` → `null` (= unlimited). */
373
+ export function serializeConcurrencyLimit(limit) {
374
+ return Number.isFinite(limit) ? limit : null;
375
+ }
376
+ /**
377
+ * pln#520 step 3 — resolve the model for a dispatch, decoupled from agent
378
+ * identity. Chain (highest priority first): explicit override (e.g.
379
+ * `dispatch --model`) → lane model → identity model → profile default.
380
+ * Returns `undefined` when nothing in the chain specifies one (the agent's
381
+ * template default applies).
382
+ */
383
+ export function resolveModel(name, opts = {}) {
384
+ const profile = getCapabilityProfile(name);
385
+ return opts.override ?? opts.lane ?? opts.identity ?? profile?.default_model;
386
+ }
326
387
  /**
327
388
  * Escape a string for safe use as a double-quoted shell argument.
328
389
  * Escapes characters that have special meaning inside double-quotes
@@ -490,6 +551,12 @@ export function buildInvokeCommand(name, prompt, options = {}) {
490
551
  const rawTokens = parseTemplateString(templateStr);
491
552
  if (rawTokens.length === 0)
492
553
  return undefined;
554
+ // pln#520 step 3: inject the resolved model right after the binary so model
555
+ // choice is decoupled from agent identity. Only when the profile declares a
556
+ // `model_flag` and the template doesn't already pin a model (don't double it).
557
+ if (options.model && profile.model_flag && !rawTokens.includes(profile.model_flag)) {
558
+ rawTokens.splice(1, 0, profile.model_flag, options.model);
559
+ }
493
560
  const executable = rawTokens[0];
494
561
  const interpolatedTokens = rawTokens.slice(1).map((tok) => tok === '{prompt}' ? embeddedPrompt : tok);
495
562
  // ── 5. Build the args array ───────────────────────────────────────────────
@@ -5,6 +5,14 @@ import { spawnSync } from 'node:child_process';
5
5
  import yaml from 'yaml';
6
6
  import { MEMORY_DIR } from './io.js';
7
7
  import { detectHostExecutionProfile, } from './execution-profile.js';
8
+ import { getCapabilityProfile } from './agent-capability.js';
9
+ /**
10
+ * trp#427 — cold-start CLI `--version` probes need headroom; a 3s timeout
11
+ * false-negatived claude-code on first launch. The spawnable check (binary on
12
+ * PATH) is the robust signal, so this only affects version-string capture
13
+ * latency, not the installed/spawnable decision.
14
+ */
15
+ const VERSION_PROBE_TIMEOUT_MS = 8000;
8
16
  function tryCommand(command, args, timeout = 5000) {
9
17
  try {
10
18
  const r = spawnSync(command, args, { encoding: 'utf-8', timeout, windowsHide: true });
@@ -14,12 +22,40 @@ function tryCommand(command, args, timeout = 5000) {
14
22
  return { ok: false, stdout: '' };
15
23
  }
16
24
  }
25
+ /**
26
+ * trp#427 — fast PATH resolution for a binary (no process launch, unlike a
27
+ * `--version` probe). Uses `where` (Windows) / `which` (POSIX).
28
+ */
29
+ function isBinaryOnPath(binary) {
30
+ if (!binary)
31
+ return false;
32
+ try {
33
+ const cmd = process.platform === 'win32' ? 'where' : 'which';
34
+ const r = spawnSync(cmd, [binary], { encoding: 'utf-8', timeout: 3000, windowsHide: true });
35
+ return r.status === 0 && (r.stdout ?? '').trim().length > 0;
36
+ }
37
+ catch {
38
+ return false;
39
+ }
40
+ }
41
+ /**
42
+ * trp#427 — an agent is SPAWNABLE when its capability profile is CLI-spawnable,
43
+ * declares an invoke binary, and that binary resolves on PATH. Decoupled from
44
+ * the `--version` health probe so a slow cold-start CLI is never misreported as
45
+ * "not installed" / undispatchable.
46
+ */
47
+ export function detectSpawnable(agentName) {
48
+ const profile = getCapabilityProfile(agentName);
49
+ if (!profile || !profile.runtime?.canBeSpawnedCli || !profile.invoke_binary)
50
+ return false;
51
+ return isBinaryOnPath(profile.invoke_binary);
52
+ }
17
53
  const AGENT_DEFINITIONS = [
18
54
  {
19
55
  name: 'claude-code',
20
56
  detect: (_home, env) => {
21
57
  // Check if claude CLI is available
22
- const cli = tryCommand('claude', ['--version'], 3000);
58
+ const cli = tryCommand('claude', ['--version'], VERSION_PROBE_TIMEOUT_MS);
23
59
  if (cli.ok) {
24
60
  const ver = cli.stdout.trim().match(/(\d+\.\d+\.\d+)/)?.[1];
25
61
  return { installed: true, method: 'claude CLI', version: ver };
@@ -81,7 +117,7 @@ const AGENT_DEFINITIONS = [
81
117
  if (fs.existsSync(codexDir)) {
82
118
  return { installed: true, method: '~/.codex directory' };
83
119
  }
84
- const cli = tryCommand('codex', ['--version'], 3000);
120
+ const cli = tryCommand('codex', ['--version'], VERSION_PROBE_TIMEOUT_MS);
85
121
  if (cli.ok) {
86
122
  const ver = cli.stdout.trim().match(/(\d+\.\d+\.\d+)/)?.[1];
87
123
  return { installed: true, method: 'codex CLI', version: ver };
@@ -252,7 +288,7 @@ const AGENT_DEFINITIONS = [
252
288
  if (fs.existsSync(path.join(home, '.gemini', 'antigravity'))) {
253
289
  return { installed: true, method: '~/.gemini/antigravity directory' };
254
290
  }
255
- const cli = tryCommand('gemini', ['--version'], 3000);
291
+ const cli = tryCommand('gemini', ['--version'], VERSION_PROBE_TIMEOUT_MS);
256
292
  if (cli.ok) {
257
293
  return { installed: true, method: 'gemini CLI', version: cli.stdout.trim() };
258
294
  }
@@ -309,7 +345,7 @@ const AGENT_DEFINITIONS = [
309
345
  if (fs.existsSync(path.join(home, '.hermes'))) {
310
346
  return { installed: true, method: '~/.hermes directory' };
311
347
  }
312
- const cli = tryCommand('hermes', ['--version'], 3000);
348
+ const cli = tryCommand('hermes', ['--version'], VERSION_PROBE_TIMEOUT_MS);
313
349
  if (cli.ok) {
314
350
  return { installed: true, method: 'hermes CLI', version: cli.stdout.trim() };
315
351
  }
@@ -332,14 +368,23 @@ const AGENT_DEFINITIONS = [
332
368
  /**
333
369
  * Detect ALL installed agents on this machine (not just the running one).
334
370
  */
335
- export function buildAgentInventory(homeDir = os.homedir(), env = process.env) {
371
+ export function buildAgentInventory(homeDir = os.homedir(), env = process.env, opts = {}) {
372
+ const spawnableResolver = opts.spawnableResolver ?? detectSpawnable;
336
373
  const agents = AGENT_DEFINITIONS.map(def => {
337
374
  const detection = def.detect(homeDir, env);
375
+ const spawnable = spawnableResolver(def.name);
376
+ // trp#427: an agent brainclaw can spawn (invoke binary on PATH) IS installed,
377
+ // even when the cold-start `--version` probe timed out. This decouples the
378
+ // dispatch decision (getInstalledAgentNames) from probe latency.
379
+ const installed = detection.installed || spawnable;
338
380
  return {
339
381
  name: def.name,
340
- installed: detection.installed,
341
- detection_method: detection.method,
382
+ installed,
383
+ detection_method: detection.installed
384
+ ? detection.method
385
+ : (spawnable ? 'spawnable: invoke binary on PATH' : detection.method),
342
386
  version: detection.version,
387
+ spawnable,
343
388
  models: def.models,
344
389
  native_tools: def.native_tools,
345
390
  mcp_support: def.mcp_support,
@@ -415,6 +460,8 @@ export function renderAgentInventorySummary(inventory) {
415
460
  features.push('Rules');
416
461
  if (agent.hooks_support)
417
462
  features.push('Hooks');
463
+ if (agent.spawnable)
464
+ features.push('Spawnable');
418
465
  lines.push(` Features: ${features.join(', ') || 'none'}`);
419
466
  if (agent.instruction_file) {
420
467
  lines.push(` Instructions: ${agent.instruction_file}`);
@@ -38,6 +38,7 @@ import { loadClaim } from './claims.js';
38
38
  import { loadAssignment } from './assignments.js';
39
39
  import { createRuntimeEvent } from './events.js';
40
40
  import { nowISO } from './ids.js';
41
+ import { readHeartbeat, readLogTail, signalExists } from './runtime-signals.js';
41
42
  // ── Constants ──────────────────────────────────────────────────────────────
42
43
  /**
43
44
  * Minimum age before a run is eligible for reconciliation. Below this, the
@@ -52,6 +53,11 @@ export const DEFAULT_HEALTH_CHECK_GRACE_MS = 60_000;
52
53
  export const DEFAULT_STALE_AFTER_MS = 30 * 60_000;
53
54
  export const DEFAULT_DEAD_PID_READ_SWEEP_AGE_MS = 5 * 60_000;
54
55
  export const DEFAULT_DEAD_PID_READ_SWEEP_LIMIT = 50;
56
+ /**
57
+ * pln#520 step 1 — a heartbeat older than this (with no completion signal) means
58
+ * the worker reached its loop then went silent: `stalled`. Default 10 min.
59
+ */
60
+ export const DEFAULT_HEARTBEAT_STALE_MS = 10 * 60_000;
55
61
  const TERMINAL_STATUSES = new Set([
56
62
  'completed', 'failed', 'cancelled', 'timed_out', 'interrupted',
57
63
  ]);
@@ -152,15 +158,51 @@ export function collectEvidence(run, cwd, options) {
152
158
  }
153
159
  catch { /* defensive */ }
154
160
  const process_alive = isProcessAlive(run.pid);
155
- return { age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive };
161
+ // pln#520 step 1 sentinel evidence. Signals live under the project
162
+ // coordination dir (the dispatcher's ackRoot), which is `cwd` for the
163
+ // reconciler. Keyed by assignment_id.
164
+ const signalRoot = cwd ?? process.cwd();
165
+ let completed_signal = false;
166
+ let failed_signal = false;
167
+ let heartbeat_exists = false;
168
+ let heartbeat_age_ms;
169
+ try {
170
+ completed_signal = signalExists(signalRoot, run.assignment_id, 'completed');
171
+ failed_signal = signalExists(signalRoot, run.assignment_id, 'failed');
172
+ const hb = readHeartbeat(signalRoot, run.assignment_id);
173
+ heartbeat_exists = hb.exists;
174
+ if (hb.exists && hb.mtimeMs !== undefined)
175
+ heartbeat_age_ms = now - hb.mtimeMs;
176
+ }
177
+ catch { /* defensive */ }
178
+ return {
179
+ age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive,
180
+ completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms,
181
+ };
156
182
  }
157
183
  function anyCompletionEvidence(evidence) {
158
- return evidence.has_post_start_commit
184
+ return evidence.completed_signal
185
+ || evidence.has_post_start_commit
159
186
  || evidence.claim_released
160
187
  || evidence.assignment_completed;
161
188
  }
189
+ /**
190
+ * pln#520 step 1 — a short tail of the captured stderr (or stdout) for
191
+ * failed_silent / stalled diagnostics, so the verdict carries the worker's
192
+ * last words instead of just a status code.
193
+ */
194
+ function logTailSuffix(run, cwd) {
195
+ const root = cwd ?? process.cwd();
196
+ const tail = (readLogTail(root, run.assignment_id, 'stderr', 500).trim()
197
+ || readLogTail(root, run.assignment_id, 'stdout', 500).trim());
198
+ if (!tail)
199
+ return '';
200
+ return ` | log tail: ${tail.replace(/\s+/g, ' ').slice(0, 300)}`;
201
+ }
162
202
  function describeEvidence(evidence) {
163
203
  const reasons = [];
204
+ if (evidence.completed_signal)
205
+ reasons.push('wrapper wrote completed sentinel');
164
206
  if (evidence.has_post_start_commit)
165
207
  reasons.push('post-start commit on worktree branch');
166
208
  if (evidence.claim_released)
@@ -231,6 +273,7 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
231
273
  const evidence = {
232
274
  age_ms: 0, has_post_start_commit: false, claim_released: false,
233
275
  assignment_completed: false, process_alive: undefined,
276
+ completed_signal: false, failed_signal: false, heartbeat_exists: false,
234
277
  };
235
278
  return {
236
279
  run_id: runId, action: 'no_op', reason: 'run not found', evidence,
@@ -280,18 +323,12 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
280
323
  };
281
324
  }
282
325
  }
283
- // Failure inference: stale + dead process + no evidence.
284
- if (evidence.age_ms >= stale && evidence.process_alive === false) {
326
+ // pln#520 step 1 sentinel-based failure (fast + trustworthy, pid-independent).
327
+ const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
328
+ const failHere = (reason) => {
285
329
  try {
286
- transitionAgentRun(runId, 'failed', {
287
- actor,
288
- status_reason: 'silent_termination_no_evidence',
289
- }, cwd);
290
- return {
291
- run_id: runId, action: 'inferred_failed',
292
- reason: 'silent_termination_no_evidence',
293
- evidence, previous_status, current_status: 'failed',
294
- };
330
+ transitionAgentRun(runId, 'failed', { actor, status_reason: reason }, cwd);
331
+ return { run_id: runId, action: 'inferred_failed', reason, evidence, previous_status, current_status: 'failed' };
295
332
  }
296
333
  catch (err) {
297
334
  return {
@@ -300,6 +337,26 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
300
337
  evidence, previous_status, current_status: run.status,
301
338
  };
302
339
  }
340
+ };
341
+ // `failed` sentinel — the wrapper saw a non-zero agent exit.
342
+ if (evidence.failed_signal) {
343
+ return failHere(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
344
+ }
345
+ // Heartbeat present but stale → reached the loop then went silent.
346
+ if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
347
+ return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
348
+ }
349
+ // Fresh heartbeat → alive; trust it over the untrustworthy wrapper pid.
350
+ if (evidence.heartbeat_exists) {
351
+ return {
352
+ run_id: runId, action: 'no_op',
353
+ reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
354
+ evidence, previous_status, current_status: run.status,
355
+ };
356
+ }
357
+ // Failure inference: stale + dead process + no evidence.
358
+ if (evidence.age_ms >= stale && evidence.process_alive === false) {
359
+ return failHere('silent_termination_no_evidence');
303
360
  }
304
361
  // Health-check window: past grace, not yet stale, no evidence either way.
305
362
  // Emit a non-mutating event so callers see the uncertainty without
@@ -339,6 +396,7 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
339
396
  const evidence = {
340
397
  age_ms: 0, has_post_start_commit: false, claim_released: false,
341
398
  assignment_completed: false, process_alive: undefined,
399
+ completed_signal: false, failed_signal: false, heartbeat_exists: false,
342
400
  };
343
401
  return {
344
402
  run_id: runId, action: 'no_op', reason: 'run not found', evidence,
@@ -352,19 +410,25 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
352
410
  evidence, previous_status: run.status, current_status: run.status,
353
411
  };
354
412
  }
355
- if (evidence.process_alive !== false) {
356
- return {
357
- run_id: run.id, action: 'no_op',
358
- reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
359
- evidence, previous_status: run.status, current_status: run.status,
360
- };
361
- }
362
- // pid reads dead — but the tracked pid is NOT trustworthy (see doc above),
363
- // so a bare dead pid NEVER cancels. Evidence of real work wins; otherwise
364
- // surface the uncertainty non-destructively and leave the run `running` for
365
- // reconcileAgentRun's stale-threshold path to fail it only after a fair,
366
- // evidence-based delay.
367
413
  const actor = options.actor ?? 'reconciler';
414
+ const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
415
+ const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
416
+ const failRun = (reason) => {
417
+ try {
418
+ transitionAgentRun(run.id, 'failed', { actor, status_reason: reason }, cwd);
419
+ return { run_id: run.id, action: 'inferred_failed', reason, evidence, previous_status: run.status, current_status: 'failed' };
420
+ }
421
+ catch (err) {
422
+ return {
423
+ run_id: run.id, action: 'no_op',
424
+ reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
425
+ evidence, previous_status: run.status, current_status: run.status,
426
+ };
427
+ }
428
+ };
429
+ // ── pln#520 step 1: SENTINELS are authoritative, independent of the
430
+ // untrustworthy wrapper pid. Check them first. ──────────────────────────
431
+ // 1. Completion evidence (mechanical `completed` sentinel or work evidence).
368
432
  if (anyCompletionEvidence(evidence)) {
369
433
  try {
370
434
  transitionAgentRun(run.id, 'completed', {
@@ -385,33 +449,43 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
385
449
  };
386
450
  }
387
451
  }
388
- // Stale + provably dead + still no evidence -> genuine silent failure. This
389
- // MUST converge HERE: the canonical read path (entity-operations.ts) and the
390
- // MCP pre-read sweep route `running` runs through this function, never
391
- // through reconcileAgentRun, so deferring would leave a crashed run `running`
392
- // forever (trp#292). The 30-min stale window — vs the immediate cancel before
393
- // pln#520 — gives a worker behind an untrusted pid ample time to leave
394
- // evidence first. Reported as `failed` (it died), not `cancelled`.
395
- const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
452
+ // 2. `failed` sentinel the wrapper saw a non-zero agent exit. This is the
453
+ // FAST, TRUSTWORTHY failed_silent detector (vs the pid heuristic that caused
454
+ // can_f792cacd false negatives). Carries the captured log tail.
455
+ if (evidence.failed_signal) {
456
+ return failRun(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
457
+ }
458
+ // 3. Heartbeat present but STALE the worker reached its loop then went
459
+ // silent (e.g. hung). pid-independent: a hung worker keeps the wrapper alive.
460
+ if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
461
+ return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
462
+ }
463
+ // 4. Fresh heartbeat → the worker is alive and working; trust it OVER the
464
+ // (untrustworthy) wrapper pid. This is the can_f792cacd fix: never fail a
465
+ // live, heartbeating worker just because its wrapper pid reads dead.
466
+ if (evidence.heartbeat_exists) {
467
+ return {
468
+ run_id: run.id, action: 'no_op',
469
+ reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
470
+ evidence, previous_status: run.status, current_status: run.status,
471
+ };
472
+ }
473
+ // ── No sentinel, no heartbeat: fall back to the pid-conservative path. The
474
+ // wrapper writes completed/failed on any normal exit, so reaching here means
475
+ // the worker has not exited and never heartbeat. Do NOT fast-fail on a dead
476
+ // pid (it's the wrapper's, not the worker's). ──────────────────────────────
477
+ if (evidence.process_alive !== false) {
478
+ return {
479
+ run_id: run.id, action: 'no_op',
480
+ reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
481
+ evidence, previous_status: run.status, current_status: run.status,
482
+ };
483
+ }
484
+ // pid dead + no sentinel + no heartbeat: only converge after the long stale
485
+ // window (trp#292 — must converge HERE since the read path never routes
486
+ // through reconcileAgentRun), giving an untrusted-pid worker ample time.
396
487
  if (evidence.age_ms >= stale) {
397
- try {
398
- transitionAgentRun(run.id, 'failed', {
399
- actor,
400
- status_reason: 'silent_termination_no_evidence',
401
- }, cwd);
402
- return {
403
- run_id: run.id, action: 'inferred_failed',
404
- reason: 'silent_termination_no_evidence',
405
- evidence, previous_status: run.status, current_status: 'failed',
406
- };
407
- }
408
- catch (err) {
409
- return {
410
- run_id: run.id, action: 'no_op',
411
- reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
412
- evidence, previous_status: run.status, current_status: run.status,
413
- };
414
- }
488
+ return failRun('silent_termination_no_evidence');
415
489
  }
416
490
  emitUnverifiedEvent(run, evidence, actor, cwd);
417
491
  return {
@@ -457,7 +531,7 @@ export function reconcileAllOpenRuns(cwd, filter = {}, options = {}) {
457
531
  catch {
458
532
  results.push({
459
533
  run_id: run.id, action: 'no_op', reason: 'reconcile threw — skipped',
460
- evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined },
534
+ evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined, completed_signal: false, failed_signal: false, heartbeat_exists: false },
461
535
  previous_status: run.status, current_status: run.status,
462
536
  });
463
537
  }
@@ -11,7 +11,7 @@ import { inferProjectFromTarget, loadInstructions, resolveInstructions } from '.
11
11
  import { buildReputationSummary, findAgentReputationSummary } from './reputation.js';
12
12
  import { listRuntimeNotes } from './runtime.js';
13
13
  import { loadState, persistState } from './state.js';
14
- import { getCapabilityProfile } from './agent-capability.js';
14
+ import { resolveConcurrencyLimit, serializeConcurrencyLimit } from './agent-capability.js';
15
15
  import { loadAllSessions } from './identity.js';
16
16
  import { countActionable } from './messaging.js';
17
17
  import { listCandidates } from './candidates.js';
@@ -176,8 +176,7 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
176
176
  for (const identity of listAgentIdentities(cwd)) {
177
177
  if (identity.agent_name === currentAgent)
178
178
  continue;
179
- const profile = getCapabilityProfile(identity.agent_name);
180
- const maxTasks = profile?.max_concurrent_tasks ?? 1;
179
+ const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(identity.agent_name));
181
180
  agentMap.set(identity.agent_name, {
182
181
  name: identity.agent_name,
183
182
  trust_level: identity.trust_level ?? 'contributor',
@@ -185,23 +184,25 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
185
184
  scopes: [],
186
185
  has_open_session: false,
187
186
  instance_count: sessionCounts.get(identity.agent_name) ?? 0,
188
- max_tasks: maxTasks,
189
- slots_remaining: maxTasks, // will be reduced when claims are counted
187
+ max_tasks: limit,
188
+ slots_remaining: limit, // will be reduced when claims are counted (null stays unlimited)
190
189
  });
191
190
  }
192
191
  // Enrich with active claims
193
192
  for (const claim of claims) {
194
193
  if (claim.agent === currentAgent)
195
194
  continue;
196
- const profile = getCapabilityProfile(claim.agent);
197
- const maxTasks = profile?.max_concurrent_tasks ?? 1;
195
+ const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(claim.agent));
198
196
  const existing = agentMap.get(claim.agent) ?? {
199
197
  name: claim.agent, trust_level: 'contributor', claim_count: 0, scopes: [],
200
198
  has_open_session: false, instance_count: sessionCounts.get(claim.agent) ?? 0,
201
- max_tasks: maxTasks, slots_remaining: maxTasks,
199
+ max_tasks: limit, slots_remaining: limit,
202
200
  };
203
201
  existing.claim_count++;
204
- existing.slots_remaining = Math.max(0, existing.max_tasks - existing.claim_count);
202
+ // null max_tasks = unlimited slots stay unlimited.
203
+ existing.slots_remaining = existing.max_tasks === null
204
+ ? null
205
+ : Math.max(0, existing.max_tasks - existing.claim_count);
205
206
  existing.scopes.push(claim.scope);
206
207
  if (!existing.last_active || claim.created_at > existing.last_active) {
207
208
  existing.last_active = claim.created_at;
@@ -44,13 +44,19 @@ function defaultRunGit(cwd, args) {
44
44
  return { ok: false, stdout: '' };
45
45
  }
46
46
  }
47
- /** True for coordination/store paths that are dirty as a side effect of dispatching. */
47
+ /**
48
+ * Top-level directories that are dirty as a side effect of coordination /
49
+ * agent tooling, never part of a dispatch's code scope:
50
+ * - `.brainclaw`, `.git` — coordination store + VCS metadata.
51
+ * - `.claude`, `.cursor`, `.codex` — per-agent local config (trp#371). A
52
+ * worker leaving these dirty (Claude Code settings, etc.) must not block an
53
+ * otherwise-safe dispatch of an unrelated code scope.
54
+ */
55
+ const SYSTEM_DIRTY_DIRS = ['.brainclaw', '.git', '.claude', '.cursor', '.codex'];
56
+ /** True for coordination/store/agent-config paths that are dirty as a side effect of tooling. */
48
57
  export function isSystemDirtyPath(p) {
49
58
  const norm = p.replace(/\\/g, '/');
50
- return norm === '.brainclaw'
51
- || norm.startsWith('.brainclaw/')
52
- || norm === '.git'
53
- || norm.startsWith('.git/');
59
+ return SYSTEM_DIRTY_DIRS.some((dir) => norm === dir || norm.startsWith(dir + '/'));
54
60
  }
55
61
  /**
56
62
  * Parse `git status --porcelain=v1 -z` output into a flat list of paths.