brainclaw 1.7.1 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -345,6 +345,14 @@ npm run test:coverage # with coverage report
345
345
 
346
346
  For older releases (v0.x and the early v1.0 launch series), `git log` on `master` is the source of truth — every release commit follows the `chore(release): bump version to <semver>` convention, and the matching feature/fix commits reference their plan id (e.g. `feat(mcp): self-heal ... (pln#478)`).
347
347
 
348
+ ### v1.7.2
349
+
350
+ - **Sequence MCP tools are agent-first by default** — sequence creation,
351
+ listing, update, and deletion tools are now in the default MCP catalog, with
352
+ explicit lane item schemas (`planId`, optional `stepId`, `rank`,
353
+ dependencies, lane metadata) and matching canonical CRUD validation for
354
+ `entity="sequence"`.
355
+
348
356
  ### v1.7.1
349
357
 
350
358
  - **MCP project context isolation fix** — `bclaw_switch` now keeps MCP switches
Binary file
package/dist/cli.js CHANGED
@@ -30,7 +30,7 @@ import { runInstruction } from './commands/instruction.js';
30
30
  import { runListAgents } from './commands/list-agents.js';
31
31
  import { runSurfaceTaskResource } from './commands/surface-task-resource.js';
32
32
  import { runListInstructions } from './commands/list-instructions.js';
33
- import { runDoctor } from './commands/doctor.js';
33
+ import { runDoctor, runDoctorSpawnCheck } from './commands/doctor.js';
34
34
  import { runRepair } from './commands/repair.js';
35
35
  import { runStale } from './commands/stale.js';
36
36
  import { runRebuild } from './commands/rebuild.js';
@@ -681,7 +681,13 @@ program
681
681
  .option('--repair', 'Rebuild dist/ when the MCP runtime is missing or stale')
682
682
  .option('--after-migration', 'Run the v1.0 post-migration health check only (exits non-zero on any failure)')
683
683
  .option('--dispatch', 'Run dispatch-health diagnostic only: reconcile open agent_runs and report stuck/unverified/silent failures (pln#496 step stp_8c072d75)')
684
- .action((options) => {
684
+ .option('--spawn-check', 'Real spawn round-trip per installed agent before dispatch (pln#520 step 2): validates delivery + handshake on this host, exits non-zero on any installed-agent failure')
685
+ .option('--spawn-check-timeout <ms>', 'Per-agent timeout for --spawn-check (default 15000)', parseInt)
686
+ .action(async (options) => {
687
+ if (options.spawnCheck) {
688
+ await runDoctorSpawnCheck({ cwd: options.cwd, json: options.json, timeoutMs: options.spawnCheckTimeout });
689
+ return;
690
+ }
685
691
  runDoctor({ ...options, afterMigration: options.afterMigration, dispatch: options.dispatch });
686
692
  });
687
693
  // --- repair (Phase 4 Sprint 2 Lane C / pln#397) ---
@@ -1441,6 +1447,8 @@ dispatchCmd
1441
1447
  .option('--agents <names>', 'Comma-separated list of agents to dispatch to')
1442
1448
  .option('--lanes <names>', 'Comma-separated list of lanes to dispatch')
1443
1449
  .option('--max <n>', 'Maximum assignments', parseInt)
1450
+ .option('--max-concurrency <n>', 'Opt-in cap on concurrent instances per host-binary (default: unlimited)', parseInt)
1451
+ .option('--model <name>', 'Model to run, decoupled from agent identity (e.g. --model sonnet)')
1444
1452
  .option('--dry', 'Preview assignments without sending messages')
1445
1453
  .option('--spawn', 'Autonomously launch CLI agents with invoke templates')
1446
1454
  .option('--agent <name>', 'Dispatcher agent name')
@@ -1450,6 +1458,8 @@ dispatchCmd
1450
1458
  agents: options.agents,
1451
1459
  lanes: options.lanes,
1452
1460
  max: options.max,
1461
+ maxConcurrency: options.maxConcurrency,
1462
+ model: options.model,
1453
1463
  dry: options.dry,
1454
1464
  spawn: options.spawn,
1455
1465
  agent: options.agent,
@@ -87,6 +87,8 @@ export async function runDispatch(options) {
87
87
  dryRun: options.dry,
88
88
  dispatcherAgent,
89
89
  autoExecute: options.spawn,
90
+ maxConcurrency: options.maxConcurrency,
91
+ model: options.model,
90
92
  }, effectiveCwd);
91
93
  if (!result) {
92
94
  console.log('No active sequence found.');
@@ -3,6 +3,7 @@ import fs from 'node:fs';
3
3
  import path from 'node:path';
4
4
  import * as childProcess from 'node:child_process';
5
5
  import { reconcileAllOpenRuns } from '../core/agentrun-reconciler.js';
6
+ import { runSpawnCheck, renderSpawnCheckReport } from '../core/spawn-check.js';
6
7
  import { loadAgentRun } from '../core/agentruns.js';
7
8
  import { listAgentIdentities, resolveCurrentAgentIdentity } from '../core/agent-registry.js';
8
9
  import { listCapabilities as listRegistryCapabilities, listTools as listRegistryTools } from '../core/registries.js';
@@ -565,6 +566,22 @@ function renderDispatchHealthHumanReport(report) {
565
566
  }
566
567
  return lines.join('\n');
567
568
  }
569
+ /**
570
+ * pln#520 step 2 — `brainclaw doctor --spawn-check`. Real spawn round-trip per
571
+ * installed agent on the current host. Exits non-zero if any installed agent
572
+ * fails (so it gates CI / a pre-dispatch pre-flight).
573
+ */
574
+ export async function runDoctorSpawnCheck(options = {}) {
575
+ const report = await runSpawnCheck(options);
576
+ if (options.json) {
577
+ console.log(JSON.stringify(report, null, 2));
578
+ }
579
+ else {
580
+ console.log(renderSpawnCheckReport(report));
581
+ }
582
+ if (report.exit_code !== 0)
583
+ process.exit(report.exit_code);
584
+ }
568
585
  export function runDoctor(options = {}) {
569
586
  if (options.dispatch) {
570
587
  const report = runDispatchHealthCheck(options);
@@ -57,6 +57,30 @@ export const SCHEMA_VERSION = '1.0.0';
57
57
  export const MCP_PROTOCOL_VERSIONS = ['2025-11-25', '2024-11-05'];
58
58
  export const MCP_SERVER_NOT_INITIALIZED = -32002;
59
59
  const MCP_RUNTIME_REPAIR_COMMAND = 'brainclaw doctor --repair';
60
+ const SEQUENCE_ITEM_INPUT_SCHEMA = {
61
+ type: 'object',
62
+ description: 'Sequence lane item. planId is required; stepId optionally narrows dispatch/readiness to a specific plan step.',
63
+ properties: {
64
+ planId: { type: 'string', minLength: 1, description: 'Plan item ID referenced by this sequence item.' },
65
+ stepId: { type: 'string', minLength: 1, description: 'Optional plan step ID inside planId for step-level dispatch/readiness.' },
66
+ rank: { type: 'number', minimum: 1, description: 'Positive integer ordering key. Ranks must be unique within a sequence.' },
67
+ hard_after: {
68
+ type: 'array',
69
+ items: { type: 'string' },
70
+ description: 'Sequence item planId values that must complete before this item becomes ready.',
71
+ },
72
+ soft_after: {
73
+ type: 'array',
74
+ items: { type: 'string' },
75
+ description: 'Advisory predecessor planId values; they inform ordering but do not block readiness.',
76
+ },
77
+ lane: { type: 'string', description: 'Optional lane label used for parallel dispatch grouping and filtering.' },
78
+ scope_hint: { type: 'string', description: 'Optional file/path scope hint for claim and brief generation.' },
79
+ rationale: { type: 'string', description: 'Optional explanation for this item or dependency placement.' },
80
+ },
81
+ required: ['planId', 'rank'],
82
+ additionalProperties: false,
83
+ };
60
84
  const { $defs: loopPhaseDefs, ...loopPhaseItemSchema } = generatedSchemas.LoopPhase;
61
85
  const loopSlotInputItemSchema = generatedSchemas.LoopSlotInput;
62
86
  export const MCP_READ_TOOLS = [
@@ -160,7 +184,7 @@ export const MCP_READ_TOOLS = [
160
184
  {
161
185
  name: 'bclaw_list_sequences',
162
186
  description: 'List coordination sequences with optional filters on status and id.',
163
- annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'auto' },
187
+ annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'auto' },
164
188
  inputSchema: {
165
189
  type: 'object',
166
190
  properties: {
@@ -624,7 +648,7 @@ const MCP_WRITE_TOOLS = [
624
648
  {
625
649
  name: 'bclaw_create_sequence',
626
650
  description: 'Create a coordination sequence shared by agents.',
627
- annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'prompt' },
651
+ annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
628
652
  inputSchema: {
629
653
  type: 'object',
630
654
  properties: {
@@ -632,7 +656,7 @@ const MCP_WRITE_TOOLS = [
632
656
  description: { type: 'string', description: 'Optional sequence description.' },
633
657
  status: { type: 'string', description: 'Status: draft, active, archived.' },
634
658
  owner: { type: 'string', description: 'Optional sequence owner.' },
635
- items: { type: 'array', description: 'Sequence items in rank order.', items: { type: 'object' } },
659
+ items: { type: 'array', description: 'Sequence items in rank order.', items: SEQUENCE_ITEM_INPUT_SCHEMA },
636
660
  tags: { type: 'array', items: { type: 'string' }, description: 'Optional tags.' },
637
661
  agent: { type: 'string', description: 'Agent name.' },
638
662
  agentId: { type: 'string', description: 'Registered agent id.' },
@@ -643,7 +667,7 @@ const MCP_WRITE_TOOLS = [
643
667
  {
644
668
  name: 'bclaw_update_sequence',
645
669
  description: 'Update a coordination sequence status, metadata, or items.',
646
- annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'prompt' },
670
+ annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
647
671
  inputSchema: {
648
672
  type: 'object',
649
673
  properties: {
@@ -652,7 +676,7 @@ const MCP_WRITE_TOOLS = [
652
676
  description: { type: 'string', description: 'Optional new description.' },
653
677
  status: { type: 'string', description: 'Status: draft, active, archived.' },
654
678
  owner: { type: 'string', description: 'Optional sequence owner.' },
655
- items: { type: 'array', description: 'Optional replacement items array.', items: { type: 'object' } },
679
+ items: { type: 'array', description: 'Optional replacement items array.', items: SEQUENCE_ITEM_INPUT_SCHEMA },
656
680
  tags: { type: 'array', items: { type: 'string' }, description: 'Optional replacement tags.' },
657
681
  agent: { type: 'string', description: 'Agent name.' },
658
682
  agentId: { type: 'string', description: 'Registered agent id.' },
@@ -754,7 +778,7 @@ const MCP_WRITE_TOOLS = [
754
778
  {
755
779
  name: 'bclaw_delete_sequence',
756
780
  description: 'Delete a sequence by ID. Requires trusted or curator trust level.',
757
- annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'prompt' },
781
+ annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
758
782
  inputSchema: {
759
783
  type: 'object',
760
784
  properties: {
@@ -1092,7 +1116,7 @@ const MCP_WRITE_TOOLS = [
1092
1116
  inputSchema: {
1093
1117
  type: 'object',
1094
1118
  properties: {
1095
- entity: { type: 'string', description: 'Entity name: plan | decision | constraint | trap | handoff | runtime_note | candidate | claim | action | assignment | agent_run | cross_project_link. Others not yet wired.' },
1119
+ entity: { type: 'string', description: 'Entity name: plan | decision | constraint | trap | handoff | runtime_note | candidate | sequence | claim | action | assignment | agent_run | cross_project_link. Others not yet wired.' },
1096
1120
  filter: { type: 'object', description: 'Filter keys: status, tag (single tag), tags (array, any-match), author, plan_id, source, auto_generated, limit, offset, includeLegacy (bool, default false), minAutoReflectConfidence (0-1, default 0.6). entity=agent_run also accepts assignment_id, claim_id, message_id.' },
1097
1121
  project: { type: 'string', description: 'Optional: name (or path/basename) of a linked project to query. Defaults to the current project. Only cross_project_links (config.yaml) and workspace store-chain children are accepted — list with `brainclaw link list`.' },
1098
1122
  },
@@ -46,6 +46,10 @@ const PROFILES = {
46
46
  invoke_binary: 'claude',
47
47
  invoke_review_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
48
48
  invoke_consult_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
49
+ // pln#520 step 3: model is selectable via `--model` — no need for a
50
+ // per-model pseudo-identity. `claude-sonnet` below is now redundant
51
+ // (run `claude-code --model sonnet`) and kept only for back-compat.
52
+ model_flag: '--model',
49
53
  },
50
54
  'claude-sonnet': {
51
55
  name: 'claude-sonnet', category: 'code-agent', workflowModel: 'interactive',
@@ -323,6 +327,63 @@ export function getCapabilityProfile(name) {
323
327
  const resolved = resolveAgentAlias(name);
324
328
  return _customProfiles.get(resolved) ?? PROFILES[resolved];
325
329
  }
330
+ /**
331
+ * pln#520 step 3 — concurrency is a resolvable execution-config value, NOT a
332
+ * structural constant baked into agent identity.
333
+ *
334
+ * The host resource a concurrency cap actually protects is the binary on the
335
+ * machine (its API quota / its RAM/CPU footprint), not the agent label.
336
+ * `resolveResourceKey` returns that shared key so callers count usage across
337
+ * every identity that drives one binary. This kills the can_dc4e4a11 bug:
338
+ * `claude-code` and `claude-sonnet` are the SAME `claude` binary on the SAME
339
+ * host but were counted separately (3 + 6 → up to 9 concurrent `claude`
340
+ * processes, oversubscribing the machine + API).
341
+ */
342
+ export function resolveResourceKey(name) {
343
+ const profile = getCapabilityProfile(name);
344
+ return profile?.invoke_binary ?? resolveAgentAlias(name);
345
+ }
346
+ /**
347
+ * Resolve the concurrency limit for an agent. `Infinity` = unlimited.
348
+ *
349
+ * Resolution chain (highest priority first), decoupled from agent identity:
350
+ * 1. explicit `override` (e.g. `brainclaw dispatch --max-concurrency N`)
351
+ * 2. host opt-in cap via `BRAINCLAW_MAX_CONCURRENCY` (protect one machine / quota)
352
+ * 3. structural floor — agents that cannot run headless in parallel
353
+ * (IDE / desktop agents, i.e. not CLI-spawnable) stay hard-capped at their
354
+ * profile `max_concurrent_tasks` (you can't spawn N IDE windows headlessly)
355
+ * 4. default for parallelizable CLI agents: UNLIMITED. There is no arbitrary
356
+ * per-identity throttle — the operator opts into a cap when they want one.
357
+ *
358
+ * When a finite cap applies it is enforced per host-binary resource
359
+ * (see `resolveResourceKey`), so all variants of one binary share the pool.
360
+ */
361
+ export function resolveConcurrencyLimit(name, opts = {}) {
362
+ if (opts.override !== undefined && opts.override > 0)
363
+ return opts.override;
364
+ const envCap = Number(process.env.BRAINCLAW_MAX_CONCURRENCY);
365
+ if (Number.isFinite(envCap) && envCap > 0)
366
+ return envCap;
367
+ const profile = getCapabilityProfile(name);
368
+ if (!profile?.runtime?.canBeSpawnedCli)
369
+ return profile?.max_concurrent_tasks ?? 1;
370
+ return Infinity;
371
+ }
372
+ /** JSON-safe rendering of a concurrency limit: `Infinity` → `null` (= unlimited). */
373
+ export function serializeConcurrencyLimit(limit) {
374
+ return Number.isFinite(limit) ? limit : null;
375
+ }
376
+ /**
377
+ * pln#520 step 3 — resolve the model for a dispatch, decoupled from agent
378
+ * identity. Chain (highest priority first): explicit override (e.g.
379
+ * `dispatch --model`) → lane model → identity model → profile default.
380
+ * Returns `undefined` when nothing in the chain specifies one (the agent's
381
+ * template default applies).
382
+ */
383
+ export function resolveModel(name, opts = {}) {
384
+ const profile = getCapabilityProfile(name);
385
+ return opts.override ?? opts.lane ?? opts.identity ?? profile?.default_model;
386
+ }
326
387
  /**
327
388
  * Escape a string for safe use as a double-quoted shell argument.
328
389
  * Escapes characters that have special meaning inside double-quotes
@@ -490,6 +551,12 @@ export function buildInvokeCommand(name, prompt, options = {}) {
490
551
  const rawTokens = parseTemplateString(templateStr);
491
552
  if (rawTokens.length === 0)
492
553
  return undefined;
554
+ // pln#520 step 3: inject the resolved model right after the binary so model
555
+ // choice is decoupled from agent identity. Only when the profile declares a
556
+ // `model_flag` and the template doesn't already pin a model (don't double it).
557
+ if (options.model && profile.model_flag && !rawTokens.includes(profile.model_flag)) {
558
+ rawTokens.splice(1, 0, profile.model_flag, options.model);
559
+ }
493
560
  const executable = rawTokens[0];
494
561
  const interpolatedTokens = rawTokens.slice(1).map((tok) => tok === '{prompt}' ? embeddedPrompt : tok);
495
562
  // ── 5. Build the args array ───────────────────────────────────────────────
@@ -38,6 +38,7 @@ import { loadClaim } from './claims.js';
38
38
  import { loadAssignment } from './assignments.js';
39
39
  import { createRuntimeEvent } from './events.js';
40
40
  import { nowISO } from './ids.js';
41
+ import { readHeartbeat, readLogTail, signalExists } from './runtime-signals.js';
41
42
  // ── Constants ──────────────────────────────────────────────────────────────
42
43
  /**
43
44
  * Minimum age before a run is eligible for reconciliation. Below this, the
@@ -52,6 +53,11 @@ export const DEFAULT_HEALTH_CHECK_GRACE_MS = 60_000;
52
53
  export const DEFAULT_STALE_AFTER_MS = 30 * 60_000;
53
54
  export const DEFAULT_DEAD_PID_READ_SWEEP_AGE_MS = 5 * 60_000;
54
55
  export const DEFAULT_DEAD_PID_READ_SWEEP_LIMIT = 50;
56
+ /**
57
+ * pln#520 step 1 — a heartbeat older than this (with no completion signal) means
58
+ * the worker reached its loop then went silent: `stalled`. Default 10 min.
59
+ */
60
+ export const DEFAULT_HEARTBEAT_STALE_MS = 10 * 60_000;
55
61
  const TERMINAL_STATUSES = new Set([
56
62
  'completed', 'failed', 'cancelled', 'timed_out', 'interrupted',
57
63
  ]);
@@ -152,15 +158,51 @@ export function collectEvidence(run, cwd, options) {
152
158
  }
153
159
  catch { /* defensive */ }
154
160
  const process_alive = isProcessAlive(run.pid);
155
- return { age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive };
161
+ // pln#520 step 1 sentinel evidence. Signals live under the project
162
+ // coordination dir (the dispatcher's ackRoot), which is `cwd` for the
163
+ // reconciler. Keyed by assignment_id.
164
+ const signalRoot = cwd ?? process.cwd();
165
+ let completed_signal = false;
166
+ let failed_signal = false;
167
+ let heartbeat_exists = false;
168
+ let heartbeat_age_ms;
169
+ try {
170
+ completed_signal = signalExists(signalRoot, run.assignment_id, 'completed');
171
+ failed_signal = signalExists(signalRoot, run.assignment_id, 'failed');
172
+ const hb = readHeartbeat(signalRoot, run.assignment_id);
173
+ heartbeat_exists = hb.exists;
174
+ if (hb.exists && hb.mtimeMs !== undefined)
175
+ heartbeat_age_ms = now - hb.mtimeMs;
176
+ }
177
+ catch { /* defensive */ }
178
+ return {
179
+ age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive,
180
+ completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms,
181
+ };
156
182
  }
157
183
  function anyCompletionEvidence(evidence) {
158
- return evidence.has_post_start_commit
184
+ return evidence.completed_signal
185
+ || evidence.has_post_start_commit
159
186
  || evidence.claim_released
160
187
  || evidence.assignment_completed;
161
188
  }
189
+ /**
190
+ * pln#520 step 1 — a short tail of the captured stderr (or stdout) for
191
+ * failed_silent / stalled diagnostics, so the verdict carries the worker's
192
+ * last words instead of just a status code.
193
+ */
194
+ function logTailSuffix(run, cwd) {
195
+ const root = cwd ?? process.cwd();
196
+ const tail = (readLogTail(root, run.assignment_id, 'stderr', 500).trim()
197
+ || readLogTail(root, run.assignment_id, 'stdout', 500).trim());
198
+ if (!tail)
199
+ return '';
200
+ return ` | log tail: ${tail.replace(/\s+/g, ' ').slice(0, 300)}`;
201
+ }
162
202
  function describeEvidence(evidence) {
163
203
  const reasons = [];
204
+ if (evidence.completed_signal)
205
+ reasons.push('wrapper wrote completed sentinel');
164
206
  if (evidence.has_post_start_commit)
165
207
  reasons.push('post-start commit on worktree branch');
166
208
  if (evidence.claim_released)
@@ -231,6 +273,7 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
231
273
  const evidence = {
232
274
  age_ms: 0, has_post_start_commit: false, claim_released: false,
233
275
  assignment_completed: false, process_alive: undefined,
276
+ completed_signal: false, failed_signal: false, heartbeat_exists: false,
234
277
  };
235
278
  return {
236
279
  run_id: runId, action: 'no_op', reason: 'run not found', evidence,
@@ -280,18 +323,12 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
280
323
  };
281
324
  }
282
325
  }
283
- // Failure inference: stale + dead process + no evidence.
284
- if (evidence.age_ms >= stale && evidence.process_alive === false) {
326
+ // pln#520 step 1 sentinel-based failure (fast + trustworthy, pid-independent).
327
+ const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
328
+ const failHere = (reason) => {
285
329
  try {
286
- transitionAgentRun(runId, 'failed', {
287
- actor,
288
- status_reason: 'silent_termination_no_evidence',
289
- }, cwd);
290
- return {
291
- run_id: runId, action: 'inferred_failed',
292
- reason: 'silent_termination_no_evidence',
293
- evidence, previous_status, current_status: 'failed',
294
- };
330
+ transitionAgentRun(runId, 'failed', { actor, status_reason: reason }, cwd);
331
+ return { run_id: runId, action: 'inferred_failed', reason, evidence, previous_status, current_status: 'failed' };
295
332
  }
296
333
  catch (err) {
297
334
  return {
@@ -300,6 +337,26 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
300
337
  evidence, previous_status, current_status: run.status,
301
338
  };
302
339
  }
340
+ };
341
+ // `failed` sentinel — the wrapper saw a non-zero agent exit.
342
+ if (evidence.failed_signal) {
343
+ return failHere(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
344
+ }
345
+ // Heartbeat present but stale → reached the loop then went silent.
346
+ if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
347
+ return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
348
+ }
349
+ // Fresh heartbeat → alive; trust it over the untrustworthy wrapper pid.
350
+ if (evidence.heartbeat_exists) {
351
+ return {
352
+ run_id: runId, action: 'no_op',
353
+ reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
354
+ evidence, previous_status, current_status: run.status,
355
+ };
356
+ }
357
+ // Failure inference: stale + dead process + no evidence.
358
+ if (evidence.age_ms >= stale && evidence.process_alive === false) {
359
+ return failHere('silent_termination_no_evidence');
303
360
  }
304
361
  // Health-check window: past grace, not yet stale, no evidence either way.
305
362
  // Emit a non-mutating event so callers see the uncertainty without
@@ -339,6 +396,7 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
339
396
  const evidence = {
340
397
  age_ms: 0, has_post_start_commit: false, claim_released: false,
341
398
  assignment_completed: false, process_alive: undefined,
399
+ completed_signal: false, failed_signal: false, heartbeat_exists: false,
342
400
  };
343
401
  return {
344
402
  run_id: runId, action: 'no_op', reason: 'run not found', evidence,
@@ -352,19 +410,25 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
352
410
  evidence, previous_status: run.status, current_status: run.status,
353
411
  };
354
412
  }
355
- if (evidence.process_alive !== false) {
356
- return {
357
- run_id: run.id, action: 'no_op',
358
- reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
359
- evidence, previous_status: run.status, current_status: run.status,
360
- };
361
- }
362
- // pid reads dead — but the tracked pid is NOT trustworthy (see doc above),
363
- // so a bare dead pid NEVER cancels. Evidence of real work wins; otherwise
364
- // surface the uncertainty non-destructively and leave the run `running` for
365
- // reconcileAgentRun's stale-threshold path to fail it only after a fair,
366
- // evidence-based delay.
367
413
  const actor = options.actor ?? 'reconciler';
414
+ const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
415
+ const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
416
+ const failRun = (reason) => {
417
+ try {
418
+ transitionAgentRun(run.id, 'failed', { actor, status_reason: reason }, cwd);
419
+ return { run_id: run.id, action: 'inferred_failed', reason, evidence, previous_status: run.status, current_status: 'failed' };
420
+ }
421
+ catch (err) {
422
+ return {
423
+ run_id: run.id, action: 'no_op',
424
+ reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
425
+ evidence, previous_status: run.status, current_status: run.status,
426
+ };
427
+ }
428
+ };
429
+ // ── pln#520 step 1: SENTINELS are authoritative, independent of the
430
+ // untrustworthy wrapper pid. Check them first. ──────────────────────────
431
+ // 1. Completion evidence (mechanical `completed` sentinel or work evidence).
368
432
  if (anyCompletionEvidence(evidence)) {
369
433
  try {
370
434
  transitionAgentRun(run.id, 'completed', {
@@ -385,33 +449,43 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
385
449
  };
386
450
  }
387
451
  }
388
- // Stale + provably dead + still no evidence -> genuine silent failure. This
389
- // MUST converge HERE: the canonical read path (entity-operations.ts) and the
390
- // MCP pre-read sweep route `running` runs through this function, never
391
- // through reconcileAgentRun, so deferring would leave a crashed run `running`
392
- // forever (trp#292). The 30-min stale window — vs the immediate cancel before
393
- // pln#520 — gives a worker behind an untrusted pid ample time to leave
394
- // evidence first. Reported as `failed` (it died), not `cancelled`.
395
- const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
452
+ // 2. `failed` sentinel the wrapper saw a non-zero agent exit. This is the
453
+ // FAST, TRUSTWORTHY failed_silent detector (vs the pid heuristic that caused
454
+ // can_f792cacd false negatives). Carries the captured log tail.
455
+ if (evidence.failed_signal) {
456
+ return failRun(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
457
+ }
458
+ // 3. Heartbeat present but STALE the worker reached its loop then went
459
+ // silent (e.g. hung). pid-independent: a hung worker keeps the wrapper alive.
460
+ if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
461
+ return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
462
+ }
463
+ // 4. Fresh heartbeat → the worker is alive and working; trust it OVER the
464
+ // (untrustworthy) wrapper pid. This is the can_f792cacd fix: never fail a
465
+ // live, heartbeating worker just because its wrapper pid reads dead.
466
+ if (evidence.heartbeat_exists) {
467
+ return {
468
+ run_id: run.id, action: 'no_op',
469
+ reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
470
+ evidence, previous_status: run.status, current_status: run.status,
471
+ };
472
+ }
473
+ // ── No sentinel, no heartbeat: fall back to the pid-conservative path. The
474
+ // wrapper writes completed/failed on any normal exit, so reaching here means
475
+ // the worker has not exited and never heartbeat. Do NOT fast-fail on a dead
476
+ // pid (it's the wrapper's, not the worker's). ──────────────────────────────
477
+ if (evidence.process_alive !== false) {
478
+ return {
479
+ run_id: run.id, action: 'no_op',
480
+ reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
481
+ evidence, previous_status: run.status, current_status: run.status,
482
+ };
483
+ }
484
+ // pid dead + no sentinel + no heartbeat: only converge after the long stale
485
+ // window (trp#292 — must converge HERE since the read path never routes
486
+ // through reconcileAgentRun), giving an untrusted-pid worker ample time.
396
487
  if (evidence.age_ms >= stale) {
397
- try {
398
- transitionAgentRun(run.id, 'failed', {
399
- actor,
400
- status_reason: 'silent_termination_no_evidence',
401
- }, cwd);
402
- return {
403
- run_id: run.id, action: 'inferred_failed',
404
- reason: 'silent_termination_no_evidence',
405
- evidence, previous_status: run.status, current_status: 'failed',
406
- };
407
- }
408
- catch (err) {
409
- return {
410
- run_id: run.id, action: 'no_op',
411
- reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
412
- evidence, previous_status: run.status, current_status: run.status,
413
- };
414
- }
488
+ return failRun('silent_termination_no_evidence');
415
489
  }
416
490
  emitUnverifiedEvent(run, evidence, actor, cwd);
417
491
  return {
@@ -457,7 +531,7 @@ export function reconcileAllOpenRuns(cwd, filter = {}, options = {}) {
457
531
  catch {
458
532
  results.push({
459
533
  run_id: run.id, action: 'no_op', reason: 'reconcile threw — skipped',
460
- evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined },
534
+ evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined, completed_signal: false, failed_signal: false, heartbeat_exists: false },
461
535
  previous_status: run.status, current_status: run.status,
462
536
  });
463
537
  }
@@ -11,7 +11,7 @@ import { inferProjectFromTarget, loadInstructions, resolveInstructions } from '.
11
11
  import { buildReputationSummary, findAgentReputationSummary } from './reputation.js';
12
12
  import { listRuntimeNotes } from './runtime.js';
13
13
  import { loadState, persistState } from './state.js';
14
- import { getCapabilityProfile } from './agent-capability.js';
14
+ import { resolveConcurrencyLimit, serializeConcurrencyLimit } from './agent-capability.js';
15
15
  import { loadAllSessions } from './identity.js';
16
16
  import { countActionable } from './messaging.js';
17
17
  import { listCandidates } from './candidates.js';
@@ -176,8 +176,7 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
176
176
  for (const identity of listAgentIdentities(cwd)) {
177
177
  if (identity.agent_name === currentAgent)
178
178
  continue;
179
- const profile = getCapabilityProfile(identity.agent_name);
180
- const maxTasks = profile?.max_concurrent_tasks ?? 1;
179
+ const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(identity.agent_name));
181
180
  agentMap.set(identity.agent_name, {
182
181
  name: identity.agent_name,
183
182
  trust_level: identity.trust_level ?? 'contributor',
@@ -185,23 +184,25 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
185
184
  scopes: [],
186
185
  has_open_session: false,
187
186
  instance_count: sessionCounts.get(identity.agent_name) ?? 0,
188
- max_tasks: maxTasks,
189
- slots_remaining: maxTasks, // will be reduced when claims are counted
187
+ max_tasks: limit,
188
+ slots_remaining: limit, // will be reduced when claims are counted (null stays unlimited)
190
189
  });
191
190
  }
192
191
  // Enrich with active claims
193
192
  for (const claim of claims) {
194
193
  if (claim.agent === currentAgent)
195
194
  continue;
196
- const profile = getCapabilityProfile(claim.agent);
197
- const maxTasks = profile?.max_concurrent_tasks ?? 1;
195
+ const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(claim.agent));
198
196
  const existing = agentMap.get(claim.agent) ?? {
199
197
  name: claim.agent, trust_level: 'contributor', claim_count: 0, scopes: [],
200
198
  has_open_session: false, instance_count: sessionCounts.get(claim.agent) ?? 0,
201
- max_tasks: maxTasks, slots_remaining: maxTasks,
199
+ max_tasks: limit, slots_remaining: limit,
202
200
  };
203
201
  existing.claim_count++;
204
- existing.slots_remaining = Math.max(0, existing.max_tasks - existing.claim_count);
202
+ // null max_tasks = unlimited slots stay unlimited.
203
+ existing.slots_remaining = existing.max_tasks === null
204
+ ? null
205
+ : Math.max(0, existing.max_tasks - existing.claim_count);
205
206
  existing.scopes.push(claim.scope);
206
207
  if (!existing.last_active || claim.created_at > existing.last_active) {
207
208
  existing.last_active = claim.created_at;