npm - brainclaw - Versions diffs - 1.7.1 → 1.7.2 - Mend

brainclaw 1.7.1 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +8 -0
package/dist/brainclaw-vscode.vsix +0 -0
package/dist/cli.js +12 -2
package/dist/commands/dispatch.js +2 -0
package/dist/commands/doctor.js +17 -0
package/dist/commands/mcp.js +31 -7
package/dist/core/agent-capability.js +67 -0
package/dist/core/agentrun-reconciler.js +126 -52
package/dist/core/coordination.js +10 -9
package/dist/core/dispatcher.js +99 -29
package/dist/core/entity-operations.js +54 -1
package/dist/core/execution-adapters.js +32 -51
package/dist/core/execution.js +14 -8
package/dist/core/instruction-templates.js +4 -3
package/dist/core/runtime-signals.js +102 -0
package/dist/core/spawn-check.js +125 -0
package/dist/facts.js +3 -3
package/dist/facts.json +2 -2
package/docs/cli.md +8 -4
package/docs/integrations/mcp.md +48 -15
package/docs/mcp-schema-changelog.md +16 -5
package/docs/playbooks/team/index.md +7 -5
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -345,6 +345,14 @@ npm run test:coverage      # with coverage report
 For older releases (v0.x and the early v1.0 launch series), `git log` on `master` is the source of truth — every release commit follows the `chore(release): bump version to <semver>` convention, and the matching feature/fix commits reference their plan id (e.g. `feat(mcp): self-heal ... (pln#478)`).
+### v1.7.2
+- **Sequence MCP tools are agent-first by default** — sequence creation,
+  listing, update, and deletion tools are now in the default MCP catalog, with
+  explicit lane item schemas (`planId`, optional `stepId`, `rank`,
+  dependencies, lane metadata) and matching canonical CRUD validation for
+  `entity="sequence"`.
 ### v1.7.1
 - **MCP project context isolation fix** — `bclaw_switch` now keeps MCP switches

package/dist/brainclaw-vscode.vsix CHANGED Viewed

Binary file

package/dist/cli.js CHANGED Viewed

@@ -30,7 +30,7 @@ import { runInstruction } from './commands/instruction.js';
 import { runListAgents } from './commands/list-agents.js';
 import { runSurfaceTaskResource } from './commands/surface-task-resource.js';
 import { runListInstructions } from './commands/list-instructions.js';
-import { runDoctor } from './commands/doctor.js';
+import { runDoctor, runDoctorSpawnCheck } from './commands/doctor.js';
 import { runRepair } from './commands/repair.js';
 import { runStale } from './commands/stale.js';
 import { runRebuild } from './commands/rebuild.js';
@@ -681,7 +681,13 @@ program
     .option('--repair', 'Rebuild dist/ when the MCP runtime is missing or stale')
     .option('--after-migration', 'Run the v1.0 post-migration health check only (exits non-zero on any failure)')
     .option('--dispatch', 'Run dispatch-health diagnostic only: reconcile open agent_runs and report stuck/unverified/silent failures (pln#496 step stp_8c072d75)')
-    .action((options) => {
+    .option('--spawn-check', 'Real spawn round-trip per installed agent before dispatch (pln#520 step 2): validates delivery + handshake on this host, exits non-zero on any installed-agent failure')
+    .option('--spawn-check-timeout <ms>', 'Per-agent timeout for --spawn-check (default 15000)', parseInt)
+    .action(async (options) => {
+    if (options.spawnCheck) {
+        await runDoctorSpawnCheck({ cwd: options.cwd, json: options.json, timeoutMs: options.spawnCheckTimeout });
+        return;
+    }
     runDoctor({ ...options, afterMigration: options.afterMigration, dispatch: options.dispatch });
 });
 // --- repair (Phase 4 Sprint 2 Lane C / pln#397) ---
@@ -1441,6 +1447,8 @@ dispatchCmd
     .option('--agents <names>', 'Comma-separated list of agents to dispatch to')
     .option('--lanes <names>', 'Comma-separated list of lanes to dispatch')
     .option('--max <n>', 'Maximum assignments', parseInt)
+    .option('--max-concurrency <n>', 'Opt-in cap on concurrent instances per host-binary (default: unlimited)', parseInt)
+    .option('--model <name>', 'Model to run, decoupled from agent identity (e.g. --model sonnet)')
     .option('--dry', 'Preview assignments without sending messages')
     .option('--spawn', 'Autonomously launch CLI agents with invoke templates')
     .option('--agent <name>', 'Dispatcher agent name')
@@ -1450,6 +1458,8 @@ dispatchCmd
         agents: options.agents,
         lanes: options.lanes,
         max: options.max,
+        maxConcurrency: options.maxConcurrency,
+        model: options.model,
         dry: options.dry,
         spawn: options.spawn,
         agent: options.agent,

package/dist/commands/dispatch.js CHANGED Viewed

@@ -87,6 +87,8 @@ export async function runDispatch(options) {
         dryRun: options.dry,
         dispatcherAgent,
         autoExecute: options.spawn,
+        maxConcurrency: options.maxConcurrency,
+        model: options.model,
     }, effectiveCwd);
     if (!result) {
         console.log('No active sequence found.');

package/dist/commands/doctor.js CHANGED Viewed

@@ -3,6 +3,7 @@ import fs from 'node:fs';
 import path from 'node:path';
 import * as childProcess from 'node:child_process';
 import { reconcileAllOpenRuns } from '../core/agentrun-reconciler.js';
+import { runSpawnCheck, renderSpawnCheckReport } from '../core/spawn-check.js';
 import { loadAgentRun } from '../core/agentruns.js';
 import { listAgentIdentities, resolveCurrentAgentIdentity } from '../core/agent-registry.js';
 import { listCapabilities as listRegistryCapabilities, listTools as listRegistryTools } from '../core/registries.js';
@@ -565,6 +566,22 @@ function renderDispatchHealthHumanReport(report) {
     }
     return lines.join('\n');
 }
+/**
+ * pln#520 step 2 — `brainclaw doctor --spawn-check`. Real spawn round-trip per
+ * installed agent on the current host. Exits non-zero if any installed agent
+ * fails (so it gates CI / a pre-dispatch pre-flight).
+ */
+export async function runDoctorSpawnCheck(options = {}) {
+    const report = await runSpawnCheck(options);
+    if (options.json) {
+        console.log(JSON.stringify(report, null, 2));
+    }
+    else {
+        console.log(renderSpawnCheckReport(report));
+    }
+    if (report.exit_code !== 0)
+        process.exit(report.exit_code);
+}
 export function runDoctor(options = {}) {
     if (options.dispatch) {
         const report = runDispatchHealthCheck(options);

package/dist/commands/mcp.js CHANGED Viewed

@@ -57,6 +57,30 @@ export const SCHEMA_VERSION = '1.0.0';
 export const MCP_PROTOCOL_VERSIONS = ['2025-11-25', '2024-11-05'];
 export const MCP_SERVER_NOT_INITIALIZED = -32002;
 const MCP_RUNTIME_REPAIR_COMMAND = 'brainclaw doctor --repair';
+const SEQUENCE_ITEM_INPUT_SCHEMA = {
+    type: 'object',
+    description: 'Sequence lane item. planId is required; stepId optionally narrows dispatch/readiness to a specific plan step.',
+    properties: {
+        planId: { type: 'string', minLength: 1, description: 'Plan item ID referenced by this sequence item.' },
+        stepId: { type: 'string', minLength: 1, description: 'Optional plan step ID inside planId for step-level dispatch/readiness.' },
+        rank: { type: 'number', minimum: 1, description: 'Positive integer ordering key. Ranks must be unique within a sequence.' },
+        hard_after: {
+            type: 'array',
+            items: { type: 'string' },
+            description: 'Sequence item planId values that must complete before this item becomes ready.',
+        },
+        soft_after: {
+            type: 'array',
+            items: { type: 'string' },
+            description: 'Advisory predecessor planId values; they inform ordering but do not block readiness.',
+        },
+        lane: { type: 'string', description: 'Optional lane label used for parallel dispatch grouping and filtering.' },
+        scope_hint: { type: 'string', description: 'Optional file/path scope hint for claim and brief generation.' },
+        rationale: { type: 'string', description: 'Optional explanation for this item or dependency placement.' },
+    },
+    required: ['planId', 'rank'],
+    additionalProperties: false,
+};
 const { $defs: loopPhaseDefs, ...loopPhaseItemSchema } = generatedSchemas.LoopPhase;
 const loopSlotInputItemSchema = generatedSchemas.LoopSlotInput;
 export const MCP_READ_TOOLS = [
@@ -160,7 +184,7 @@ export const MCP_READ_TOOLS = [
     {
         name: 'bclaw_list_sequences',
         description: 'List coordination sequences with optional filters on status and id.',
-        annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'auto' },
+        annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'auto' },
         inputSchema: {
             type: 'object',
             properties: {
@@ -624,7 +648,7 @@ const MCP_WRITE_TOOLS = [
     {
         name: 'bclaw_create_sequence',
         description: 'Create a coordination sequence shared by agents.',
-        annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'prompt' },
+        annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
         inputSchema: {
             type: 'object',
             properties: {
@@ -632,7 +656,7 @@ const MCP_WRITE_TOOLS = [
                 description: { type: 'string', description: 'Optional sequence description.' },
                 status: { type: 'string', description: 'Status: draft, active, archived.' },
                 owner: { type: 'string', description: 'Optional sequence owner.' },
-                items: { type: 'array', description: 'Sequence items in rank order.', items: { type: 'object' } },
+                items: { type: 'array', description: 'Sequence items in rank order.', items: SEQUENCE_ITEM_INPUT_SCHEMA },
                 tags: { type: 'array', items: { type: 'string' }, description: 'Optional tags.' },
                 agent: { type: 'string', description: 'Agent name.' },
                 agentId: { type: 'string', description: 'Registered agent id.' },
@@ -643,7 +667,7 @@ const MCP_WRITE_TOOLS = [
     {
         name: 'bclaw_update_sequence',
         description: 'Update a coordination sequence status, metadata, or items.',
-        annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'prompt' },
+        annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
         inputSchema: {
             type: 'object',
             properties: {
@@ -652,7 +676,7 @@ const MCP_WRITE_TOOLS = [
                 description: { type: 'string', description: 'Optional new description.' },
                 status: { type: 'string', description: 'Status: draft, active, archived.' },
                 owner: { type: 'string', description: 'Optional sequence owner.' },
-                items: { type: 'array', description: 'Optional replacement items array.', items: { type: 'object' } },
+                items: { type: 'array', description: 'Optional replacement items array.', items: SEQUENCE_ITEM_INPUT_SCHEMA },
                 tags: { type: 'array', items: { type: 'string' }, description: 'Optional replacement tags.' },
                 agent: { type: 'string', description: 'Agent name.' },
                 agentId: { type: 'string', description: 'Registered agent id.' },
@@ -754,7 +778,7 @@ const MCP_WRITE_TOOLS = [
     {
         name: 'bclaw_delete_sequence',
         description: 'Delete a sequence by ID. Requires trusted or curator trust level.',
-        annotations: { tier: 'advanced', category: 'coordination', headlessApproval: 'prompt' },
+        annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
         inputSchema: {
             type: 'object',
             properties: {
@@ -1092,7 +1116,7 @@ const MCP_WRITE_TOOLS = [
         inputSchema: {
             type: 'object',
             properties: {
-                entity: { type: 'string', description: 'Entity name: plan | decision | constraint | trap | handoff | runtime_note | candidate | claim | action | assignment | agent_run | cross_project_link. Others not yet wired.' },
+                entity: { type: 'string', description: 'Entity name: plan | decision | constraint | trap | handoff | runtime_note | candidate | sequence | claim | action | assignment | agent_run | cross_project_link. Others not yet wired.' },
                 filter: { type: 'object', description: 'Filter keys: status, tag (single tag), tags (array, any-match), author, plan_id, source, auto_generated, limit, offset, includeLegacy (bool, default false), minAutoReflectConfidence (0-1, default 0.6). entity=agent_run also accepts assignment_id, claim_id, message_id.' },
                 project: { type: 'string', description: 'Optional: name (or path/basename) of a linked project to query. Defaults to the current project. Only cross_project_links (config.yaml) and workspace store-chain children are accepted — list with `brainclaw link list`.' },
             },

package/dist/core/agent-capability.js CHANGED Viewed

@@ -46,6 +46,10 @@ const PROFILES = {
         invoke_binary: 'claude',
         invoke_review_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
         invoke_consult_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
+        // pln#520 step 3: model is selectable via `--model` — no need for a
+        // per-model pseudo-identity. `claude-sonnet` below is now redundant
+        // (run `claude-code --model sonnet`) and kept only for back-compat.
+        model_flag: '--model',
     },
     'claude-sonnet': {
         name: 'claude-sonnet', category: 'code-agent', workflowModel: 'interactive',
@@ -323,6 +327,63 @@ export function getCapabilityProfile(name) {
     const resolved = resolveAgentAlias(name);
     return _customProfiles.get(resolved) ?? PROFILES[resolved];
 }
+/**
+ * pln#520 step 3 — concurrency is a resolvable execution-config value, NOT a
+ * structural constant baked into agent identity.
+ *
+ * The host resource a concurrency cap actually protects is the binary on the
+ * machine (its API quota / its RAM/CPU footprint), not the agent label.
+ * `resolveResourceKey` returns that shared key so callers count usage across
+ * every identity that drives one binary. This kills the can_dc4e4a11 bug:
+ * `claude-code` and `claude-sonnet` are the SAME `claude` binary on the SAME
+ * host but were counted separately (3 + 6 → up to 9 concurrent `claude`
+ * processes, oversubscribing the machine + API).
+ */
+export function resolveResourceKey(name) {
+    const profile = getCapabilityProfile(name);
+    return profile?.invoke_binary ?? resolveAgentAlias(name);
+}
+/**
+ * Resolve the concurrency limit for an agent. `Infinity` = unlimited.
+ *
+ * Resolution chain (highest priority first), decoupled from agent identity:
+ *   1. explicit `override` (e.g. `brainclaw dispatch --max-concurrency N`)
+ *   2. host opt-in cap via `BRAINCLAW_MAX_CONCURRENCY` (protect one machine / quota)
+ *   3. structural floor — agents that cannot run headless in parallel
+ *      (IDE / desktop agents, i.e. not CLI-spawnable) stay hard-capped at their
+ *      profile `max_concurrent_tasks` (you can't spawn N IDE windows headlessly)
+ *   4. default for parallelizable CLI agents: UNLIMITED. There is no arbitrary
+ *      per-identity throttle — the operator opts into a cap when they want one.
+ *
+ * When a finite cap applies it is enforced per host-binary resource
+ * (see `resolveResourceKey`), so all variants of one binary share the pool.
+ */
+export function resolveConcurrencyLimit(name, opts = {}) {
+    if (opts.override !== undefined && opts.override > 0)
+        return opts.override;
+    const envCap = Number(process.env.BRAINCLAW_MAX_CONCURRENCY);
+    if (Number.isFinite(envCap) && envCap > 0)
+        return envCap;
+    const profile = getCapabilityProfile(name);
+    if (!profile?.runtime?.canBeSpawnedCli)
+        return profile?.max_concurrent_tasks ?? 1;
+    return Infinity;
+}
+/** JSON-safe rendering of a concurrency limit: `Infinity` → `null` (= unlimited). */
+export function serializeConcurrencyLimit(limit) {
+    return Number.isFinite(limit) ? limit : null;
+}
+/**
+ * pln#520 step 3 — resolve the model for a dispatch, decoupled from agent
+ * identity. Chain (highest priority first): explicit override (e.g.
+ * `dispatch --model`) → lane model → identity model → profile default.
+ * Returns `undefined` when nothing in the chain specifies one (the agent's
+ * template default applies).
+ */
+export function resolveModel(name, opts = {}) {
+    const profile = getCapabilityProfile(name);
+    return opts.override ?? opts.lane ?? opts.identity ?? profile?.default_model;
+}
 /**
  * Escape a string for safe use as a double-quoted shell argument.
  * Escapes characters that have special meaning inside double-quotes
@@ -490,6 +551,12 @@ export function buildInvokeCommand(name, prompt, options = {}) {
     const rawTokens = parseTemplateString(templateStr);
     if (rawTokens.length === 0)
         return undefined;
+    // pln#520 step 3: inject the resolved model right after the binary so model
+    // choice is decoupled from agent identity. Only when the profile declares a
+    // `model_flag` and the template doesn't already pin a model (don't double it).
+    if (options.model && profile.model_flag && !rawTokens.includes(profile.model_flag)) {
+        rawTokens.splice(1, 0, profile.model_flag, options.model);
+    }
     const executable = rawTokens[0];
     const interpolatedTokens = rawTokens.slice(1).map((tok) => tok === '{prompt}' ? embeddedPrompt : tok);
     // ── 5. Build the args array ───────────────────────────────────────────────

package/dist/core/agentrun-reconciler.js CHANGED Viewed

@@ -38,6 +38,7 @@ import { loadClaim } from './claims.js';
 import { loadAssignment } from './assignments.js';
 import { createRuntimeEvent } from './events.js';
 import { nowISO } from './ids.js';
+import { readHeartbeat, readLogTail, signalExists } from './runtime-signals.js';
 // ── Constants ──────────────────────────────────────────────────────────────
 /**
  * Minimum age before a run is eligible for reconciliation. Below this, the
@@ -52,6 +53,11 @@ export const DEFAULT_HEALTH_CHECK_GRACE_MS = 60_000;
 export const DEFAULT_STALE_AFTER_MS = 30 * 60_000;
 export const DEFAULT_DEAD_PID_READ_SWEEP_AGE_MS = 5 * 60_000;
 export const DEFAULT_DEAD_PID_READ_SWEEP_LIMIT = 50;
+/**
+ * pln#520 step 1 — a heartbeat older than this (with no completion signal) means
+ * the worker reached its loop then went silent: `stalled`. Default 10 min.
+ */
+export const DEFAULT_HEARTBEAT_STALE_MS = 10 * 60_000;
 const TERMINAL_STATUSES = new Set([
     'completed', 'failed', 'cancelled', 'timed_out', 'interrupted',
 ]);
@@ -152,15 +158,51 @@ export function collectEvidence(run, cwd, options) {
     }
     catch { /* defensive */ }
     const process_alive = isProcessAlive(run.pid);
-    return { age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive };
+    // pln#520 step 1 — sentinel evidence. Signals live under the project
+    // coordination dir (the dispatcher's ackRoot), which is `cwd` for the
+    // reconciler. Keyed by assignment_id.
+    const signalRoot = cwd ?? process.cwd();
+    let completed_signal = false;
+    let failed_signal = false;
+    let heartbeat_exists = false;
+    let heartbeat_age_ms;
+    try {
+        completed_signal = signalExists(signalRoot, run.assignment_id, 'completed');
+        failed_signal = signalExists(signalRoot, run.assignment_id, 'failed');
+        const hb = readHeartbeat(signalRoot, run.assignment_id);
+        heartbeat_exists = hb.exists;
+        if (hb.exists && hb.mtimeMs !== undefined)
+            heartbeat_age_ms = now - hb.mtimeMs;
+    }
+    catch { /* defensive */ }
+    return {
+        age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive,
+        completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms,
+    };
 }
 function anyCompletionEvidence(evidence) {
-    return evidence.has_post_start_commit
+    return evidence.completed_signal
+        || evidence.has_post_start_commit
         || evidence.claim_released
         || evidence.assignment_completed;
 }
+/**
+ * pln#520 step 1 — a short tail of the captured stderr (or stdout) for
+ * failed_silent / stalled diagnostics, so the verdict carries the worker's
+ * last words instead of just a status code.
+ */
+function logTailSuffix(run, cwd) {
+    const root = cwd ?? process.cwd();
+    const tail = (readLogTail(root, run.assignment_id, 'stderr', 500).trim()
+        || readLogTail(root, run.assignment_id, 'stdout', 500).trim());
+    if (!tail)
+        return '';
+    return ` | log tail: ${tail.replace(/\s+/g, ' ').slice(0, 300)}`;
+}
 function describeEvidence(evidence) {
     const reasons = [];
+    if (evidence.completed_signal)
+        reasons.push('wrapper wrote completed sentinel');
     if (evidence.has_post_start_commit)
         reasons.push('post-start commit on worktree branch');
     if (evidence.claim_released)
@@ -231,6 +273,7 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
         const evidence = {
             age_ms: 0, has_post_start_commit: false, claim_released: false,
             assignment_completed: false, process_alive: undefined,
+            completed_signal: false, failed_signal: false, heartbeat_exists: false,
         };
         return {
             run_id: runId, action: 'no_op', reason: 'run not found', evidence,
@@ -280,18 +323,12 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
             };
         }
     }
-    // Failure inference: stale + dead process + no evidence.
-    if (evidence.age_ms >= stale && evidence.process_alive === false) {
+    // pln#520 step 1 — sentinel-based failure (fast + trustworthy, pid-independent).
+    const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
+    const failHere = (reason) => {
         try {
-            transitionAgentRun(runId, 'failed', {
-                actor,
-                status_reason: 'silent_termination_no_evidence',
-            }, cwd);
-            return {
-                run_id: runId, action: 'inferred_failed',
-                reason: 'silent_termination_no_evidence',
-                evidence, previous_status, current_status: 'failed',
-            };
+            transitionAgentRun(runId, 'failed', { actor, status_reason: reason }, cwd);
+            return { run_id: runId, action: 'inferred_failed', reason, evidence, previous_status, current_status: 'failed' };
         }
         catch (err) {
             return {
@@ -300,6 +337,26 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
                 evidence, previous_status, current_status: run.status,
             };
         }
+    };
+    // `failed` sentinel — the wrapper saw a non-zero agent exit.
+    if (evidence.failed_signal) {
+        return failHere(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
+    }
+    // Heartbeat present but stale → reached the loop then went silent.
+    if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
+        return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
+    }
+    // Fresh heartbeat → alive; trust it over the untrustworthy wrapper pid.
+    if (evidence.heartbeat_exists) {
+        return {
+            run_id: runId, action: 'no_op',
+            reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
+            evidence, previous_status, current_status: run.status,
+        };
+    }
+    // Failure inference: stale + dead process + no evidence.
+    if (evidence.age_ms >= stale && evidence.process_alive === false) {
+        return failHere('silent_termination_no_evidence');
     }
     // Health-check window: past grace, not yet stale, no evidence either way.
     // Emit a non-mutating event so callers see the uncertainty without
@@ -339,6 +396,7 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
         const evidence = {
             age_ms: 0, has_post_start_commit: false, claim_released: false,
             assignment_completed: false, process_alive: undefined,
+            completed_signal: false, failed_signal: false, heartbeat_exists: false,
         };
         return {
             run_id: runId, action: 'no_op', reason: 'run not found', evidence,
@@ -352,19 +410,25 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
             evidence, previous_status: run.status, current_status: run.status,
         };
     }
-    if (evidence.process_alive !== false) {
-        return {
-            run_id: run.id, action: 'no_op',
-            reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
-            evidence, previous_status: run.status, current_status: run.status,
-        };
-    }
-    // pid reads dead — but the tracked pid is NOT trustworthy (see doc above),
-    // so a bare dead pid NEVER cancels. Evidence of real work wins; otherwise
-    // surface the uncertainty non-destructively and leave the run `running` for
-    // reconcileAgentRun's stale-threshold path to fail it only after a fair,
-    // evidence-based delay.
     const actor = options.actor ?? 'reconciler';
+    const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
+    const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
+    const failRun = (reason) => {
+        try {
+            transitionAgentRun(run.id, 'failed', { actor, status_reason: reason }, cwd);
+            return { run_id: run.id, action: 'inferred_failed', reason, evidence, previous_status: run.status, current_status: 'failed' };
+        }
+        catch (err) {
+            return {
+                run_id: run.id, action: 'no_op',
+                reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
+                evidence, previous_status: run.status, current_status: run.status,
+            };
+        }
+    };
+    // ── pln#520 step 1: SENTINELS are authoritative, independent of the
+    // untrustworthy wrapper pid. Check them first. ──────────────────────────
+    // 1. Completion evidence (mechanical `completed` sentinel or work evidence).
     if (anyCompletionEvidence(evidence)) {
         try {
             transitionAgentRun(run.id, 'completed', {
@@ -385,33 +449,43 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
             };
         }
     }
-    // Stale + provably dead + still no evidence -> genuine silent failure. This
-    // MUST converge HERE: the canonical read path (entity-operations.ts) and the
-    // MCP pre-read sweep route `running` runs through this function, never
-    // through reconcileAgentRun, so deferring would leave a crashed run `running`
-    // forever (trp#292). The 30-min stale window — vs the immediate cancel before
-    // pln#520 — gives a worker behind an untrusted pid ample time to leave
-    // evidence first. Reported as `failed` (it died), not `cancelled`.
-    const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
+    // 2. `failed` sentinel — the wrapper saw a non-zero agent exit. This is the
+    // FAST, TRUSTWORTHY failed_silent detector (vs the pid heuristic that caused
+    // can_f792cacd false negatives). Carries the captured log tail.
+    if (evidence.failed_signal) {
+        return failRun(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
+    }
+    // 3. Heartbeat present but STALE → the worker reached its loop then went
+    // silent (e.g. hung). pid-independent: a hung worker keeps the wrapper alive.
+    if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
+        return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
+    }
+    // 4. Fresh heartbeat → the worker is alive and working; trust it OVER the
+    // (untrustworthy) wrapper pid. This is the can_f792cacd fix: never fail a
+    // live, heartbeating worker just because its wrapper pid reads dead.
+    if (evidence.heartbeat_exists) {
+        return {
+            run_id: run.id, action: 'no_op',
+            reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
+            evidence, previous_status: run.status, current_status: run.status,
+        };
+    }
+    // ── No sentinel, no heartbeat: fall back to the pid-conservative path. The
+    // wrapper writes completed/failed on any normal exit, so reaching here means
+    // the worker has not exited and never heartbeat. Do NOT fast-fail on a dead
+    // pid (it's the wrapper's, not the worker's). ──────────────────────────────
+    if (evidence.process_alive !== false) {
+        return {
+            run_id: run.id, action: 'no_op',
+            reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
+            evidence, previous_status: run.status, current_status: run.status,
+        };
+    }
+    // pid dead + no sentinel + no heartbeat: only converge after the long stale
+    // window (trp#292 — must converge HERE since the read path never routes
+    // through reconcileAgentRun), giving an untrusted-pid worker ample time.
     if (evidence.age_ms >= stale) {
-        try {
-            transitionAgentRun(run.id, 'failed', {
-                actor,
-                status_reason: 'silent_termination_no_evidence',
-            }, cwd);
-            return {
-                run_id: run.id, action: 'inferred_failed',
-                reason: 'silent_termination_no_evidence',
-                evidence, previous_status: run.status, current_status: 'failed',
-            };
-        }
-        catch (err) {
-            return {
-                run_id: run.id, action: 'no_op',
-                reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
-                evidence, previous_status: run.status, current_status: run.status,
-            };
-        }
+        return failRun('silent_termination_no_evidence');
     }
     emitUnverifiedEvent(run, evidence, actor, cwd);
     return {
@@ -457,7 +531,7 @@ export function reconcileAllOpenRuns(cwd, filter = {}, options = {}) {
             catch {
                 results.push({
                     run_id: run.id, action: 'no_op', reason: 'reconcile threw — skipped',
-                    evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined },
+                    evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined, completed_signal: false, failed_signal: false, heartbeat_exists: false },
                     previous_status: run.status, current_status: run.status,
                 });
             }

package/dist/core/coordination.js CHANGED Viewed

@@ -11,7 +11,7 @@ import { inferProjectFromTarget, loadInstructions, resolveInstructions } from '.
 import { buildReputationSummary, findAgentReputationSummary } from './reputation.js';
 import { listRuntimeNotes } from './runtime.js';
 import { loadState, persistState } from './state.js';
-import { getCapabilityProfile } from './agent-capability.js';
+import { resolveConcurrencyLimit, serializeConcurrencyLimit } from './agent-capability.js';
 import { loadAllSessions } from './identity.js';
 import { countActionable } from './messaging.js';
 import { listCandidates } from './candidates.js';
@@ -176,8 +176,7 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
     for (const identity of listAgentIdentities(cwd)) {
         if (identity.agent_name === currentAgent)
             continue;
-        const profile = getCapabilityProfile(identity.agent_name);
-        const maxTasks = profile?.max_concurrent_tasks ?? 1;
+        const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(identity.agent_name));
         agentMap.set(identity.agent_name, {
             name: identity.agent_name,
             trust_level: identity.trust_level ?? 'contributor',
@@ -185,23 +184,25 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
             scopes: [],
             has_open_session: false,
             instance_count: sessionCounts.get(identity.agent_name) ?? 0,
-            max_tasks: maxTasks,
-            slots_remaining: maxTasks, // will be reduced when claims are counted
+            max_tasks: limit,
+            slots_remaining: limit, // will be reduced when claims are counted (null stays unlimited)
         });
     }
     // Enrich with active claims
     for (const claim of claims) {
         if (claim.agent === currentAgent)
             continue;
-        const profile = getCapabilityProfile(claim.agent);
-        const maxTasks = profile?.max_concurrent_tasks ?? 1;
+        const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(claim.agent));
         const existing = agentMap.get(claim.agent) ?? {
             name: claim.agent, trust_level: 'contributor', claim_count: 0, scopes: [],
             has_open_session: false, instance_count: sessionCounts.get(claim.agent) ?? 0,
-            max_tasks: maxTasks, slots_remaining: maxTasks,
+            max_tasks: limit, slots_remaining: limit,
         };
         existing.claim_count++;
-        existing.slots_remaining = Math.max(0, existing.max_tasks - existing.claim_count);
+        // null max_tasks = unlimited → slots stay unlimited.
+        existing.slots_remaining = existing.max_tasks === null
+            ? null
+            : Math.max(0, existing.max_tasks - existing.claim_count);
         existing.scopes.push(claim.scope);
         if (!existing.last_active || claim.created_at > existing.last_active) {
             existing.last_active = claim.created_at;