brainclaw 1.7.1 → 1.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -0
- package/dist/brainclaw-vscode.vsix +0 -0
- package/dist/cli.js +12 -2
- package/dist/commands/dispatch.js +2 -0
- package/dist/commands/doctor.js +17 -0
- package/dist/commands/mcp.js +31 -7
- package/dist/core/agent-capability.js +67 -0
- package/dist/core/agentrun-reconciler.js +126 -52
- package/dist/core/coordination.js +10 -9
- package/dist/core/dispatcher.js +99 -29
- package/dist/core/entity-operations.js +54 -1
- package/dist/core/execution-adapters.js +32 -51
- package/dist/core/execution.js +14 -8
- package/dist/core/instruction-templates.js +4 -3
- package/dist/core/runtime-signals.js +102 -0
- package/dist/core/spawn-check.js +125 -0
- package/dist/facts.js +3 -3
- package/dist/facts.json +2 -2
- package/docs/cli.md +8 -4
- package/docs/integrations/mcp.md +48 -15
- package/docs/mcp-schema-changelog.md +16 -5
- package/docs/playbooks/team/index.md +7 -5
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -345,6 +345,14 @@ npm run test:coverage # with coverage report
|
|
|
345
345
|
|
|
346
346
|
For older releases (v0.x and the early v1.0 launch series), `git log` on `master` is the source of truth — every release commit follows the `chore(release): bump version to <semver>` convention, and the matching feature/fix commits reference their plan id (e.g. `feat(mcp): self-heal ... (pln#478)`).
|
|
347
347
|
|
|
348
|
+
### v1.7.2
|
|
349
|
+
|
|
350
|
+
- **Sequence MCP tools are agent-first by default** — sequence creation,
|
|
351
|
+
listing, update, and deletion tools are now in the default MCP catalog, with
|
|
352
|
+
explicit lane item schemas (`planId`, optional `stepId`, `rank`,
|
|
353
|
+
dependencies, lane metadata) and matching canonical CRUD validation for
|
|
354
|
+
`entity="sequence"`.
|
|
355
|
+
|
|
348
356
|
### v1.7.1
|
|
349
357
|
|
|
350
358
|
- **MCP project context isolation fix** — `bclaw_switch` now keeps MCP switches
|
|
Binary file
|
package/dist/cli.js
CHANGED
|
@@ -30,7 +30,7 @@ import { runInstruction } from './commands/instruction.js';
|
|
|
30
30
|
import { runListAgents } from './commands/list-agents.js';
|
|
31
31
|
import { runSurfaceTaskResource } from './commands/surface-task-resource.js';
|
|
32
32
|
import { runListInstructions } from './commands/list-instructions.js';
|
|
33
|
-
import { runDoctor } from './commands/doctor.js';
|
|
33
|
+
import { runDoctor, runDoctorSpawnCheck } from './commands/doctor.js';
|
|
34
34
|
import { runRepair } from './commands/repair.js';
|
|
35
35
|
import { runStale } from './commands/stale.js';
|
|
36
36
|
import { runRebuild } from './commands/rebuild.js';
|
|
@@ -681,7 +681,13 @@ program
|
|
|
681
681
|
.option('--repair', 'Rebuild dist/ when the MCP runtime is missing or stale')
|
|
682
682
|
.option('--after-migration', 'Run the v1.0 post-migration health check only (exits non-zero on any failure)')
|
|
683
683
|
.option('--dispatch', 'Run dispatch-health diagnostic only: reconcile open agent_runs and report stuck/unverified/silent failures (pln#496 step stp_8c072d75)')
|
|
684
|
-
.
|
|
684
|
+
.option('--spawn-check', 'Real spawn round-trip per installed agent before dispatch (pln#520 step 2): validates delivery + handshake on this host, exits non-zero on any installed-agent failure')
|
|
685
|
+
.option('--spawn-check-timeout <ms>', 'Per-agent timeout for --spawn-check (default 15000)', parseInt)
|
|
686
|
+
.action(async (options) => {
|
|
687
|
+
if (options.spawnCheck) {
|
|
688
|
+
await runDoctorSpawnCheck({ cwd: options.cwd, json: options.json, timeoutMs: options.spawnCheckTimeout });
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
685
691
|
runDoctor({ ...options, afterMigration: options.afterMigration, dispatch: options.dispatch });
|
|
686
692
|
});
|
|
687
693
|
// --- repair (Phase 4 Sprint 2 Lane C / pln#397) ---
|
|
@@ -1441,6 +1447,8 @@ dispatchCmd
|
|
|
1441
1447
|
.option('--agents <names>', 'Comma-separated list of agents to dispatch to')
|
|
1442
1448
|
.option('--lanes <names>', 'Comma-separated list of lanes to dispatch')
|
|
1443
1449
|
.option('--max <n>', 'Maximum assignments', parseInt)
|
|
1450
|
+
.option('--max-concurrency <n>', 'Opt-in cap on concurrent instances per host-binary (default: unlimited)', parseInt)
|
|
1451
|
+
.option('--model <name>', 'Model to run, decoupled from agent identity (e.g. --model sonnet)')
|
|
1444
1452
|
.option('--dry', 'Preview assignments without sending messages')
|
|
1445
1453
|
.option('--spawn', 'Autonomously launch CLI agents with invoke templates')
|
|
1446
1454
|
.option('--agent <name>', 'Dispatcher agent name')
|
|
@@ -1450,6 +1458,8 @@ dispatchCmd
|
|
|
1450
1458
|
agents: options.agents,
|
|
1451
1459
|
lanes: options.lanes,
|
|
1452
1460
|
max: options.max,
|
|
1461
|
+
maxConcurrency: options.maxConcurrency,
|
|
1462
|
+
model: options.model,
|
|
1453
1463
|
dry: options.dry,
|
|
1454
1464
|
spawn: options.spawn,
|
|
1455
1465
|
agent: options.agent,
|
|
@@ -87,6 +87,8 @@ export async function runDispatch(options) {
|
|
|
87
87
|
dryRun: options.dry,
|
|
88
88
|
dispatcherAgent,
|
|
89
89
|
autoExecute: options.spawn,
|
|
90
|
+
maxConcurrency: options.maxConcurrency,
|
|
91
|
+
model: options.model,
|
|
90
92
|
}, effectiveCwd);
|
|
91
93
|
if (!result) {
|
|
92
94
|
console.log('No active sequence found.');
|
package/dist/commands/doctor.js
CHANGED
|
@@ -3,6 +3,7 @@ import fs from 'node:fs';
|
|
|
3
3
|
import path from 'node:path';
|
|
4
4
|
import * as childProcess from 'node:child_process';
|
|
5
5
|
import { reconcileAllOpenRuns } from '../core/agentrun-reconciler.js';
|
|
6
|
+
import { runSpawnCheck, renderSpawnCheckReport } from '../core/spawn-check.js';
|
|
6
7
|
import { loadAgentRun } from '../core/agentruns.js';
|
|
7
8
|
import { listAgentIdentities, resolveCurrentAgentIdentity } from '../core/agent-registry.js';
|
|
8
9
|
import { listCapabilities as listRegistryCapabilities, listTools as listRegistryTools } from '../core/registries.js';
|
|
@@ -565,6 +566,22 @@ function renderDispatchHealthHumanReport(report) {
|
|
|
565
566
|
}
|
|
566
567
|
return lines.join('\n');
|
|
567
568
|
}
|
|
569
|
+
/**
|
|
570
|
+
* pln#520 step 2 — `brainclaw doctor --spawn-check`. Real spawn round-trip per
|
|
571
|
+
* installed agent on the current host. Exits non-zero if any installed agent
|
|
572
|
+
* fails (so it gates CI / a pre-dispatch pre-flight).
|
|
573
|
+
*/
|
|
574
|
+
export async function runDoctorSpawnCheck(options = {}) {
|
|
575
|
+
const report = await runSpawnCheck(options);
|
|
576
|
+
if (options.json) {
|
|
577
|
+
console.log(JSON.stringify(report, null, 2));
|
|
578
|
+
}
|
|
579
|
+
else {
|
|
580
|
+
console.log(renderSpawnCheckReport(report));
|
|
581
|
+
}
|
|
582
|
+
if (report.exit_code !== 0)
|
|
583
|
+
process.exit(report.exit_code);
|
|
584
|
+
}
|
|
568
585
|
export function runDoctor(options = {}) {
|
|
569
586
|
if (options.dispatch) {
|
|
570
587
|
const report = runDispatchHealthCheck(options);
|
package/dist/commands/mcp.js
CHANGED
|
@@ -57,6 +57,30 @@ export const SCHEMA_VERSION = '1.0.0';
|
|
|
57
57
|
export const MCP_PROTOCOL_VERSIONS = ['2025-11-25', '2024-11-05'];
|
|
58
58
|
export const MCP_SERVER_NOT_INITIALIZED = -32002;
|
|
59
59
|
const MCP_RUNTIME_REPAIR_COMMAND = 'brainclaw doctor --repair';
|
|
60
|
+
const SEQUENCE_ITEM_INPUT_SCHEMA = {
|
|
61
|
+
type: 'object',
|
|
62
|
+
description: 'Sequence lane item. planId is required; stepId optionally narrows dispatch/readiness to a specific plan step.',
|
|
63
|
+
properties: {
|
|
64
|
+
planId: { type: 'string', minLength: 1, description: 'Plan item ID referenced by this sequence item.' },
|
|
65
|
+
stepId: { type: 'string', minLength: 1, description: 'Optional plan step ID inside planId for step-level dispatch/readiness.' },
|
|
66
|
+
rank: { type: 'number', minimum: 1, description: 'Positive integer ordering key. Ranks must be unique within a sequence.' },
|
|
67
|
+
hard_after: {
|
|
68
|
+
type: 'array',
|
|
69
|
+
items: { type: 'string' },
|
|
70
|
+
description: 'Sequence item planId values that must complete before this item becomes ready.',
|
|
71
|
+
},
|
|
72
|
+
soft_after: {
|
|
73
|
+
type: 'array',
|
|
74
|
+
items: { type: 'string' },
|
|
75
|
+
description: 'Advisory predecessor planId values; they inform ordering but do not block readiness.',
|
|
76
|
+
},
|
|
77
|
+
lane: { type: 'string', description: 'Optional lane label used for parallel dispatch grouping and filtering.' },
|
|
78
|
+
scope_hint: { type: 'string', description: 'Optional file/path scope hint for claim and brief generation.' },
|
|
79
|
+
rationale: { type: 'string', description: 'Optional explanation for this item or dependency placement.' },
|
|
80
|
+
},
|
|
81
|
+
required: ['planId', 'rank'],
|
|
82
|
+
additionalProperties: false,
|
|
83
|
+
};
|
|
60
84
|
const { $defs: loopPhaseDefs, ...loopPhaseItemSchema } = generatedSchemas.LoopPhase;
|
|
61
85
|
const loopSlotInputItemSchema = generatedSchemas.LoopSlotInput;
|
|
62
86
|
export const MCP_READ_TOOLS = [
|
|
@@ -160,7 +184,7 @@ export const MCP_READ_TOOLS = [
|
|
|
160
184
|
{
|
|
161
185
|
name: 'bclaw_list_sequences',
|
|
162
186
|
description: 'List coordination sequences with optional filters on status and id.',
|
|
163
|
-
annotations: { tier: '
|
|
187
|
+
annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'auto' },
|
|
164
188
|
inputSchema: {
|
|
165
189
|
type: 'object',
|
|
166
190
|
properties: {
|
|
@@ -624,7 +648,7 @@ const MCP_WRITE_TOOLS = [
|
|
|
624
648
|
{
|
|
625
649
|
name: 'bclaw_create_sequence',
|
|
626
650
|
description: 'Create a coordination sequence shared by agents.',
|
|
627
|
-
annotations: { tier: '
|
|
651
|
+
annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
|
|
628
652
|
inputSchema: {
|
|
629
653
|
type: 'object',
|
|
630
654
|
properties: {
|
|
@@ -632,7 +656,7 @@ const MCP_WRITE_TOOLS = [
|
|
|
632
656
|
description: { type: 'string', description: 'Optional sequence description.' },
|
|
633
657
|
status: { type: 'string', description: 'Status: draft, active, archived.' },
|
|
634
658
|
owner: { type: 'string', description: 'Optional sequence owner.' },
|
|
635
|
-
items: { type: 'array', description: 'Sequence items in rank order.', items:
|
|
659
|
+
items: { type: 'array', description: 'Sequence items in rank order.', items: SEQUENCE_ITEM_INPUT_SCHEMA },
|
|
636
660
|
tags: { type: 'array', items: { type: 'string' }, description: 'Optional tags.' },
|
|
637
661
|
agent: { type: 'string', description: 'Agent name.' },
|
|
638
662
|
agentId: { type: 'string', description: 'Registered agent id.' },
|
|
@@ -643,7 +667,7 @@ const MCP_WRITE_TOOLS = [
|
|
|
643
667
|
{
|
|
644
668
|
name: 'bclaw_update_sequence',
|
|
645
669
|
description: 'Update a coordination sequence status, metadata, or items.',
|
|
646
|
-
annotations: { tier: '
|
|
670
|
+
annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
|
|
647
671
|
inputSchema: {
|
|
648
672
|
type: 'object',
|
|
649
673
|
properties: {
|
|
@@ -652,7 +676,7 @@ const MCP_WRITE_TOOLS = [
|
|
|
652
676
|
description: { type: 'string', description: 'Optional new description.' },
|
|
653
677
|
status: { type: 'string', description: 'Status: draft, active, archived.' },
|
|
654
678
|
owner: { type: 'string', description: 'Optional sequence owner.' },
|
|
655
|
-
items: { type: 'array', description: 'Optional replacement items array.', items:
|
|
679
|
+
items: { type: 'array', description: 'Optional replacement items array.', items: SEQUENCE_ITEM_INPUT_SCHEMA },
|
|
656
680
|
tags: { type: 'array', items: { type: 'string' }, description: 'Optional replacement tags.' },
|
|
657
681
|
agent: { type: 'string', description: 'Agent name.' },
|
|
658
682
|
agentId: { type: 'string', description: 'Registered agent id.' },
|
|
@@ -754,7 +778,7 @@ const MCP_WRITE_TOOLS = [
|
|
|
754
778
|
{
|
|
755
779
|
name: 'bclaw_delete_sequence',
|
|
756
780
|
description: 'Delete a sequence by ID. Requires trusted or curator trust level.',
|
|
757
|
-
annotations: { tier: '
|
|
781
|
+
annotations: { tier: 'standard', category: 'coordination', headlessApproval: 'prompt' },
|
|
758
782
|
inputSchema: {
|
|
759
783
|
type: 'object',
|
|
760
784
|
properties: {
|
|
@@ -1092,7 +1116,7 @@ const MCP_WRITE_TOOLS = [
|
|
|
1092
1116
|
inputSchema: {
|
|
1093
1117
|
type: 'object',
|
|
1094
1118
|
properties: {
|
|
1095
|
-
entity: { type: 'string', description: 'Entity name: plan | decision | constraint | trap | handoff | runtime_note | candidate | claim | action | assignment | agent_run | cross_project_link. Others not yet wired.' },
|
|
1119
|
+
entity: { type: 'string', description: 'Entity name: plan | decision | constraint | trap | handoff | runtime_note | candidate | sequence | claim | action | assignment | agent_run | cross_project_link. Others not yet wired.' },
|
|
1096
1120
|
filter: { type: 'object', description: 'Filter keys: status, tag (single tag), tags (array, any-match), author, plan_id, source, auto_generated, limit, offset, includeLegacy (bool, default false), minAutoReflectConfidence (0-1, default 0.6). entity=agent_run also accepts assignment_id, claim_id, message_id.' },
|
|
1097
1121
|
project: { type: 'string', description: 'Optional: name (or path/basename) of a linked project to query. Defaults to the current project. Only cross_project_links (config.yaml) and workspace store-chain children are accepted — list with `brainclaw link list`.' },
|
|
1098
1122
|
},
|
|
@@ -46,6 +46,10 @@ const PROFILES = {
|
|
|
46
46
|
invoke_binary: 'claude',
|
|
47
47
|
invoke_review_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
|
|
48
48
|
invoke_consult_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
|
|
49
|
+
// pln#520 step 3: model is selectable via `--model` — no need for a
|
|
50
|
+
// per-model pseudo-identity. `claude-sonnet` below is now redundant
|
|
51
|
+
// (run `claude-code --model sonnet`) and kept only for back-compat.
|
|
52
|
+
model_flag: '--model',
|
|
49
53
|
},
|
|
50
54
|
'claude-sonnet': {
|
|
51
55
|
name: 'claude-sonnet', category: 'code-agent', workflowModel: 'interactive',
|
|
@@ -323,6 +327,63 @@ export function getCapabilityProfile(name) {
|
|
|
323
327
|
const resolved = resolveAgentAlias(name);
|
|
324
328
|
return _customProfiles.get(resolved) ?? PROFILES[resolved];
|
|
325
329
|
}
|
|
330
|
+
/**
|
|
331
|
+
* pln#520 step 3 — concurrency is a resolvable execution-config value, NOT a
|
|
332
|
+
* structural constant baked into agent identity.
|
|
333
|
+
*
|
|
334
|
+
* The host resource a concurrency cap actually protects is the binary on the
|
|
335
|
+
* machine (its API quota / its RAM/CPU footprint), not the agent label.
|
|
336
|
+
* `resolveResourceKey` returns that shared key so callers count usage across
|
|
337
|
+
* every identity that drives one binary. This kills the can_dc4e4a11 bug:
|
|
338
|
+
* `claude-code` and `claude-sonnet` are the SAME `claude` binary on the SAME
|
|
339
|
+
* host but were counted separately (3 + 6 → up to 9 concurrent `claude`
|
|
340
|
+
* processes, oversubscribing the machine + API).
|
|
341
|
+
*/
|
|
342
|
+
export function resolveResourceKey(name) {
|
|
343
|
+
const profile = getCapabilityProfile(name);
|
|
344
|
+
return profile?.invoke_binary ?? resolveAgentAlias(name);
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Resolve the concurrency limit for an agent. `Infinity` = unlimited.
|
|
348
|
+
*
|
|
349
|
+
* Resolution chain (highest priority first), decoupled from agent identity:
|
|
350
|
+
* 1. explicit `override` (e.g. `brainclaw dispatch --max-concurrency N`)
|
|
351
|
+
* 2. host opt-in cap via `BRAINCLAW_MAX_CONCURRENCY` (protect one machine / quota)
|
|
352
|
+
* 3. structural floor — agents that cannot run headless in parallel
|
|
353
|
+
* (IDE / desktop agents, i.e. not CLI-spawnable) stay hard-capped at their
|
|
354
|
+
* profile `max_concurrent_tasks` (you can't spawn N IDE windows headlessly)
|
|
355
|
+
* 4. default for parallelizable CLI agents: UNLIMITED. There is no arbitrary
|
|
356
|
+
* per-identity throttle — the operator opts into a cap when they want one.
|
|
357
|
+
*
|
|
358
|
+
* When a finite cap applies it is enforced per host-binary resource
|
|
359
|
+
* (see `resolveResourceKey`), so all variants of one binary share the pool.
|
|
360
|
+
*/
|
|
361
|
+
export function resolveConcurrencyLimit(name, opts = {}) {
|
|
362
|
+
if (opts.override !== undefined && opts.override > 0)
|
|
363
|
+
return opts.override;
|
|
364
|
+
const envCap = Number(process.env.BRAINCLAW_MAX_CONCURRENCY);
|
|
365
|
+
if (Number.isFinite(envCap) && envCap > 0)
|
|
366
|
+
return envCap;
|
|
367
|
+
const profile = getCapabilityProfile(name);
|
|
368
|
+
if (!profile?.runtime?.canBeSpawnedCli)
|
|
369
|
+
return profile?.max_concurrent_tasks ?? 1;
|
|
370
|
+
return Infinity;
|
|
371
|
+
}
|
|
372
|
+
/** JSON-safe rendering of a concurrency limit: `Infinity` → `null` (= unlimited). */
|
|
373
|
+
export function serializeConcurrencyLimit(limit) {
|
|
374
|
+
return Number.isFinite(limit) ? limit : null;
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* pln#520 step 3 — resolve the model for a dispatch, decoupled from agent
|
|
378
|
+
* identity. Chain (highest priority first): explicit override (e.g.
|
|
379
|
+
* `dispatch --model`) → lane model → identity model → profile default.
|
|
380
|
+
* Returns `undefined` when nothing in the chain specifies one (the agent's
|
|
381
|
+
* template default applies).
|
|
382
|
+
*/
|
|
383
|
+
export function resolveModel(name, opts = {}) {
|
|
384
|
+
const profile = getCapabilityProfile(name);
|
|
385
|
+
return opts.override ?? opts.lane ?? opts.identity ?? profile?.default_model;
|
|
386
|
+
}
|
|
326
387
|
/**
|
|
327
388
|
* Escape a string for safe use as a double-quoted shell argument.
|
|
328
389
|
* Escapes characters that have special meaning inside double-quotes
|
|
@@ -490,6 +551,12 @@ export function buildInvokeCommand(name, prompt, options = {}) {
|
|
|
490
551
|
const rawTokens = parseTemplateString(templateStr);
|
|
491
552
|
if (rawTokens.length === 0)
|
|
492
553
|
return undefined;
|
|
554
|
+
// pln#520 step 3: inject the resolved model right after the binary so model
|
|
555
|
+
// choice is decoupled from agent identity. Only when the profile declares a
|
|
556
|
+
// `model_flag` and the template doesn't already pin a model (don't double it).
|
|
557
|
+
if (options.model && profile.model_flag && !rawTokens.includes(profile.model_flag)) {
|
|
558
|
+
rawTokens.splice(1, 0, profile.model_flag, options.model);
|
|
559
|
+
}
|
|
493
560
|
const executable = rawTokens[0];
|
|
494
561
|
const interpolatedTokens = rawTokens.slice(1).map((tok) => tok === '{prompt}' ? embeddedPrompt : tok);
|
|
495
562
|
// ── 5. Build the args array ───────────────────────────────────────────────
|
|
@@ -38,6 +38,7 @@ import { loadClaim } from './claims.js';
|
|
|
38
38
|
import { loadAssignment } from './assignments.js';
|
|
39
39
|
import { createRuntimeEvent } from './events.js';
|
|
40
40
|
import { nowISO } from './ids.js';
|
|
41
|
+
import { readHeartbeat, readLogTail, signalExists } from './runtime-signals.js';
|
|
41
42
|
// ── Constants ──────────────────────────────────────────────────────────────
|
|
42
43
|
/**
|
|
43
44
|
* Minimum age before a run is eligible for reconciliation. Below this, the
|
|
@@ -52,6 +53,11 @@ export const DEFAULT_HEALTH_CHECK_GRACE_MS = 60_000;
|
|
|
52
53
|
export const DEFAULT_STALE_AFTER_MS = 30 * 60_000;
|
|
53
54
|
export const DEFAULT_DEAD_PID_READ_SWEEP_AGE_MS = 5 * 60_000;
|
|
54
55
|
export const DEFAULT_DEAD_PID_READ_SWEEP_LIMIT = 50;
|
|
56
|
+
/**
|
|
57
|
+
* pln#520 step 1 — a heartbeat older than this (with no completion signal) means
|
|
58
|
+
* the worker reached its loop then went silent: `stalled`. Default 10 min.
|
|
59
|
+
*/
|
|
60
|
+
export const DEFAULT_HEARTBEAT_STALE_MS = 10 * 60_000;
|
|
55
61
|
const TERMINAL_STATUSES = new Set([
|
|
56
62
|
'completed', 'failed', 'cancelled', 'timed_out', 'interrupted',
|
|
57
63
|
]);
|
|
@@ -152,15 +158,51 @@ export function collectEvidence(run, cwd, options) {
|
|
|
152
158
|
}
|
|
153
159
|
catch { /* defensive */ }
|
|
154
160
|
const process_alive = isProcessAlive(run.pid);
|
|
155
|
-
|
|
161
|
+
// pln#520 step 1 — sentinel evidence. Signals live under the project
|
|
162
|
+
// coordination dir (the dispatcher's ackRoot), which is `cwd` for the
|
|
163
|
+
// reconciler. Keyed by assignment_id.
|
|
164
|
+
const signalRoot = cwd ?? process.cwd();
|
|
165
|
+
let completed_signal = false;
|
|
166
|
+
let failed_signal = false;
|
|
167
|
+
let heartbeat_exists = false;
|
|
168
|
+
let heartbeat_age_ms;
|
|
169
|
+
try {
|
|
170
|
+
completed_signal = signalExists(signalRoot, run.assignment_id, 'completed');
|
|
171
|
+
failed_signal = signalExists(signalRoot, run.assignment_id, 'failed');
|
|
172
|
+
const hb = readHeartbeat(signalRoot, run.assignment_id);
|
|
173
|
+
heartbeat_exists = hb.exists;
|
|
174
|
+
if (hb.exists && hb.mtimeMs !== undefined)
|
|
175
|
+
heartbeat_age_ms = now - hb.mtimeMs;
|
|
176
|
+
}
|
|
177
|
+
catch { /* defensive */ }
|
|
178
|
+
return {
|
|
179
|
+
age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive,
|
|
180
|
+
completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms,
|
|
181
|
+
};
|
|
156
182
|
}
|
|
157
183
|
function anyCompletionEvidence(evidence) {
|
|
158
|
-
return evidence.
|
|
184
|
+
return evidence.completed_signal
|
|
185
|
+
|| evidence.has_post_start_commit
|
|
159
186
|
|| evidence.claim_released
|
|
160
187
|
|| evidence.assignment_completed;
|
|
161
188
|
}
|
|
189
|
+
/**
|
|
190
|
+
* pln#520 step 1 — a short tail of the captured stderr (or stdout) for
|
|
191
|
+
* failed_silent / stalled diagnostics, so the verdict carries the worker's
|
|
192
|
+
* last words instead of just a status code.
|
|
193
|
+
*/
|
|
194
|
+
function logTailSuffix(run, cwd) {
|
|
195
|
+
const root = cwd ?? process.cwd();
|
|
196
|
+
const tail = (readLogTail(root, run.assignment_id, 'stderr', 500).trim()
|
|
197
|
+
|| readLogTail(root, run.assignment_id, 'stdout', 500).trim());
|
|
198
|
+
if (!tail)
|
|
199
|
+
return '';
|
|
200
|
+
return ` | log tail: ${tail.replace(/\s+/g, ' ').slice(0, 300)}`;
|
|
201
|
+
}
|
|
162
202
|
function describeEvidence(evidence) {
|
|
163
203
|
const reasons = [];
|
|
204
|
+
if (evidence.completed_signal)
|
|
205
|
+
reasons.push('wrapper wrote completed sentinel');
|
|
164
206
|
if (evidence.has_post_start_commit)
|
|
165
207
|
reasons.push('post-start commit on worktree branch');
|
|
166
208
|
if (evidence.claim_released)
|
|
@@ -231,6 +273,7 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
|
|
|
231
273
|
const evidence = {
|
|
232
274
|
age_ms: 0, has_post_start_commit: false, claim_released: false,
|
|
233
275
|
assignment_completed: false, process_alive: undefined,
|
|
276
|
+
completed_signal: false, failed_signal: false, heartbeat_exists: false,
|
|
234
277
|
};
|
|
235
278
|
return {
|
|
236
279
|
run_id: runId, action: 'no_op', reason: 'run not found', evidence,
|
|
@@ -280,18 +323,12 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
|
|
|
280
323
|
};
|
|
281
324
|
}
|
|
282
325
|
}
|
|
283
|
-
//
|
|
284
|
-
|
|
326
|
+
// pln#520 step 1 — sentinel-based failure (fast + trustworthy, pid-independent).
|
|
327
|
+
const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
328
|
+
const failHere = (reason) => {
|
|
285
329
|
try {
|
|
286
|
-
transitionAgentRun(runId, 'failed', {
|
|
287
|
-
|
|
288
|
-
status_reason: 'silent_termination_no_evidence',
|
|
289
|
-
}, cwd);
|
|
290
|
-
return {
|
|
291
|
-
run_id: runId, action: 'inferred_failed',
|
|
292
|
-
reason: 'silent_termination_no_evidence',
|
|
293
|
-
evidence, previous_status, current_status: 'failed',
|
|
294
|
-
};
|
|
330
|
+
transitionAgentRun(runId, 'failed', { actor, status_reason: reason }, cwd);
|
|
331
|
+
return { run_id: runId, action: 'inferred_failed', reason, evidence, previous_status, current_status: 'failed' };
|
|
295
332
|
}
|
|
296
333
|
catch (err) {
|
|
297
334
|
return {
|
|
@@ -300,6 +337,26 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
|
|
|
300
337
|
evidence, previous_status, current_status: run.status,
|
|
301
338
|
};
|
|
302
339
|
}
|
|
340
|
+
};
|
|
341
|
+
// `failed` sentinel — the wrapper saw a non-zero agent exit.
|
|
342
|
+
if (evidence.failed_signal) {
|
|
343
|
+
return failHere(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
|
|
344
|
+
}
|
|
345
|
+
// Heartbeat present but stale → reached the loop then went silent.
|
|
346
|
+
if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
|
|
347
|
+
return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
|
|
348
|
+
}
|
|
349
|
+
// Fresh heartbeat → alive; trust it over the untrustworthy wrapper pid.
|
|
350
|
+
if (evidence.heartbeat_exists) {
|
|
351
|
+
return {
|
|
352
|
+
run_id: runId, action: 'no_op',
|
|
353
|
+
reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
|
|
354
|
+
evidence, previous_status, current_status: run.status,
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
// Failure inference: stale + dead process + no evidence.
|
|
358
|
+
if (evidence.age_ms >= stale && evidence.process_alive === false) {
|
|
359
|
+
return failHere('silent_termination_no_evidence');
|
|
303
360
|
}
|
|
304
361
|
// Health-check window: past grace, not yet stale, no evidence either way.
|
|
305
362
|
// Emit a non-mutating event so callers see the uncertainty without
|
|
@@ -339,6 +396,7 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
|
|
|
339
396
|
const evidence = {
|
|
340
397
|
age_ms: 0, has_post_start_commit: false, claim_released: false,
|
|
341
398
|
assignment_completed: false, process_alive: undefined,
|
|
399
|
+
completed_signal: false, failed_signal: false, heartbeat_exists: false,
|
|
342
400
|
};
|
|
343
401
|
return {
|
|
344
402
|
run_id: runId, action: 'no_op', reason: 'run not found', evidence,
|
|
@@ -352,19 +410,25 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
|
|
|
352
410
|
evidence, previous_status: run.status, current_status: run.status,
|
|
353
411
|
};
|
|
354
412
|
}
|
|
355
|
-
if (evidence.process_alive !== false) {
|
|
356
|
-
return {
|
|
357
|
-
run_id: run.id, action: 'no_op',
|
|
358
|
-
reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
|
|
359
|
-
evidence, previous_status: run.status, current_status: run.status,
|
|
360
|
-
};
|
|
361
|
-
}
|
|
362
|
-
// pid reads dead — but the tracked pid is NOT trustworthy (see doc above),
|
|
363
|
-
// so a bare dead pid NEVER cancels. Evidence of real work wins; otherwise
|
|
364
|
-
// surface the uncertainty non-destructively and leave the run `running` for
|
|
365
|
-
// reconcileAgentRun's stale-threshold path to fail it only after a fair,
|
|
366
|
-
// evidence-based delay.
|
|
367
413
|
const actor = options.actor ?? 'reconciler';
|
|
414
|
+
const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
|
|
415
|
+
const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
416
|
+
const failRun = (reason) => {
|
|
417
|
+
try {
|
|
418
|
+
transitionAgentRun(run.id, 'failed', { actor, status_reason: reason }, cwd);
|
|
419
|
+
return { run_id: run.id, action: 'inferred_failed', reason, evidence, previous_status: run.status, current_status: 'failed' };
|
|
420
|
+
}
|
|
421
|
+
catch (err) {
|
|
422
|
+
return {
|
|
423
|
+
run_id: run.id, action: 'no_op',
|
|
424
|
+
reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
|
|
425
|
+
evidence, previous_status: run.status, current_status: run.status,
|
|
426
|
+
};
|
|
427
|
+
}
|
|
428
|
+
};
|
|
429
|
+
// ── pln#520 step 1: SENTINELS are authoritative, independent of the
|
|
430
|
+
// untrustworthy wrapper pid. Check them first. ──────────────────────────
|
|
431
|
+
// 1. Completion evidence (mechanical `completed` sentinel or work evidence).
|
|
368
432
|
if (anyCompletionEvidence(evidence)) {
|
|
369
433
|
try {
|
|
370
434
|
transitionAgentRun(run.id, 'completed', {
|
|
@@ -385,33 +449,43 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
|
|
|
385
449
|
};
|
|
386
450
|
}
|
|
387
451
|
}
|
|
388
|
-
//
|
|
389
|
-
//
|
|
390
|
-
//
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
//
|
|
395
|
-
|
|
452
|
+
// 2. `failed` sentinel — the wrapper saw a non-zero agent exit. This is the
|
|
453
|
+
// FAST, TRUSTWORTHY failed_silent detector (vs the pid heuristic that caused
|
|
454
|
+
// can_f792cacd false negatives). Carries the captured log tail.
|
|
455
|
+
if (evidence.failed_signal) {
|
|
456
|
+
return failRun(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
|
|
457
|
+
}
|
|
458
|
+
// 3. Heartbeat present but STALE → the worker reached its loop then went
|
|
459
|
+
// silent (e.g. hung). pid-independent: a hung worker keeps the wrapper alive.
|
|
460
|
+
if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
|
|
461
|
+
return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
|
|
462
|
+
}
|
|
463
|
+
// 4. Fresh heartbeat → the worker is alive and working; trust it OVER the
|
|
464
|
+
// (untrustworthy) wrapper pid. This is the can_f792cacd fix: never fail a
|
|
465
|
+
// live, heartbeating worker just because its wrapper pid reads dead.
|
|
466
|
+
if (evidence.heartbeat_exists) {
|
|
467
|
+
return {
|
|
468
|
+
run_id: run.id, action: 'no_op',
|
|
469
|
+
reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
|
|
470
|
+
evidence, previous_status: run.status, current_status: run.status,
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
// ── No sentinel, no heartbeat: fall back to the pid-conservative path. The
|
|
474
|
+
// wrapper writes completed/failed on any normal exit, so reaching here means
|
|
475
|
+
// the worker has not exited and never heartbeat. Do NOT fast-fail on a dead
|
|
476
|
+
// pid (it's the wrapper's, not the worker's). ──────────────────────────────
|
|
477
|
+
if (evidence.process_alive !== false) {
|
|
478
|
+
return {
|
|
479
|
+
run_id: run.id, action: 'no_op',
|
|
480
|
+
reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
|
|
481
|
+
evidence, previous_status: run.status, current_status: run.status,
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
// pid dead + no sentinel + no heartbeat: only converge after the long stale
|
|
485
|
+
// window (trp#292 — must converge HERE since the read path never routes
|
|
486
|
+
// through reconcileAgentRun), giving an untrusted-pid worker ample time.
|
|
396
487
|
if (evidence.age_ms >= stale) {
|
|
397
|
-
|
|
398
|
-
transitionAgentRun(run.id, 'failed', {
|
|
399
|
-
actor,
|
|
400
|
-
status_reason: 'silent_termination_no_evidence',
|
|
401
|
-
}, cwd);
|
|
402
|
-
return {
|
|
403
|
-
run_id: run.id, action: 'inferred_failed',
|
|
404
|
-
reason: 'silent_termination_no_evidence',
|
|
405
|
-
evidence, previous_status: run.status, current_status: 'failed',
|
|
406
|
-
};
|
|
407
|
-
}
|
|
408
|
-
catch (err) {
|
|
409
|
-
return {
|
|
410
|
-
run_id: run.id, action: 'no_op',
|
|
411
|
-
reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
|
|
412
|
-
evidence, previous_status: run.status, current_status: run.status,
|
|
413
|
-
};
|
|
414
|
-
}
|
|
488
|
+
return failRun('silent_termination_no_evidence');
|
|
415
489
|
}
|
|
416
490
|
emitUnverifiedEvent(run, evidence, actor, cwd);
|
|
417
491
|
return {
|
|
@@ -457,7 +531,7 @@ export function reconcileAllOpenRuns(cwd, filter = {}, options = {}) {
|
|
|
457
531
|
catch {
|
|
458
532
|
results.push({
|
|
459
533
|
run_id: run.id, action: 'no_op', reason: 'reconcile threw — skipped',
|
|
460
|
-
evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined },
|
|
534
|
+
evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined, completed_signal: false, failed_signal: false, heartbeat_exists: false },
|
|
461
535
|
previous_status: run.status, current_status: run.status,
|
|
462
536
|
});
|
|
463
537
|
}
|
|
@@ -11,7 +11,7 @@ import { inferProjectFromTarget, loadInstructions, resolveInstructions } from '.
|
|
|
11
11
|
import { buildReputationSummary, findAgentReputationSummary } from './reputation.js';
|
|
12
12
|
import { listRuntimeNotes } from './runtime.js';
|
|
13
13
|
import { loadState, persistState } from './state.js';
|
|
14
|
-
import {
|
|
14
|
+
import { resolveConcurrencyLimit, serializeConcurrencyLimit } from './agent-capability.js';
|
|
15
15
|
import { loadAllSessions } from './identity.js';
|
|
16
16
|
import { countActionable } from './messaging.js';
|
|
17
17
|
import { listCandidates } from './candidates.js';
|
|
@@ -176,8 +176,7 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
|
|
|
176
176
|
for (const identity of listAgentIdentities(cwd)) {
|
|
177
177
|
if (identity.agent_name === currentAgent)
|
|
178
178
|
continue;
|
|
179
|
-
const
|
|
180
|
-
const maxTasks = profile?.max_concurrent_tasks ?? 1;
|
|
179
|
+
const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(identity.agent_name));
|
|
181
180
|
agentMap.set(identity.agent_name, {
|
|
182
181
|
name: identity.agent_name,
|
|
183
182
|
trust_level: identity.trust_level ?? 'contributor',
|
|
@@ -185,23 +184,25 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
|
|
|
185
184
|
scopes: [],
|
|
186
185
|
has_open_session: false,
|
|
187
186
|
instance_count: sessionCounts.get(identity.agent_name) ?? 0,
|
|
188
|
-
max_tasks:
|
|
189
|
-
slots_remaining:
|
|
187
|
+
max_tasks: limit,
|
|
188
|
+
slots_remaining: limit, // will be reduced when claims are counted (null stays unlimited)
|
|
190
189
|
});
|
|
191
190
|
}
|
|
192
191
|
// Enrich with active claims
|
|
193
192
|
for (const claim of claims) {
|
|
194
193
|
if (claim.agent === currentAgent)
|
|
195
194
|
continue;
|
|
196
|
-
const
|
|
197
|
-
const maxTasks = profile?.max_concurrent_tasks ?? 1;
|
|
195
|
+
const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(claim.agent));
|
|
198
196
|
const existing = agentMap.get(claim.agent) ?? {
|
|
199
197
|
name: claim.agent, trust_level: 'contributor', claim_count: 0, scopes: [],
|
|
200
198
|
has_open_session: false, instance_count: sessionCounts.get(claim.agent) ?? 0,
|
|
201
|
-
max_tasks:
|
|
199
|
+
max_tasks: limit, slots_remaining: limit,
|
|
202
200
|
};
|
|
203
201
|
existing.claim_count++;
|
|
204
|
-
|
|
202
|
+
// null max_tasks = unlimited → slots stay unlimited.
|
|
203
|
+
existing.slots_remaining = existing.max_tasks === null
|
|
204
|
+
? null
|
|
205
|
+
: Math.max(0, existing.max_tasks - existing.claim_count);
|
|
205
206
|
existing.scopes.push(claim.scope);
|
|
206
207
|
if (!existing.last_active || claim.created_at > existing.last_active) {
|
|
207
208
|
existing.last_active = claim.created_at;
|