brainclaw 1.7.1 → 1.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -94
- package/dist/brainclaw-vscode.vsix +0 -0
- package/dist/cli.js +25 -3
- package/dist/commands/dispatch.js +2 -0
- package/dist/commands/doctor.js +17 -0
- package/dist/commands/harvest.js +124 -1
- package/dist/commands/mcp.js +32 -8
- package/dist/core/agent-capability.js +67 -0
- package/dist/core/agent-inventory.js +54 -7
- package/dist/core/agentrun-reconciler.js +126 -52
- package/dist/core/coordination.js +10 -9
- package/dist/core/dirty-scope.js +11 -5
- package/dist/core/dispatcher.js +109 -29
- package/dist/core/entity-operations.js +54 -1
- package/dist/core/execution-adapters.js +32 -51
- package/dist/core/execution.js +14 -8
- package/dist/core/instruction-templates.js +5 -4
- package/dist/core/runtime-signals.js +102 -0
- package/dist/core/schema.js +18 -0
- package/dist/core/spawn-check.js +125 -0
- package/dist/core/worktree.js +146 -7
- package/dist/facts.js +3 -3
- package/dist/facts.json +2 -2
- package/docs/cli.md +8 -4
- package/docs/integrations/mcp.md +48 -15
- package/docs/mcp-schema-changelog.md +16 -5
- package/docs/playbooks/team/index.md +7 -5
- package/package.json +1 -1
|
@@ -46,6 +46,10 @@ const PROFILES = {
|
|
|
46
46
|
invoke_binary: 'claude',
|
|
47
47
|
invoke_review_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
|
|
48
48
|
invoke_consult_template: 'claude -p --allowedTools "Read,Glob,Grep" {prompt}',
|
|
49
|
+
// pln#520 step 3: model is selectable via `--model` — no need for a
|
|
50
|
+
// per-model pseudo-identity. `claude-sonnet` below is now redundant
|
|
51
|
+
// (run `claude-code --model sonnet`) and kept only for back-compat.
|
|
52
|
+
model_flag: '--model',
|
|
49
53
|
},
|
|
50
54
|
'claude-sonnet': {
|
|
51
55
|
name: 'claude-sonnet', category: 'code-agent', workflowModel: 'interactive',
|
|
@@ -323,6 +327,63 @@ export function getCapabilityProfile(name) {
|
|
|
323
327
|
const resolved = resolveAgentAlias(name);
|
|
324
328
|
return _customProfiles.get(resolved) ?? PROFILES[resolved];
|
|
325
329
|
}
|
|
330
|
+
/**
|
|
331
|
+
* pln#520 step 3 — concurrency is a resolvable execution-config value, NOT a
|
|
332
|
+
* structural constant baked into agent identity.
|
|
333
|
+
*
|
|
334
|
+
* The host resource a concurrency cap actually protects is the binary on the
|
|
335
|
+
* machine (its API quota / its RAM/CPU footprint), not the agent label.
|
|
336
|
+
* `resolveResourceKey` returns that shared key so callers count usage across
|
|
337
|
+
* every identity that drives one binary. This kills the can_dc4e4a11 bug:
|
|
338
|
+
* `claude-code` and `claude-sonnet` are the SAME `claude` binary on the SAME
|
|
339
|
+
* host but were counted separately (3 + 6 → up to 9 concurrent `claude`
|
|
340
|
+
* processes, oversubscribing the machine + API).
|
|
341
|
+
*/
|
|
342
|
+
export function resolveResourceKey(name) {
|
|
343
|
+
const profile = getCapabilityProfile(name);
|
|
344
|
+
return profile?.invoke_binary ?? resolveAgentAlias(name);
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Resolve the concurrency limit for an agent. `Infinity` = unlimited.
|
|
348
|
+
*
|
|
349
|
+
* Resolution chain (highest priority first), decoupled from agent identity:
|
|
350
|
+
* 1. explicit `override` (e.g. `brainclaw dispatch --max-concurrency N`)
|
|
351
|
+
* 2. host opt-in cap via `BRAINCLAW_MAX_CONCURRENCY` (protect one machine / quota)
|
|
352
|
+
* 3. structural floor — agents that cannot run headless in parallel
|
|
353
|
+
* (IDE / desktop agents, i.e. not CLI-spawnable) stay hard-capped at their
|
|
354
|
+
* profile `max_concurrent_tasks` (you can't spawn N IDE windows headlessly)
|
|
355
|
+
* 4. default for parallelizable CLI agents: UNLIMITED. There is no arbitrary
|
|
356
|
+
* per-identity throttle — the operator opts into a cap when they want one.
|
|
357
|
+
*
|
|
358
|
+
* When a finite cap applies it is enforced per host-binary resource
|
|
359
|
+
* (see `resolveResourceKey`), so all variants of one binary share the pool.
|
|
360
|
+
*/
|
|
361
|
+
export function resolveConcurrencyLimit(name, opts = {}) {
|
|
362
|
+
if (opts.override !== undefined && opts.override > 0)
|
|
363
|
+
return opts.override;
|
|
364
|
+
const envCap = Number(process.env.BRAINCLAW_MAX_CONCURRENCY);
|
|
365
|
+
if (Number.isFinite(envCap) && envCap > 0)
|
|
366
|
+
return envCap;
|
|
367
|
+
const profile = getCapabilityProfile(name);
|
|
368
|
+
if (!profile?.runtime?.canBeSpawnedCli)
|
|
369
|
+
return profile?.max_concurrent_tasks ?? 1;
|
|
370
|
+
return Infinity;
|
|
371
|
+
}
|
|
372
|
+
/** JSON-safe rendering of a concurrency limit: `Infinity` → `null` (= unlimited). */
|
|
373
|
+
export function serializeConcurrencyLimit(limit) {
|
|
374
|
+
return Number.isFinite(limit) ? limit : null;
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* pln#520 step 3 — resolve the model for a dispatch, decoupled from agent
|
|
378
|
+
* identity. Chain (highest priority first): explicit override (e.g.
|
|
379
|
+
* `dispatch --model`) → lane model → identity model → profile default.
|
|
380
|
+
* Returns `undefined` when nothing in the chain specifies one (the agent's
|
|
381
|
+
* template default applies).
|
|
382
|
+
*/
|
|
383
|
+
export function resolveModel(name, opts = {}) {
|
|
384
|
+
const profile = getCapabilityProfile(name);
|
|
385
|
+
return opts.override ?? opts.lane ?? opts.identity ?? profile?.default_model;
|
|
386
|
+
}
|
|
326
387
|
/**
|
|
327
388
|
* Escape a string for safe use as a double-quoted shell argument.
|
|
328
389
|
* Escapes characters that have special meaning inside double-quotes
|
|
@@ -490,6 +551,12 @@ export function buildInvokeCommand(name, prompt, options = {}) {
|
|
|
490
551
|
const rawTokens = parseTemplateString(templateStr);
|
|
491
552
|
if (rawTokens.length === 0)
|
|
492
553
|
return undefined;
|
|
554
|
+
// pln#520 step 3: inject the resolved model right after the binary so model
|
|
555
|
+
// choice is decoupled from agent identity. Only when the profile declares a
|
|
556
|
+
// `model_flag` and the template doesn't already pin a model (don't double it).
|
|
557
|
+
if (options.model && profile.model_flag && !rawTokens.includes(profile.model_flag)) {
|
|
558
|
+
rawTokens.splice(1, 0, profile.model_flag, options.model);
|
|
559
|
+
}
|
|
493
560
|
const executable = rawTokens[0];
|
|
494
561
|
const interpolatedTokens = rawTokens.slice(1).map((tok) => tok === '{prompt}' ? embeddedPrompt : tok);
|
|
495
562
|
// ── 5. Build the args array ───────────────────────────────────────────────
|
|
@@ -5,6 +5,14 @@ import { spawnSync } from 'node:child_process';
|
|
|
5
5
|
import yaml from 'yaml';
|
|
6
6
|
import { MEMORY_DIR } from './io.js';
|
|
7
7
|
import { detectHostExecutionProfile, } from './execution-profile.js';
|
|
8
|
+
import { getCapabilityProfile } from './agent-capability.js';
|
|
9
|
+
/**
|
|
10
|
+
* trp#427 — cold-start CLI `--version` probes need headroom; a 3s timeout
|
|
11
|
+
* false-negatived claude-code on first launch. The spawnable check (binary on
|
|
12
|
+
* PATH) is the robust signal, so this only affects version-string capture
|
|
13
|
+
* latency, not the installed/spawnable decision.
|
|
14
|
+
*/
|
|
15
|
+
const VERSION_PROBE_TIMEOUT_MS = 8000;
|
|
8
16
|
function tryCommand(command, args, timeout = 5000) {
|
|
9
17
|
try {
|
|
10
18
|
const r = spawnSync(command, args, { encoding: 'utf-8', timeout, windowsHide: true });
|
|
@@ -14,12 +22,40 @@ function tryCommand(command, args, timeout = 5000) {
|
|
|
14
22
|
return { ok: false, stdout: '' };
|
|
15
23
|
}
|
|
16
24
|
}
|
|
25
|
+
/**
|
|
26
|
+
* trp#427 — fast PATH resolution for a binary (no process launch, unlike a
|
|
27
|
+
* `--version` probe). Uses `where` (Windows) / `which` (POSIX).
|
|
28
|
+
*/
|
|
29
|
+
function isBinaryOnPath(binary) {
|
|
30
|
+
if (!binary)
|
|
31
|
+
return false;
|
|
32
|
+
try {
|
|
33
|
+
const cmd = process.platform === 'win32' ? 'where' : 'which';
|
|
34
|
+
const r = spawnSync(cmd, [binary], { encoding: 'utf-8', timeout: 3000, windowsHide: true });
|
|
35
|
+
return r.status === 0 && (r.stdout ?? '').trim().length > 0;
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* trp#427 — an agent is SPAWNABLE when its capability profile is CLI-spawnable,
|
|
43
|
+
* declares an invoke binary, and that binary resolves on PATH. Decoupled from
|
|
44
|
+
* the `--version` health probe so a slow cold-start CLI is never misreported as
|
|
45
|
+
* "not installed" / undispatchable.
|
|
46
|
+
*/
|
|
47
|
+
export function detectSpawnable(agentName) {
|
|
48
|
+
const profile = getCapabilityProfile(agentName);
|
|
49
|
+
if (!profile || !profile.runtime?.canBeSpawnedCli || !profile.invoke_binary)
|
|
50
|
+
return false;
|
|
51
|
+
return isBinaryOnPath(profile.invoke_binary);
|
|
52
|
+
}
|
|
17
53
|
const AGENT_DEFINITIONS = [
|
|
18
54
|
{
|
|
19
55
|
name: 'claude-code',
|
|
20
56
|
detect: (_home, env) => {
|
|
21
57
|
// Check if claude CLI is available
|
|
22
|
-
const cli = tryCommand('claude', ['--version'],
|
|
58
|
+
const cli = tryCommand('claude', ['--version'], VERSION_PROBE_TIMEOUT_MS);
|
|
23
59
|
if (cli.ok) {
|
|
24
60
|
const ver = cli.stdout.trim().match(/(\d+\.\d+\.\d+)/)?.[1];
|
|
25
61
|
return { installed: true, method: 'claude CLI', version: ver };
|
|
@@ -81,7 +117,7 @@ const AGENT_DEFINITIONS = [
|
|
|
81
117
|
if (fs.existsSync(codexDir)) {
|
|
82
118
|
return { installed: true, method: '~/.codex directory' };
|
|
83
119
|
}
|
|
84
|
-
const cli = tryCommand('codex', ['--version'],
|
|
120
|
+
const cli = tryCommand('codex', ['--version'], VERSION_PROBE_TIMEOUT_MS);
|
|
85
121
|
if (cli.ok) {
|
|
86
122
|
const ver = cli.stdout.trim().match(/(\d+\.\d+\.\d+)/)?.[1];
|
|
87
123
|
return { installed: true, method: 'codex CLI', version: ver };
|
|
@@ -252,7 +288,7 @@ const AGENT_DEFINITIONS = [
|
|
|
252
288
|
if (fs.existsSync(path.join(home, '.gemini', 'antigravity'))) {
|
|
253
289
|
return { installed: true, method: '~/.gemini/antigravity directory' };
|
|
254
290
|
}
|
|
255
|
-
const cli = tryCommand('gemini', ['--version'],
|
|
291
|
+
const cli = tryCommand('gemini', ['--version'], VERSION_PROBE_TIMEOUT_MS);
|
|
256
292
|
if (cli.ok) {
|
|
257
293
|
return { installed: true, method: 'gemini CLI', version: cli.stdout.trim() };
|
|
258
294
|
}
|
|
@@ -309,7 +345,7 @@ const AGENT_DEFINITIONS = [
|
|
|
309
345
|
if (fs.existsSync(path.join(home, '.hermes'))) {
|
|
310
346
|
return { installed: true, method: '~/.hermes directory' };
|
|
311
347
|
}
|
|
312
|
-
const cli = tryCommand('hermes', ['--version'],
|
|
348
|
+
const cli = tryCommand('hermes', ['--version'], VERSION_PROBE_TIMEOUT_MS);
|
|
313
349
|
if (cli.ok) {
|
|
314
350
|
return { installed: true, method: 'hermes CLI', version: cli.stdout.trim() };
|
|
315
351
|
}
|
|
@@ -332,14 +368,23 @@ const AGENT_DEFINITIONS = [
|
|
|
332
368
|
/**
|
|
333
369
|
* Detect ALL installed agents on this machine (not just the running one).
|
|
334
370
|
*/
|
|
335
|
-
export function buildAgentInventory(homeDir = os.homedir(), env = process.env) {
|
|
371
|
+
export function buildAgentInventory(homeDir = os.homedir(), env = process.env, opts = {}) {
|
|
372
|
+
const spawnableResolver = opts.spawnableResolver ?? detectSpawnable;
|
|
336
373
|
const agents = AGENT_DEFINITIONS.map(def => {
|
|
337
374
|
const detection = def.detect(homeDir, env);
|
|
375
|
+
const spawnable = spawnableResolver(def.name);
|
|
376
|
+
// trp#427: an agent brainclaw can spawn (invoke binary on PATH) IS installed,
|
|
377
|
+
// even when the cold-start `--version` probe timed out. This decouples the
|
|
378
|
+
// dispatch decision (getInstalledAgentNames) from probe latency.
|
|
379
|
+
const installed = detection.installed || spawnable;
|
|
338
380
|
return {
|
|
339
381
|
name: def.name,
|
|
340
|
-
installed
|
|
341
|
-
detection_method: detection.
|
|
382
|
+
installed,
|
|
383
|
+
detection_method: detection.installed
|
|
384
|
+
? detection.method
|
|
385
|
+
: (spawnable ? 'spawnable: invoke binary on PATH' : detection.method),
|
|
342
386
|
version: detection.version,
|
|
387
|
+
spawnable,
|
|
343
388
|
models: def.models,
|
|
344
389
|
native_tools: def.native_tools,
|
|
345
390
|
mcp_support: def.mcp_support,
|
|
@@ -415,6 +460,8 @@ export function renderAgentInventorySummary(inventory) {
|
|
|
415
460
|
features.push('Rules');
|
|
416
461
|
if (agent.hooks_support)
|
|
417
462
|
features.push('Hooks');
|
|
463
|
+
if (agent.spawnable)
|
|
464
|
+
features.push('Spawnable');
|
|
418
465
|
lines.push(` Features: ${features.join(', ') || 'none'}`);
|
|
419
466
|
if (agent.instruction_file) {
|
|
420
467
|
lines.push(` Instructions: ${agent.instruction_file}`);
|
|
@@ -38,6 +38,7 @@ import { loadClaim } from './claims.js';
|
|
|
38
38
|
import { loadAssignment } from './assignments.js';
|
|
39
39
|
import { createRuntimeEvent } from './events.js';
|
|
40
40
|
import { nowISO } from './ids.js';
|
|
41
|
+
import { readHeartbeat, readLogTail, signalExists } from './runtime-signals.js';
|
|
41
42
|
// ── Constants ──────────────────────────────────────────────────────────────
|
|
42
43
|
/**
|
|
43
44
|
* Minimum age before a run is eligible for reconciliation. Below this, the
|
|
@@ -52,6 +53,11 @@ export const DEFAULT_HEALTH_CHECK_GRACE_MS = 60_000;
|
|
|
52
53
|
export const DEFAULT_STALE_AFTER_MS = 30 * 60_000;
|
|
53
54
|
export const DEFAULT_DEAD_PID_READ_SWEEP_AGE_MS = 5 * 60_000;
|
|
54
55
|
export const DEFAULT_DEAD_PID_READ_SWEEP_LIMIT = 50;
|
|
56
|
+
/**
|
|
57
|
+
* pln#520 step 1 — a heartbeat older than this (with no completion signal) means
|
|
58
|
+
* the worker reached its loop then went silent: `stalled`. Default 10 min.
|
|
59
|
+
*/
|
|
60
|
+
export const DEFAULT_HEARTBEAT_STALE_MS = 10 * 60_000;
|
|
55
61
|
const TERMINAL_STATUSES = new Set([
|
|
56
62
|
'completed', 'failed', 'cancelled', 'timed_out', 'interrupted',
|
|
57
63
|
]);
|
|
@@ -152,15 +158,51 @@ export function collectEvidence(run, cwd, options) {
|
|
|
152
158
|
}
|
|
153
159
|
catch { /* defensive */ }
|
|
154
160
|
const process_alive = isProcessAlive(run.pid);
|
|
155
|
-
|
|
161
|
+
// pln#520 step 1 — sentinel evidence. Signals live under the project
|
|
162
|
+
// coordination dir (the dispatcher's ackRoot), which is `cwd` for the
|
|
163
|
+
// reconciler. Keyed by assignment_id.
|
|
164
|
+
const signalRoot = cwd ?? process.cwd();
|
|
165
|
+
let completed_signal = false;
|
|
166
|
+
let failed_signal = false;
|
|
167
|
+
let heartbeat_exists = false;
|
|
168
|
+
let heartbeat_age_ms;
|
|
169
|
+
try {
|
|
170
|
+
completed_signal = signalExists(signalRoot, run.assignment_id, 'completed');
|
|
171
|
+
failed_signal = signalExists(signalRoot, run.assignment_id, 'failed');
|
|
172
|
+
const hb = readHeartbeat(signalRoot, run.assignment_id);
|
|
173
|
+
heartbeat_exists = hb.exists;
|
|
174
|
+
if (hb.exists && hb.mtimeMs !== undefined)
|
|
175
|
+
heartbeat_age_ms = now - hb.mtimeMs;
|
|
176
|
+
}
|
|
177
|
+
catch { /* defensive */ }
|
|
178
|
+
return {
|
|
179
|
+
age_ms, has_post_start_commit, claim_released, assignment_completed, process_alive,
|
|
180
|
+
completed_signal, failed_signal, heartbeat_exists, heartbeat_age_ms,
|
|
181
|
+
};
|
|
156
182
|
}
|
|
157
183
|
function anyCompletionEvidence(evidence) {
|
|
158
|
-
return evidence.
|
|
184
|
+
return evidence.completed_signal
|
|
185
|
+
|| evidence.has_post_start_commit
|
|
159
186
|
|| evidence.claim_released
|
|
160
187
|
|| evidence.assignment_completed;
|
|
161
188
|
}
|
|
189
|
+
/**
|
|
190
|
+
* pln#520 step 1 — a short tail of the captured stderr (or stdout) for
|
|
191
|
+
* failed_silent / stalled diagnostics, so the verdict carries the worker's
|
|
192
|
+
* last words instead of just a status code.
|
|
193
|
+
*/
|
|
194
|
+
function logTailSuffix(run, cwd) {
|
|
195
|
+
const root = cwd ?? process.cwd();
|
|
196
|
+
const tail = (readLogTail(root, run.assignment_id, 'stderr', 500).trim()
|
|
197
|
+
|| readLogTail(root, run.assignment_id, 'stdout', 500).trim());
|
|
198
|
+
if (!tail)
|
|
199
|
+
return '';
|
|
200
|
+
return ` | log tail: ${tail.replace(/\s+/g, ' ').slice(0, 300)}`;
|
|
201
|
+
}
|
|
162
202
|
function describeEvidence(evidence) {
|
|
163
203
|
const reasons = [];
|
|
204
|
+
if (evidence.completed_signal)
|
|
205
|
+
reasons.push('wrapper wrote completed sentinel');
|
|
164
206
|
if (evidence.has_post_start_commit)
|
|
165
207
|
reasons.push('post-start commit on worktree branch');
|
|
166
208
|
if (evidence.claim_released)
|
|
@@ -231,6 +273,7 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
|
|
|
231
273
|
const evidence = {
|
|
232
274
|
age_ms: 0, has_post_start_commit: false, claim_released: false,
|
|
233
275
|
assignment_completed: false, process_alive: undefined,
|
|
276
|
+
completed_signal: false, failed_signal: false, heartbeat_exists: false,
|
|
234
277
|
};
|
|
235
278
|
return {
|
|
236
279
|
run_id: runId, action: 'no_op', reason: 'run not found', evidence,
|
|
@@ -280,18 +323,12 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
|
|
|
280
323
|
};
|
|
281
324
|
}
|
|
282
325
|
}
|
|
283
|
-
//
|
|
284
|
-
|
|
326
|
+
// pln#520 step 1 — sentinel-based failure (fast + trustworthy, pid-independent).
|
|
327
|
+
const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
328
|
+
const failHere = (reason) => {
|
|
285
329
|
try {
|
|
286
|
-
transitionAgentRun(runId, 'failed', {
|
|
287
|
-
|
|
288
|
-
status_reason: 'silent_termination_no_evidence',
|
|
289
|
-
}, cwd);
|
|
290
|
-
return {
|
|
291
|
-
run_id: runId, action: 'inferred_failed',
|
|
292
|
-
reason: 'silent_termination_no_evidence',
|
|
293
|
-
evidence, previous_status, current_status: 'failed',
|
|
294
|
-
};
|
|
330
|
+
transitionAgentRun(runId, 'failed', { actor, status_reason: reason }, cwd);
|
|
331
|
+
return { run_id: runId, action: 'inferred_failed', reason, evidence, previous_status, current_status: 'failed' };
|
|
295
332
|
}
|
|
296
333
|
catch (err) {
|
|
297
334
|
return {
|
|
@@ -300,6 +337,26 @@ export function reconcileAgentRun(runId, cwd, options = {}) {
|
|
|
300
337
|
evidence, previous_status, current_status: run.status,
|
|
301
338
|
};
|
|
302
339
|
}
|
|
340
|
+
};
|
|
341
|
+
// `failed` sentinel — the wrapper saw a non-zero agent exit.
|
|
342
|
+
if (evidence.failed_signal) {
|
|
343
|
+
return failHere(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
|
|
344
|
+
}
|
|
345
|
+
// Heartbeat present but stale → reached the loop then went silent.
|
|
346
|
+
if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
|
|
347
|
+
return failHere(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
|
|
348
|
+
}
|
|
349
|
+
// Fresh heartbeat → alive; trust it over the untrustworthy wrapper pid.
|
|
350
|
+
if (evidence.heartbeat_exists) {
|
|
351
|
+
return {
|
|
352
|
+
run_id: runId, action: 'no_op',
|
|
353
|
+
reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
|
|
354
|
+
evidence, previous_status, current_status: run.status,
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
// Failure inference: stale + dead process + no evidence.
|
|
358
|
+
if (evidence.age_ms >= stale && evidence.process_alive === false) {
|
|
359
|
+
return failHere('silent_termination_no_evidence');
|
|
303
360
|
}
|
|
304
361
|
// Health-check window: past grace, not yet stale, no evidence either way.
|
|
305
362
|
// Emit a non-mutating event so callers see the uncertainty without
|
|
@@ -339,6 +396,7 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
|
|
|
339
396
|
const evidence = {
|
|
340
397
|
age_ms: 0, has_post_start_commit: false, claim_released: false,
|
|
341
398
|
assignment_completed: false, process_alive: undefined,
|
|
399
|
+
completed_signal: false, failed_signal: false, heartbeat_exists: false,
|
|
342
400
|
};
|
|
343
401
|
return {
|
|
344
402
|
run_id: runId, action: 'no_op', reason: 'run not found', evidence,
|
|
@@ -352,19 +410,25 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
|
|
|
352
410
|
evidence, previous_status: run.status, current_status: run.status,
|
|
353
411
|
};
|
|
354
412
|
}
|
|
355
|
-
if (evidence.process_alive !== false) {
|
|
356
|
-
return {
|
|
357
|
-
run_id: run.id, action: 'no_op',
|
|
358
|
-
reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
|
|
359
|
-
evidence, previous_status: run.status, current_status: run.status,
|
|
360
|
-
};
|
|
361
|
-
}
|
|
362
|
-
// pid reads dead — but the tracked pid is NOT trustworthy (see doc above),
|
|
363
|
-
// so a bare dead pid NEVER cancels. Evidence of real work wins; otherwise
|
|
364
|
-
// surface the uncertainty non-destructively and leave the run `running` for
|
|
365
|
-
// reconcileAgentRun's stale-threshold path to fail it only after a fair,
|
|
366
|
-
// evidence-based delay.
|
|
367
413
|
const actor = options.actor ?? 'reconciler';
|
|
414
|
+
const stale = options.staleAfterMs ?? DEFAULT_STALE_AFTER_MS;
|
|
415
|
+
const heartbeatStale = options.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
416
|
+
const failRun = (reason) => {
|
|
417
|
+
try {
|
|
418
|
+
transitionAgentRun(run.id, 'failed', { actor, status_reason: reason }, cwd);
|
|
419
|
+
return { run_id: run.id, action: 'inferred_failed', reason, evidence, previous_status: run.status, current_status: 'failed' };
|
|
420
|
+
}
|
|
421
|
+
catch (err) {
|
|
422
|
+
return {
|
|
423
|
+
run_id: run.id, action: 'no_op',
|
|
424
|
+
reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
|
|
425
|
+
evidence, previous_status: run.status, current_status: run.status,
|
|
426
|
+
};
|
|
427
|
+
}
|
|
428
|
+
};
|
|
429
|
+
// ── pln#520 step 1: SENTINELS are authoritative, independent of the
|
|
430
|
+
// untrustworthy wrapper pid. Check them first. ──────────────────────────
|
|
431
|
+
// 1. Completion evidence (mechanical `completed` sentinel or work evidence).
|
|
368
432
|
if (anyCompletionEvidence(evidence)) {
|
|
369
433
|
try {
|
|
370
434
|
transitionAgentRun(run.id, 'completed', {
|
|
@@ -385,33 +449,43 @@ export function reconcileDeadPidRunningAgentRunAtRead(runId, cwd, options = {})
|
|
|
385
449
|
};
|
|
386
450
|
}
|
|
387
451
|
}
|
|
388
|
-
//
|
|
389
|
-
//
|
|
390
|
-
//
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
//
|
|
395
|
-
|
|
452
|
+
// 2. `failed` sentinel — the wrapper saw a non-zero agent exit. This is the
|
|
453
|
+
// FAST, TRUSTWORTHY failed_silent detector (vs the pid heuristic that caused
|
|
454
|
+
// can_f792cacd false negatives). Carries the captured log tail.
|
|
455
|
+
if (evidence.failed_signal) {
|
|
456
|
+
return failRun(`failed_silent: wrapper reported non-zero exit${logTailSuffix(run, cwd)}`);
|
|
457
|
+
}
|
|
458
|
+
// 3. Heartbeat present but STALE → the worker reached its loop then went
|
|
459
|
+
// silent (e.g. hung). pid-independent: a hung worker keeps the wrapper alive.
|
|
460
|
+
if (evidence.heartbeat_exists && evidence.heartbeat_age_ms !== undefined && evidence.heartbeat_age_ms >= heartbeatStale) {
|
|
461
|
+
return failRun(`stalled: heartbeat last seen ${Math.round(evidence.heartbeat_age_ms / 1000)}s ago${logTailSuffix(run, cwd)}`);
|
|
462
|
+
}
|
|
463
|
+
// 4. Fresh heartbeat → the worker is alive and working; trust it OVER the
|
|
464
|
+
// (untrustworthy) wrapper pid. This is the can_f792cacd fix: never fail a
|
|
465
|
+
// live, heartbeating worker just because its wrapper pid reads dead.
|
|
466
|
+
if (evidence.heartbeat_exists) {
|
|
467
|
+
return {
|
|
468
|
+
run_id: run.id, action: 'no_op',
|
|
469
|
+
reason: `heartbeat fresh (${Math.round((evidence.heartbeat_age_ms ?? 0) / 1000)}s) — worker alive, pid untrusted`,
|
|
470
|
+
evidence, previous_status: run.status, current_status: run.status,
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
// ── No sentinel, no heartbeat: fall back to the pid-conservative path. The
|
|
474
|
+
// wrapper writes completed/failed on any normal exit, so reaching here means
|
|
475
|
+
// the worker has not exited and never heartbeat. Do NOT fast-fail on a dead
|
|
476
|
+
// pid (it's the wrapper's, not the worker's). ──────────────────────────────
|
|
477
|
+
if (evidence.process_alive !== false) {
|
|
478
|
+
return {
|
|
479
|
+
run_id: run.id, action: 'no_op',
|
|
480
|
+
reason: evidence.process_alive === true ? 'process alive' : 'pid liveness unknown',
|
|
481
|
+
evidence, previous_status: run.status, current_status: run.status,
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
// pid dead + no sentinel + no heartbeat: only converge after the long stale
|
|
485
|
+
// window (trp#292 — must converge HERE since the read path never routes
|
|
486
|
+
// through reconcileAgentRun), giving an untrusted-pid worker ample time.
|
|
396
487
|
if (evidence.age_ms >= stale) {
|
|
397
|
-
|
|
398
|
-
transitionAgentRun(run.id, 'failed', {
|
|
399
|
-
actor,
|
|
400
|
-
status_reason: 'silent_termination_no_evidence',
|
|
401
|
-
}, cwd);
|
|
402
|
-
return {
|
|
403
|
-
run_id: run.id, action: 'inferred_failed',
|
|
404
|
-
reason: 'silent_termination_no_evidence',
|
|
405
|
-
evidence, previous_status: run.status, current_status: 'failed',
|
|
406
|
-
};
|
|
407
|
-
}
|
|
408
|
-
catch (err) {
|
|
409
|
-
return {
|
|
410
|
-
run_id: run.id, action: 'no_op',
|
|
411
|
-
reason: `failure transition rejected: ${err instanceof Error ? err.message : String(err)}`,
|
|
412
|
-
evidence, previous_status: run.status, current_status: run.status,
|
|
413
|
-
};
|
|
414
|
-
}
|
|
488
|
+
return failRun('silent_termination_no_evidence');
|
|
415
489
|
}
|
|
416
490
|
emitUnverifiedEvent(run, evidence, actor, cwd);
|
|
417
491
|
return {
|
|
@@ -457,7 +531,7 @@ export function reconcileAllOpenRuns(cwd, filter = {}, options = {}) {
|
|
|
457
531
|
catch {
|
|
458
532
|
results.push({
|
|
459
533
|
run_id: run.id, action: 'no_op', reason: 'reconcile threw — skipped',
|
|
460
|
-
evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined },
|
|
534
|
+
evidence: { age_ms: 0, has_post_start_commit: false, claim_released: false, assignment_completed: false, process_alive: undefined, completed_signal: false, failed_signal: false, heartbeat_exists: false },
|
|
461
535
|
previous_status: run.status, current_status: run.status,
|
|
462
536
|
});
|
|
463
537
|
}
|
|
@@ -11,7 +11,7 @@ import { inferProjectFromTarget, loadInstructions, resolveInstructions } from '.
|
|
|
11
11
|
import { buildReputationSummary, findAgentReputationSummary } from './reputation.js';
|
|
12
12
|
import { listRuntimeNotes } from './runtime.js';
|
|
13
13
|
import { loadState, persistState } from './state.js';
|
|
14
|
-
import {
|
|
14
|
+
import { resolveConcurrencyLimit, serializeConcurrencyLimit } from './agent-capability.js';
|
|
15
15
|
import { loadAllSessions } from './identity.js';
|
|
16
16
|
import { countActionable } from './messaging.js';
|
|
17
17
|
import { listCandidates } from './candidates.js';
|
|
@@ -176,8 +176,7 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
|
|
|
176
176
|
for (const identity of listAgentIdentities(cwd)) {
|
|
177
177
|
if (identity.agent_name === currentAgent)
|
|
178
178
|
continue;
|
|
179
|
-
const
|
|
180
|
-
const maxTasks = profile?.max_concurrent_tasks ?? 1;
|
|
179
|
+
const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(identity.agent_name));
|
|
181
180
|
agentMap.set(identity.agent_name, {
|
|
182
181
|
name: identity.agent_name,
|
|
183
182
|
trust_level: identity.trust_level ?? 'contributor',
|
|
@@ -185,23 +184,25 @@ function buildOtherAgentsSummary(claims, notes, currentAgent, cwd) {
|
|
|
185
184
|
scopes: [],
|
|
186
185
|
has_open_session: false,
|
|
187
186
|
instance_count: sessionCounts.get(identity.agent_name) ?? 0,
|
|
188
|
-
max_tasks:
|
|
189
|
-
slots_remaining:
|
|
187
|
+
max_tasks: limit,
|
|
188
|
+
slots_remaining: limit, // will be reduced when claims are counted (null stays unlimited)
|
|
190
189
|
});
|
|
191
190
|
}
|
|
192
191
|
// Enrich with active claims
|
|
193
192
|
for (const claim of claims) {
|
|
194
193
|
if (claim.agent === currentAgent)
|
|
195
194
|
continue;
|
|
196
|
-
const
|
|
197
|
-
const maxTasks = profile?.max_concurrent_tasks ?? 1;
|
|
195
|
+
const limit = serializeConcurrencyLimit(resolveConcurrencyLimit(claim.agent));
|
|
198
196
|
const existing = agentMap.get(claim.agent) ?? {
|
|
199
197
|
name: claim.agent, trust_level: 'contributor', claim_count: 0, scopes: [],
|
|
200
198
|
has_open_session: false, instance_count: sessionCounts.get(claim.agent) ?? 0,
|
|
201
|
-
max_tasks:
|
|
199
|
+
max_tasks: limit, slots_remaining: limit,
|
|
202
200
|
};
|
|
203
201
|
existing.claim_count++;
|
|
204
|
-
|
|
202
|
+
// null max_tasks = unlimited → slots stay unlimited.
|
|
203
|
+
existing.slots_remaining = existing.max_tasks === null
|
|
204
|
+
? null
|
|
205
|
+
: Math.max(0, existing.max_tasks - existing.claim_count);
|
|
205
206
|
existing.scopes.push(claim.scope);
|
|
206
207
|
if (!existing.last_active || claim.created_at > existing.last_active) {
|
|
207
208
|
existing.last_active = claim.created_at;
|
package/dist/core/dirty-scope.js
CHANGED
|
@@ -44,13 +44,19 @@ function defaultRunGit(cwd, args) {
|
|
|
44
44
|
return { ok: false, stdout: '' };
|
|
45
45
|
}
|
|
46
46
|
}
|
|
47
|
-
/**
|
|
47
|
+
/**
|
|
48
|
+
* Top-level directories that are dirty as a side effect of coordination /
|
|
49
|
+
* agent tooling, never part of a dispatch's code scope:
|
|
50
|
+
* - `.brainclaw`, `.git` — coordination store + VCS metadata.
|
|
51
|
+
* - `.claude`, `.cursor`, `.codex` — per-agent local config (trp#371). A
|
|
52
|
+
* worker leaving these dirty (Claude Code settings, etc.) must not block an
|
|
53
|
+
* otherwise-safe dispatch of an unrelated code scope.
|
|
54
|
+
*/
|
|
55
|
+
const SYSTEM_DIRTY_DIRS = ['.brainclaw', '.git', '.claude', '.cursor', '.codex'];
|
|
56
|
+
/** True for coordination/store/agent-config paths that are dirty as a side effect of tooling. */
|
|
48
57
|
export function isSystemDirtyPath(p) {
|
|
49
58
|
const norm = p.replace(/\\/g, '/');
|
|
50
|
-
return norm ===
|
|
51
|
-
|| norm.startsWith('.brainclaw/')
|
|
52
|
-
|| norm === '.git'
|
|
53
|
-
|| norm.startsWith('.git/');
|
|
59
|
+
return SYSTEM_DIRTY_DIRS.some((dir) => norm === dir || norm.startsWith(dir + '/'));
|
|
54
60
|
}
|
|
55
61
|
/**
|
|
56
62
|
* Parse `git status --porcelain=v1 -z` output into a flat list of paths.
|