nubos-pilot 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/README.md +16 -0
- package/agents/np-architect.md +2 -0
- package/agents/np-executor.md +1 -1
- package/agents/np-learnings-extractor.md +54 -0
- package/agents/np-planner.md +1 -1
- package/agents/np-security-reviewer.md +9 -0
- package/bin/np-tools/_commands.cjs +4 -0
- package/bin/np-tools/derive-tier.cjs +86 -0
- package/bin/np-tools/derive-tier.test.cjs +83 -0
- package/bin/np-tools/learnings.cjs +109 -0
- package/bin/np-tools/learnings.test.cjs +66 -0
- package/bin/np-tools/loop-run-round.cjs +7 -1
- package/bin/np-tools/security.cjs +3 -0
- package/bin/np-tools/skill-audit.cjs +79 -0
- package/bin/np-tools/skill-audit.test.cjs +86 -0
- package/bin/np-tools/spawn-headless.cjs +35 -1
- package/bin/np-tools/spawn-headless.test.cjs +135 -0
- package/bin/np-tools/verify-reliability.cjs +65 -0
- package/bin/np-tools/verify-reliability.test.cjs +69 -0
- package/lib/agents.test.cjs +1 -0
- package/lib/config-defaults.cjs +13 -0
- package/lib/config-schema.cjs +11 -0
- package/lib/eval-reliability.cjs +63 -0
- package/lib/eval-reliability.test.cjs +56 -0
- package/lib/headless-guard.cjs +127 -0
- package/lib/headless-guard.test.cjs +119 -0
- package/lib/install/claude-hooks-learnings.test.cjs +82 -0
- package/lib/install/claude-hooks.cjs +65 -4
- package/lib/install/claude-hooks.test.cjs +5 -2
- package/lib/learnings/capture-ledger.cjs +80 -0
- package/lib/learnings/capture-ledger.test.cjs +54 -0
- package/lib/learnings/extract.cjs +191 -0
- package/lib/learnings/extract.test.cjs +115 -0
- package/lib/nubosloop-audit.cjs +104 -0
- package/lib/nubosloop-skill-audit.test.cjs +98 -0
- package/lib/nubosloop.cjs +9 -0
- package/lib/tier-classify.cjs +67 -0
- package/lib/tier-classify.test.cjs +67 -0
- package/np-tools.cjs +4 -0
- package/package.json +1 -1
- package/skills/np-access-control/SKILL.md +42 -0
- package/skills/np-accessibility-audit/SKILL.md +41 -0
- package/skills/np-adr/SKILL.md +37 -0
- package/skills/np-api-design/SKILL.md +34 -0
- package/skills/np-caching-strategy/SKILL.md +38 -0
- package/skills/np-data-modeling/SKILL.md +37 -0
- package/skills/np-data-privacy/SKILL.md +39 -0
- package/skills/np-dependency-audit/SKILL.md +47 -0
- package/skills/np-encryption/SKILL.md +47 -0
- package/skills/np-error-handling/SKILL.md +37 -0
- package/skills/np-incident-response/SKILL.md +38 -0
- package/skills/np-llm-app-architecture/SKILL.md +50 -0
- package/skills/np-observability/SKILL.md +39 -0
- package/skills/np-performance/SKILL.md +38 -0
- package/skills/np-queue-design/SKILL.md +32 -0
- package/skills/np-rag-design/SKILL.md +43 -0
- package/skills/np-refactoring/SKILL.md +35 -0
- package/skills/np-resilience-patterns/SKILL.md +39 -0
- package/skills/np-secure-code-review/SKILL.md +46 -0
- package/skills/np-secure-design/SKILL.md +44 -0
- package/skills/np-service-boundary/SKILL.md +35 -0
- package/skills/np-system-design/SKILL.md +40 -0
- package/skills/np-test-strategy/SKILL.md +46 -0
- package/skills/np-threat-model/SKILL.md +42 -0
- package/templates/claude/payload/hooks/np-learnings-hook.cjs +56 -0
- package/templates/claude/payload/hooks/np-security-hook.cjs +1 -0
- package/workflows/architect-phase.md +21 -1
- package/workflows/execute-phase.md +66 -4
- package/workflows/verify-work.md +17 -4
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { test } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const fs = require('node:fs');
|
|
6
|
+
const os = require('node:os');
|
|
7
|
+
const path = require('node:path');
|
|
8
|
+
const { run } = require('./skill-audit.cjs');
|
|
9
|
+
const checkpoint = require('../../lib/checkpoint.cjs');
|
|
10
|
+
|
|
11
|
+
function _mkRoot() {
|
|
12
|
+
const r = fs.mkdtempSync(path.join(os.tmpdir(), 'np-skill-cli-'));
|
|
13
|
+
fs.mkdirSync(path.join(r, '.nubos-pilot', 'checkpoints'), { recursive: true });
|
|
14
|
+
fs.writeFileSync(
|
|
15
|
+
path.join(r, '.nubos-pilot', 'STATE.md'),
|
|
16
|
+
'---\nschema_version: 2\ncurrent_phase: null\ncurrent_plan: null\ncurrent_task: null\n---\n',
|
|
17
|
+
'utf-8',
|
|
18
|
+
);
|
|
19
|
+
return r;
|
|
20
|
+
}
|
|
21
|
+
function _cap(cwd) {
|
|
22
|
+
const out = { text: '' }; const err = { text: '' };
|
|
23
|
+
return { cwd, stdout: { write: (s) => { out.text += s; return true; } }, stderr: { write: (s) => { err.text += s; return true; } }, out, err };
|
|
24
|
+
}
|
|
25
|
+
const TID = 'M001-S001-T0001';
|
|
26
|
+
|
|
27
|
+
test('SC-1: expect then findings reports the unacked skill', () => {
|
|
28
|
+
const r = _mkRoot();
|
|
29
|
+
try {
|
|
30
|
+
checkpoint.startTask({ id: TID }, r);
|
|
31
|
+
assert.equal(run(['expect', '--task', TID, '--skills', 'np-api-design,np-encryption'], _cap(r)), 0);
|
|
32
|
+
const c = _cap(r);
|
|
33
|
+
assert.equal(run(['findings', '--task', TID], c), 0);
|
|
34
|
+
const parsed = JSON.parse(c.out.text);
|
|
35
|
+
assert.equal(parsed.findings.length, 1);
|
|
36
|
+
assert.deepEqual(parsed.findings[0].raw.missing_skills.sort(), ['np-api-design', 'np-encryption']);
|
|
37
|
+
} finally { fs.rmSync(r, { recursive: true, force: true }); }
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('SC-2: ack clears the finding for that skill', () => {
|
|
41
|
+
const r = _mkRoot();
|
|
42
|
+
try {
|
|
43
|
+
checkpoint.startTask({ id: TID }, r);
|
|
44
|
+
run(['expect', '--task', TID, '--skills', 'np-api-design'], _cap(r));
|
|
45
|
+
run(['ack', '--task', TID, '--skill', 'np-api-design'], _cap(r));
|
|
46
|
+
const c = _cap(r);
|
|
47
|
+
run(['findings', '--task', TID], c);
|
|
48
|
+
assert.equal(JSON.parse(c.out.text).findings.length, 0);
|
|
49
|
+
} finally { fs.rmSync(r, { recursive: true, force: true }); }
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test('SC-3: invalid task id → error envelope exit 1', () => {
|
|
53
|
+
const c = _cap(process.cwd());
|
|
54
|
+
assert.equal(run(['ack', '--task', 'bogus', '--skill', 'x'], c), 1);
|
|
55
|
+
assert.match(c.err.text, /skill-audit-invalid-task-id/);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
test('SC-4: ack without --skill → error envelope exit 1', () => {
|
|
59
|
+
const r = _mkRoot();
|
|
60
|
+
try {
|
|
61
|
+
checkpoint.startTask({ id: TID }, r);
|
|
62
|
+
const c = _cap(r);
|
|
63
|
+
assert.equal(run(['ack', '--task', TID], c), 1);
|
|
64
|
+
assert.match(c.err.text, /skill-audit-missing-skill/);
|
|
65
|
+
} finally { fs.rmSync(r, { recursive: true, force: true }); }
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test('SC-5: unknown verb → exit 1; --help → exit 0', () => {
|
|
69
|
+
const c1 = _cap(process.cwd());
|
|
70
|
+
assert.equal(run(['bogus'], c1), 1);
|
|
71
|
+
assert.match(c1.err.text, /skill-audit-unknown-verb/);
|
|
72
|
+
const c2 = _cap(process.cwd());
|
|
73
|
+
assert.equal(run(['--help'], c2), 0);
|
|
74
|
+
assert.match(c2.out.text, /skill-audit/);
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
test('SC-6: expect with empty skills is a no-op (no findings)', () => {
|
|
78
|
+
const r = _mkRoot();
|
|
79
|
+
try {
|
|
80
|
+
checkpoint.startTask({ id: TID }, r);
|
|
81
|
+
assert.equal(run(['expect', '--task', TID, '--skills', ''], _cap(r)), 0);
|
|
82
|
+
const c = _cap(r);
|
|
83
|
+
run(['findings', '--task', TID], c);
|
|
84
|
+
assert.equal(JSON.parse(c.out.text).findings.length, 0);
|
|
85
|
+
} finally { fs.rmSync(r, { recursive: true, force: true }); }
|
|
86
|
+
});
|
|
@@ -8,6 +8,7 @@ const child_process = require('node:child_process');
|
|
|
8
8
|
const { NubosPilotError, atomicWriteFileSync, appendJsonl, findProjectRoot } = require('../../lib/core.cjs');
|
|
9
9
|
const runContext = require('../../lib/run-context.cjs');
|
|
10
10
|
const safePath = require('../../lib/safe-path.cjs');
|
|
11
|
+
const headlessGuard = require('../../lib/headless-guard.cjs');
|
|
11
12
|
const args = require('./_args.cjs');
|
|
12
13
|
|
|
13
14
|
function _sha256(s) {
|
|
@@ -171,6 +172,22 @@ function run(argv, ctx) {
|
|
|
171
172
|
const stdout = context.stdout || process.stdout;
|
|
172
173
|
const list = Array.isArray(argv) ? argv : [];
|
|
173
174
|
|
|
175
|
+
if (headlessGuard.isHeadless(process.env)) {
|
|
176
|
+
throw new NubosPilotError(
|
|
177
|
+
'spawn-headless-reentrant',
|
|
178
|
+
'refusing to spawn a nested headless `claude` (NUBOS_PILOT_HEADLESS is set) — recursion guard',
|
|
179
|
+
{ depth: headlessGuard.currentDepth(process.env) },
|
|
180
|
+
);
|
|
181
|
+
}
|
|
182
|
+
if (headlessGuard.depthExceeded(process.env)) {
|
|
183
|
+
throw new NubosPilotError(
|
|
184
|
+
'spawn-headless-depth-exceeded',
|
|
185
|
+
'refusing to spawn headless `claude`: hook depth ' + headlessGuard.currentDepth(process.env)
|
|
186
|
+
+ ' has reached the cap ' + headlessGuard.maxDepth(process.env) + ' (recursion guard)',
|
|
187
|
+
{ depth: headlessGuard.currentDepth(process.env), max: headlessGuard.maxDepth(process.env) },
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
|
|
174
191
|
const agent = args.getFlag(list, '--agent');
|
|
175
192
|
if (!agent) {
|
|
176
193
|
throw new NubosPilotError(
|
|
@@ -213,6 +230,21 @@ function run(argv, ctx) {
|
|
|
213
230
|
|
|
214
231
|
const runId = runContext.getRunId();
|
|
215
232
|
|
|
233
|
+
let lockRoot;
|
|
234
|
+
try { lockRoot = findProjectRoot(cwd); }
|
|
235
|
+
catch { lockRoot = cwd; }
|
|
236
|
+
const lock = headlessGuard.tryAcquireSpawnLock(lockRoot, agent);
|
|
237
|
+
if (!lock.acquired) {
|
|
238
|
+
throw new NubosPilotError(
|
|
239
|
+
'spawn-headless-locked',
|
|
240
|
+
'another headless run for agent `' + agent + '` is already active in this project (concurrency guard)',
|
|
241
|
+
{ agent, holder: lock.holder || null },
|
|
242
|
+
);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const childEnv = _filterSpawnEnv(process.env);
|
|
246
|
+
Object.assign(childEnv, headlessGuard.childSpawnEnv(process.env));
|
|
247
|
+
|
|
216
248
|
const bin = _claudeBinary();
|
|
217
249
|
const claudeArgs = ['-p', '--output-format', 'json'];
|
|
218
250
|
const startedAt = new Date().toISOString();
|
|
@@ -224,7 +256,7 @@ function run(argv, ctx) {
|
|
|
224
256
|
timeout: timeoutMs,
|
|
225
257
|
maxBuffer: 64 * 1024 * 1024,
|
|
226
258
|
encoding: 'utf-8',
|
|
227
|
-
env:
|
|
259
|
+
env: childEnv,
|
|
228
260
|
killSignal: 'SIGKILL',
|
|
229
261
|
});
|
|
230
262
|
} catch (err) {
|
|
@@ -233,6 +265,8 @@ function run(argv, ctx) {
|
|
|
233
265
|
'failed to spawn `' + bin + '`: ' + (err && err.message),
|
|
234
266
|
{ bin, cause: err && err.code },
|
|
235
267
|
);
|
|
268
|
+
} finally {
|
|
269
|
+
lock.release();
|
|
236
270
|
}
|
|
237
271
|
if (result.error && result.error.code === 'ENOENT') {
|
|
238
272
|
throw new NubosPilotError(
|
|
@@ -8,6 +8,14 @@ const assert = require('node:assert/strict');
|
|
|
8
8
|
|
|
9
9
|
const spawnHeadless = require('./spawn-headless.cjs');
|
|
10
10
|
const runContext = require('../../lib/run-context.cjs');
|
|
11
|
+
const headlessGuard = require('../../lib/headless-guard.cjs');
|
|
12
|
+
|
|
13
|
+
function _mockClaude(r, name, body) {
|
|
14
|
+
const p = path.join(r, name);
|
|
15
|
+
fs.writeFileSync(p, body, 'utf-8');
|
|
16
|
+
fs.chmodSync(p, 0o755);
|
|
17
|
+
return p;
|
|
18
|
+
}
|
|
11
19
|
|
|
12
20
|
const _sandboxes = [];
|
|
13
21
|
const _envBackup = {};
|
|
@@ -488,6 +496,133 @@ test('SH-TRAIL-2 two sequential spawns append two parseable trail lines (jsonl i
|
|
|
488
496
|
for (const l of lines) JSON.parse(l);
|
|
489
497
|
});
|
|
490
498
|
|
|
499
|
+
test('SH-GUARD-1 refuses to spawn when NUBOS_PILOT_HEADLESS=1 (reentrancy guard)', () => {
|
|
500
|
+
const r = _mkRoot();
|
|
501
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
502
|
+
const mockBin = _mockClaude(r, 'mock.sh', '#!/bin/sh\ncat > /dev/null\necho "{}"\n');
|
|
503
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', mockBin);
|
|
504
|
+
_setEnv('NUBOS_PILOT_HEADLESS', '1');
|
|
505
|
+
const cap = _cap();
|
|
506
|
+
assert.throws(
|
|
507
|
+
() => spawnHeadless.run(
|
|
508
|
+
['--agent', 'np-test-critic', '--prompt-path', 'p.md', '--output-path', 'out.json'],
|
|
509
|
+
{ cwd: r, stdout: cap.stub },
|
|
510
|
+
),
|
|
511
|
+
(err) => err && err.code === 'spawn-headless-reentrant',
|
|
512
|
+
);
|
|
513
|
+
assert.equal(fs.existsSync(path.join(r, 'out.json')), false, 'no claude must be spawned inside a headless run');
|
|
514
|
+
});
|
|
515
|
+
|
|
516
|
+
test('SH-GUARD-2 refuses to spawn when hook depth has reached the cap (depth guard)', () => {
|
|
517
|
+
const r = _mkRoot();
|
|
518
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
519
|
+
const mockBin = _mockClaude(r, 'mock.sh', '#!/bin/sh\ncat > /dev/null\necho "{}"\n');
|
|
520
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', mockBin);
|
|
521
|
+
_setEnv('NUBOS_PILOT_HOOK_DEPTH', '1');
|
|
522
|
+
const cap = _cap();
|
|
523
|
+
assert.throws(
|
|
524
|
+
() => spawnHeadless.run(
|
|
525
|
+
['--agent', 'np-test-critic', '--prompt-path', 'p.md', '--output-path', 'out.json'],
|
|
526
|
+
{ cwd: r, stdout: cap.stub },
|
|
527
|
+
),
|
|
528
|
+
(err) => err && err.code === 'spawn-headless-depth-exceeded',
|
|
529
|
+
);
|
|
530
|
+
});
|
|
531
|
+
|
|
532
|
+
test('SH-GUARD-3 child env carries NUBOS_PILOT_HEADLESS=1 and depth=1 (one level deep only)', () => {
|
|
533
|
+
const r = _mkRoot();
|
|
534
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
535
|
+
const mockBin = _mockClaude(r, 'mock.sh',
|
|
536
|
+
'#!/bin/sh\ncat > /dev/null\nprintf \'{"hl":"\'$NUBOS_PILOT_HEADLESS\'","depth":"\'$NUBOS_PILOT_HOOK_DEPTH\'"}\\n\'\n');
|
|
537
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', mockBin);
|
|
538
|
+
const cap = _cap();
|
|
539
|
+
const rc = spawnHeadless.run(
|
|
540
|
+
['--agent', 'np-test-critic', '--prompt-path', 'p.md', '--output-path', 'out.json'],
|
|
541
|
+
{ cwd: r, stdout: cap.stub },
|
|
542
|
+
);
|
|
543
|
+
assert.equal(rc, 0);
|
|
544
|
+
const child = JSON.parse(fs.readFileSync(path.join(r, 'out.json'), 'utf-8'));
|
|
545
|
+
assert.equal(child.hl, '1', 'child claude must run with NUBOS_PILOT_HEADLESS=1');
|
|
546
|
+
assert.equal(child.depth, '1', 'child claude must run at hook depth 1');
|
|
547
|
+
});
|
|
548
|
+
|
|
549
|
+
test('SH-GUARD-4 refuses to spawn while a live lock for the same agent is held (concurrency guard)', () => {
|
|
550
|
+
const r = _mkRoot();
|
|
551
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
552
|
+
const mockBin = _mockClaude(r, 'mock.sh', '#!/bin/sh\ncat > /dev/null\necho "{}"\n');
|
|
553
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', mockBin);
|
|
554
|
+
const held = headlessGuard.tryAcquireSpawnLock(r, 'np-test-critic');
|
|
555
|
+
assert.equal(held.acquired, true);
|
|
556
|
+
const cap = _cap();
|
|
557
|
+
try {
|
|
558
|
+
assert.throws(
|
|
559
|
+
() => spawnHeadless.run(
|
|
560
|
+
['--agent', 'np-test-critic', '--prompt-path', 'p.md', '--output-path', 'out.json'],
|
|
561
|
+
{ cwd: r, stdout: cap.stub },
|
|
562
|
+
),
|
|
563
|
+
(err) => err && err.code === 'spawn-headless-locked',
|
|
564
|
+
);
|
|
565
|
+
} finally {
|
|
566
|
+
held.release();
|
|
567
|
+
}
|
|
568
|
+
});
|
|
569
|
+
|
|
570
|
+
test('SH-GUARD-5 lock is released after a successful spawn (re-spawnable)', () => {
|
|
571
|
+
const r = _mkRoot();
|
|
572
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
573
|
+
const mockBin = _mockClaude(r, 'mock.sh', '#!/bin/sh\ncat > /dev/null\necho "{}"\n');
|
|
574
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', mockBin);
|
|
575
|
+
const cap = _cap();
|
|
576
|
+
for (let i = 0; i < 2; i++) {
|
|
577
|
+
const rc = spawnHeadless.run(
|
|
578
|
+
['--agent', 'np-test-critic', '--prompt-path', 'p.md', '--output-path', 'out' + i + '.json'],
|
|
579
|
+
{ cwd: r, stdout: cap.stub },
|
|
580
|
+
);
|
|
581
|
+
assert.equal(rc, 0, 'sequential spawns must each acquire and release the lock');
|
|
582
|
+
}
|
|
583
|
+
assert.equal(fs.existsSync(headlessGuard._lockPath(r, 'np-test-critic')), false, 'no lock residue after spawns');
|
|
584
|
+
});
|
|
585
|
+
|
|
586
|
+
test('SH-GUARD-6 a held lock for one agent does NOT block a different agent (per-agent scope)', () => {
|
|
587
|
+
const r = _mkRoot();
|
|
588
|
+
fs.writeFileSync(
|
|
589
|
+
path.join(r, '.nubos-pilot', 'agents', 'np-other-critic.md'),
|
|
590
|
+
'---\nname: np-other-critic\n---\n\n# Role\n',
|
|
591
|
+
'utf-8',
|
|
592
|
+
);
|
|
593
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
594
|
+
const mockBin = _mockClaude(r, 'mock.sh', '#!/bin/sh\ncat > /dev/null\necho "{}"\n');
|
|
595
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', mockBin);
|
|
596
|
+
const held = headlessGuard.tryAcquireSpawnLock(r, 'np-test-critic');
|
|
597
|
+
assert.equal(held.acquired, true);
|
|
598
|
+
const cap = _cap();
|
|
599
|
+
try {
|
|
600
|
+
const rc = spawnHeadless.run(
|
|
601
|
+
['--agent', 'np-other-critic', '--prompt-path', 'p.md', '--output-path', 'out.json'],
|
|
602
|
+
{ cwd: r, stdout: cap.stub },
|
|
603
|
+
);
|
|
604
|
+
assert.equal(rc, 0, 'a different agent must spawn while np-test-critic is locked');
|
|
605
|
+
} finally {
|
|
606
|
+
held.release();
|
|
607
|
+
}
|
|
608
|
+
});
|
|
609
|
+
|
|
610
|
+
test('SH-GUARD-7 lock is released even when the spawn errors (claude-not-found)', () => {
|
|
611
|
+
const r = _mkRoot();
|
|
612
|
+
fs.writeFileSync(path.join(r, 'p.md'), 'audit', 'utf-8');
|
|
613
|
+
_setEnv('NUBOS_PILOT_CLAUDE_BIN', path.join(r, 'no-such-binary'));
|
|
614
|
+
const cap = _cap();
|
|
615
|
+
assert.throws(
|
|
616
|
+
() => spawnHeadless.run(
|
|
617
|
+
['--agent', 'np-test-critic', '--prompt-path', 'p.md', '--output-path', 'out.json'],
|
|
618
|
+
{ cwd: r, stdout: cap.stub },
|
|
619
|
+
),
|
|
620
|
+
(err) => err && err.code === 'spawn-headless-claude-not-found',
|
|
621
|
+
);
|
|
622
|
+
assert.equal(fs.existsSync(headlessGuard._lockPath(r, 'np-test-critic')), false,
|
|
623
|
+
'the per-agent lock must not leak when the spawn fails');
|
|
624
|
+
});
|
|
625
|
+
|
|
491
626
|
test('SH-ENV-4 NUBOS_PILOT_/CLAUDE_/ANTHROPIC_ prefixed vars pass through (whitelisted prefix)', () => {
|
|
492
627
|
const parent = {
|
|
493
628
|
PATH: '/usr/bin',
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { summarize, describe } = require('../../lib/eval-reliability.cjs');
|
|
4
|
+
const { emitErrorEnvelope } = require('./_args.cjs');
|
|
5
|
+
|
|
6
|
+
function _usage() {
|
|
7
|
+
return [
|
|
8
|
+
'Usage:',
|
|
9
|
+
' np-tools.cjs verify-reliability --codes <c1,c2,...>',
|
|
10
|
+
'',
|
|
11
|
+
'pass@k reliability: the orchestrator runs a task\'s <verify> command k times',
|
|
12
|
+
'and passes the collected exit codes (0 = pass). Emits a JSON summary whose',
|
|
13
|
+
'`aggregate_exit_code` is 0 only when every run passed (pass^k) — feed it to',
|
|
14
|
+
'`loop-run-round --phase post-executor --verify-exit-code`. A flaky task',
|
|
15
|
+
'aggregates to red and flows through the normal build-fixer path.',
|
|
16
|
+
].join('\n');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function run(argv, ctx) {
|
|
20
|
+
const context = ctx || {};
|
|
21
|
+
const stdout = context.stdout || process.stdout;
|
|
22
|
+
const stderr = context.stderr || process.stderr;
|
|
23
|
+
const args = Array.isArray(argv) ? argv.slice() : [];
|
|
24
|
+
|
|
25
|
+
let codesRaw = null;
|
|
26
|
+
for (let i = 0; i < args.length; i++) {
|
|
27
|
+
const a = args[i];
|
|
28
|
+
if (a === '-h' || a === '--help') { stdout.write(_usage() + '\n'); return 0; }
|
|
29
|
+
else if (a === '--codes') { codesRaw = args[++i] || ''; }
|
|
30
|
+
else if (a.startsWith('--codes=')) { codesRaw = a.slice('--codes='.length); }
|
|
31
|
+
else {
|
|
32
|
+
stderr.write(JSON.stringify({
|
|
33
|
+
code: 'verify-reliability-unknown-arg',
|
|
34
|
+
message: 'Unknown argument: ' + a,
|
|
35
|
+
details: { arg: a },
|
|
36
|
+
}) + '\n');
|
|
37
|
+
return 1;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (codesRaw == null) {
|
|
42
|
+
stderr.write(JSON.stringify({
|
|
43
|
+
code: 'verify-reliability-missing-codes',
|
|
44
|
+
message: '--codes <c1,c2,...> is required',
|
|
45
|
+
details: {},
|
|
46
|
+
}) + '\n');
|
|
47
|
+
return 1;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
try {
|
|
51
|
+
const codes = String(codesRaw).split(',').map((s) => s.trim()).filter((s) => s !== '').map(Number);
|
|
52
|
+
const summary = summarize(codes);
|
|
53
|
+
stdout.write(JSON.stringify(Object.assign({}, summary, { description: describe(summary) })) + '\n');
|
|
54
|
+
return 0;
|
|
55
|
+
} catch (err) {
|
|
56
|
+
emitErrorEnvelope(err, stderr, 'verify-reliability-internal-error');
|
|
57
|
+
return 1;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
module.exports = { run };
|
|
62
|
+
|
|
63
|
+
if (require.main === module) {
|
|
64
|
+
process.exit(run(process.argv.slice(2)));
|
|
65
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { test } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const { run } = require('./verify-reliability.cjs');
|
|
6
|
+
|
|
7
|
+
function _capture() {
|
|
8
|
+
const out = { text: '' };
|
|
9
|
+
const err = { text: '' };
|
|
10
|
+
return {
|
|
11
|
+
stdout: { write: (s) => { out.text += s; return true; } },
|
|
12
|
+
stderr: { write: (s) => { err.text += s; return true; } },
|
|
13
|
+
out, err,
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
test('VR-1: all-pass codes → aggregate 0, reliable-pass', () => {
|
|
18
|
+
const c = _capture();
|
|
19
|
+
const code = run(['--codes', '0,0,0'], c);
|
|
20
|
+
assert.strictEqual(code, 0);
|
|
21
|
+
const r = JSON.parse(c.out.text);
|
|
22
|
+
assert.strictEqual(r.aggregate_exit_code, 0);
|
|
23
|
+
assert.strictEqual(r.verdict, 'reliable-pass');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('VR-2: flaky codes → aggregate 1, flaky verdict + loud description', () => {
|
|
27
|
+
const c = _capture();
|
|
28
|
+
const code = run(['--codes', '0,1,0'], c);
|
|
29
|
+
assert.strictEqual(code, 0);
|
|
30
|
+
const r = JSON.parse(c.out.text);
|
|
31
|
+
assert.strictEqual(r.aggregate_exit_code, 1);
|
|
32
|
+
assert.strictEqual(r.flaky, true);
|
|
33
|
+
assert.match(r.description, /FLAKY/);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
test('VR-3: --codes= form supported', () => {
|
|
37
|
+
const c = _capture();
|
|
38
|
+
const code = run(['--codes=1,1'], c);
|
|
39
|
+
assert.strictEqual(code, 0);
|
|
40
|
+
assert.strictEqual(JSON.parse(c.out.text).verdict, 'reliable-fail');
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
test('VR-4: missing --codes → error envelope, exit 1', () => {
|
|
44
|
+
const c = _capture();
|
|
45
|
+
const code = run([], c);
|
|
46
|
+
assert.strictEqual(code, 1);
|
|
47
|
+
assert.match(c.err.text, /verify-reliability-missing-codes/);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test('VR-5: unknown arg → error envelope, exit 1', () => {
|
|
51
|
+
const c = _capture();
|
|
52
|
+
const code = run(['--bogus'], c);
|
|
53
|
+
assert.strictEqual(code, 1);
|
|
54
|
+
assert.match(c.err.text, /verify-reliability-unknown-arg/);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
test('VR-6: empty codes → internal error envelope, exit 1', () => {
|
|
58
|
+
const c = _capture();
|
|
59
|
+
const code = run(['--codes', ''], c);
|
|
60
|
+
assert.strictEqual(code, 1);
|
|
61
|
+
assert.match(c.err.text, /eval-reliability-no-runs/);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test('VR-7: --help → usage exit 0', () => {
|
|
65
|
+
const c = _capture();
|
|
66
|
+
const code = run(['--help'], c);
|
|
67
|
+
assert.strictEqual(code, 0);
|
|
68
|
+
assert.match(c.out.text, /verify-reliability/);
|
|
69
|
+
});
|
package/lib/agents.test.cjs
CHANGED
|
@@ -247,6 +247,7 @@ const NP_AGENTS = [
|
|
|
247
247
|
{ file: 'np-nyquist-auditor', expected_tier: 'haiku' },
|
|
248
248
|
{ file: 'np-sc-extractor', expected_tier: 'haiku' },
|
|
249
249
|
{ file: 'np-critic', expected_tier: 'sonnet' },
|
|
250
|
+
{ file: 'np-learnings-extractor', expected_tier: 'haiku' },
|
|
250
251
|
];
|
|
251
252
|
|
|
252
253
|
// Audit-surface modules — files in agents/ that carry agent-shaped frontmatter
|
package/lib/config-defaults.cjs
CHANGED
|
@@ -9,6 +9,7 @@ const DEFAULT_WORKFLOW = Object.freeze({
|
|
|
9
9
|
commit_docs: true,
|
|
10
10
|
commit_artifacts: true,
|
|
11
11
|
worktree_isolation: false,
|
|
12
|
+
tier_routing: false,
|
|
12
13
|
research_tools: DEFAULT_RESEARCH_TOOLS,
|
|
13
14
|
});
|
|
14
15
|
|
|
@@ -21,6 +22,7 @@ const DEFAULT_AGENTS = Object.freeze({
|
|
|
21
22
|
|
|
22
23
|
const DEFAULT_LOOP = Object.freeze({
|
|
23
24
|
maxRounds: 3,
|
|
25
|
+
verify_runs: 1,
|
|
24
26
|
});
|
|
25
27
|
|
|
26
28
|
const DEFAULT_SWARM_RESEARCH = Object.freeze({
|
|
@@ -58,6 +60,14 @@ const DEFAULT_CONFORMANCE = Object.freeze({
|
|
|
58
60
|
inject_criteria: true,
|
|
59
61
|
});
|
|
60
62
|
|
|
63
|
+
const DEFAULT_LEARNINGS = Object.freeze({
|
|
64
|
+
auto_capture: true,
|
|
65
|
+
max_captures_per_hour: 10,
|
|
66
|
+
max_in_a_row: 3,
|
|
67
|
+
timeout_ms: 120000,
|
|
68
|
+
max_files: 30,
|
|
69
|
+
});
|
|
70
|
+
|
|
61
71
|
const DEFAULT_AUTO_LOG_LEARNING = true;
|
|
62
72
|
|
|
63
73
|
const DEFAULT_SPAWN_HEADLESS = Object.freeze({
|
|
@@ -86,6 +96,7 @@ const DEFAULT_CONFIG_TREE = Object.freeze({
|
|
|
86
96
|
spawn: DEFAULT_SPAWN,
|
|
87
97
|
security: DEFAULT_SECURITY,
|
|
88
98
|
conformance: DEFAULT_CONFORMANCE,
|
|
99
|
+
learnings: DEFAULT_LEARNINGS,
|
|
89
100
|
auto_log_learning: DEFAULT_AUTO_LOG_LEARNING,
|
|
90
101
|
});
|
|
91
102
|
|
|
@@ -119,6 +130,7 @@ function buildInstallConfig(answers) {
|
|
|
119
130
|
},
|
|
120
131
|
security: { ...DEFAULT_SECURITY },
|
|
121
132
|
conformance: { ...DEFAULT_CONFORMANCE },
|
|
133
|
+
learnings: { ...DEFAULT_LEARNINGS },
|
|
122
134
|
auto_log_learning: DEFAULT_AUTO_LOG_LEARNING,
|
|
123
135
|
};
|
|
124
136
|
}
|
|
@@ -135,6 +147,7 @@ module.exports = {
|
|
|
135
147
|
DEFAULT_SPAWN_HEADLESS,
|
|
136
148
|
DEFAULT_SECURITY,
|
|
137
149
|
DEFAULT_CONFORMANCE,
|
|
150
|
+
DEFAULT_LEARNINGS,
|
|
138
151
|
DEFAULT_AUTO_LOG_LEARNING,
|
|
139
152
|
DEFAULT_MODEL_PROFILE,
|
|
140
153
|
DEFAULT_SCOPE,
|
package/lib/config-schema.cjs
CHANGED
|
@@ -21,6 +21,7 @@ const SCHEMA = Object.freeze({
|
|
|
21
21
|
worktree_isolation: { type: 'boolean', optional: true },
|
|
22
22
|
research_tools: { type: 'object', shape: 'any', optional: true },
|
|
23
23
|
text_mode: { type: 'boolean', optional: true },
|
|
24
|
+
tier_routing: { type: 'boolean', optional: true },
|
|
24
25
|
},
|
|
25
26
|
},
|
|
26
27
|
agents: {
|
|
@@ -34,6 +35,7 @@ const SCHEMA = Object.freeze({
|
|
|
34
35
|
loop: {
|
|
35
36
|
type: 'object', optional: true, shape: {
|
|
36
37
|
maxRounds: { type: 'number', optional: true },
|
|
38
|
+
verify_runs: { type: 'number', optional: true },
|
|
37
39
|
},
|
|
38
40
|
},
|
|
39
41
|
swarm: {
|
|
@@ -86,6 +88,15 @@ const SCHEMA = Object.freeze({
|
|
|
86
88
|
inject_criteria: { type: 'boolean', optional: true },
|
|
87
89
|
},
|
|
88
90
|
},
|
|
91
|
+
learnings: {
|
|
92
|
+
type: 'object', optional: true, shape: {
|
|
93
|
+
auto_capture: { type: 'boolean', optional: true },
|
|
94
|
+
max_captures_per_hour: { type: 'number', optional: true },
|
|
95
|
+
max_in_a_row: { type: 'number', optional: true },
|
|
96
|
+
timeout_ms: { type: 'number', optional: true },
|
|
97
|
+
max_files: { type: 'number', optional: true },
|
|
98
|
+
},
|
|
99
|
+
},
|
|
89
100
|
});
|
|
90
101
|
|
|
91
102
|
function _typeOf(v) {
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// pass@k reliability: the orchestrator runs a task's <verify> command k times
|
|
4
|
+
// and feeds the collected exit codes here. A task that passes only sometimes is
|
|
5
|
+
// FLAKY — not green. summarize() folds k runs into a single aggregate exit code
|
|
6
|
+
// (0 only when every run passed — pass^k semantics) so flakiness flows through
|
|
7
|
+
// the EXISTING verify-red → build-fixer path. No new critic category is
|
|
8
|
+
// introduced (that would risk the unknown-category spurious-stuck trap).
|
|
9
|
+
|
|
10
|
+
const { NubosPilotError } = require('./core.cjs');
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @param {number[]} exitCodes one exit code per verify run (0 = pass)
|
|
14
|
+
* @returns {{runs:number, passes:number, fails:number, pass_at_1:boolean, pass_at_k:boolean, flaky:boolean, verdict:string, aggregate_exit_code:number}}
|
|
15
|
+
*/
|
|
16
|
+
function summarize(exitCodes) {
|
|
17
|
+
if (!Array.isArray(exitCodes) || exitCodes.length === 0) {
|
|
18
|
+
throw new NubosPilotError(
|
|
19
|
+
'eval-reliability-no-runs',
|
|
20
|
+
'summarize requires a non-empty array of exit codes',
|
|
21
|
+
{ got: exitCodes },
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
const codes = exitCodes.map((c) => Number(c));
|
|
25
|
+
if (codes.some((c) => !Number.isInteger(c))) {
|
|
26
|
+
throw new NubosPilotError(
|
|
27
|
+
'eval-reliability-bad-code',
|
|
28
|
+
'every exit code must be an integer',
|
|
29
|
+
{ codes: exitCodes },
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const runs = codes.length;
|
|
34
|
+
const passes = codes.filter((c) => c === 0).length;
|
|
35
|
+
const fails = runs - passes;
|
|
36
|
+
const passAt1 = codes[0] === 0;
|
|
37
|
+
const passAtK = passes === runs;
|
|
38
|
+
const flaky = passes > 0 && fails > 0;
|
|
39
|
+
|
|
40
|
+
let verdict;
|
|
41
|
+
if (passAtK) verdict = 'reliable-pass';
|
|
42
|
+
else if (passes === 0) verdict = 'reliable-fail';
|
|
43
|
+
else verdict = 'flaky';
|
|
44
|
+
|
|
45
|
+
// pass^k: green only if every run passed. Flaky and all-fail both aggregate
|
|
46
|
+
// to non-zero so the loop treats them as verify-red.
|
|
47
|
+
const aggregate_exit_code = passAtK ? 0 : 1;
|
|
48
|
+
|
|
49
|
+
return { runs, passes, fails, pass_at_1: passAt1, pass_at_k: passAtK, flaky, verdict, aggregate_exit_code };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** One-line human summary for the verify log the build-fixer reads. */
|
|
53
|
+
function describe(s) {
|
|
54
|
+
if (s.runs === 1) {
|
|
55
|
+
return s.pass_at_k ? 'verify passed (1 run)' : 'verify failed (1 run)';
|
|
56
|
+
}
|
|
57
|
+
if (s.verdict === 'reliable-pass') return 'verify reliably passed (' + s.passes + '/' + s.runs + ' runs)';
|
|
58
|
+
if (s.verdict === 'reliable-fail') return 'verify reliably failed (0/' + s.runs + ' runs passed)';
|
|
59
|
+
return 'FLAKY: verify passed only ' + s.passes + '/' + s.runs + ' runs — non-deterministic, treated as red. '
|
|
60
|
+
+ 'Make the verified behaviour deterministic (no sleeps/real clock/network/ordering) before this task can go green.';
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
module.exports = { summarize, describe };
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { test } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const { summarize, describe } = require('./eval-reliability.cjs');
|
|
6
|
+
|
|
7
|
+
test('ER-1: all pass → reliable-pass, aggregate 0', () => {
|
|
8
|
+
const s = summarize([0, 0, 0]);
|
|
9
|
+
assert.strictEqual(s.verdict, 'reliable-pass');
|
|
10
|
+
assert.strictEqual(s.pass_at_k, true);
|
|
11
|
+
assert.strictEqual(s.pass_at_1, true);
|
|
12
|
+
assert.strictEqual(s.flaky, false);
|
|
13
|
+
assert.strictEqual(s.aggregate_exit_code, 0);
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
test('ER-2: all fail → reliable-fail, aggregate non-zero', () => {
|
|
17
|
+
const s = summarize([1, 1, 1]);
|
|
18
|
+
assert.strictEqual(s.verdict, 'reliable-fail');
|
|
19
|
+
assert.strictEqual(s.pass_at_k, false);
|
|
20
|
+
assert.strictEqual(s.flaky, false);
|
|
21
|
+
assert.strictEqual(s.aggregate_exit_code, 1);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
test('ER-3: mixed → flaky, aggregate non-zero (pass^k)', () => {
|
|
25
|
+
const s = summarize([0, 1, 0]);
|
|
26
|
+
assert.strictEqual(s.verdict, 'flaky');
|
|
27
|
+
assert.strictEqual(s.flaky, true);
|
|
28
|
+
assert.strictEqual(s.pass_at_1, true);
|
|
29
|
+
assert.strictEqual(s.pass_at_k, false);
|
|
30
|
+
assert.strictEqual(s.aggregate_exit_code, 1);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
test('ER-4: first-run-fail-then-pass is still flaky and red', () => {
|
|
34
|
+
const s = summarize([1, 0, 0]);
|
|
35
|
+
assert.strictEqual(s.flaky, true);
|
|
36
|
+
assert.strictEqual(s.pass_at_1, false);
|
|
37
|
+
assert.strictEqual(s.aggregate_exit_code, 1);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('ER-5: single run preserves classic behaviour', () => {
|
|
41
|
+
assert.strictEqual(summarize([0]).aggregate_exit_code, 0);
|
|
42
|
+
assert.strictEqual(summarize([2]).aggregate_exit_code, 1);
|
|
43
|
+
assert.strictEqual(summarize([0]).verdict, 'reliable-pass');
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test('ER-6: empty/invalid input throws', () => {
|
|
47
|
+
assert.throws(() => summarize([]), (e) => e.code === 'eval-reliability-no-runs');
|
|
48
|
+
assert.throws(() => summarize('nope'), (e) => e.code === 'eval-reliability-no-runs');
|
|
49
|
+
assert.throws(() => summarize([0, 1.5]), (e) => e.code === 'eval-reliability-bad-code');
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test('ER-7: describe is human-readable and flags flaky loudly', () => {
|
|
53
|
+
assert.match(describe(summarize([0])), /passed \(1 run\)/);
|
|
54
|
+
assert.match(describe(summarize([0, 0, 0])), /reliably passed/);
|
|
55
|
+
assert.match(describe(summarize([0, 1, 0])), /FLAKY/);
|
|
56
|
+
});
|