dual-brain 3.7.1 → 3.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/hooks/test-orchestrator.mjs +312 -0
- package/package.json +1 -1
|
@@ -10,8 +10,10 @@
|
|
|
10
10
|
|
|
11
11
|
import { execSync, spawnSync } from 'child_process';
|
|
12
12
|
import {
|
|
13
|
+
appendFileSync,
|
|
13
14
|
existsSync,
|
|
14
15
|
readFileSync,
|
|
16
|
+
unlinkSync,
|
|
15
17
|
writeFileSync,
|
|
16
18
|
} from 'fs';
|
|
17
19
|
import { dirname, resolve } from 'path';
|
|
@@ -337,6 +339,316 @@ test('failure-detector: ignores followed=false', () => {
|
|
|
337
339
|
return true;
|
|
338
340
|
});
|
|
339
341
|
|
|
342
|
+
// ─── Test 17: enforce-tier: malformed stdin ─────────────────────────────────
|
|
343
|
+
test('enforce-tier: malformed stdin', () => {
|
|
344
|
+
const { parsed, status } = run(ENFORCE_TIER, 'this is not json at all {{{');
|
|
345
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
346
|
+
if (!parsed) return 'no valid JSON output';
|
|
347
|
+
return true;
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
// ─── Test 18: enforce-tier: missing tool_input ──────────────────────────────
|
|
351
|
+
test('enforce-tier: missing tool_input', () => {
|
|
352
|
+
const payload = JSON.stringify({ tool_name: 'Agent' });
|
|
353
|
+
const { parsed, status } = run(ENFORCE_TIER, payload);
|
|
354
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
355
|
+
if (!parsed) return 'no valid JSON output';
|
|
356
|
+
return true;
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
// ─── Test 19: enforce-tier: non-Agent tool passthrough ──────────────────────
|
|
360
|
+
test('enforce-tier: non-Agent tool passthrough', () => {
|
|
361
|
+
const payload = JSON.stringify({ tool_name: 'Read', tool_input: { file_path: '/foo' } });
|
|
362
|
+
const { parsed, status } = run(ENFORCE_TIER, payload);
|
|
363
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
364
|
+
if (!parsed) return 'no valid JSON output';
|
|
365
|
+
if (Object.keys(parsed).length !== 0)
|
|
366
|
+
return `expected {}, got: ${JSON.stringify(parsed)}`;
|
|
367
|
+
return true;
|
|
368
|
+
});
|
|
369
|
+
|
|
370
|
+
// ─── Test 20: cost-logger: malformed stdin ──────────────────────────────────
|
|
371
|
+
test('cost-logger: malformed stdin', () => {
|
|
372
|
+
const { parsed, status } = runStream(COST_LOGGER, 'not json garbage >>>');
|
|
373
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
374
|
+
if (!parsed) return 'no valid JSON output';
|
|
375
|
+
return true;
|
|
376
|
+
});
|
|
377
|
+
|
|
378
|
+
// ─── Test 21: cost-logger: missing fields ───────────────────────────────────
|
|
379
|
+
test('cost-logger: missing fields', () => {
|
|
380
|
+
let linesBefore = 0;
|
|
381
|
+
if (existsSync(USAGE_JSONL)) {
|
|
382
|
+
linesBefore = readFileSync(USAGE_JSONL, 'utf8').split('\n').filter(Boolean).length;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
const { parsed, status } = runStream(COST_LOGGER, '{}');
|
|
386
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
387
|
+
if (!parsed) return 'no valid JSON output';
|
|
388
|
+
|
|
389
|
+
if (!existsSync(USAGE_JSONL)) return 'daily usage log was not created';
|
|
390
|
+
const lines = readFileSync(USAGE_JSONL, 'utf8').split('\n').filter(Boolean);
|
|
391
|
+
if (lines.length <= linesBefore) return 'no new line was appended to daily usage log';
|
|
392
|
+
|
|
393
|
+
// Clean up the test line
|
|
394
|
+
try {
|
|
395
|
+
const kept = lines.slice(0, linesBefore).join('\n');
|
|
396
|
+
writeFileSync(USAGE_JSONL, kept ? kept + '\n' : '', 'utf8');
|
|
397
|
+
} catch {}
|
|
398
|
+
|
|
399
|
+
return true;
|
|
400
|
+
});
|
|
401
|
+
|
|
402
|
+
// ─── Test 22: cost-logger: error status recorded ────────────────────────────
|
|
403
|
+
test('cost-logger: error status recorded', () => {
|
|
404
|
+
let linesBefore = 0;
|
|
405
|
+
if (existsSync(USAGE_JSONL)) {
|
|
406
|
+
linesBefore = readFileSync(USAGE_JSONL, 'utf8').split('\n').filter(Boolean).length;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
const payload = JSON.stringify({
|
|
410
|
+
tool_name: 'Agent',
|
|
411
|
+
tool_input: { prompt: 'test' },
|
|
412
|
+
error: 'something failed',
|
|
413
|
+
});
|
|
414
|
+
const { parsed, status } = runStream(COST_LOGGER, payload);
|
|
415
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
416
|
+
if (!parsed) return 'no valid JSON output';
|
|
417
|
+
|
|
418
|
+
if (!existsSync(USAGE_JSONL)) return 'daily usage log was not created';
|
|
419
|
+
const lines = readFileSync(USAGE_JSONL, 'utf8').split('\n').filter(Boolean);
|
|
420
|
+
if (lines.length <= linesBefore) return 'no new line was appended to daily usage log';
|
|
421
|
+
|
|
422
|
+
const lastLine = lines[lines.length - 1];
|
|
423
|
+
let entry;
|
|
424
|
+
try { entry = JSON.parse(lastLine); } catch { return `last line not valid JSON: ${lastLine}`; }
|
|
425
|
+
if (entry.status !== 'error') return `expected status "error", got: "${entry.status}"`;
|
|
426
|
+
|
|
427
|
+
// Clean up the test line
|
|
428
|
+
try {
|
|
429
|
+
const kept = lines.slice(0, linesBefore).join('\n');
|
|
430
|
+
writeFileSync(USAGE_JSONL, kept ? kept + '\n' : '', 'utf8');
|
|
431
|
+
} catch {}
|
|
432
|
+
|
|
433
|
+
return true;
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
// ─── Test 23: enforce-tier: cost-saver demotes think ────────────────────────
|
|
437
|
+
test('enforce-tier: cost-saver demotes think', () => {
|
|
438
|
+
const profileFile = resolve(__dirname, '..', 'dual-brain.profile.json');
|
|
439
|
+
let originalProfile;
|
|
440
|
+
try { originalProfile = readFileSync(profileFile, 'utf8'); } catch { originalProfile = null; }
|
|
441
|
+
try {
|
|
442
|
+
writeFileSync(profileFile, JSON.stringify({ active: 'cost-saver' }));
|
|
443
|
+
// "edit the README file" — execute-like text, no think words
|
|
444
|
+
// cost-saver's demote_think=true demotes think→execute when text lacks think words
|
|
445
|
+
const payload = JSON.stringify({
|
|
446
|
+
tool_name: 'Agent',
|
|
447
|
+
tool_input: { prompt: 'edit the README file', model: 'opus' },
|
|
448
|
+
});
|
|
449
|
+
const { parsed, status } = run(ENFORCE_TIER, payload);
|
|
450
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
451
|
+
if (!parsed) return 'no valid JSON output';
|
|
452
|
+
// With demote_think, the tier stays execute, so opus on execute work exits 0 with valid JSON
|
|
453
|
+
return true;
|
|
454
|
+
} finally {
|
|
455
|
+
if (originalProfile !== null) writeFileSync(profileFile, originalProfile);
|
|
456
|
+
else try { unlinkSync(profileFile); } catch {}
|
|
457
|
+
}
|
|
458
|
+
});
|
|
459
|
+
|
|
460
|
+
// ─── Test 24: enforce-tier: quality-first promotes execute ──────────────────
|
|
461
|
+
test('enforce-tier: quality-first promotes execute', () => {
|
|
462
|
+
const profileFile = resolve(__dirname, '..', 'dual-brain.profile.json');
|
|
463
|
+
let originalProfile;
|
|
464
|
+
try { originalProfile = readFileSync(profileFile, 'utf8'); } catch { originalProfile = null; }
|
|
465
|
+
try {
|
|
466
|
+
writeFileSync(profileFile, JSON.stringify({ active: 'quality-first' }));
|
|
467
|
+
// Think-like description on sonnet model — quality-first's promote_execute=true
|
|
468
|
+
// promotes to think when text matches think words
|
|
469
|
+
const payload = JSON.stringify({
|
|
470
|
+
tool_name: 'Agent',
|
|
471
|
+
tool_input: { prompt: 'review architecture and plan the migration', model: 'sonnet' },
|
|
472
|
+
});
|
|
473
|
+
const { parsed, status } = run(ENFORCE_TIER, payload);
|
|
474
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
475
|
+
if (!parsed) return 'no valid JSON output';
|
|
476
|
+
if (!parsed.systemMessage) return `expected systemMessage, got: ${JSON.stringify(parsed)}`;
|
|
477
|
+
if (!parsed.systemMessage.toLowerCase().includes('think'))
|
|
478
|
+
return `expected "think" in systemMessage, got: ${parsed.systemMessage}`;
|
|
479
|
+
return true;
|
|
480
|
+
} finally {
|
|
481
|
+
if (originalProfile !== null) writeFileSync(profileFile, originalProfile);
|
|
482
|
+
else try { unlinkSync(profileFile); } catch {}
|
|
483
|
+
}
|
|
484
|
+
});
|
|
485
|
+
|
|
486
|
+
// ─── Test 25: enforce-tier: auto profile with high-risk file ────────────────
|
|
487
|
+
test('enforce-tier: auto profile with high-risk file', () => {
|
|
488
|
+
const profileFile = resolve(__dirname, '..', 'dual-brain.profile.json');
|
|
489
|
+
let originalProfile;
|
|
490
|
+
try { originalProfile = readFileSync(profileFile, 'utf8'); } catch { originalProfile = null; }
|
|
491
|
+
try {
|
|
492
|
+
writeFileSync(profileFile, JSON.stringify({ active: 'auto' }));
|
|
493
|
+
// Description with auth/credentials path → risk classifier detects critical risk → promote to think
|
|
494
|
+
const payload = JSON.stringify({
|
|
495
|
+
tool_name: 'Agent',
|
|
496
|
+
tool_input: { description: 'update src/auth/credentials.mjs', prompt: 'change the token logic', model: 'sonnet' },
|
|
497
|
+
});
|
|
498
|
+
const { parsed, status } = run(ENFORCE_TIER, payload);
|
|
499
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
500
|
+
if (!parsed) return 'no valid JSON output';
|
|
501
|
+
if (!parsed.systemMessage) return `expected systemMessage, got: ${JSON.stringify(parsed)}`;
|
|
502
|
+
const msg = parsed.systemMessage.toLowerCase();
|
|
503
|
+
if (!msg.includes('think') && !msg.includes('dual-brain'))
|
|
504
|
+
return `expected "think" or "dual-brain" in systemMessage, got: ${parsed.systemMessage}`;
|
|
505
|
+
return true;
|
|
506
|
+
} finally {
|
|
507
|
+
// Always restore profile to auto so subsequent tests aren't affected
|
|
508
|
+
writeFileSync(profileFile, JSON.stringify({ active: 'auto' }));
|
|
509
|
+
}
|
|
510
|
+
});
|
|
511
|
+
|
|
512
|
+
// ─── Test 26: adaptive: recordFailure writes to ledger ─────────────────────
|
|
513
|
+
test('adaptive: recordFailure writes to ledger', () => {
|
|
514
|
+
const LEDGER = resolve(HOOKS, 'decision-ledger.jsonl');
|
|
515
|
+
const backup = existsSync(LEDGER) ? readFileSync(LEDGER, 'utf8') : null;
|
|
516
|
+
|
|
517
|
+
try {
|
|
518
|
+
const script = `
|
|
519
|
+
import { recordFailure } from './failure-detector.mjs';
|
|
520
|
+
recordFailure('testhash123', 'execute', 'test_error');
|
|
521
|
+
`;
|
|
522
|
+
const proc = spawnSync(process.execPath, [
|
|
523
|
+
'--input-type=module',
|
|
524
|
+
'-e', script,
|
|
525
|
+
], { encoding: 'utf8', timeout: 5000, cwd: HOOKS });
|
|
526
|
+
|
|
527
|
+
if (proc.status !== 0) return `recordFailure script failed: ${proc.stderr}`;
|
|
528
|
+
if (!existsSync(LEDGER)) return 'ledger file not created';
|
|
529
|
+
|
|
530
|
+
const lines = readFileSync(LEDGER, 'utf8').split('\n').filter(Boolean);
|
|
531
|
+
const lastLine = lines[lines.length - 1];
|
|
532
|
+
let entry;
|
|
533
|
+
try { entry = JSON.parse(lastLine); } catch { return `last line not valid JSON: ${lastLine}`; }
|
|
534
|
+
if (entry.prompt_hash !== 'testhash123') return `expected prompt_hash=testhash123, got: ${entry.prompt_hash}`;
|
|
535
|
+
if (entry.success !== false) return `expected success=false, got: ${entry.success}`;
|
|
536
|
+
return true;
|
|
537
|
+
} finally {
|
|
538
|
+
if (backup !== null) writeFileSync(LEDGER, backup, 'utf8');
|
|
539
|
+
else try { writeFileSync(LEDGER, '', 'utf8'); } catch {}
|
|
540
|
+
}
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
// ─── Test 27: adaptive: checkFailureLoop detects 2+ failures ───────────────
|
|
544
|
+
test('adaptive: checkFailureLoop detects 2+ failures', () => {
|
|
545
|
+
const LEDGER = resolve(HOOKS, 'decision-ledger.jsonl');
|
|
546
|
+
const backup = existsSync(LEDGER) ? readFileSync(LEDGER, 'utf8') : null;
|
|
547
|
+
|
|
548
|
+
try {
|
|
549
|
+
const hash = 'looptest_' + Date.now();
|
|
550
|
+
const now = new Date().toISOString();
|
|
551
|
+
const failEntry = JSON.stringify({
|
|
552
|
+
type: 'failure', timestamp: now, prompt_hash: hash,
|
|
553
|
+
tier: 'execute', reason: 'test', success: false,
|
|
554
|
+
});
|
|
555
|
+
const content = (backup || '') + failEntry + '\n' + failEntry + '\n';
|
|
556
|
+
writeFileSync(LEDGER, content, 'utf8');
|
|
557
|
+
|
|
558
|
+
const script = `
|
|
559
|
+
import { checkFailureLoop } from './failure-detector.mjs';
|
|
560
|
+
const result = checkFailureLoop('${hash}');
|
|
561
|
+
process.stdout.write(JSON.stringify(result));
|
|
562
|
+
`;
|
|
563
|
+
const proc = spawnSync(process.execPath, [
|
|
564
|
+
'--input-type=module',
|
|
565
|
+
'-e', script,
|
|
566
|
+
], { encoding: 'utf8', timeout: 5000, cwd: HOOKS });
|
|
567
|
+
|
|
568
|
+
if (proc.status !== 0) return `checkFailureLoop script failed: ${proc.stderr}`;
|
|
569
|
+
let result;
|
|
570
|
+
try { result = JSON.parse(proc.stdout.trim()); } catch { return `output not JSON: ${proc.stdout}`; }
|
|
571
|
+
if (!result.isLoop) return `expected isLoop=true, got: ${JSON.stringify(result)}`;
|
|
572
|
+
if (result.count < 2) return `expected count>=2, got: ${result.count}`;
|
|
573
|
+
if (result.suggestion !== 'promote_tier' && result.suggestion !== 'escalate_to_dual_brain')
|
|
574
|
+
return `unexpected suggestion: ${result.suggestion}`;
|
|
575
|
+
return true;
|
|
576
|
+
} finally {
|
|
577
|
+
if (backup !== null) writeFileSync(LEDGER, backup, 'utf8');
|
|
578
|
+
else try { writeFileSync(LEDGER, '', 'utf8'); } catch {}
|
|
579
|
+
}
|
|
580
|
+
});
|
|
581
|
+
|
|
582
|
+
// ─── Test 28: adaptive: checkFailureLoop ignores old failures ──────────────
|
|
583
|
+
test('adaptive: checkFailureLoop ignores old failures', () => {
|
|
584
|
+
const LEDGER = resolve(HOOKS, 'decision-ledger.jsonl');
|
|
585
|
+
const backup = existsSync(LEDGER) ? readFileSync(LEDGER, 'utf8') : null;
|
|
586
|
+
|
|
587
|
+
try {
|
|
588
|
+
const hash = 'oldtest_' + Date.now();
|
|
589
|
+
const threeHoursAgo = new Date(Date.now() - 3 * 60 * 60 * 1000).toISOString();
|
|
590
|
+
const oldEntry = JSON.stringify({
|
|
591
|
+
type: 'failure', timestamp: threeHoursAgo, prompt_hash: hash,
|
|
592
|
+
tier: 'execute', reason: 'old_test', success: false,
|
|
593
|
+
});
|
|
594
|
+
writeFileSync(LEDGER, oldEntry + '\n' + oldEntry + '\n', 'utf8');
|
|
595
|
+
|
|
596
|
+
const script = `
|
|
597
|
+
import { checkFailureLoop } from './failure-detector.mjs';
|
|
598
|
+
const result = checkFailureLoop('${hash}');
|
|
599
|
+
process.stdout.write(JSON.stringify(result));
|
|
600
|
+
`;
|
|
601
|
+
const proc = spawnSync(process.execPath, [
|
|
602
|
+
'--input-type=module',
|
|
603
|
+
'-e', script,
|
|
604
|
+
], { encoding: 'utf8', timeout: 5000, cwd: HOOKS });
|
|
605
|
+
|
|
606
|
+
if (proc.status !== 0) return `checkFailureLoop script failed: ${proc.stderr}`;
|
|
607
|
+
let result;
|
|
608
|
+
try { result = JSON.parse(proc.stdout.trim()); } catch { return `output not JSON: ${proc.stdout}`; }
|
|
609
|
+
if (result.isLoop) return `expected isLoop=false for old failures, got: ${JSON.stringify(result)}`;
|
|
610
|
+
return true;
|
|
611
|
+
} finally {
|
|
612
|
+
if (backup !== null) writeFileSync(LEDGER, backup, 'utf8');
|
|
613
|
+
else try { writeFileSync(LEDGER, '', 'utf8'); } catch {}
|
|
614
|
+
}
|
|
615
|
+
});
|
|
616
|
+
|
|
617
|
+
// ─── Test 29: adaptive: cost-logger records Agent errors ───────────────────
|
|
618
|
+
test('adaptive: cost-logger records Agent errors', () => {
|
|
619
|
+
const LEDGER = resolve(HOOKS, 'decision-ledger.jsonl');
|
|
620
|
+
const backup = existsSync(LEDGER) ? readFileSync(LEDGER, 'utf8') : null;
|
|
621
|
+
|
|
622
|
+
try {
|
|
623
|
+
let linesBefore = 0;
|
|
624
|
+
if (existsSync(LEDGER)) {
|
|
625
|
+
linesBefore = readFileSync(LEDGER, 'utf8').split('\n').filter(Boolean).length;
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
const payload = JSON.stringify({
|
|
629
|
+
tool_name: 'Agent',
|
|
630
|
+
tool_input: { prompt: 'failing task hash test' },
|
|
631
|
+
error: 'test failure',
|
|
632
|
+
});
|
|
633
|
+
const { status } = runStream(COST_LOGGER, payload);
|
|
634
|
+
if (status !== 0) return `non-zero exit: ${status}`;
|
|
635
|
+
|
|
636
|
+
if (!existsSync(LEDGER)) return 'ledger file not created';
|
|
637
|
+
const lines = readFileSync(LEDGER, 'utf8').split('\n').filter(Boolean);
|
|
638
|
+
if (lines.length <= linesBefore) return 'no new failure entry appended to ledger';
|
|
639
|
+
|
|
640
|
+
const newEntry = lines[lines.length - 1];
|
|
641
|
+
let entry;
|
|
642
|
+
try { entry = JSON.parse(newEntry); } catch { return `last line not valid JSON: ${newEntry}`; }
|
|
643
|
+
if (entry.success !== false) return `expected success=false, got: ${entry.success}`;
|
|
644
|
+
if (entry.type !== 'failure') return `expected type=failure, got: ${entry.type}`;
|
|
645
|
+
return true;
|
|
646
|
+
} finally {
|
|
647
|
+
if (backup !== null) writeFileSync(LEDGER, backup, 'utf8');
|
|
648
|
+
else try { writeFileSync(LEDGER, '', 'utf8'); } catch {}
|
|
649
|
+
}
|
|
650
|
+
});
|
|
651
|
+
|
|
340
652
|
// ─── Summary ─────────────────────────────────────────────────────────────────
|
|
341
653
|
const total = passed + failed;
|
|
342
654
|
console.log(`\n${passed}/${total} tests passed`);
|
package/package.json
CHANGED