@maintainabilityai/research-runner 0.1.29 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/runner/skills.js +146 -7
- package/package.json +1 -1
package/dist/runner/skills.js
CHANGED
|
@@ -576,6 +576,116 @@ const handleContextQuality = async (input) => {
|
|
|
576
576
|
return { ok: true, scope: parsed.data, bars };
|
|
577
577
|
};
|
|
578
578
|
// ─────────────────────────────────────────────────────────────────────
|
|
579
|
+
// Self-review provenance skills (B29) — pure-data attempt-tracking for
|
|
580
|
+
// prd-agent's persona-switch self-critique loop.
|
|
581
|
+
//
|
|
582
|
+
// Why these exist (PR #112 forensic):
|
|
583
|
+
// The persona-switch self-critique is a prompt-level reasoning step;
|
|
584
|
+
// pre-B29 it emitted ZERO skill_call events. So the audit chain had
|
|
585
|
+
// no proof that the agent entered round N of Architect or Security
|
|
586
|
+
// review. On PR #112 the prd-agent hallucinated `tier=restricted` and
|
|
587
|
+
// skipped the loop entirely, claiming `SKIPPED_RESTRICTED_TIER` in
|
|
588
|
+
// the PRD frontmatter — when the OKR action's actual governanceTier
|
|
589
|
+
// was `supervised`. The chain showed nothing wrong because nothing
|
|
590
|
+
// in the chain referenced self-critique at all.
|
|
591
|
+
//
|
|
592
|
+
// These skills don't "do" the review (the LLM still does that). They
|
|
593
|
+
// hand the agent the AUTHORITATIVE inputs: the OKR action's frozen
|
|
594
|
+
// tier, the resulting max_auto_rounds, a should_proceed gate, and
|
|
595
|
+
// the contents of `.caterpillar/prompts/prd/<persona>-review.md`.
|
|
596
|
+
// Because every runSkill() auto-emits, the chain proves: "agent
|
|
597
|
+
// entered persona X, round N, was told tier=Y, max_rounds=Z,
|
|
598
|
+
// should_proceed=W." If a subsequent `### Self-review — <persona>
|
|
599
|
+
// (round N)` block doesn't appear in the PR body, that's a clear
|
|
600
|
+
// contract violation visible in the audit comment.
|
|
601
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
602
|
+
const SelfReviewInput = zod_1.z.object({
|
|
603
|
+
okrId: zod_1.z.string().min(1),
|
|
604
|
+
runId: zod_1.z.string().min(1),
|
|
605
|
+
round: zod_1.z.number().int().positive(),
|
|
606
|
+
});
|
|
607
|
+
/**
|
|
608
|
+
* Tier → MAX_AUTO_ROUNDS mapping per design §6.2. Restricted=0 means the
|
|
609
|
+
* loop is skipped entirely (mandatory human gate). The agent SHOULD NOT
|
|
610
|
+
* be inferring tier from any other source; this is the single source of
|
|
611
|
+
* truth for the OKR run that's been frozen at dispatch time.
|
|
612
|
+
*/
|
|
613
|
+
function tierMaxRounds(tier) {
|
|
614
|
+
const t = tier.toLowerCase();
|
|
615
|
+
if (t === 'autonomous') {
|
|
616
|
+
return 3;
|
|
617
|
+
}
|
|
618
|
+
if (t === 'supervised') {
|
|
619
|
+
return 2;
|
|
620
|
+
}
|
|
621
|
+
return 0; // restricted / unknown
|
|
622
|
+
}
|
|
623
|
+
/**
|
|
624
|
+
* Factory: builds a self-review skill handler for one persona. Pure
|
|
625
|
+
* data — reads OKR yaml + prompt pack file, computes tier-driven gating,
|
|
626
|
+
* returns the bundle. No LLM, no synthesis.
|
|
627
|
+
*/
|
|
628
|
+
function makeSelfReviewHandler(persona) {
|
|
629
|
+
return async (input) => {
|
|
630
|
+
const parsed = SelfReviewInput.safeParse(input);
|
|
631
|
+
if (!parsed.success) {
|
|
632
|
+
return { ok: false, reason: `bad-input: ${parsed.error.message}` };
|
|
633
|
+
}
|
|
634
|
+
const mesh = meshPath();
|
|
635
|
+
const okrPath = path.join(mesh, 'okrs', parsed.data.okrId, 'okr.yaml');
|
|
636
|
+
if (!fs.existsSync(okrPath)) {
|
|
637
|
+
return { ok: false, reason: 'okr-not-found' };
|
|
638
|
+
}
|
|
639
|
+
const card = readYaml(okrPath);
|
|
640
|
+
const action = card?.actions?.find(a => a.runId === parsed.data.runId);
|
|
641
|
+
if (!action) {
|
|
642
|
+
return { ok: false, reason: `action-not-found: no actions[] entry with runId=${parsed.data.runId}` };
|
|
643
|
+
}
|
|
644
|
+
const tier = (action.governanceTier ?? '').toLowerCase();
|
|
645
|
+
const maxAutoRounds = tierMaxRounds(tier);
|
|
646
|
+
const shouldProceed = tier !== 'restricted' && parsed.data.round <= maxAutoRounds;
|
|
647
|
+
// Prompt-pack filename note: the persona is "architect" but the
|
|
648
|
+
// pack file is "architecture-review.md" (full word). Map explicitly
|
|
649
|
+
// so we don't accidentally look for "architect-review.md".
|
|
650
|
+
const promptFilename = persona === 'architect' ? 'architecture-review.md' : 'security-review.md';
|
|
651
|
+
const promptPath = path.join(mesh, '.caterpillar', 'prompts', 'prd', promptFilename);
|
|
652
|
+
let promptPack = '';
|
|
653
|
+
let promptPackFound = false;
|
|
654
|
+
if (fs.existsSync(promptPath)) {
|
|
655
|
+
try {
|
|
656
|
+
promptPack = fs.readFileSync(promptPath, 'utf8');
|
|
657
|
+
promptPackFound = true;
|
|
658
|
+
}
|
|
659
|
+
catch { /* leave empty */ }
|
|
660
|
+
}
|
|
661
|
+
// The chain only needs the small fields, not the whole prompt-pack
|
|
662
|
+
// body — auditMetadata controls what lands in the skill_call event.
|
|
663
|
+
const auditMetadata = {
|
|
664
|
+
persona,
|
|
665
|
+
tier,
|
|
666
|
+
max_auto_rounds: maxAutoRounds,
|
|
667
|
+
round: parsed.data.round,
|
|
668
|
+
should_proceed: shouldProceed,
|
|
669
|
+
prompt_pack_path: promptPath,
|
|
670
|
+
prompt_pack_found: promptPackFound,
|
|
671
|
+
};
|
|
672
|
+
return {
|
|
673
|
+
ok: true,
|
|
674
|
+
persona,
|
|
675
|
+
tier,
|
|
676
|
+
maxAutoRounds,
|
|
677
|
+
round: parsed.data.round,
|
|
678
|
+
shouldProceed,
|
|
679
|
+
promptPack,
|
|
680
|
+
promptPackPath: promptPath,
|
|
681
|
+
promptPackFound,
|
|
682
|
+
auditMetadata,
|
|
683
|
+
};
|
|
684
|
+
};
|
|
685
|
+
}
|
|
686
|
+
const handleSelfReviewArchitect = makeSelfReviewHandler('architect');
|
|
687
|
+
const handleSelfReviewSecurity = makeSelfReviewHandler('security');
|
|
688
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
579
689
|
// Search skills — thin wrappers over the existing search nodes
|
|
580
690
|
// ─────────────────────────────────────────────────────────────────────
|
|
581
691
|
const SearchQueriesInput = zod_1.z.object({
|
|
@@ -817,8 +927,21 @@ const AuditEmitInput = zod_1.z.object({
|
|
|
817
927
|
phase: zod_1.z.enum(['why', 'how', 'what']),
|
|
818
928
|
intentThreadUuid: zod_1.z.string().min(1),
|
|
819
929
|
});
|
|
820
|
-
|
|
821
|
-
|
|
930
|
+
/**
|
|
931
|
+
* Audit-JSONL file-lock retry budget. Sized for parallel auto-emission:
|
|
932
|
+
* the agent often fires 4 search skills concurrently, each completing in
|
|
933
|
+
* ~500ms–3s. When their handlers return at similar times, all 4 try to
|
|
934
|
+
* grab the JSONL lock simultaneously. Pre-B28a.v1.1 the budget was
|
|
935
|
+
* `3 × 50ms linear = 300ms max` which silently dropped 3 of 4 events on
|
|
936
|
+
* PR #108. New budget: 20 retries with exponential 2^n backoff capped at
|
|
937
|
+
* 500ms each (sequence: 100, 200, 400, 500, 500, 500, …) ≈ 9.6s total
|
|
938
|
+
* wait — comfortably tolerates 4–8 parallel skill invocations while
|
|
939
|
+
* staying well under the runner's overall step timeout. Total emission
|
|
940
|
+
* latency stays unchanged in the happy-path single-writer case.
|
|
941
|
+
*/
|
|
942
|
+
const LOCK_RETRY_LIMIT = 20;
|
|
943
|
+
const LOCK_RETRY_BASE_MS = 100;
|
|
944
|
+
const LOCK_RETRY_MAX_MS = 500;
|
|
822
945
|
/** Recursive key-sorted JSON stringify so the event hash is canonical. */
|
|
823
946
|
function canonicalStringify(value) {
|
|
824
947
|
if (value === null || typeof value !== 'object') {
|
|
@@ -948,7 +1071,12 @@ const handleAuditEmitEvent = async (input) => {
|
|
|
948
1071
|
}
|
|
949
1072
|
catch (err) {
|
|
950
1073
|
if (err.code === 'EEXIST') {
|
|
951
|
-
|
|
1074
|
+
// Exponential backoff capped at LOCK_RETRY_MAX_MS. With 20
|
|
1075
|
+
// attempts the wait sequence is 100, 200, 400, 500, 500, … ≈
|
|
1076
|
+
// 9.6s total — enough headroom for 4–8 parallel auto-emissions
|
|
1077
|
+
// from skills firing concurrently (B28a.v1.1).
|
|
1078
|
+
const wait = Math.min(LOCK_RETRY_BASE_MS * (2 ** attempt), LOCK_RETRY_MAX_MS);
|
|
1079
|
+
await sleep(wait);
|
|
952
1080
|
continue;
|
|
953
1081
|
}
|
|
954
1082
|
return { ok: false, reason: `audit-lock-failed: ${err.message}` };
|
|
@@ -1114,6 +1242,8 @@ exports.SKILLS = {
|
|
|
1114
1242
|
'context-architecture': handleContextArchitecture,
|
|
1115
1243
|
'context-security': handleContextSecurity,
|
|
1116
1244
|
'context-quality': handleContextQuality,
|
|
1245
|
+
'self-review-architect': handleSelfReviewArchitect,
|
|
1246
|
+
'self-review-security': handleSelfReviewSecurity,
|
|
1117
1247
|
'tavily-search': handleTavilySearch,
|
|
1118
1248
|
'arxiv-search': handleArxivSearch,
|
|
1119
1249
|
'uspto-search': handleUsptoSearch,
|
|
@@ -1160,10 +1290,14 @@ async function runSkill(name, input) {
|
|
|
1160
1290
|
if (!result.ok) {
|
|
1161
1291
|
payload.reason = result.reason;
|
|
1162
1292
|
}
|
|
1163
|
-
// Best-effort: an audit-write failure must not shadow the real
|
|
1164
|
-
// result.
|
|
1293
|
+
// Best-effort: an audit-write failure must not shadow the real
|
|
1294
|
+
// skill result. But we MUST surface the failure to stderr — pre-
|
|
1295
|
+
// B28a.v1.1 these were silently swallowed and PR #108 dropped 3
|
|
1296
|
+
// of 4 parallel-search events with no warning. The chain-verify
|
|
1297
|
+
// CI gate still catches gaps post-hoc; this stderr line catches
|
|
1298
|
+
// them at write time.
|
|
1165
1299
|
try {
|
|
1166
|
-
await handleAuditEmitEvent({
|
|
1300
|
+
const emit = await handleAuditEmitEvent({
|
|
1167
1301
|
okrId: ctx.okrId,
|
|
1168
1302
|
runId: ctx.runId,
|
|
1169
1303
|
phase: ctx.phase,
|
|
@@ -1171,8 +1305,13 @@ async function runSkill(name, input) {
|
|
|
1171
1305
|
eventKind: 'skill_call',
|
|
1172
1306
|
payload,
|
|
1173
1307
|
});
|
|
1308
|
+
if (!emit.ok) {
|
|
1309
|
+
process.stderr.write(`::warning::audit auto-emit failed for skill ${name}: ${emit.reason}\n`);
|
|
1310
|
+
}
|
|
1311
|
+
}
|
|
1312
|
+
catch (err) {
|
|
1313
|
+
process.stderr.write(`::warning::audit auto-emit threw for skill ${name}: ${err.message}\n`);
|
|
1174
1314
|
}
|
|
1175
|
-
catch { /* swallow — chain-verify catches gaps */ }
|
|
1176
1315
|
}
|
|
1177
1316
|
}
|
|
1178
1317
|
return result;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@maintainabilityai/research-runner",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.33",
|
|
4
4
|
"description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "MaintainabilityAI",
|