@maintainabilityai/research-runner 0.1.29 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -576,6 +576,116 @@ const handleContextQuality = async (input) => {
576
576
  return { ok: true, scope: parsed.data, bars };
577
577
  };
578
578
  // ─────────────────────────────────────────────────────────────────────
579
+ // Self-review provenance skills (B29) — pure-data attempt-tracking for
580
+ // prd-agent's persona-switch self-critique loop.
581
+ //
582
+ // Why these exist (PR #112 forensic):
583
+ // The persona-switch self-critique is a prompt-level reasoning step;
584
+ // pre-B29 it emitted ZERO skill_call events. So the audit chain had
585
+ // no proof that the agent entered round N of Architect or Security
586
+ // review. On PR #112 the prd-agent hallucinated `tier=restricted` and
587
+ // skipped the loop entirely, claiming `SKIPPED_RESTRICTED_TIER` in
588
+ // the PRD frontmatter — when the OKR action's actual governanceTier
589
+ // was `supervised`. The chain showed nothing wrong because nothing
590
+ // in the chain referenced self-critique at all.
591
+ //
592
+ // These skills don't "do" the review (the LLM still does that). They
593
+ // hand the agent the AUTHORITATIVE inputs: the OKR action's frozen
594
+ // tier, the resulting max_auto_rounds, a should_proceed gate, and
595
+ // the contents of `.caterpillar/prompts/prd/<persona>-review.md`.
596
+ // Because every runSkill() auto-emits, the chain proves: "agent
597
+ // entered persona X, round N, was told tier=Y, max_rounds=Z,
598
+ // should_proceed=W." If a subsequent `### Self-review — <persona>
599
+ // (round N)` block doesn't appear in the PR body, that's a clear
600
+ // contract violation visible in the audit comment.
601
+ // ─────────────────────────────────────────────────────────────────────
602
+ const SelfReviewInput = zod_1.z.object({
603
+ okrId: zod_1.z.string().min(1),
604
+ runId: zod_1.z.string().min(1),
605
+ round: zod_1.z.number().int().positive(),
606
+ });
607
+ /**
608
+ * Tier → MAX_AUTO_ROUNDS mapping per design §6.2. Restricted=0 means the
609
+ * loop is skipped entirely (mandatory human gate). The agent SHOULD NOT
610
+ * be inferring tier from any other source; this is the single source of
611
+ * truth for the OKR run that's been frozen at dispatch time.
612
+ */
613
+ function tierMaxRounds(tier) {
614
+ const t = tier.toLowerCase();
615
+ if (t === 'autonomous') {
616
+ return 3;
617
+ }
618
+ if (t === 'supervised') {
619
+ return 2;
620
+ }
621
+ return 0; // restricted / unknown
622
+ }
623
+ /**
624
+ * Factory: builds a self-review skill handler for one persona. Pure
625
+ * data — reads OKR yaml + prompt pack file, computes tier-driven gating,
626
+ * returns the bundle. No LLM, no synthesis.
627
+ */
628
+ function makeSelfReviewHandler(persona) {
629
+ return async (input) => {
630
+ const parsed = SelfReviewInput.safeParse(input);
631
+ if (!parsed.success) {
632
+ return { ok: false, reason: `bad-input: ${parsed.error.message}` };
633
+ }
634
+ const mesh = meshPath();
635
+ const okrPath = path.join(mesh, 'okrs', parsed.data.okrId, 'okr.yaml');
636
+ if (!fs.existsSync(okrPath)) {
637
+ return { ok: false, reason: 'okr-not-found' };
638
+ }
639
+ const card = readYaml(okrPath);
640
+ const action = card?.actions?.find(a => a.runId === parsed.data.runId);
641
+ if (!action) {
642
+ return { ok: false, reason: `action-not-found: no actions[] entry with runId=${parsed.data.runId}` };
643
+ }
644
+ const tier = (action.governanceTier ?? '').toLowerCase();
645
+ const maxAutoRounds = tierMaxRounds(tier);
646
+ const shouldProceed = tier !== 'restricted' && parsed.data.round <= maxAutoRounds;
647
+ // Prompt-pack filename note: the persona is "architect" but the
648
+ // pack file is "architecture-review.md" (full word). Map explicitly
649
+ // so we don't accidentally look for "architect-review.md".
650
+ const promptFilename = persona === 'architect' ? 'architecture-review.md' : 'security-review.md';
651
+ const promptPath = path.join(mesh, '.caterpillar', 'prompts', 'prd', promptFilename);
652
+ let promptPack = '';
653
+ let promptPackFound = false;
654
+ if (fs.existsSync(promptPath)) {
655
+ try {
656
+ promptPack = fs.readFileSync(promptPath, 'utf8');
657
+ promptPackFound = true;
658
+ }
659
+ catch { /* leave empty */ }
660
+ }
661
+ // The chain only needs the small fields, not the whole prompt-pack
662
+ // body — auditMetadata controls what lands in the skill_call event.
663
+ const auditMetadata = {
664
+ persona,
665
+ tier,
666
+ max_auto_rounds: maxAutoRounds,
667
+ round: parsed.data.round,
668
+ should_proceed: shouldProceed,
669
+ prompt_pack_path: promptPath,
670
+ prompt_pack_found: promptPackFound,
671
+ };
672
+ return {
673
+ ok: true,
674
+ persona,
675
+ tier,
676
+ maxAutoRounds,
677
+ round: parsed.data.round,
678
+ shouldProceed,
679
+ promptPack,
680
+ promptPackPath: promptPath,
681
+ promptPackFound,
682
+ auditMetadata,
683
+ };
684
+ };
685
+ }
686
+ const handleSelfReviewArchitect = makeSelfReviewHandler('architect');
687
+ const handleSelfReviewSecurity = makeSelfReviewHandler('security');
688
+ // ─────────────────────────────────────────────────────────────────────
579
689
  // Search skills — thin wrappers over the existing search nodes
580
690
  // ─────────────────────────────────────────────────────────────────────
581
691
  const SearchQueriesInput = zod_1.z.object({
@@ -817,8 +927,21 @@ const AuditEmitInput = zod_1.z.object({
817
927
  phase: zod_1.z.enum(['why', 'how', 'what']),
818
928
  intentThreadUuid: zod_1.z.string().min(1),
819
929
  });
820
- const LOCK_RETRY_LIMIT = 3;
821
- const LOCK_RETRY_BASE_MS = 50;
930
+ /**
931
+ * Audit-JSONL file-lock retry budget. Sized for parallel auto-emission:
932
+ * the agent often fires 4 search skills concurrently, each completing in
933
+ * ~500ms–3s. When their handlers return at similar times, all 4 try to
934
+ * grab the JSONL lock simultaneously. Pre-B28a.v1.1 the budget was
935
+ * `3 × 50ms linear = 300ms max` which silently dropped 3 of 4 events on
936
+ * PR #108. New budget: 20 retries with exponential 2^n backoff capped at
937
+ * 500ms each (sequence: 100, 200, 400, 500, 500, 500, …) ≈ 9.6s total
938
+ * wait — comfortably tolerates 4–8 parallel skill invocations while
939
+ * staying well under the runner's overall step timeout. Total emission
940
+ * latency stays unchanged in the happy-path single-writer case.
941
+ */
942
+ const LOCK_RETRY_LIMIT = 20;
943
+ const LOCK_RETRY_BASE_MS = 100;
944
+ const LOCK_RETRY_MAX_MS = 500;
822
945
  /** Recursive key-sorted JSON stringify so the event hash is canonical. */
823
946
  function canonicalStringify(value) {
824
947
  if (value === null || typeof value !== 'object') {
@@ -948,7 +1071,12 @@ const handleAuditEmitEvent = async (input) => {
948
1071
  }
949
1072
  catch (err) {
950
1073
  if (err.code === 'EEXIST') {
951
- await sleep(LOCK_RETRY_BASE_MS * (attempt + 1));
1074
+ // Exponential backoff capped at LOCK_RETRY_MAX_MS. With 20
1075
+ // attempts the wait sequence is 100, 200, 400, 500, 500, … ≈
1076
+ // 9.6s total — enough headroom for 4–8 parallel auto-emissions
1077
+ // from skills firing concurrently (B28a.v1.1).
1078
+ const wait = Math.min(LOCK_RETRY_BASE_MS * (2 ** attempt), LOCK_RETRY_MAX_MS);
1079
+ await sleep(wait);
952
1080
  continue;
953
1081
  }
954
1082
  return { ok: false, reason: `audit-lock-failed: ${err.message}` };
@@ -1114,6 +1242,8 @@ exports.SKILLS = {
1114
1242
  'context-architecture': handleContextArchitecture,
1115
1243
  'context-security': handleContextSecurity,
1116
1244
  'context-quality': handleContextQuality,
1245
+ 'self-review-architect': handleSelfReviewArchitect,
1246
+ 'self-review-security': handleSelfReviewSecurity,
1117
1247
  'tavily-search': handleTavilySearch,
1118
1248
  'arxiv-search': handleArxivSearch,
1119
1249
  'uspto-search': handleUsptoSearch,
@@ -1160,10 +1290,14 @@ async function runSkill(name, input) {
1160
1290
  if (!result.ok) {
1161
1291
  payload.reason = result.reason;
1162
1292
  }
1163
- // Best-effort: an audit-write failure must not shadow the real skill
1164
- // result. The chain-verify CI gate is the catch-net for missed events.
1293
+ // Best-effort: an audit-write failure must not shadow the real
1294
+ // skill result. But we MUST surface the failure to stderr — pre-
1295
+ // B28a.v1.1 these were silently swallowed and PR #108 dropped 3
1296
+ // of 4 parallel-search events with no warning. The chain-verify
1297
+ // CI gate still catches gaps post-hoc; this stderr line catches
1298
+ // them at write time.
1165
1299
  try {
1166
- await handleAuditEmitEvent({
1300
+ const emit = await handleAuditEmitEvent({
1167
1301
  okrId: ctx.okrId,
1168
1302
  runId: ctx.runId,
1169
1303
  phase: ctx.phase,
@@ -1171,8 +1305,13 @@ async function runSkill(name, input) {
1171
1305
  eventKind: 'skill_call',
1172
1306
  payload,
1173
1307
  });
1308
+ if (!emit.ok) {
1309
+ process.stderr.write(`::warning::audit auto-emit failed for skill ${name}: ${emit.reason}\n`);
1310
+ }
1311
+ }
1312
+ catch (err) {
1313
+ process.stderr.write(`::warning::audit auto-emit threw for skill ${name}: ${err.message}\n`);
1174
1314
  }
1175
- catch { /* swallow — chain-verify catches gaps */ }
1176
1315
  }
1177
1316
  }
1178
1317
  return result;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@maintainabilityai/research-runner",
3
- "version": "0.1.29",
3
+ "version": "0.1.33",
4
4
  "description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
5
5
  "license": "MIT",
6
6
  "author": "MaintainabilityAI",