npm - @maintainabilityai/research-runner - Versions diffs - 0.1.29 → 0.1.33 - Mend

@maintainabilityai/research-runner 0.1.29 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/runner/skills.js +146 -7
package/package.json +1 -1

package/dist/runner/skills.js CHANGED Viewed

@@ -576,6 +576,116 @@ const handleContextQuality = async (input) => {
     return { ok: true, scope: parsed.data, bars };
 };
 // ─────────────────────────────────────────────────────────────────────
+// Self-review provenance skills (B29) — pure-data attempt-tracking for
+// prd-agent's persona-switch self-critique loop.
+//
+// Why these exist (PR #112 forensic):
+// The persona-switch self-critique is a prompt-level reasoning step;
+// pre-B29 it emitted ZERO skill_call events. So the audit chain had
+// no proof that the agent entered round N of Architect or Security
+// review. On PR #112 the prd-agent hallucinated `tier=restricted` and
+// skipped the loop entirely, claiming `SKIPPED_RESTRICTED_TIER` in
+// the PRD frontmatter — when the OKR action's actual governanceTier
+// was `supervised`. The chain showed nothing wrong because nothing
+// in the chain referenced self-critique at all.
+//
+// These skills don't "do" the review (the LLM still does that). They
+// hand the agent the AUTHORITATIVE inputs: the OKR action's frozen
+// tier, the resulting max_auto_rounds, a should_proceed gate, and
+// the contents of `.caterpillar/prompts/prd/<persona>-review.md`.
+// Because every runSkill() auto-emits, the chain proves: "agent
+// entered persona X, round N, was told tier=Y, max_rounds=Z,
+// should_proceed=W." If a subsequent `### Self-review — <persona>
+// (round N)` block doesn't appear in the PR body, that's a clear
+// contract violation visible in the audit comment.
+// ─────────────────────────────────────────────────────────────────────
+const SelfReviewInput = zod_1.z.object({
+    okrId: zod_1.z.string().min(1),
+    runId: zod_1.z.string().min(1),
+    round: zod_1.z.number().int().positive(),
+});
+/**
+ * Tier → MAX_AUTO_ROUNDS mapping per design §6.2. Restricted=0 means the
+ * loop is skipped entirely (mandatory human gate). The agent SHOULD NOT
+ * be inferring tier from any other source; this is the single source of
+ * truth for the OKR run that's been frozen at dispatch time.
+ */
+function tierMaxRounds(tier) {
+    const t = tier.toLowerCase();
+    if (t === 'autonomous') {
+        return 3;
+    }
+    if (t === 'supervised') {
+        return 2;
+    }
+    return 0; // restricted / unknown
+}
+/**
+ * Factory: builds a self-review skill handler for one persona. Pure
+ * data — reads OKR yaml + prompt pack file, computes tier-driven gating,
+ * returns the bundle. No LLM, no synthesis.
+ */
+function makeSelfReviewHandler(persona) {
+    return async (input) => {
+        const parsed = SelfReviewInput.safeParse(input);
+        if (!parsed.success) {
+            return { ok: false, reason: `bad-input: ${parsed.error.message}` };
+        }
+        const mesh = meshPath();
+        const okrPath = path.join(mesh, 'okrs', parsed.data.okrId, 'okr.yaml');
+        if (!fs.existsSync(okrPath)) {
+            return { ok: false, reason: 'okr-not-found' };
+        }
+        const card = readYaml(okrPath);
+        const action = card?.actions?.find(a => a.runId === parsed.data.runId);
+        if (!action) {
+            return { ok: false, reason: `action-not-found: no actions[] entry with runId=${parsed.data.runId}` };
+        }
+        const tier = (action.governanceTier ?? '').toLowerCase();
+        const maxAutoRounds = tierMaxRounds(tier);
+        const shouldProceed = tier !== 'restricted' && parsed.data.round <= maxAutoRounds;
+        // Prompt-pack filename note: the persona is "architect" but the
+        // pack file is "architecture-review.md" (full word). Map explicitly
+        // so we don't accidentally look for "architect-review.md".
+        const promptFilename = persona === 'architect' ? 'architecture-review.md' : 'security-review.md';
+        const promptPath = path.join(mesh, '.caterpillar', 'prompts', 'prd', promptFilename);
+        let promptPack = '';
+        let promptPackFound = false;
+        if (fs.existsSync(promptPath)) {
+            try {
+                promptPack = fs.readFileSync(promptPath, 'utf8');
+                promptPackFound = true;
+            }
+            catch { /* leave empty */ }
+        }
+        // The chain only needs the small fields, not the whole prompt-pack
+        // body — auditMetadata controls what lands in the skill_call event.
+        const auditMetadata = {
+            persona,
+            tier,
+            max_auto_rounds: maxAutoRounds,
+            round: parsed.data.round,
+            should_proceed: shouldProceed,
+            prompt_pack_path: promptPath,
+            prompt_pack_found: promptPackFound,
+        };
+        return {
+            ok: true,
+            persona,
+            tier,
+            maxAutoRounds,
+            round: parsed.data.round,
+            shouldProceed,
+            promptPack,
+            promptPackPath: promptPath,
+            promptPackFound,
+            auditMetadata,
+        };
+    };
+}
+const handleSelfReviewArchitect = makeSelfReviewHandler('architect');
+const handleSelfReviewSecurity = makeSelfReviewHandler('security');
+// ─────────────────────────────────────────────────────────────────────
 // Search skills — thin wrappers over the existing search nodes
 // ─────────────────────────────────────────────────────────────────────
 const SearchQueriesInput = zod_1.z.object({
@@ -817,8 +927,21 @@ const AuditEmitInput = zod_1.z.object({
     phase: zod_1.z.enum(['why', 'how', 'what']),
     intentThreadUuid: zod_1.z.string().min(1),
 });
-const LOCK_RETRY_LIMIT = 3;
-const LOCK_RETRY_BASE_MS = 50;
+/**
+ * Audit-JSONL file-lock retry budget. Sized for parallel auto-emission:
+ * the agent often fires 4 search skills concurrently, each completing in
+ * ~500ms–3s. When their handlers return at similar times, all 4 try to
+ * grab the JSONL lock simultaneously. Pre-B28a.v1.1 the budget was
+ * `3 × 50ms linear = 300ms max` which silently dropped 3 of 4 events on
+ * PR #108. New budget: 20 retries with exponential 2^n backoff capped at
+ * 500ms each (sequence: 100, 200, 400, 500, 500, 500, …) ≈ 9.6s total
+ * wait — comfortably tolerates 4–8 parallel skill invocations while
+ * staying well under the runner's overall step timeout. Total emission
+ * latency stays unchanged in the happy-path single-writer case.
+ */
+const LOCK_RETRY_LIMIT = 20;
+const LOCK_RETRY_BASE_MS = 100;
+const LOCK_RETRY_MAX_MS = 500;
 /** Recursive key-sorted JSON stringify so the event hash is canonical. */
 function canonicalStringify(value) {
     if (value === null || typeof value !== 'object') {
@@ -948,7 +1071,12 @@ const handleAuditEmitEvent = async (input) => {
         }
         catch (err) {
             if (err.code === 'EEXIST') {
-                await sleep(LOCK_RETRY_BASE_MS * (attempt + 1));
+                // Exponential backoff capped at LOCK_RETRY_MAX_MS. With 20
+                // attempts the wait sequence is 100, 200, 400, 500, 500, … ≈
+                // 9.6s total — enough headroom for 4–8 parallel auto-emissions
+                // from skills firing concurrently (B28a.v1.1).
+                const wait = Math.min(LOCK_RETRY_BASE_MS * (2 ** attempt), LOCK_RETRY_MAX_MS);
+                await sleep(wait);
                 continue;
             }
             return { ok: false, reason: `audit-lock-failed: ${err.message}` };
@@ -1114,6 +1242,8 @@ exports.SKILLS = {
     'context-architecture': handleContextArchitecture,
     'context-security': handleContextSecurity,
     'context-quality': handleContextQuality,
+    'self-review-architect': handleSelfReviewArchitect,
+    'self-review-security': handleSelfReviewSecurity,
     'tavily-search': handleTavilySearch,
     'arxiv-search': handleArxivSearch,
     'uspto-search': handleUsptoSearch,
@@ -1160,10 +1290,14 @@ async function runSkill(name, input) {
             if (!result.ok) {
                 payload.reason = result.reason;
             }
-            // Best-effort: an audit-write failure must not shadow the real skill
-            // result. The chain-verify CI gate is the catch-net for missed events.
+            // Best-effort: an audit-write failure must not shadow the real
+            // skill result. But we MUST surface the failure to stderr — pre-
+            // B28a.v1.1 these were silently swallowed and PR #108 dropped 3
+            // of 4 parallel-search events with no warning. The chain-verify
+            // CI gate still catches gaps post-hoc; this stderr line catches
+            // them at write time.
             try {
-                await handleAuditEmitEvent({
+                const emit = await handleAuditEmitEvent({
                     okrId: ctx.okrId,
                     runId: ctx.runId,
                     phase: ctx.phase,
@@ -1171,8 +1305,13 @@ async function runSkill(name, input) {
                     eventKind: 'skill_call',
                     payload,
                 });
+                if (!emit.ok) {
+                    process.stderr.write(`::warning::audit auto-emit failed for skill ${name}: ${emit.reason}\n`);
+                }
+            }
+            catch (err) {
+                process.stderr.write(`::warning::audit auto-emit threw for skill ${name}: ${err.message}\n`);
             }
-            catch { /* swallow — chain-verify catches gaps */ }
         }
     }
     return result;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@maintainabilityai/research-runner",
-  "version": "0.1.29",
+  "version": "0.1.33",
   "description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
   "license": "MIT",
   "author": "MaintainabilityAI",