npm - @maintainabilityai/research-runner - Versions diffs - 0.1.44 → 0.1.46 - Mend

@maintainabilityai/research-runner 0.1.44 → 0.1.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -77,6 +77,43 @@ LLM provider/model, token count, cost, grounding score, and audit chain hash.
 Auditors verify the artifact by re-running the chain against the recorded
 mesh sha.
+## Versioning + workflow-template pin scheme
+The mesh-deployed workflow templates pin this package with a **tilde range**:
+```
+npx -y @maintainabilityai/research-runner@~0.1.42 skill-<name>
+```
+`~0.1.42` allows patch releases (`0.1.43`, `0.1.44`, …) but not minor
+or major bumps. The reasons:
+1. **Auto-publish bumps patch on every merge.** The
+   `npm-publish-research-runner.yml` workflow runs `npm version patch`
+   when anything under `packages/research-runner/**` changes. A new
+   patch is published within minutes of merge.
+2. **Templates pinned exactly would force a follow-up edit on every
+   patch.** With `@0.1.42` (exact), every patch bump would leave the
+   templates stale until someone edited them. With `~0.1.42`, the
+   templates carry on transparently.
+3. **A minor bump is a deliberate review event.** When the runner ships
+   a contract change (new event field, new skill API shape, removed
+   field), bump `version` from `0.1.x` to `0.2.0` and update the
+   templates in the same PR. A `phaseSpec.test.ts` parity test fails
+   loudly when the templates' major.minor doesn't match `package.json`.
+**When you change anything under `packages/research-runner/**`:** you
+do NOT need to edit workflow templates. The auto-publish handles it.
+**When you ship a contract-breaking change:** bump the minor version
+in `packages/research-runner/package.json` AND update every
+`@maintainabilityai/research-runner@~0.X.Y` reference in
+`vscode-extension/code-templates/**` to match. Tests enforce this.
+The off-by-one risk the tilde range eliminates: a developer trying to
+mentally compute "what patch will the auto-publish produce" and pinning
+to the wrong value. With tilde, the patch resolves at run-time from
+npm, and the mental math goes away.
 ## License
 MIT

package/dist/runner/skills.js CHANGED Viewed

@@ -1338,6 +1338,15 @@ const handleKnowledgeCode = async (input) => {
         // Workflow gate consumes this to validate cited paths.
         inventory_paths: inventoryPaths,
     };
+    // Bug-R / R6 (Codex round-3) — persist inventory to the clone
+    // cache so knowledge-code-read can strict-mode validate requested
+    // paths against the same list that lands in the audit chain.
+    // Without this, the agent could ask knowledge-code-read for
+    // arbitrary paths inside the clone that the chain never advertised.
+    try {
+        fs.writeFileSync(path.join(cloneTarget, '.knowledge-code-inventory.json'), JSON.stringify({ inventory_paths: inventoryPaths, sha, cachedAt: new Date().toISOString() }), 'utf8');
+    }
+    catch { /* inventory persist failure is non-fatal — read skill will fall back to cache-only check */ }
     return {
         ok: true,
         mode: 'brownfield',
@@ -1407,9 +1416,26 @@ const handleKnowledgeCodeRead = async (input) => {
     if (normalized.startsWith('..') || normalized === '..' || normalized.includes(`${path.sep}..${path.sep}`)) {
         return { ok: false, reason: `path-rejected: path-traversal segments forbidden (${filePath} -> ${normalized})` };
     }
-    // Reuse the cached clone from knowledge-code; clone fresh if missing
-    // (e.g. agent called knowledge-code-read without calling knowledge-
-    // code first — supported but slower).
+    // Bug-R / R6 (Codex round-3) — auth tightening. A prior knowledge-
+    // code call for this (runId, owner, name) MUST have populated the
+    // cache before knowledge-code-read can return content. Closes two
+    // gaps Codex flagged: (1) skill could read any public GitHub repo
+    // by URL alone, (2) audit chain didn't prove the standard
+    // brownfield-grounding pipeline ran before the file read. Test
+    // mode (KNOWLEDGE_CODE_READ_ALLOW_UNCACHED=1) bypasses this for
+    // unit tests that drive the skill directly.
+    const cacheDir = knowledgeCodeCacheDir(runId, gh.owner, gh.name);
+    const metaPath = path.join(cacheDir, '.cache-meta.json');
+    const allowUncached = process.env.KNOWLEDGE_CODE_READ_ALLOW_UNCACHED === '1';
+    if (!allowUncached && !fs.existsSync(metaPath)) {
+        return {
+            ok: false,
+            reason: `no-prior-knowledge-code: knowledge-code-read requires a prior knowledge-code call for ${gh.owner}/${gh.name} in run ${runId}. Call knowledge-code first to clone + classify the repo, then knowledge-code-read can return file contents from the cached clone.`,
+            remediation: "Call `knowledge-code` with the same repoUrl + runId before invoking knowledge-code-read. The audit chain then proves the agent went through brownfield grounding before reading files.",
+        };
+    }
+    // Reuse the cached clone from knowledge-code; clone fresh only in
+    // test mode (allowUncached).
     const cloneResult = ensureClone(runId, repoUrl, ref ?? 'HEAD', gh.owner, gh.name);
     if (!cloneResult.ok) {
         return {
@@ -1419,6 +1445,29 @@ const handleKnowledgeCodeRead = async (input) => {
             remediation: `Could not access clone for ${repoUrl}. Underlying error: ${cloneResult.error ?? 'unknown'}`,
         };
     }
+    // Bug-R / R6 (strict mode part 2) — validate the requested path
+    // against the inventory persisted by knowledge-code. Only paths
+    // that knowledge-code already advertised in `inventory_paths` are
+    // readable — closes the gap where the agent could ask for any
+    // file inside the clone, including files not visible in the
+    // bounded walk. Test mode bypasses (see allowUncached).
+    if (!allowUncached) {
+        const inventoryPath = path.join(cloneResult.path, '.knowledge-code-inventory.json');
+        if (fs.existsSync(inventoryPath)) {
+            try {
+                const inv = JSON.parse(fs.readFileSync(inventoryPath, 'utf8'));
+                const allowed = new Set(inv.inventory_paths ?? []);
+                if (allowed.size > 0 && !allowed.has(normalized)) {
+                    return {
+                        ok: false,
+                        reason: `path-not-in-inventory: ${normalized} is not in the knowledge-code inventory_paths for ${gh.owner}/${gh.name}. The agent can only read files knowledge-code advertised in the chain.`,
+                        remediation: "If the file is real but missed by the bounded walk (default maxFiles=200), call knowledge-code with a higher maxFiles before retrying.",
+                    };
+                }
+            }
+            catch { /* malformed inventory; fall through (cache-only check still applied) */ }
+        }
+    }
     const absPath = path.join(cloneResult.path, normalized);
     // Final paranoia check — resolve the real path and verify it's still
     // a child of the clone root. Defends against symlink-shaped escapes
@@ -1525,6 +1574,52 @@ function detectAllQueriesFailed(envelopes, skill) {
     // pattern matching of firewall-block vs query-quality failures.
     return `all-queries-failed: ${skill} — ${firstError}`;
 }
+/**
+ * Bug-Q phase 3 (Codex audit follow-up / oracle evidence) — search
+ * audit metadata now carries a bounded preview of WHICH results came
+ * back, not just HOW MANY. Without this, a reviewer who wants to
+ * verify "S-3 cites a real arXiv paper, not a hallucinated one"
+ * has nothing in the chain to verify against — they'd have to trust
+ * the agent's research-doc citations and re-run the search.
+ *
+ * Preview shape per hit: { provider, query, title, url, snippet?,
+ *   score?, publishedDate? } where:
+ *   - snippet is truncated to ~200 chars (the ProviderResult.content
+ *     field already caps at ~500; we shorten further for chain size)
+ *   - score is rounded to 2 decimals
+ *
+ * Total preview cap: 25 hits per skill_call. Search runs typically
+ * return 10-30 results per provider before dedupe; the cap keeps the
+ * audit JSONL compact while still proving "real evidence behind every
+ * citation."
+ */
+const SEARCH_RESULTS_PREVIEW_CAP = 25;
+const SEARCH_SNIPPET_CAP = 200;
+function buildSearchAuditMetadata(queries, results) {
+    const preview = results.slice(0, SEARCH_RESULTS_PREVIEW_CAP).map((r) => {
+        const snippet = (r.content || '').replace(/\s+/g, ' ').trim();
+        const truncated = snippet.length > SEARCH_SNIPPET_CAP
+            ? snippet.slice(0, SEARCH_SNIPPET_CAP) + '…'
+            : snippet;
+        const entry = {
+            provider: r.provider,
+            query: r.fromQuery,
+            title: r.title,
+            url: r.url,
+        };
+        if (truncated) {
+            entry.snippet = truncated;
+        }
+        if (typeof r.score === 'number' && isFinite(r.score)) {
+            entry.score = Math.round(r.score * 100) / 100;
+        }
+        if (r.publishedDate) {
+            entry.publishedDate = r.publishedDate;
+        }
+        return entry;
+    });
+    return { queries, result_count: results.length, results_preview: preview };
+}
 const handleTavilySearch = async (input) => {
     const parsed = SearchQueriesInput.safeParse(input);
     if (!parsed.success) {
@@ -1540,7 +1635,7 @@ const handleTavilySearch = async (input) => {
             queries: parsed.data.queries,
             maxResultsPerQuery: parsed.data.maxResults,
         });
-        const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
+        const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
         const failure = detectAllQueriesFailed(res.envelopes, 'tavily-search');
         if (failure) {
             return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1561,7 +1656,7 @@ const handleArxivSearch = async (input) => {
             queries: parsed.data.queries,
             maxResultsPerQuery: parsed.data.maxResults,
         });
-        const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
+        const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
         const failure = detectAllQueriesFailed(res.envelopes, 'arxiv-search');
         if (failure) {
             return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1587,7 +1682,7 @@ const handleUsptoSearch = async (input) => {
             queries: parsed.data.queries,
             maxResultsPerQuery: parsed.data.maxResults,
         });
-        const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
+        const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
         const failure = detectAllQueriesFailed(res.envelopes, 'uspto-search');
         if (failure) {
             return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1608,7 +1703,7 @@ const handleHackerNewsSearch = async (input) => {
             queries: parsed.data.queries,
             hitsPerQuery: parsed.data.maxResults,
         });
-        const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
+        const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
         const failure = detectAllQueriesFailed(res.envelopes, 'hackernews-search');
         if (failure) {
             return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@maintainabilityai/research-runner",
-  "version": "0.1.44",
+  "version": "0.1.46",
   "description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
   "license": "MIT",
   "author": "MaintainabilityAI",