@maintainabilityai/research-runner 0.1.44 → 0.1.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,6 +77,43 @@ LLM provider/model, token count, cost, grounding score, and audit chain hash.
77
77
  Auditors verify the artifact by re-running the chain against the recorded
78
78
  mesh sha.
79
79
 
80
+ ## Versioning + workflow-template pin scheme
81
+
82
+ The mesh-deployed workflow templates pin this package with a **tilde range**:
83
+
84
+ ```
85
+ npx -y @maintainabilityai/research-runner@~0.1.42 skill-<name>
86
+ ```
87
+
88
+ `~0.1.42` allows patch releases (`0.1.43`, `0.1.44`, …) but not minor
89
+ or major bumps. The reasons:
90
+
91
+ 1. **Auto-publish bumps patch on every merge.** The
92
+ `npm-publish-research-runner.yml` workflow runs `npm version patch`
93
+ when anything under `packages/research-runner/**` changes. A new
94
+ patch is published within minutes of merge.
95
+ 2. **Templates pinned exactly would force a follow-up edit on every
96
+ patch.** With `@0.1.42` (exact), every patch bump would leave the
97
+ templates stale until someone edited them. With `~0.1.42`, the
98
+ templates carry on transparently.
99
+ 3. **A minor bump is a deliberate review event.** When the runner ships
100
+ a contract change (new event field, new skill API shape, removed
101
+ field), bump `version` from `0.1.x` to `0.2.0` and update the
102
+ templates in the same PR. A `phaseSpec.test.ts` parity test fails
103
+ loudly when the templates' major.minor doesn't match `package.json`.
104
+
105
+ **When you change anything under `packages/research-runner/**`:** you
106
+ do NOT need to edit workflow templates. The auto-publish handles it.
107
+ **When you ship a contract-breaking change:** bump the minor version
108
+ in `packages/research-runner/package.json` AND update every
109
+ `@maintainabilityai/research-runner@~0.X.Y` reference in
110
+ `vscode-extension/code-templates/**` to match. Tests enforce this.
111
+
112
+ The off-by-one risk the tilde range eliminates: a developer trying to
113
+ mentally compute "what patch will the auto-publish produce" and pinning
114
+ to the wrong value. With tilde, the patch resolves at run-time from
115
+ npm, and the mental math goes away.
116
+
80
117
  ## License
81
118
 
82
119
  MIT
@@ -1338,6 +1338,15 @@ const handleKnowledgeCode = async (input) => {
1338
1338
  // Workflow gate consumes this to validate cited paths.
1339
1339
  inventory_paths: inventoryPaths,
1340
1340
  };
1341
+ // Bug-R / R6 (Codex round-3) — persist inventory to the clone
1342
+ // cache so knowledge-code-read can strict-mode validate requested
1343
+ // paths against the same list that lands in the audit chain.
1344
+ // Without this, the agent could ask knowledge-code-read for
1345
+ // arbitrary paths inside the clone that the chain never advertised.
1346
+ try {
1347
+ fs.writeFileSync(path.join(cloneTarget, '.knowledge-code-inventory.json'), JSON.stringify({ inventory_paths: inventoryPaths, sha, cachedAt: new Date().toISOString() }), 'utf8');
1348
+ }
1349
+ catch { /* inventory persist failure is non-fatal — read skill will fall back to cache-only check */ }
1341
1350
  return {
1342
1351
  ok: true,
1343
1352
  mode: 'brownfield',
@@ -1407,9 +1416,26 @@ const handleKnowledgeCodeRead = async (input) => {
1407
1416
  if (normalized.startsWith('..') || normalized === '..' || normalized.includes(`${path.sep}..${path.sep}`)) {
1408
1417
  return { ok: false, reason: `path-rejected: path-traversal segments forbidden (${filePath} -> ${normalized})` };
1409
1418
  }
1410
- // Reuse the cached clone from knowledge-code; clone fresh if missing
1411
- // (e.g. agent called knowledge-code-read without calling knowledge-
1412
- // code first supported but slower).
1419
+ // Bug-R / R6 (Codex round-3) auth tightening. A prior knowledge-
1420
+ // code call for this (runId, owner, name) MUST have populated the
1421
+ // cache before knowledge-code-read can return content. Closes two
1422
+ // gaps Codex flagged: (1) skill could read any public GitHub repo
1423
+ // by URL alone, (2) audit chain didn't prove the standard
1424
+ // brownfield-grounding pipeline ran before the file read. Test
1425
+ // mode (KNOWLEDGE_CODE_READ_ALLOW_UNCACHED=1) bypasses this for
1426
+ // unit tests that drive the skill directly.
1427
+ const cacheDir = knowledgeCodeCacheDir(runId, gh.owner, gh.name);
1428
+ const metaPath = path.join(cacheDir, '.cache-meta.json');
1429
+ const allowUncached = process.env.KNOWLEDGE_CODE_READ_ALLOW_UNCACHED === '1';
1430
+ if (!allowUncached && !fs.existsSync(metaPath)) {
1431
+ return {
1432
+ ok: false,
1433
+ reason: `no-prior-knowledge-code: knowledge-code-read requires a prior knowledge-code call for ${gh.owner}/${gh.name} in run ${runId}. Call knowledge-code first to clone + classify the repo, then knowledge-code-read can return file contents from the cached clone.`,
1434
+ remediation: "Call `knowledge-code` with the same repoUrl + runId before invoking knowledge-code-read. The audit chain then proves the agent went through brownfield grounding before reading files.",
1435
+ };
1436
+ }
1437
+ // Reuse the cached clone from knowledge-code; clone fresh only in
1438
+ // test mode (allowUncached).
1413
1439
  const cloneResult = ensureClone(runId, repoUrl, ref ?? 'HEAD', gh.owner, gh.name);
1414
1440
  if (!cloneResult.ok) {
1415
1441
  return {
@@ -1419,6 +1445,29 @@ const handleKnowledgeCodeRead = async (input) => {
1419
1445
  remediation: `Could not access clone for ${repoUrl}. Underlying error: ${cloneResult.error ?? 'unknown'}`,
1420
1446
  };
1421
1447
  }
1448
+ // Bug-R / R6 (strict mode part 2) — validate the requested path
1449
+ // against the inventory persisted by knowledge-code. Only paths
1450
+ // that knowledge-code already advertised in `inventory_paths` are
1451
+ // readable — closes the gap where the agent could ask for any
1452
+ // file inside the clone, including files not visible in the
1453
+ // bounded walk. Test mode bypasses (see allowUncached).
1454
+ if (!allowUncached) {
1455
+ const inventoryPath = path.join(cloneResult.path, '.knowledge-code-inventory.json');
1456
+ if (fs.existsSync(inventoryPath)) {
1457
+ try {
1458
+ const inv = JSON.parse(fs.readFileSync(inventoryPath, 'utf8'));
1459
+ const allowed = new Set(inv.inventory_paths ?? []);
1460
+ if (allowed.size > 0 && !allowed.has(normalized)) {
1461
+ return {
1462
+ ok: false,
1463
+ reason: `path-not-in-inventory: ${normalized} is not in the knowledge-code inventory_paths for ${gh.owner}/${gh.name}. The agent can only read files knowledge-code advertised in the chain.`,
1464
+ remediation: "If the file is real but missed by the bounded walk (default maxFiles=200), call knowledge-code with a higher maxFiles before retrying.",
1465
+ };
1466
+ }
1467
+ }
1468
+ catch { /* malformed inventory; fall through (cache-only check still applied) */ }
1469
+ }
1470
+ }
1422
1471
  const absPath = path.join(cloneResult.path, normalized);
1423
1472
  // Final paranoia check — resolve the real path and verify it's still
1424
1473
  // a child of the clone root. Defends against symlink-shaped escapes
@@ -1525,6 +1574,52 @@ function detectAllQueriesFailed(envelopes, skill) {
1525
1574
  // pattern matching of firewall-block vs query-quality failures.
1526
1575
  return `all-queries-failed: ${skill} — ${firstError}`;
1527
1576
  }
1577
+ /**
1578
+ * Bug-Q phase 3 (Codex audit follow-up / oracle evidence) — search
1579
+ * audit metadata now carries a bounded preview of WHICH results came
1580
+ * back, not just HOW MANY. Without this, a reviewer who wants to
1581
+ * verify "S-3 cites a real arXiv paper, not a hallucinated one"
1582
+ * has nothing in the chain to verify against — they'd have to trust
1583
+ * the agent's research-doc citations and re-run the search.
1584
+ *
1585
+ * Preview shape per hit: { provider, query, title, url, snippet?,
1586
+ * score?, publishedDate? } where:
1587
+ * - snippet is truncated to ~200 chars (the ProviderResult.content
1588
+ * field already caps at ~500; we shorten further for chain size)
1589
+ * - score is rounded to 2 decimals
1590
+ *
1591
+ * Total preview cap: 25 hits per skill_call. Search runs typically
1592
+ * return 10-30 results per provider before dedupe; the cap keeps the
1593
+ * audit JSONL compact while still proving "real evidence behind every
1594
+ * citation."
1595
+ */
1596
+ const SEARCH_RESULTS_PREVIEW_CAP = 25;
1597
+ const SEARCH_SNIPPET_CAP = 200;
1598
+ function buildSearchAuditMetadata(queries, results) {
1599
+ const preview = results.slice(0, SEARCH_RESULTS_PREVIEW_CAP).map((r) => {
1600
+ const snippet = (r.content || '').replace(/\s+/g, ' ').trim();
1601
+ const truncated = snippet.length > SEARCH_SNIPPET_CAP
1602
+ ? snippet.slice(0, SEARCH_SNIPPET_CAP) + '…'
1603
+ : snippet;
1604
+ const entry = {
1605
+ provider: r.provider,
1606
+ query: r.fromQuery,
1607
+ title: r.title,
1608
+ url: r.url,
1609
+ };
1610
+ if (truncated) {
1611
+ entry.snippet = truncated;
1612
+ }
1613
+ if (typeof r.score === 'number' && isFinite(r.score)) {
1614
+ entry.score = Math.round(r.score * 100) / 100;
1615
+ }
1616
+ if (r.publishedDate) {
1617
+ entry.publishedDate = r.publishedDate;
1618
+ }
1619
+ return entry;
1620
+ });
1621
+ return { queries, result_count: results.length, results_preview: preview };
1622
+ }
1528
1623
  const handleTavilySearch = async (input) => {
1529
1624
  const parsed = SearchQueriesInput.safeParse(input);
1530
1625
  if (!parsed.success) {
@@ -1540,7 +1635,7 @@ const handleTavilySearch = async (input) => {
1540
1635
  queries: parsed.data.queries,
1541
1636
  maxResultsPerQuery: parsed.data.maxResults,
1542
1637
  });
1543
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1638
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1544
1639
  const failure = detectAllQueriesFailed(res.envelopes, 'tavily-search');
1545
1640
  if (failure) {
1546
1641
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1561,7 +1656,7 @@ const handleArxivSearch = async (input) => {
1561
1656
  queries: parsed.data.queries,
1562
1657
  maxResultsPerQuery: parsed.data.maxResults,
1563
1658
  });
1564
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1659
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1565
1660
  const failure = detectAllQueriesFailed(res.envelopes, 'arxiv-search');
1566
1661
  if (failure) {
1567
1662
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1587,7 +1682,7 @@ const handleUsptoSearch = async (input) => {
1587
1682
  queries: parsed.data.queries,
1588
1683
  maxResultsPerQuery: parsed.data.maxResults,
1589
1684
  });
1590
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1685
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1591
1686
  const failure = detectAllQueriesFailed(res.envelopes, 'uspto-search');
1592
1687
  if (failure) {
1593
1688
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1608,7 +1703,7 @@ const handleHackerNewsSearch = async (input) => {
1608
1703
  queries: parsed.data.queries,
1609
1704
  hitsPerQuery: parsed.data.maxResults,
1610
1705
  });
1611
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1706
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1612
1707
  const failure = detectAllQueriesFailed(res.envelopes, 'hackernews-search');
1613
1708
  if (failure) {
1614
1709
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@maintainabilityai/research-runner",
3
- "version": "0.1.44",
3
+ "version": "0.1.46",
4
4
  "description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
5
5
  "license": "MIT",
6
6
  "author": "MaintainabilityAI",