@maintainabilityai/research-runner 0.1.44 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,6 +77,43 @@ LLM provider/model, token count, cost, grounding score, and audit chain hash.
77
77
  Auditors verify the artifact by re-running the chain against the recorded
78
78
  mesh sha.
79
79
 
80
+ ## Versioning + workflow-template pin scheme
81
+
82
+ The mesh-deployed workflow templates pin this package with a **tilde range**:
83
+
84
+ ```
85
+ npx -y @maintainabilityai/research-runner@~0.1.42 skill-<name>
86
+ ```
87
+
88
+ `~0.1.42` allows patch releases (`0.1.43`, `0.1.44`, …) but not minor
89
+ or major bumps. The reasons:
90
+
91
+ 1. **Auto-publish bumps patch on every merge.** The
92
+ `npm-publish-research-runner.yml` workflow runs `npm version patch`
93
+ when anything under `packages/research-runner/**` changes. A new
94
+ patch is published within minutes of merge.
95
+ 2. **Templates pinned exactly would force a follow-up edit on every
96
+ patch.** With `@0.1.42` (exact), every patch bump would leave the
97
+ templates stale until someone edited them. With `~0.1.42`, the
98
+ templates carry on transparently.
99
+ 3. **A minor bump is a deliberate review event.** When the runner ships
100
+ a contract change (new event field, new skill API shape, removed
101
+ field), bump `version` from `0.1.x` to `0.2.0` and update the
102
+ templates in the same PR. A `phaseSpec.test.ts` parity test fails
103
+ loudly when the templates' major.minor doesn't match `package.json`.
104
+
105
+ **When you change anything under `packages/research-runner/**`:** you
106
+ do NOT need to edit workflow templates. The auto-publish handles it.
107
+ **When you ship a contract-breaking change:** bump the minor version
108
+ in `packages/research-runner/package.json` AND update every
109
+ `@maintainabilityai/research-runner@~0.X.Y` reference in
110
+ `vscode-extension/code-templates/**` to match. Tests enforce this.
111
+
112
+ The off-by-one risk the tilde range eliminates: a developer trying to
113
+ mentally compute "what patch will the auto-publish produce" and pinning
114
+ to the wrong value. With tilde, the patch resolves at run-time from
115
+ npm, and the mental math goes away.
116
+
80
117
  ## License
81
118
 
82
119
  MIT
@@ -1525,6 +1525,52 @@ function detectAllQueriesFailed(envelopes, skill) {
1525
1525
  // pattern matching of firewall-block vs query-quality failures.
1526
1526
  return `all-queries-failed: ${skill} — ${firstError}`;
1527
1527
  }
1528
+ /**
1529
+ * Bug-Q phase 3 (Codex audit follow-up / oracle evidence) — search
1530
+ * audit metadata now carries a bounded preview of WHICH results came
1531
+ * back, not just HOW MANY. Without this, a reviewer who wants to
1532
+ * verify "S-3 cites a real arXiv paper, not a hallucinated one"
1533
+ * has nothing in the chain to verify against — they'd have to trust
1534
+ * the agent's research-doc citations and re-run the search.
1535
+ *
1536
+ * Preview shape per hit: { provider, query, title, url, snippet?,
1537
+ * score?, publishedDate? } where:
1538
+ * - snippet is truncated to ~200 chars (the ProviderResult.content
1539
+ * field already caps at ~500; we shorten further for chain size)
1540
+ * - score is rounded to 2 decimals
1541
+ *
1542
+ * Total preview cap: 25 hits per skill_call. Search runs typically
1543
+ * return 10-30 results per provider before dedupe; the cap keeps the
1544
+ * audit JSONL compact while still proving "real evidence behind every
1545
+ * citation."
1546
+ */
1547
+ const SEARCH_RESULTS_PREVIEW_CAP = 25;
1548
+ const SEARCH_SNIPPET_CAP = 200;
1549
+ function buildSearchAuditMetadata(queries, results) {
1550
+ const preview = results.slice(0, SEARCH_RESULTS_PREVIEW_CAP).map((r) => {
1551
+ const snippet = (r.content || '').replace(/\s+/g, ' ').trim();
1552
+ const truncated = snippet.length > SEARCH_SNIPPET_CAP
1553
+ ? snippet.slice(0, SEARCH_SNIPPET_CAP) + '…'
1554
+ : snippet;
1555
+ const entry = {
1556
+ provider: r.provider,
1557
+ query: r.fromQuery,
1558
+ title: r.title,
1559
+ url: r.url,
1560
+ };
1561
+ if (truncated) {
1562
+ entry.snippet = truncated;
1563
+ }
1564
+ if (typeof r.score === 'number' && isFinite(r.score)) {
1565
+ entry.score = Math.round(r.score * 100) / 100;
1566
+ }
1567
+ if (r.publishedDate) {
1568
+ entry.publishedDate = r.publishedDate;
1569
+ }
1570
+ return entry;
1571
+ });
1572
+ return { queries, result_count: results.length, results_preview: preview };
1573
+ }
1528
1574
  const handleTavilySearch = async (input) => {
1529
1575
  const parsed = SearchQueriesInput.safeParse(input);
1530
1576
  if (!parsed.success) {
@@ -1540,7 +1586,7 @@ const handleTavilySearch = async (input) => {
1540
1586
  queries: parsed.data.queries,
1541
1587
  maxResultsPerQuery: parsed.data.maxResults,
1542
1588
  });
1543
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1589
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1544
1590
  const failure = detectAllQueriesFailed(res.envelopes, 'tavily-search');
1545
1591
  if (failure) {
1546
1592
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1561,7 +1607,7 @@ const handleArxivSearch = async (input) => {
1561
1607
  queries: parsed.data.queries,
1562
1608
  maxResultsPerQuery: parsed.data.maxResults,
1563
1609
  });
1564
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1610
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1565
1611
  const failure = detectAllQueriesFailed(res.envelopes, 'arxiv-search');
1566
1612
  if (failure) {
1567
1613
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1587,7 +1633,7 @@ const handleUsptoSearch = async (input) => {
1587
1633
  queries: parsed.data.queries,
1588
1634
  maxResultsPerQuery: parsed.data.maxResults,
1589
1635
  });
1590
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1636
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1591
1637
  const failure = detectAllQueriesFailed(res.envelopes, 'uspto-search');
1592
1638
  if (failure) {
1593
1639
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
@@ -1608,7 +1654,7 @@ const handleHackerNewsSearch = async (input) => {
1608
1654
  queries: parsed.data.queries,
1609
1655
  hitsPerQuery: parsed.data.maxResults,
1610
1656
  });
1611
- const auditMetadata = { queries: parsed.data.queries, result_count: res.results.length };
1657
+ const auditMetadata = buildSearchAuditMetadata(parsed.data.queries, res.results);
1612
1658
  const failure = detectAllQueriesFailed(res.envelopes, 'hackernews-search');
1613
1659
  if (failure) {
1614
1660
  return { ok: false, reason: failure, envelopes: res.envelopes, auditMetadata };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@maintainabilityai/research-runner",
3
- "version": "0.1.44",
3
+ "version": "0.1.45",
4
4
  "description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
5
5
  "license": "MIT",
6
6
  "author": "MaintainabilityAI",