@maintainabilityai/research-runner 0.1.11 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm/llm-router.js +7 -1
- package/dist/runner/archeologist.js +1 -1
- package/dist/runner/nodes/arxiv-search.js +1 -1
- package/dist/runner/nodes/dedupe-and-rank.js +53 -6
- package/dist/runner/nodes/format-for-human.js +14 -8
- package/dist/runner/nodes/uspto-search.js +1 -1
- package/dist/schemas/ranked-source.d.ts +8 -2
- package/dist/schemas/ranked-source.js +5 -2
- package/package.json +1 -1
package/dist/llm/llm-router.js
CHANGED
|
@@ -5,7 +5,13 @@ const anthropic_client_1 = require("./anthropic-client");
|
|
|
5
5
|
const github_models_client_1 = require("./github-models-client");
|
|
6
6
|
/** Per-tier per-provider model id lookup. */
|
|
7
7
|
const MODEL_BY_TIER = {
|
|
8
|
-
|
|
8
|
+
// gpt-4.1-mini outperforms gpt-4o-mini on the per-provider query-plan
|
|
9
|
+
// task. Verified empirically against the celeb-api brief: 4.1-mini
|
|
10
|
+
// produces more on-topic arxiv phrases ("celebrity identity disambig-
|
|
11
|
+
// uation" vs 4o-mini's generic "API integration challenges"), tighter
|
|
12
|
+
// patent AND-clauses, and stays inside the spec's word counts more
|
|
13
|
+
// reliably. Same "low" rate-limit tier as 4o-mini, so no infra change.
|
|
14
|
+
plan: { anthropic: 'claude-haiku-4-5', githubModels: 'openai/gpt-4.1-mini' },
|
|
9
15
|
// gpt-5-chat is in the "custom" GH-Models tier (200K input / 100K
|
|
10
16
|
// output) and is NON-reasoning — verified end-to-end with a live API
|
|
11
17
|
// call (reasoning_tokens=0, finish_reason=stop). Picked over gpt-5
|
|
@@ -515,7 +515,7 @@ async function runArcheologist(opts) {
|
|
|
515
515
|
provider: brief.llm_provider,
|
|
516
516
|
// plan_queries is the only LLM hop we run now (synth handed off
|
|
517
517
|
// to the assigned agent). Surface that model in the Hatter's Tag.
|
|
518
|
-
model: 'openai/gpt-
|
|
518
|
+
model: 'openai/gpt-4.1-mini',
|
|
519
519
|
input_tokens: totalInputTokens,
|
|
520
520
|
output_tokens: totalOutputTokens,
|
|
521
521
|
cost_usd: roundUsd(totalCostUsd),
|
|
@@ -35,7 +35,7 @@ async function runArxivSearch(opts) {
|
|
|
35
35
|
fromQuery: query,
|
|
36
36
|
title: r.title,
|
|
37
37
|
url: r.abstractUrl,
|
|
38
|
-
content: r.summary.slice(0,
|
|
38
|
+
content: r.summary.slice(0, 2000),
|
|
39
39
|
// Position-derived score: arXiv returns by relevance, decay 0.9 → 0.5.
|
|
40
40
|
score: Math.max(0.5, 0.9 - j * 0.1),
|
|
41
41
|
publishedDate: r.published || undefined,
|
|
@@ -30,6 +30,19 @@ function canonicalizeUrl(rawUrl) {
|
|
|
30
30
|
return rawUrl.trim().toLowerCase();
|
|
31
31
|
}
|
|
32
32
|
}
|
|
33
|
+
/**
|
|
34
|
+
* Per-provider quota for the top-N output. Without these floors, Tavily
|
|
35
|
+
* (normalized scores 0.9–1.0) crushes every other provider in pure
|
|
36
|
+
* global ranking — synth would see zero HN signal and zero patent
|
|
37
|
+
* coverage. Quotas sum to topN's default (20). Any unused slack
|
|
38
|
+
* spills over to the highest-scoring non-quota entries across providers.
|
|
39
|
+
*/
|
|
40
|
+
const PROVIDER_QUOTA = {
|
|
41
|
+
tavily: 8,
|
|
42
|
+
arxiv: 5,
|
|
43
|
+
uspto: 4,
|
|
44
|
+
hackernews: 3,
|
|
45
|
+
};
|
|
33
46
|
function dedupeAndRank(opts) {
|
|
34
47
|
const topN = opts.topN ?? 20;
|
|
35
48
|
const retrievedAt = opts.retrievedAt ?? new Date().toISOString();
|
|
@@ -48,7 +61,7 @@ function dedupeAndRank(opts) {
|
|
|
48
61
|
if (r.score > existing.scoreSum / existing.occurrences) {
|
|
49
62
|
existing.title = r.title || existing.title;
|
|
50
63
|
if (r.content) {
|
|
51
|
-
existing.excerpt = r.content.slice(0,
|
|
64
|
+
existing.excerpt = r.content.slice(0, 2000);
|
|
52
65
|
}
|
|
53
66
|
}
|
|
54
67
|
if (!existing.publishedAt && r.publishedDate) {
|
|
@@ -63,7 +76,7 @@ function dedupeAndRank(opts) {
|
|
|
63
76
|
canonicalUrl: canonical,
|
|
64
77
|
provider: r.provider,
|
|
65
78
|
title: r.title || canonical,
|
|
66
|
-
excerpt: (r.content || '').slice(0,
|
|
79
|
+
excerpt: (r.content || '').slice(0, 2000),
|
|
67
80
|
publishedAt: r.publishedDate,
|
|
68
81
|
authors: r.authors,
|
|
69
82
|
scoreSum: r.score,
|
|
@@ -72,12 +85,46 @@ function dedupeAndRank(opts) {
|
|
|
72
85
|
});
|
|
73
86
|
}
|
|
74
87
|
}
|
|
75
|
-
const
|
|
76
|
-
.map(a => {
|
|
88
|
+
const allEntries = [...bucket.values()].map(a => {
|
|
77
89
|
const recall = 1 + 0.15 * (a.queries.size - 1);
|
|
78
90
|
const composite = Math.min(1, a.scoreSum * recall / Math.max(1, a.occurrences));
|
|
79
91
|
return { aggregated: a, composite };
|
|
80
|
-
})
|
|
92
|
+
});
|
|
93
|
+
// Phase 1 — per-provider quota: take each provider's top-K (K from PROVIDER_QUOTA).
|
|
94
|
+
// Phase 2 — spillover: fill the remaining budget with the next-highest entries
|
|
95
|
+
// from anywhere, including providers that have already filled their quota.
|
|
96
|
+
// Phase 3 — re-sort the combined set by composite score for stable display order.
|
|
97
|
+
const used = new Set();
|
|
98
|
+
const picks = [];
|
|
99
|
+
for (const provider of Object.keys(PROVIDER_QUOTA)) {
|
|
100
|
+
const k = PROVIDER_QUOTA[provider];
|
|
101
|
+
if (k === 0) {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
const fromProvider = allEntries
|
|
105
|
+
.filter(e => e.aggregated.provider === provider)
|
|
106
|
+
.sort((a, b) => b.composite - a.composite)
|
|
107
|
+
.slice(0, k);
|
|
108
|
+
for (const e of fromProvider) {
|
|
109
|
+
if (used.has(e.aggregated.canonicalUrl)) {
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
picks.push(e);
|
|
113
|
+
used.add(e.aggregated.canonicalUrl);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
const remainingBudget = Math.max(0, topN - picks.length);
|
|
117
|
+
if (remainingBudget > 0) {
|
|
118
|
+
const spillover = allEntries
|
|
119
|
+
.filter(e => !used.has(e.aggregated.canonicalUrl))
|
|
120
|
+
.sort((a, b) => b.composite - a.composite)
|
|
121
|
+
.slice(0, remainingBudget);
|
|
122
|
+
for (const e of spillover) {
|
|
123
|
+
picks.push(e);
|
|
124
|
+
used.add(e.aggregated.canonicalUrl);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
const ranked = picks
|
|
81
128
|
.sort((a, b) => b.composite - a.composite)
|
|
82
129
|
.slice(0, topN);
|
|
83
130
|
return ranked.map((entry, i) => ({
|
|
@@ -87,7 +134,7 @@ function dedupeAndRank(opts) {
|
|
|
87
134
|
url: entry.aggregated.canonicalUrl,
|
|
88
135
|
retrieved_at: retrievedAt,
|
|
89
136
|
salience_score: roundTo(entry.composite, 4),
|
|
90
|
-
excerpt: entry.aggregated.excerpt.slice(0,
|
|
137
|
+
excerpt: entry.aggregated.excerpt.slice(0, 2000),
|
|
91
138
|
...(entry.aggregated.publishedAt ? { published_at: entry.aggregated.publishedAt } : {}),
|
|
92
139
|
...(entry.aggregated.authors && entry.aggregated.authors.length > 0 ? { authors: entry.aggregated.authors } : {}),
|
|
93
140
|
}));
|
|
@@ -2,12 +2,14 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.formatForHuman = formatForHuman;
|
|
4
4
|
/**
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
* Normalise the excerpt for blockquote display — collapse whitespace
|
|
6
|
+
* runs so newlines in arXiv abstracts don't break the markdown quote.
|
|
7
|
+
* No length cap here: the schema's 2000-char limit is what the agent
|
|
8
|
+
* downstream sees, and a downstream synthesis or PRD agent needs the
|
|
9
|
+
* full excerpt to write faithful citations.
|
|
7
10
|
*/
|
|
8
|
-
function
|
|
9
|
-
|
|
10
|
-
return cleaned.length <= n ? cleaned : cleaned.slice(0, n - 1) + '…';
|
|
11
|
+
function normaliseExcerpt(s) {
|
|
12
|
+
return s.replace(/\s+/g, ' ').trim();
|
|
11
13
|
}
|
|
12
14
|
function meshSummary(meshContext) {
|
|
13
15
|
if (meshContext.bar) {
|
|
@@ -46,8 +48,12 @@ function providerSection(label, emoji, provider, sources, totalCount) {
|
|
|
46
48
|
for (const s of sources) {
|
|
47
49
|
const authors = s.authors && s.authors.length > 0 ? ` — _${s.authors.slice(0, 3).join(', ')}${s.authors.length > 3 ? ' et al.' : ''}_` : '';
|
|
48
50
|
const date = s.published_at ? ` _(${s.published_at.slice(0, 10)})_` : '';
|
|
49
|
-
|
|
50
|
-
|
|
51
|
+
// Render the citation id as standalone inline code so the synth
|
|
52
|
+
// agent (and a downstream PRD agent) can grep `\bS\d+\b` cleanly.
|
|
53
|
+
// The earlier form **[`S1`] [Title](url)** broke GitHub's markdown
|
|
54
|
+
// parser (it tried to interpret the brackets as a reference link).
|
|
55
|
+
lines.push(`- \`${s.id}\` **[${s.title}](${s.url})** — score ${s.salience_score.toFixed(2)}${date}${authors}`);
|
|
56
|
+
lines.push(` > ${normaliseExcerpt(s.excerpt)}`);
|
|
51
57
|
}
|
|
52
58
|
lines.push('');
|
|
53
59
|
return lines;
|
|
@@ -159,6 +165,6 @@ function formatForHuman(opts) {
|
|
|
159
165
|
lines.push('');
|
|
160
166
|
lines.push('---');
|
|
161
167
|
lines.push('');
|
|
162
|
-
lines.push(`🤖 Generated by \`research-runner archeologist
|
|
168
|
+
lines.push(`🤖 Generated by \`research-runner archeologist\`. Run id: \`${runId}\` (see the Hatter's Tag for agent version + audit chain).`);
|
|
163
169
|
return { body: lines.join('\n') };
|
|
164
170
|
}
|
|
@@ -46,7 +46,7 @@ async function runUsptoSearch(opts) {
|
|
|
46
46
|
fromQuery: query,
|
|
47
47
|
title: r.title,
|
|
48
48
|
url: r.url,
|
|
49
|
-
content: r.abstract.slice(0,
|
|
49
|
+
content: r.abstract.slice(0, 2000),
|
|
50
50
|
score: Math.max(0.4, 0.85 - j * 0.1),
|
|
51
51
|
publishedDate: r.grantedAt || undefined,
|
|
52
52
|
authors: r.inventors,
|
|
@@ -15,7 +15,10 @@ export declare const RankedSource: z.ZodObject<{
|
|
|
15
15
|
retrieved_at: z.ZodEffects<z.ZodString, string, string>;
|
|
16
16
|
/** 0.0 - 1.0, higher = more relevant. Computed by dedupe_and_rank. */
|
|
17
17
|
salience_score: z.ZodNumber;
|
|
18
|
-
/** ≤
|
|
18
|
+
/** ≤2000-char excerpt the synthesis agent quotes from. Sized for the
|
|
19
|
+
* current flow where the agent (Copilot Coding Agent / @claude) reads
|
|
20
|
+
* the issue comment — they have plenty of context budget, and 500
|
|
21
|
+
* routinely truncated arXiv abstracts and patent summaries mid-thought. */
|
|
19
22
|
excerpt: z.ZodString;
|
|
20
23
|
/** Optional: pub date if the source has one (papers, news, patents). */
|
|
21
24
|
published_at: z.ZodOptional<z.ZodEffects<z.ZodString, string, string>>;
|
|
@@ -52,7 +55,10 @@ export declare const RankedSourceList: z.ZodArray<z.ZodObject<{
|
|
|
52
55
|
retrieved_at: z.ZodEffects<z.ZodString, string, string>;
|
|
53
56
|
/** 0.0 - 1.0, higher = more relevant. Computed by dedupe_and_rank. */
|
|
54
57
|
salience_score: z.ZodNumber;
|
|
55
|
-
/** ≤
|
|
58
|
+
/** ≤2000-char excerpt the synthesis agent quotes from. Sized for the
|
|
59
|
+
* current flow where the agent (Copilot Coding Agent / @claude) reads
|
|
60
|
+
* the issue comment — they have plenty of context budget, and 500
|
|
61
|
+
* routinely truncated arXiv abstracts and patent summaries mid-thought. */
|
|
56
62
|
excerpt: z.ZodString;
|
|
57
63
|
/** Optional: pub date if the source has one (papers, news, patents). */
|
|
58
64
|
published_at: z.ZodOptional<z.ZodEffects<z.ZodString, string, string>>;
|
|
@@ -19,8 +19,11 @@ exports.RankedSource = zod_1.z.object({
|
|
|
19
19
|
retrieved_at: primitives_1.IsoTimestamp,
|
|
20
20
|
/** 0.0 - 1.0, higher = more relevant. Computed by dedupe_and_rank. */
|
|
21
21
|
salience_score: zod_1.z.number().min(0).max(1),
|
|
22
|
-
/** ≤
|
|
23
|
-
|
|
22
|
+
/** ≤2000-char excerpt the synthesis agent quotes from. Sized for the
|
|
23
|
+
* current flow where the agent (Copilot Coding Agent / @claude) reads
|
|
24
|
+
* the issue comment — they have plenty of context budget, and 500
|
|
25
|
+
* routinely truncated arXiv abstracts and patent summaries mid-thought. */
|
|
26
|
+
excerpt: zod_1.z.string().max(2000),
|
|
24
27
|
/** Optional: pub date if the source has one (papers, news, patents). */
|
|
25
28
|
published_at: primitives_1.IsoTimestamp.optional(),
|
|
26
29
|
/** Optional: authors (arxiv / news). */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@maintainabilityai/research-runner",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.13",
|
|
4
4
|
"description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "MaintainabilityAI",
|