@steel-dev/atlas 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +219 -0
- package/dist/agent.d.ts +34 -0
- package/dist/agent.js +133 -0
- package/dist/async.d.ts +19 -0
- package/dist/async.js +172 -0
- package/dist/atlas.d.ts +19 -0
- package/dist/atlas.js +69 -0
- package/dist/budget.d.ts +64 -0
- package/dist/budget.js +336 -0
- package/dist/checklist.d.ts +115 -0
- package/dist/checklist.js +297 -0
- package/dist/cli.js +38700 -0
- package/dist/config.d.ts +80 -0
- package/dist/config.js +109 -0
- package/dist/context.d.ts +26 -0
- package/dist/context.js +250 -0
- package/dist/custom-tools.d.ts +26 -0
- package/dist/custom-tools.js +33 -0
- package/dist/defaults.d.ts +10 -0
- package/dist/defaults.js +37 -0
- package/dist/economy.d.ts +12 -0
- package/dist/economy.js +6 -0
- package/dist/env.d.ts +1 -0
- package/dist/env.js +8 -0
- package/dist/errors.d.ts +6 -0
- package/dist/errors.js +11 -0
- package/dist/event-hub.d.ts +11 -0
- package/dist/event-hub.js +83 -0
- package/dist/events.d.ts +105 -0
- package/dist/events.js +1 -0
- package/dist/html-extract.d.ts +21 -0
- package/dist/html-extract.js +459 -0
- package/dist/index.d.ts +59 -0
- package/dist/index.js +26 -0
- package/dist/memory.d.ts +2 -0
- package/dist/memory.js +38 -0
- package/dist/model.d.ts +49 -0
- package/dist/model.js +630 -0
- package/dist/orchestrate.d.ts +5 -0
- package/dist/orchestrate.js +277 -0
- package/dist/pdf-extract.d.ts +5 -0
- package/dist/pdf-extract.js +20 -0
- package/dist/prompts.d.ts +2 -0
- package/dist/prompts.js +6 -0
- package/dist/providers/domain/arxiv.d.ts +6 -0
- package/dist/providers/domain/arxiv.js +83 -0
- package/dist/providers/domain/clinicaltrials.d.ts +6 -0
- package/dist/providers/domain/clinicaltrials.js +104 -0
- package/dist/providers/domain/edgar.d.ts +10 -0
- package/dist/providers/domain/edgar.js +92 -0
- package/dist/providers/domain/index.d.ts +14 -0
- package/dist/providers/domain/index.js +7 -0
- package/dist/providers/domain/openalex.d.ts +7 -0
- package/dist/providers/domain/openalex.js +128 -0
- package/dist/providers/domain/pubmed.d.ts +8 -0
- package/dist/providers/domain/pubmed.js +123 -0
- package/dist/providers/domain/semantic-scholar.d.ts +6 -0
- package/dist/providers/domain/semantic-scholar.js +112 -0
- package/dist/providers/domain/shared.d.ts +12 -0
- package/dist/providers/domain/shared.js +39 -0
- package/dist/providers/domain/wikipedia.d.ts +6 -0
- package/dist/providers/domain/wikipedia.js +71 -0
- package/dist/providers/exa-agent.d.ts +9 -0
- package/dist/providers/exa-agent.js +67 -0
- package/dist/providers/fetch.d.ts +66 -0
- package/dist/providers/fetch.js +675 -0
- package/dist/providers/parallel-agent.d.ts +11 -0
- package/dist/providers/parallel-agent.js +100 -0
- package/dist/providers/perplexity-agent.d.ts +17 -0
- package/dist/providers/perplexity-agent.js +86 -0
- package/dist/providers/search.d.ts +65 -0
- package/dist/providers/search.js +433 -0
- package/dist/providers/store.d.ts +48 -0
- package/dist/providers/store.js +217 -0
- package/dist/researcher.d.ts +20 -0
- package/dist/researcher.js +3 -0
- package/dist/robots.d.ts +16 -0
- package/dist/robots.js +146 -0
- package/dist/roles.d.ts +6 -0
- package/dist/roles.js +4 -0
- package/dist/run.d.ts +65 -0
- package/dist/run.js +371 -0
- package/dist/safe-dispatcher.d.ts +16 -0
- package/dist/safe-dispatcher.js +32 -0
- package/dist/safety.d.ts +23 -0
- package/dist/safety.js +206 -0
- package/dist/sandbox.d.ts +22 -0
- package/dist/sandbox.js +228 -0
- package/dist/search-normalize.d.ts +2 -0
- package/dist/search-normalize.js +13 -0
- package/dist/source-documents.d.ts +77 -0
- package/dist/source-documents.js +421 -0
- package/dist/sources.d.ts +57 -0
- package/dist/sources.js +1 -0
- package/dist/spine.d.ts +19 -0
- package/dist/spine.js +722 -0
- package/dist/state.d.ts +90 -0
- package/dist/state.js +27 -0
- package/dist/structured.d.ts +7 -0
- package/dist/structured.js +18 -0
- package/dist/tools.d.ts +33 -0
- package/dist/tools.js +1187 -0
- package/dist/trace-digest.d.ts +11 -0
- package/dist/trace-digest.js +309 -0
- package/dist/trace.d.ts +225 -0
- package/dist/trace.js +278 -0
- package/dist/trail.d.ts +15 -0
- package/dist/trail.js +74 -0
- package/dist/url.d.ts +1 -0
- package/dist/url.js +25 -0
- package/package.json +107 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import { generateObject, generateText } from "ai";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { minViableSubtaskUSD, resolvePricing } from "./budget.js";
|
|
4
|
+
import { deriveChildCtx } from "./context.js";
|
|
5
|
+
import { errorMessage } from "./errors.js";
|
|
6
|
+
import { MODEL_CALL_MAX_RETRIES } from "./model.js";
|
|
7
|
+
import { runSpine } from "./spine.js";
|
|
8
|
+
import { withTraceFrame } from "./trace.js";
|
|
9
|
+
export const ATLAS_KEY = "atlas";
|
|
10
|
+
const DEFAULT_ATLAS_DESCRIPTION = "Atlas's own deep-research spine: plans, searches, fetches, and synthesizes a grounded, citation-backed report. Strong on academic, finance, and multi-source synthesis. Default for any sub-task without a more specialized fit.";
|
|
11
|
+
const DECOMPOSE_SYSTEM = "You are a research orchestrator. Decompose the question into independent sub-tasks, each routed to the best-fit researcher from the roster. " +
|
|
12
|
+
"Cut along independent seams so the sub-reports compose without cross-referencing each other's internals. " +
|
|
13
|
+
"For a simple question, return a SINGLE sub-task — never split more than the question needs. " +
|
|
14
|
+
"Each subtask.researcher MUST be one of the roster keys. Structured output only.";
|
|
15
|
+
const SYNTH_SYSTEM = "You synthesize ONE cited research report from several independent sub-reports. " +
|
|
16
|
+
"Merge overlapping findings, surface and resolve contradictions, and write an integrated answer to the question — not a list of the sub-reports. " +
|
|
17
|
+
"Preserve every concrete specific (figures, names, dates). Cite sources inline as [N] using ONLY the numbers in the provided source roster; never invent a number and never write your own Sources list — one is appended for you. Open with a brief bottom-line answer.";
|
|
18
|
+
const decomposeSchema = z.object({
|
|
19
|
+
strategy: z.string(),
|
|
20
|
+
subtasks: z
|
|
21
|
+
.array(z.object({
|
|
22
|
+
query: z.string(),
|
|
23
|
+
researcher: z.string(),
|
|
24
|
+
rationale: z.string().optional(),
|
|
25
|
+
}))
|
|
26
|
+
.min(1)
|
|
27
|
+
.max(8),
|
|
28
|
+
});
|
|
29
|
+
const DECOMPOSE_FRACTION = 0.05;
|
|
30
|
+
const DECOMPOSE_MIN_USD = 0.02;
|
|
31
|
+
const SYNTH_MERGE_FRACTION = 0.3;
|
|
32
|
+
const SYNTH_MERGE_MIN_USD = 0.05;
|
|
33
|
+
const MAX_SUBTASKS = 8;
|
|
34
|
+
export async function runOrchestrated(rctx, researchers) {
|
|
35
|
+
const meter = rctx.meter;
|
|
36
|
+
const researchModelId = rctx.config.models.research.modelId ?? "";
|
|
37
|
+
const minViable = minViableSubtaskUSD("broad", rctx.config.envelope.maxReportTokens, resolvePricing(researchModelId, rctx.pricing).pricing);
|
|
38
|
+
const synthGrant = meter.grant({
|
|
39
|
+
fraction: SYNTH_MERGE_FRACTION,
|
|
40
|
+
minUSD: SYNTH_MERGE_MIN_USD,
|
|
41
|
+
}) ?? meter;
|
|
42
|
+
const releaseSynth = () => {
|
|
43
|
+
if (synthGrant !== meter)
|
|
44
|
+
synthGrant.release();
|
|
45
|
+
};
|
|
46
|
+
const roster = [ATLAS_KEY, ...Object.keys(researchers)]
|
|
47
|
+
.map((key) => `- ${key}: ${key === ATLAS_KEY ? DEFAULT_ATLAS_DESCRIPTION : researchers[key].description}`)
|
|
48
|
+
.join("\n");
|
|
49
|
+
const acquisitionBefore = meter.remainingUSD();
|
|
50
|
+
const maxSubtasks = minViable > 0
|
|
51
|
+
? Math.max(1, Math.min(MAX_SUBTASKS, Math.floor((acquisitionBefore * 0.95) / minViable)))
|
|
52
|
+
: MAX_SUBTASKS;
|
|
53
|
+
let strategy = "";
|
|
54
|
+
let subtasks = [{ query: rctx.question, researcher: ATLAS_KEY }];
|
|
55
|
+
const decomposeGrant = meter.grant({ fraction: DECOMPOSE_FRACTION, minUSD: DECOMPOSE_MIN_USD }) ??
|
|
56
|
+
meter;
|
|
57
|
+
try {
|
|
58
|
+
const decomposed = await withTraceFrame(rctx.recorder, { site: "decompose" }, () => generateObject({
|
|
59
|
+
model: rctx.bindModel("lead", decomposeGrant),
|
|
60
|
+
system: DECOMPOSE_SYSTEM,
|
|
61
|
+
prompt: `Question:\n${rctx.question}\n\nResearcher roster:\n${roster}\n\n` +
|
|
62
|
+
`Return the decomposition: a one-line strategy and at most ${maxSubtasks} sub-task(s) — fewer when the question is simple.`,
|
|
63
|
+
schema: decomposeSchema,
|
|
64
|
+
maxOutputTokens: 1200,
|
|
65
|
+
maxRetries: MODEL_CALL_MAX_RETRIES,
|
|
66
|
+
abortSignal: rctx.signal,
|
|
67
|
+
}));
|
|
68
|
+
strategy = decomposed.object.strategy;
|
|
69
|
+
subtasks = decomposed.object.subtasks.slice(0, maxSubtasks).map((st) => ({
|
|
70
|
+
query: st.query,
|
|
71
|
+
researcher: st.researcher === ATLAS_KEY || researchers[st.researcher]
|
|
72
|
+
? st.researcher
|
|
73
|
+
: ATLAS_KEY,
|
|
74
|
+
}));
|
|
75
|
+
}
|
|
76
|
+
catch (err) {
|
|
77
|
+
if (rctx.signal?.aborted)
|
|
78
|
+
throw err;
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
if (decomposeGrant !== meter)
|
|
82
|
+
decomposeGrant.release();
|
|
83
|
+
}
|
|
84
|
+
if (strategy.trim())
|
|
85
|
+
rctx.emit({ type: "plan.updated", rationale: strategy });
|
|
86
|
+
const acquisition = meter.remainingUSD();
|
|
87
|
+
const perTask = subtasks.length > 0 ? acquisition / subtasks.length : acquisition;
|
|
88
|
+
const dispatched = await Promise.all(subtasks.map(async (subtask) => {
|
|
89
|
+
const grant = meter.grant({ maxUSD: perTask }) ?? meter;
|
|
90
|
+
try {
|
|
91
|
+
if (subtask.researcher === ATLAS_KEY) {
|
|
92
|
+
const child = deriveChildCtx(rctx, subtask.query);
|
|
93
|
+
const out = await withTraceFrame(rctx.recorder, { site: `researcher:${ATLAS_KEY}` }, () => runSpine(child, { meter: grant }));
|
|
94
|
+
const sources = child.sources.fetchedSources.map((s) => {
|
|
95
|
+
const doc = s.sourceId
|
|
96
|
+
? child.sources.byId.get(s.sourceId)
|
|
97
|
+
: undefined;
|
|
98
|
+
return {
|
|
99
|
+
url: s.url,
|
|
100
|
+
title: s.title,
|
|
101
|
+
via: doc?.metadata.method ?? "unknown",
|
|
102
|
+
chars: doc?.storedChars ?? 0,
|
|
103
|
+
};
|
|
104
|
+
});
|
|
105
|
+
return { subtask, report: out.report, sources, ok: true, spine: out };
|
|
106
|
+
}
|
|
107
|
+
const researcher = researchers[subtask.researcher];
|
|
108
|
+
const ctx = {
|
|
109
|
+
budget: { maxUSD: grant.remainingUSD() },
|
|
110
|
+
log: () => { },
|
|
111
|
+
...(rctx.signal ? { signal: rctx.signal } : {}),
|
|
112
|
+
};
|
|
113
|
+
const report = await researcher.research(subtask.query, ctx);
|
|
114
|
+
grant.charge(report.cost ?? 0);
|
|
115
|
+
return {
|
|
116
|
+
subtask,
|
|
117
|
+
report: report.report,
|
|
118
|
+
sources: report.sources.map((s) => ({
|
|
119
|
+
url: s.url,
|
|
120
|
+
title: s.title ?? s.url,
|
|
121
|
+
via: subtask.researcher,
|
|
122
|
+
})),
|
|
123
|
+
ok: true,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
catch (err) {
|
|
127
|
+
if (rctx.signal?.aborted)
|
|
128
|
+
throw err;
|
|
129
|
+
return {
|
|
130
|
+
subtask,
|
|
131
|
+
report: "",
|
|
132
|
+
sources: [],
|
|
133
|
+
ok: false,
|
|
134
|
+
error: errorMessage(err),
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
finally {
|
|
138
|
+
if (grant !== meter)
|
|
139
|
+
grant.release();
|
|
140
|
+
}
|
|
141
|
+
}));
|
|
142
|
+
const ok = dispatched.filter((d) => d.ok && d.report.trim());
|
|
143
|
+
const failed = dispatched.filter((d) => !d.ok || !d.report.trim());
|
|
144
|
+
const warnings = failed.map((d) => `Researcher "${d.subtask.researcher}" returned no report for "${d.subtask.query}"` +
|
|
145
|
+
(d.error ? `: ${d.error}` : "."));
|
|
146
|
+
if (ok.length === 0) {
|
|
147
|
+
releaseSynth();
|
|
148
|
+
return {
|
|
149
|
+
report: "No researcher returned a usable report for this question.",
|
|
150
|
+
note: strategy,
|
|
151
|
+
citations: [],
|
|
152
|
+
unboundCitations: [],
|
|
153
|
+
sources: [],
|
|
154
|
+
warnings,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
// A single complete child report can pass through without a merge pass.
|
|
158
|
+
const soleAtlas = ok.length === 1 ? ok[0].spine : undefined;
|
|
159
|
+
if (soleAtlas) {
|
|
160
|
+
releaseSynth();
|
|
161
|
+
return {
|
|
162
|
+
report: soleAtlas.report,
|
|
163
|
+
note: soleAtlas.note,
|
|
164
|
+
citations: soleAtlas.citations,
|
|
165
|
+
unboundCitations: soleAtlas.unboundCitations,
|
|
166
|
+
sources: ok[0].sources,
|
|
167
|
+
warnings,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
const sources = dedupeSources(ok);
|
|
171
|
+
let merged;
|
|
172
|
+
if (ok.length === 1) {
|
|
173
|
+
merged = ok[0].report;
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
const rosterList = sources
|
|
177
|
+
.map((s, i) => `[${i + 1}] ${s.title} — ${s.url}`)
|
|
178
|
+
.join("\n");
|
|
179
|
+
const blocks = ok
|
|
180
|
+
.map((d, i) => `## Sub-report ${i + 1} — via "${d.subtask.researcher}"\n` +
|
|
181
|
+
`Query: ${d.subtask.query}\n\n${stripSourcesSection(d.report)}`)
|
|
182
|
+
.join("\n\n---\n\n");
|
|
183
|
+
let text = "";
|
|
184
|
+
try {
|
|
185
|
+
const synth = await withTraceFrame(rctx.recorder, { site: "synthesize" }, () => generateText({
|
|
186
|
+
model: rctx.bindModel("write", synthGrant),
|
|
187
|
+
system: SYNTH_SYSTEM,
|
|
188
|
+
prompt: `Question:\n${rctx.question}\n\n` +
|
|
189
|
+
`Source roster (cite inline as [N], using only these numbers):\n${rosterList || "(none)"}\n\n` +
|
|
190
|
+
`Sub-reports to merge:\n${blocks}\n\n` +
|
|
191
|
+
"Write the integrated final report.",
|
|
192
|
+
maxOutputTokens: rctx.config.envelope.maxReportTokens,
|
|
193
|
+
maxRetries: MODEL_CALL_MAX_RETRIES,
|
|
194
|
+
abortSignal: rctx.signal,
|
|
195
|
+
}));
|
|
196
|
+
text = synth.text;
|
|
197
|
+
}
|
|
198
|
+
catch (err) {
|
|
199
|
+
if (rctx.signal?.aborted)
|
|
200
|
+
throw err;
|
|
201
|
+
}
|
|
202
|
+
merged = text.trim() || ok.map((d) => d.report).join("\n\n---\n\n");
|
|
203
|
+
}
|
|
204
|
+
releaseSynth();
|
|
205
|
+
const bound = bindRosterCitations(stripSourcesSection(merged), sources);
|
|
206
|
+
return {
|
|
207
|
+
report: bound.report,
|
|
208
|
+
note: strategy,
|
|
209
|
+
citations: bound.citations,
|
|
210
|
+
unboundCitations: bound.unboundCitations,
|
|
211
|
+
sources,
|
|
212
|
+
warnings,
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
function dedupeSources(ok) {
|
|
216
|
+
const seen = new Set();
|
|
217
|
+
const out = [];
|
|
218
|
+
for (const d of ok) {
|
|
219
|
+
for (const s of d.sources) {
|
|
220
|
+
if (s.url && !seen.has(s.url)) {
|
|
221
|
+
seen.add(s.url);
|
|
222
|
+
out.push(s);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
return out;
|
|
227
|
+
}
|
|
228
|
+
function escapeRegExp(value) {
|
|
229
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
230
|
+
}
|
|
231
|
+
// Remove local source lists before renumbering against the merged source roster.
|
|
232
|
+
function stripSourcesSection(text) {
|
|
233
|
+
return text
|
|
234
|
+
.replace(/\n#{1,6}\s*(?:sources|references)\b[\s\S]*$/i, "")
|
|
235
|
+
.trimEnd();
|
|
236
|
+
}
|
|
237
|
+
// Bind the merged report to a compact, report-wide source list.
|
|
238
|
+
function bindRosterCitations(text, roster) {
|
|
239
|
+
let bound = text;
|
|
240
|
+
roster.forEach((source, i) => {
|
|
241
|
+
const marker = i + 1;
|
|
242
|
+
bound = bound.replace(new RegExp(`\\[([^\\]]+)\\]\\(${escapeRegExp(source.url)}\\)`, "g"), `$1 [${marker}]`);
|
|
243
|
+
});
|
|
244
|
+
const order = [];
|
|
245
|
+
const display = new Map();
|
|
246
|
+
const unbound = new Set();
|
|
247
|
+
let renumbered = bound.replace(/\[(\d+)\](?!\()/g, (_match, digits) => {
|
|
248
|
+
const n = Number(digits);
|
|
249
|
+
if (n < 1 || n > roster.length) {
|
|
250
|
+
unbound.add(`source_${n}`);
|
|
251
|
+
return "";
|
|
252
|
+
}
|
|
253
|
+
if (!display.has(n)) {
|
|
254
|
+
order.push(n);
|
|
255
|
+
display.set(n, order.length);
|
|
256
|
+
}
|
|
257
|
+
return `[${display.get(n)}]`;
|
|
258
|
+
});
|
|
259
|
+
renumbered = renumbered
|
|
260
|
+
.replace(/ {2,}/g, " ")
|
|
261
|
+
.replace(/ +([.,;:)])/g, "$1")
|
|
262
|
+
.trim();
|
|
263
|
+
const citations = order.map((rosterN, idx) => ({
|
|
264
|
+
sourceId: `source_${rosterN}`,
|
|
265
|
+
marker: idx + 1,
|
|
266
|
+
}));
|
|
267
|
+
const references = order
|
|
268
|
+
.map((rosterN, idx) => {
|
|
269
|
+
const source = roster[rosterN - 1];
|
|
270
|
+
return `${idx + 1}. [${source.title}](${source.url})`;
|
|
271
|
+
})
|
|
272
|
+
.join("\n");
|
|
273
|
+
const report = references
|
|
274
|
+
? `${renumbered}\n\n## Sources\n\n${references}`
|
|
275
|
+
: renumbered;
|
|
276
|
+
return { report, citations, unboundCitations: [...unbound] };
|
|
277
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { PDFParse } from "pdf-parse";
|
|
2
|
+
const PAGE_MARKER = /--\s*\d+\s+of\s+\d+\s*--/g;
|
|
3
|
+
const NO_TEXT_LAYER = "[This PDF has no extractable text layer — it is likely a scanned or image-only document, so its text could not be read. Do not treat this as 'the fact is not in the document'; try an alternate source, an HTML version, or a different URL for the same content.]";
|
|
4
|
+
export async function extractPdfText(data) {
|
|
5
|
+
const parser = new PDFParse({ data });
|
|
6
|
+
try {
|
|
7
|
+
const result = await parser.getText();
|
|
8
|
+
const text = result.text;
|
|
9
|
+
const pages = (text.match(PAGE_MARKER) ?? []).length;
|
|
10
|
+
const body = text.replace(PAGE_MARKER, " ");
|
|
11
|
+
const alnum = (body.match(/[\p{L}\p{N}]/gu) ?? []).length;
|
|
12
|
+
if (pages >= 2 && alnum < Math.max(200, pages * 20)) {
|
|
13
|
+
return { text: NO_TEXT_LAYER };
|
|
14
|
+
}
|
|
15
|
+
return { text };
|
|
16
|
+
}
|
|
17
|
+
finally {
|
|
18
|
+
await parser.destroy();
|
|
19
|
+
}
|
|
20
|
+
}
|
package/dist/prompts.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export function isoDate(epochMs) {
|
|
2
|
+
return new Date(epochMs).toISOString().slice(0, 10);
|
|
3
|
+
}
|
|
4
|
+
export function todayLine(todayISO) {
|
|
5
|
+
return `Today's date is ${todayISO}. Interpret "current", "recent", and "latest" relative to this date, not your training data; for any time-bound question, seek the most recent figures available.`;
|
|
6
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import { errorMessage } from "../../errors.js";
|
|
3
|
+
import { safeDomain, } from "../search.js";
|
|
4
|
+
import { buildContent, clampLimit, collapse, fetchText } from "./shared.js";
|
|
5
|
+
const ENDPOINT = "https://export.arxiv.org/api/query";
|
|
6
|
+
const SORT = {
|
|
7
|
+
relevance: "relevance",
|
|
8
|
+
lastUpdated: "lastUpdatedDate",
|
|
9
|
+
submitted: "submittedDate",
|
|
10
|
+
};
|
|
11
|
+
export function arxiv(opts = {}) {
|
|
12
|
+
const defaultLimit = clampLimit(opts.defaultLimit ?? 5);
|
|
13
|
+
const sortBy = SORT[opts.sort ?? "relevance"] ?? "relevance";
|
|
14
|
+
return {
|
|
15
|
+
id: "arxiv",
|
|
16
|
+
async search({ query, maxResults, signal }) {
|
|
17
|
+
const q = query.trim();
|
|
18
|
+
if (!q)
|
|
19
|
+
return [];
|
|
20
|
+
const params = new URLSearchParams({
|
|
21
|
+
search_query: `all:${q}`,
|
|
22
|
+
start: "0",
|
|
23
|
+
max_results: String(clampLimit(maxResults ?? defaultLimit)),
|
|
24
|
+
sortBy,
|
|
25
|
+
sortOrder: "descending",
|
|
26
|
+
});
|
|
27
|
+
let xml;
|
|
28
|
+
try {
|
|
29
|
+
xml = await fetchText(`${ENDPOINT}?${params.toString()}`, signal, "application/atom+xml");
|
|
30
|
+
}
|
|
31
|
+
catch (err) {
|
|
32
|
+
throw new Error(`arxiv: request failed: ${errorMessage(err)}`);
|
|
33
|
+
}
|
|
34
|
+
return toResults(xml);
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
function toResults(xml) {
|
|
39
|
+
const $ = cheerio.load(xml, { xml: true });
|
|
40
|
+
const out = [];
|
|
41
|
+
$("entry").each((_, el) => {
|
|
42
|
+
const entry = $(el);
|
|
43
|
+
const title = collapse(entry.children("title").first().text());
|
|
44
|
+
const url = entry
|
|
45
|
+
.children("id")
|
|
46
|
+
.first()
|
|
47
|
+
.text()
|
|
48
|
+
.trim()
|
|
49
|
+
.replace(/^http:\/\//, "https://");
|
|
50
|
+
if (!title || !url)
|
|
51
|
+
return;
|
|
52
|
+
const abstract = collapse(entry.children("summary").first().text());
|
|
53
|
+
const authors = entry
|
|
54
|
+
.find("author > name")
|
|
55
|
+
.map((_, n) => collapse($(n).text()))
|
|
56
|
+
.get()
|
|
57
|
+
.filter(Boolean);
|
|
58
|
+
const published = entry
|
|
59
|
+
.children("published")
|
|
60
|
+
.first()
|
|
61
|
+
.text()
|
|
62
|
+
.trim()
|
|
63
|
+
.slice(0, 10);
|
|
64
|
+
const meta = published ? [`Published: ${published}`] : [];
|
|
65
|
+
out.push({
|
|
66
|
+
position: out.length + 1,
|
|
67
|
+
title,
|
|
68
|
+
url,
|
|
69
|
+
snippet: abstract,
|
|
70
|
+
domain: safeDomain(url),
|
|
71
|
+
meta: {
|
|
72
|
+
openUrls: pdfUrls(url),
|
|
73
|
+
fallbackText: buildContent({ title, authors, meta, abstract }),
|
|
74
|
+
},
|
|
75
|
+
});
|
|
76
|
+
});
|
|
77
|
+
return out;
|
|
78
|
+
}
|
|
79
|
+
function pdfUrls(absUrl) {
|
|
80
|
+
return absUrl.includes("/abs/")
|
|
81
|
+
? [absUrl.replace("/abs/", "/pdf/"), absUrl]
|
|
82
|
+
: [absUrl];
|
|
83
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { jsonSchema } from "ai";
|
|
2
|
+
import { researchTool, } from "../../custom-tools.js";
|
|
3
|
+
import { errorMessage } from "../../errors.js";
|
|
4
|
+
import { buildContent, clampLimit, collapse, fetchJson, manifest, } from "./shared.js";
|
|
5
|
+
const ENDPOINT = "https://clinicaltrials.gov/api/v2/studies";
|
|
6
|
+
export function clinicaltrials(opts = {}) {
|
|
7
|
+
const defaultLimit = clampLimit(opts.defaultLimit ?? 5);
|
|
8
|
+
const status = (opts.status ?? [])
|
|
9
|
+
.map((s) => s.trim().toUpperCase())
|
|
10
|
+
.filter(Boolean);
|
|
11
|
+
return researchTool({
|
|
12
|
+
description: "Search ClinicalTrials.gov, the registry of clinical studies conducted around the world. Returns trial summaries (status, conditions, interventions, sponsor) as cited sources — including ongoing and unpublished trials not yet in the literature.",
|
|
13
|
+
inputSchema: jsonSchema({
|
|
14
|
+
type: "object",
|
|
15
|
+
properties: {
|
|
16
|
+
query: {
|
|
17
|
+
type: "string",
|
|
18
|
+
description: "Search query (condition, intervention, sponsor, or free text)",
|
|
19
|
+
},
|
|
20
|
+
},
|
|
21
|
+
required: ["query"],
|
|
22
|
+
additionalProperties: false,
|
|
23
|
+
}),
|
|
24
|
+
async execute(input, ctx) {
|
|
25
|
+
const query = String(input.query ?? "").trim();
|
|
26
|
+
if (!query)
|
|
27
|
+
return "clinicaltrials: empty query";
|
|
28
|
+
const params = new URLSearchParams({
|
|
29
|
+
"query.term": query,
|
|
30
|
+
pageSize: String(defaultLimit),
|
|
31
|
+
format: "json",
|
|
32
|
+
});
|
|
33
|
+
if (status.length)
|
|
34
|
+
params.set("filter.overallStatus", status.join(","));
|
|
35
|
+
let data;
|
|
36
|
+
try {
|
|
37
|
+
data = await fetchJson(`${ENDPOINT}?${params.toString()}`, ctx.signal);
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
return `clinicaltrials: request failed: ${errorMessage(err)}`;
|
|
41
|
+
}
|
|
42
|
+
return manifest("clinicaltrials", query, ingest(data, ctx));
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
function ingest(data, ctx) {
|
|
47
|
+
const studies = data && typeof data === "object"
|
|
48
|
+
? data.studies
|
|
49
|
+
: undefined;
|
|
50
|
+
if (!Array.isArray(studies))
|
|
51
|
+
return [];
|
|
52
|
+
const titles = [];
|
|
53
|
+
for (const study of studies) {
|
|
54
|
+
const p = (study ?? {})
|
|
55
|
+
.protocolSection ?? {};
|
|
56
|
+
const idm = p.identificationModule ?? {};
|
|
57
|
+
const nctId = collapse(String(idm.nctId ?? ""));
|
|
58
|
+
const title = collapse(String(idm.briefTitle ?? idm.officialTitle ?? ""));
|
|
59
|
+
if (!nctId || !title)
|
|
60
|
+
continue;
|
|
61
|
+
const overallStatus = collapse(String(p.statusModule?.overallStatus ?? "")).replace(/_/g, " ");
|
|
62
|
+
const sponsor = collapse(String(p.sponsorCollaboratorsModule?.leadSponsor?.name ?? ""));
|
|
63
|
+
const conditions = list(p.conditionsModule?.conditions);
|
|
64
|
+
const interventions = Array.isArray(p.armsInterventionsModule?.interventions)
|
|
65
|
+
? p.armsInterventionsModule.interventions
|
|
66
|
+
.map((i) => collapse(String(i?.name ?? "")))
|
|
67
|
+
.filter(Boolean)
|
|
68
|
+
.slice(0, 8)
|
|
69
|
+
: [];
|
|
70
|
+
const studyType = collapse(String(p.designModule?.studyType ?? ""));
|
|
71
|
+
const phases = list(p.designModule?.phases).map((x) => x.replace(/_/g, " "));
|
|
72
|
+
const enrollment = p.designModule?.enrollmentInfo?.count;
|
|
73
|
+
const abstract = collapse(String(p.descriptionModule?.briefSummary ?? ""));
|
|
74
|
+
const typeLine = [studyType, phases.join("/")].filter(Boolean).join(" · ");
|
|
75
|
+
const meta = [];
|
|
76
|
+
if (overallStatus)
|
|
77
|
+
meta.push(`Status: ${overallStatus}`);
|
|
78
|
+
if (typeLine)
|
|
79
|
+
meta.push(typeLine);
|
|
80
|
+
if (conditions.length)
|
|
81
|
+
meta.push(`Conditions: ${conditions.join(", ")}`);
|
|
82
|
+
if (interventions.length)
|
|
83
|
+
meta.push(`Interventions: ${interventions.join(", ")}`);
|
|
84
|
+
if (sponsor)
|
|
85
|
+
meta.push(`Sponsor: ${sponsor}`);
|
|
86
|
+
if (typeof enrollment === "number")
|
|
87
|
+
meta.push(`Enrollment: ${enrollment}`);
|
|
88
|
+
ctx.addSource({
|
|
89
|
+
url: `https://clinicaltrials.gov/study/${nctId}`,
|
|
90
|
+
title,
|
|
91
|
+
content: buildContent({ title, meta, abstract }),
|
|
92
|
+
});
|
|
93
|
+
titles.push(title);
|
|
94
|
+
}
|
|
95
|
+
return titles;
|
|
96
|
+
}
|
|
97
|
+
function list(value) {
|
|
98
|
+
return Array.isArray(value)
|
|
99
|
+
? value
|
|
100
|
+
.map((v) => collapse(String(v)))
|
|
101
|
+
.filter(Boolean)
|
|
102
|
+
.slice(0, 8)
|
|
103
|
+
: [];
|
|
104
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { type SearchProvider } from "../search.js";
|
|
2
|
+
export interface EdgarOptions {
|
|
3
|
+
defaultLimit?: number;
|
|
4
|
+
forms?: string[];
|
|
5
|
+
from?: string;
|
|
6
|
+
to?: string;
|
|
7
|
+
userAgent?: string;
|
|
8
|
+
email?: string;
|
|
9
|
+
}
|
|
10
|
+
export declare function edgar(opts?: EdgarOptions): SearchProvider;
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { readEnv } from "../../env.js";
|
|
2
|
+
import { errorMessage } from "../../errors.js";
|
|
3
|
+
import { safeDomain, } from "../search.js";
|
|
4
|
+
import { clampLimit, collapse } from "./shared.js";
|
|
5
|
+
const ENDPOINT = "https://efts.sec.gov/LATEST/search-index";
|
|
6
|
+
const ARCHIVES = "https://www.sec.gov/Archives/edgar/data";
|
|
7
|
+
export function edgar(opts = {}) {
|
|
8
|
+
const defaultLimit = clampLimit(opts.defaultLimit ?? 10, 10);
|
|
9
|
+
const forms = (opts.forms ?? []).map((f) => f.trim()).filter(Boolean);
|
|
10
|
+
const email = opts.email ?? readEnv("ATLAS_SEC_EMAIL");
|
|
11
|
+
const userAgent = opts.userAgent ??
|
|
12
|
+
readEnv("ATLAS_SEC_USER_AGENT") ??
|
|
13
|
+
(email ? `atlas-research/0.1 (${email})` : undefined);
|
|
14
|
+
return {
|
|
15
|
+
id: "edgar",
|
|
16
|
+
async search({ query, maxResults, signal }) {
|
|
17
|
+
const q = query.trim();
|
|
18
|
+
if (!q)
|
|
19
|
+
return [];
|
|
20
|
+
if (!userAgent) {
|
|
21
|
+
throw new Error("edgar: SEC requires a contact email in the User-Agent; set ATLAS_SEC_EMAIL (or pass { email } / { userAgent }).");
|
|
22
|
+
}
|
|
23
|
+
const params = new URLSearchParams({ q });
|
|
24
|
+
if (forms.length)
|
|
25
|
+
params.set("forms", forms.join(","));
|
|
26
|
+
if (opts.from)
|
|
27
|
+
params.set("startdt", opts.from);
|
|
28
|
+
if (opts.to)
|
|
29
|
+
params.set("enddt", opts.to);
|
|
30
|
+
let data;
|
|
31
|
+
try {
|
|
32
|
+
data = await fetchEdgar(`${ENDPOINT}?${params.toString()}`, userAgent, signal);
|
|
33
|
+
}
|
|
34
|
+
catch (err) {
|
|
35
|
+
throw new Error(`edgar: request failed: ${errorMessage(err)}`);
|
|
36
|
+
}
|
|
37
|
+
return parse(data, clampLimit(maxResults ?? defaultLimit, 10));
|
|
38
|
+
},
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
async function fetchEdgar(url, userAgent, signal) {
|
|
42
|
+
const resp = await fetch(url, {
|
|
43
|
+
signal,
|
|
44
|
+
headers: { "user-agent": userAgent, accept: "application/json" },
|
|
45
|
+
});
|
|
46
|
+
if (!resp.ok)
|
|
47
|
+
throw new Error(`HTTP ${resp.status} ${resp.statusText}`.trim());
|
|
48
|
+
return resp.json();
|
|
49
|
+
}
|
|
50
|
+
function parse(data, limit) {
|
|
51
|
+
const hits = data && typeof data === "object"
|
|
52
|
+
? data.hits?.hits
|
|
53
|
+
: undefined;
|
|
54
|
+
if (!Array.isArray(hits))
|
|
55
|
+
return [];
|
|
56
|
+
const out = [];
|
|
57
|
+
for (const hit of hits.slice(0, limit)) {
|
|
58
|
+
const h = (hit ?? {});
|
|
59
|
+
const src = (h._source ?? {});
|
|
60
|
+
const id = String(h._id ?? "");
|
|
61
|
+
const colon = id.indexOf(":");
|
|
62
|
+
const accession = colon >= 0 ? id.slice(0, colon) : String(src.adsh ?? "");
|
|
63
|
+
const filename = colon >= 0 ? id.slice(colon + 1) : "";
|
|
64
|
+
const cik = String(src.ciks?.[0] ?? "").replace(/^0+/, "");
|
|
65
|
+
if (!cik || !accession || !filename)
|
|
66
|
+
continue;
|
|
67
|
+
const url = `${ARCHIVES}/${cik}/${accession.replace(/-/g, "")}/${filename}`;
|
|
68
|
+
const display = collapse(String(src.display_names?.[0] ?? ""));
|
|
69
|
+
const company = display.split("(")[0].trim() || display || `CIK ${cik}`;
|
|
70
|
+
const form = collapse(String(src.form ?? src.file_type ?? ""));
|
|
71
|
+
const filed = collapse(String(src.file_date ?? ""));
|
|
72
|
+
const period = collapse(String(src.period_ending ?? ""));
|
|
73
|
+
const label = [
|
|
74
|
+
company,
|
|
75
|
+
form,
|
|
76
|
+
filed && `filed ${filed}`,
|
|
77
|
+
period && `period ${period}`,
|
|
78
|
+
]
|
|
79
|
+
.filter(Boolean)
|
|
80
|
+
.join(" · ");
|
|
81
|
+
out.push({
|
|
82
|
+
position: out.length + 1,
|
|
83
|
+
title: label,
|
|
84
|
+
url,
|
|
85
|
+
snippet: [form, filed && `filed ${filed}`, period && `period ${period}`]
|
|
86
|
+
.filter(Boolean)
|
|
87
|
+
.join(" · "),
|
|
88
|
+
domain: safeDomain(url),
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export type { ArxivOptions } from "./arxiv.js";
|
|
2
|
+
export { arxiv } from "./arxiv.js";
|
|
3
|
+
export type { ClinicalTrialsOptions } from "./clinicaltrials.js";
|
|
4
|
+
export { clinicaltrials } from "./clinicaltrials.js";
|
|
5
|
+
export type { EdgarOptions } from "./edgar.js";
|
|
6
|
+
export { edgar } from "./edgar.js";
|
|
7
|
+
export type { OpenAlexOptions } from "./openalex.js";
|
|
8
|
+
export { openalex } from "./openalex.js";
|
|
9
|
+
export type { PubmedOptions } from "./pubmed.js";
|
|
10
|
+
export { pubmed } from "./pubmed.js";
|
|
11
|
+
export type { SemanticScholarOptions } from "./semantic-scholar.js";
|
|
12
|
+
export { semanticScholar } from "./semantic-scholar.js";
|
|
13
|
+
export type { WikipediaOptions } from "./wikipedia.js";
|
|
14
|
+
export { wikipedia } from "./wikipedia.js";
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export { arxiv } from "./arxiv.js";
|
|
2
|
+
export { clinicaltrials } from "./clinicaltrials.js";
|
|
3
|
+
export { edgar } from "./edgar.js";
|
|
4
|
+
export { openalex } from "./openalex.js";
|
|
5
|
+
export { pubmed } from "./pubmed.js";
|
|
6
|
+
export { semanticScholar } from "./semantic-scholar.js";
|
|
7
|
+
export { wikipedia } from "./wikipedia.js";
|