@steel-dev/atlas 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +219 -0
- package/dist/agent.d.ts +34 -0
- package/dist/agent.js +133 -0
- package/dist/async.d.ts +19 -0
- package/dist/async.js +172 -0
- package/dist/atlas.d.ts +19 -0
- package/dist/atlas.js +69 -0
- package/dist/budget.d.ts +64 -0
- package/dist/budget.js +336 -0
- package/dist/checklist.d.ts +115 -0
- package/dist/checklist.js +297 -0
- package/dist/cli.js +38700 -0
- package/dist/config.d.ts +80 -0
- package/dist/config.js +109 -0
- package/dist/context.d.ts +26 -0
- package/dist/context.js +250 -0
- package/dist/custom-tools.d.ts +26 -0
- package/dist/custom-tools.js +33 -0
- package/dist/defaults.d.ts +10 -0
- package/dist/defaults.js +37 -0
- package/dist/economy.d.ts +12 -0
- package/dist/economy.js +6 -0
- package/dist/env.d.ts +1 -0
- package/dist/env.js +8 -0
- package/dist/errors.d.ts +6 -0
- package/dist/errors.js +11 -0
- package/dist/event-hub.d.ts +11 -0
- package/dist/event-hub.js +83 -0
- package/dist/events.d.ts +105 -0
- package/dist/events.js +1 -0
- package/dist/html-extract.d.ts +21 -0
- package/dist/html-extract.js +459 -0
- package/dist/index.d.ts +59 -0
- package/dist/index.js +26 -0
- package/dist/memory.d.ts +2 -0
- package/dist/memory.js +38 -0
- package/dist/model.d.ts +49 -0
- package/dist/model.js +630 -0
- package/dist/orchestrate.d.ts +5 -0
- package/dist/orchestrate.js +277 -0
- package/dist/pdf-extract.d.ts +5 -0
- package/dist/pdf-extract.js +20 -0
- package/dist/prompts.d.ts +2 -0
- package/dist/prompts.js +6 -0
- package/dist/providers/domain/arxiv.d.ts +6 -0
- package/dist/providers/domain/arxiv.js +83 -0
- package/dist/providers/domain/clinicaltrials.d.ts +6 -0
- package/dist/providers/domain/clinicaltrials.js +104 -0
- package/dist/providers/domain/edgar.d.ts +10 -0
- package/dist/providers/domain/edgar.js +92 -0
- package/dist/providers/domain/index.d.ts +14 -0
- package/dist/providers/domain/index.js +7 -0
- package/dist/providers/domain/openalex.d.ts +7 -0
- package/dist/providers/domain/openalex.js +128 -0
- package/dist/providers/domain/pubmed.d.ts +8 -0
- package/dist/providers/domain/pubmed.js +123 -0
- package/dist/providers/domain/semantic-scholar.d.ts +6 -0
- package/dist/providers/domain/semantic-scholar.js +112 -0
- package/dist/providers/domain/shared.d.ts +12 -0
- package/dist/providers/domain/shared.js +39 -0
- package/dist/providers/domain/wikipedia.d.ts +6 -0
- package/dist/providers/domain/wikipedia.js +71 -0
- package/dist/providers/exa-agent.d.ts +9 -0
- package/dist/providers/exa-agent.js +67 -0
- package/dist/providers/fetch.d.ts +66 -0
- package/dist/providers/fetch.js +675 -0
- package/dist/providers/parallel-agent.d.ts +11 -0
- package/dist/providers/parallel-agent.js +100 -0
- package/dist/providers/perplexity-agent.d.ts +17 -0
- package/dist/providers/perplexity-agent.js +86 -0
- package/dist/providers/search.d.ts +65 -0
- package/dist/providers/search.js +433 -0
- package/dist/providers/store.d.ts +48 -0
- package/dist/providers/store.js +217 -0
- package/dist/researcher.d.ts +20 -0
- package/dist/researcher.js +3 -0
- package/dist/robots.d.ts +16 -0
- package/dist/robots.js +146 -0
- package/dist/roles.d.ts +6 -0
- package/dist/roles.js +4 -0
- package/dist/run.d.ts +65 -0
- package/dist/run.js +371 -0
- package/dist/safe-dispatcher.d.ts +16 -0
- package/dist/safe-dispatcher.js +32 -0
- package/dist/safety.d.ts +23 -0
- package/dist/safety.js +206 -0
- package/dist/sandbox.d.ts +22 -0
- package/dist/sandbox.js +228 -0
- package/dist/search-normalize.d.ts +2 -0
- package/dist/search-normalize.js +13 -0
- package/dist/source-documents.d.ts +77 -0
- package/dist/source-documents.js +421 -0
- package/dist/sources.d.ts +57 -0
- package/dist/sources.js +1 -0
- package/dist/spine.d.ts +19 -0
- package/dist/spine.js +722 -0
- package/dist/state.d.ts +90 -0
- package/dist/state.js +27 -0
- package/dist/structured.d.ts +7 -0
- package/dist/structured.js +18 -0
- package/dist/tools.d.ts +33 -0
- package/dist/tools.js +1187 -0
- package/dist/trace-digest.d.ts +11 -0
- package/dist/trace-digest.js +309 -0
- package/dist/trace.d.ts +225 -0
- package/dist/trace.js +278 -0
- package/dist/trail.d.ts +15 -0
- package/dist/trail.js +74 -0
- package/dist/url.d.ts +1 -0
- package/dist/url.js +25 -0
- package/package.json +107 -0
|
@@ -0,0 +1,675 @@
|
|
|
1
|
+
import { sleep, withTimeout } from "../async.js";
|
|
2
|
+
import { readEnv } from "../env.js";
|
|
3
|
+
import { errorMessage } from "../errors.js";
|
|
4
|
+
import { htmlToMarkdown } from "../html-extract.js";
|
|
5
|
+
import { extractPdfText } from "../pdf-extract.js";
|
|
6
|
+
import { createRobotsCache } from "../robots.js";
|
|
7
|
+
import { guardRedirect as guardRedirectUrl } from "../safety.js";
|
|
8
|
+
import { extractionMetadataFromExa, extractionMetadataFromHtml, extractionMetadataFromPdf, extractionMetadataFromScrape, extractionMetadataFromText, } from "../source-documents.js";
|
|
9
|
+
const DIRECT_PDF_MAX_BYTES = 25 * 1024 * 1024;
|
|
10
|
+
const DIRECT_HTML_MAX_BYTES = 5 * 1024 * 1024;
|
|
11
|
+
const DIRECT_FETCH_TIMEOUT_MS = 15_000;
|
|
12
|
+
const SCRAPE_TIMEOUT_MS = 30_000;
|
|
13
|
+
const DIRECT_HTML_MIN_CHARS = 100;
|
|
14
|
+
const PDF_MAGIC = "%PDF";
|
|
15
|
+
const FETCH_USER_AGENT = "Mozilla/5.0 (compatible; AtlasResearchBot/0.2; +https://github.com/steel-experiments/atlas)";
|
|
16
|
+
const ROBOTS_AGENT_TOKEN = "atlasresearchbot";
|
|
17
|
+
const STEEL_RETRY_MAX_ATTEMPTS = 5;
|
|
18
|
+
const ANTI_BOT_MARKERS = [
|
|
19
|
+
"just a moment",
|
|
20
|
+
"verifying you are human",
|
|
21
|
+
"checking your browser",
|
|
22
|
+
"enable javascript and cookies",
|
|
23
|
+
"access denied",
|
|
24
|
+
"captcha",
|
|
25
|
+
"pardon our interruption",
|
|
26
|
+
"unusual traffic from your computer network",
|
|
27
|
+
];
|
|
28
|
+
export function looksBlocked(text) {
|
|
29
|
+
if (!text)
|
|
30
|
+
return false;
|
|
31
|
+
const lower = text.toLowerCase().slice(0, 4000);
|
|
32
|
+
return ANTI_BOT_MARKERS.some((marker) => lower.includes(marker));
|
|
33
|
+
}
|
|
34
|
+
const BLOCK_MARKER_EXEMPT_CHARS = 2_000;
|
|
35
|
+
export function looksBlockedPage(markdown, raw) {
|
|
36
|
+
if (markdown.length >= BLOCK_MARKER_EXEMPT_CHARS)
|
|
37
|
+
return false;
|
|
38
|
+
return looksBlocked(markdown) || looksBlocked(raw);
|
|
39
|
+
}
|
|
40
|
+
function failed(method, note, escalate = true) {
|
|
41
|
+
return { ok: false, attempt: { method, ok: false, note }, escalate };
|
|
42
|
+
}
|
|
43
|
+
export function isLikelyPdfUrl(url) {
|
|
44
|
+
try {
|
|
45
|
+
const parsed = new URL(url);
|
|
46
|
+
return /\.pdf$/i.test(parsed.pathname);
|
|
47
|
+
}
|
|
48
|
+
catch {
|
|
49
|
+
return /\.pdf(?:$|[?#])/i.test(url);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
function isPdfContentType(contentType) {
|
|
53
|
+
return /\bapplication\/pdf\b/i.test(contentType ?? "");
|
|
54
|
+
}
|
|
55
|
+
function isHtmlContentType(contentType) {
|
|
56
|
+
if (!contentType)
|
|
57
|
+
return true;
|
|
58
|
+
return /\b(?:text\/html|application\/xhtml\+xml)\b/i.test(contentType);
|
|
59
|
+
}
|
|
60
|
+
function isJsonContentType(contentType) {
|
|
61
|
+
return /\b(?:application|text)\/(?:[\w.+-]+\+)?json\b/i.test(contentType ?? "");
|
|
62
|
+
}
|
|
63
|
+
function isXmlContentType(contentType) {
|
|
64
|
+
return /\b(?:application|text)\/(?:[\w.+-]+\+)?xml\b/i.test(contentType ?? "");
|
|
65
|
+
}
|
|
66
|
+
function isPlainTextContentType(contentType) {
|
|
67
|
+
return /\btext\/(?:plain|csv|tab-separated-values|markdown)\b/i.test(contentType ?? "");
|
|
68
|
+
}
|
|
69
|
+
function isDirectTextContentType(contentType) {
|
|
70
|
+
return (isJsonContentType(contentType) ||
|
|
71
|
+
isXmlContentType(contentType) ||
|
|
72
|
+
isPlainTextContentType(contentType));
|
|
73
|
+
}
|
|
74
|
+
function looksLikeDirectText(data) {
|
|
75
|
+
const prefix = new TextDecoder("utf-8", { fatal: false })
|
|
76
|
+
.decode(data.slice(0, 512))
|
|
77
|
+
.trimStart();
|
|
78
|
+
return (prefix.startsWith("{") ||
|
|
79
|
+
prefix.startsWith("[") ||
|
|
80
|
+
prefix.startsWith("<?xml") ||
|
|
81
|
+
/^PMID-|\b[A-Z]{2,}-\s/.test(prefix));
|
|
82
|
+
}
|
|
83
|
+
function isPdfBytes(data) {
|
|
84
|
+
if (data.byteLength < PDF_MAGIC.length)
|
|
85
|
+
return false;
|
|
86
|
+
return (new TextDecoder("ascii").decode(data.slice(0, PDF_MAGIC.length)) ===
|
|
87
|
+
PDF_MAGIC);
|
|
88
|
+
}
|
|
89
|
+
function readContentLength(headers) {
|
|
90
|
+
const raw = headers.get("content-length");
|
|
91
|
+
if (!raw)
|
|
92
|
+
return undefined;
|
|
93
|
+
const n = Number(raw);
|
|
94
|
+
return Number.isFinite(n) && n >= 0 ? n : undefined;
|
|
95
|
+
}
|
|
96
|
+
function titleFromUrl(url) {
|
|
97
|
+
try {
|
|
98
|
+
const parsed = new URL(url);
|
|
99
|
+
const filename = decodeURIComponent(parsed.pathname.split("/").filter(Boolean).at(-1) ?? "");
|
|
100
|
+
return filename || parsed.hostname || url;
|
|
101
|
+
}
|
|
102
|
+
catch {
|
|
103
|
+
return url;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
function normalizeDirectText(text, contentType) {
|
|
107
|
+
const trimmed = text.trim();
|
|
108
|
+
if (isJsonContentType(contentType) || /^[[{]/.test(trimmed)) {
|
|
109
|
+
try {
|
|
110
|
+
return {
|
|
111
|
+
markdown: JSON.stringify(JSON.parse(trimmed), null, 2),
|
|
112
|
+
method: "json_direct",
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return { markdown: trimmed, method: "text_direct" };
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if (isXmlContentType(contentType) || /^<\??xml\b/i.test(trimmed)) {
|
|
120
|
+
return { markdown: trimmed, method: "xml_direct" };
|
|
121
|
+
}
|
|
122
|
+
return { markdown: trimmed, method: "text_direct" };
|
|
123
|
+
}
|
|
124
|
+
export function basicFetch() {
|
|
125
|
+
const robots = createRobotsCache({
|
|
126
|
+
agentToken: ROBOTS_AGENT_TOKEN,
|
|
127
|
+
userAgent: FETCH_USER_AGENT,
|
|
128
|
+
});
|
|
129
|
+
const domainTails = new Map();
|
|
130
|
+
const serialize = (host, task) => {
|
|
131
|
+
const prev = domainTails.get(host) ?? Promise.resolve();
|
|
132
|
+
const next = prev.then(task, task);
|
|
133
|
+
const tail = next.then(() => { }, () => { });
|
|
134
|
+
domainTails.set(host, tail);
|
|
135
|
+
void tail.then(() => {
|
|
136
|
+
if (domainTails.get(host) === tail)
|
|
137
|
+
domainTails.delete(host);
|
|
138
|
+
});
|
|
139
|
+
return next;
|
|
140
|
+
};
|
|
141
|
+
return {
|
|
142
|
+
id: "basic",
|
|
143
|
+
fetch(req) {
|
|
144
|
+
let host;
|
|
145
|
+
try {
|
|
146
|
+
host = new URL(req.url).host.toLowerCase();
|
|
147
|
+
}
|
|
148
|
+
catch {
|
|
149
|
+
return Promise.resolve(failed("direct_http", `bad_url: not a valid URL: ${req.url}`, false));
|
|
150
|
+
}
|
|
151
|
+
return serialize(host, () => directFetch(req, robots));
|
|
152
|
+
},
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
class ResponseTooLargeError extends Error {
|
|
156
|
+
}
|
|
157
|
+
async function readCappedBytes(response, maxBytes) {
|
|
158
|
+
const body = response.body;
|
|
159
|
+
if (!body) {
|
|
160
|
+
const buf = new Uint8Array(await response.arrayBuffer());
|
|
161
|
+
if (buf.byteLength > maxBytes)
|
|
162
|
+
throw new ResponseTooLargeError();
|
|
163
|
+
return buf;
|
|
164
|
+
}
|
|
165
|
+
const reader = body.getReader();
|
|
166
|
+
const chunks = [];
|
|
167
|
+
let total = 0;
|
|
168
|
+
try {
|
|
169
|
+
for (;;) {
|
|
170
|
+
const { done, value } = await reader.read();
|
|
171
|
+
if (done)
|
|
172
|
+
break;
|
|
173
|
+
if (!value)
|
|
174
|
+
continue;
|
|
175
|
+
total += value.byteLength;
|
|
176
|
+
if (total > maxBytes)
|
|
177
|
+
throw new ResponseTooLargeError();
|
|
178
|
+
chunks.push(value);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
finally {
|
|
182
|
+
await reader.cancel().catch(() => { });
|
|
183
|
+
}
|
|
184
|
+
const out = new Uint8Array(total);
|
|
185
|
+
let offset = 0;
|
|
186
|
+
for (const chunk of chunks) {
|
|
187
|
+
out.set(chunk, offset);
|
|
188
|
+
offset += chunk.byteLength;
|
|
189
|
+
}
|
|
190
|
+
return out;
|
|
191
|
+
}
|
|
192
|
+
const MAX_REDIRECT_HOPS = 5;
|
|
193
|
+
async function directFetch({ url, signal, guardRedirect, dispatcher }, robots) {
|
|
194
|
+
if (!guardRedirect) {
|
|
195
|
+
const initial = await guardRedirectUrl(url, {});
|
|
196
|
+
if (!initial.ok) {
|
|
197
|
+
return failed("direct_http", `blocked_url: fetch of ${url} blocked: ${initial.reason}`, false);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
let currentUrl = url;
|
|
201
|
+
let response;
|
|
202
|
+
for (let hop = 0;; hop++) {
|
|
203
|
+
if (!(await robots.allows(currentUrl, signal, dispatcher))) {
|
|
204
|
+
return failed("direct_http", "robots_disallowed: robots.txt disallows direct fetching of this URL");
|
|
205
|
+
}
|
|
206
|
+
try {
|
|
207
|
+
const timeout = AbortSignal.timeout(DIRECT_FETCH_TIMEOUT_MS);
|
|
208
|
+
response = await fetch(currentUrl, {
|
|
209
|
+
redirect: "manual",
|
|
210
|
+
signal: signal ? AbortSignal.any([signal, timeout]) : timeout,
|
|
211
|
+
headers: {
|
|
212
|
+
accept: "application/pdf,*/*;q=0.8",
|
|
213
|
+
"user-agent": FETCH_USER_AGENT,
|
|
214
|
+
},
|
|
215
|
+
...(dispatcher ? { dispatcher } : {}),
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
catch (err) {
|
|
219
|
+
if (signal?.aborted)
|
|
220
|
+
throw err;
|
|
221
|
+
return failed("direct_http", `network_error: direct fetch failed: ${errorMessage(err)}`);
|
|
222
|
+
}
|
|
223
|
+
const location = response.headers.get("location");
|
|
224
|
+
if (response.status >= 300 && response.status < 400 && location) {
|
|
225
|
+
await response.body?.cancel().catch(() => { });
|
|
226
|
+
if (hop >= MAX_REDIRECT_HOPS) {
|
|
227
|
+
return failed("direct_http", `too_many_redirects: gave up after ${MAX_REDIRECT_HOPS} redirects`);
|
|
228
|
+
}
|
|
229
|
+
let next;
|
|
230
|
+
try {
|
|
231
|
+
next = new URL(location, currentUrl).toString();
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
return failed("direct_http", `bad_redirect: invalid redirect location: ${location}`, false);
|
|
235
|
+
}
|
|
236
|
+
const verdict = guardRedirect
|
|
237
|
+
? await guardRedirect(next)
|
|
238
|
+
: await guardRedirectUrl(next, {});
|
|
239
|
+
if (!verdict.ok) {
|
|
240
|
+
return failed("direct_http", `blocked_redirect: redirect to ${next} blocked: ${verdict.reason}`, false);
|
|
241
|
+
}
|
|
242
|
+
currentUrl = next;
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
break;
|
|
246
|
+
}
|
|
247
|
+
if (!response.ok) {
|
|
248
|
+
return failed("direct_http", `http_error: direct fetch returned HTTP ${response.status}`);
|
|
249
|
+
}
|
|
250
|
+
const contentType = response.headers.get("content-type") ?? undefined;
|
|
251
|
+
const contentLength = readContentLength(response.headers);
|
|
252
|
+
const maxBytes = isLikelyPdfUrl(currentUrl) || isPdfContentType(contentType)
|
|
253
|
+
? DIRECT_PDF_MAX_BYTES
|
|
254
|
+
: DIRECT_HTML_MAX_BYTES;
|
|
255
|
+
if (contentLength !== undefined && contentLength > maxBytes) {
|
|
256
|
+
return failed("direct_http", `too_large: direct response is too large (${contentLength} bytes)`, false);
|
|
257
|
+
}
|
|
258
|
+
let data;
|
|
259
|
+
try {
|
|
260
|
+
data = await readCappedBytes(response, maxBytes);
|
|
261
|
+
}
|
|
262
|
+
catch (err) {
|
|
263
|
+
if (signal?.aborted)
|
|
264
|
+
throw err;
|
|
265
|
+
if (err instanceof ResponseTooLargeError) {
|
|
266
|
+
return failed("direct_http", `too_large: direct response exceeded ${maxBytes} bytes`, false);
|
|
267
|
+
}
|
|
268
|
+
return failed("direct_http", `network_error: reading response failed: ${errorMessage(err)}`);
|
|
269
|
+
}
|
|
270
|
+
const finalUrl = response.url || currentUrl;
|
|
271
|
+
if (isPdfBytes(data) || isPdfContentType(contentType)) {
|
|
272
|
+
return extractPdf(data, contentType, finalUrl, signal);
|
|
273
|
+
}
|
|
274
|
+
if (!contentType && looksLikeDirectText(data)) {
|
|
275
|
+
return extractText(data, contentType, finalUrl);
|
|
276
|
+
}
|
|
277
|
+
if (isHtmlContentType(contentType)) {
|
|
278
|
+
return extractHtml(data, contentType, finalUrl);
|
|
279
|
+
}
|
|
280
|
+
if (isDirectTextContentType(contentType) || looksLikeDirectText(data)) {
|
|
281
|
+
return extractText(data, contentType, finalUrl);
|
|
282
|
+
}
|
|
283
|
+
return failed("direct_http", contentType
|
|
284
|
+
? `unsupported_content_type: direct response was ${contentType}`
|
|
285
|
+
: "unsupported_content_type: direct response was not HTML, PDF, JSON, XML, or text", false);
|
|
286
|
+
}
|
|
287
|
+
function extractHtml(data, contentType, finalUrl) {
|
|
288
|
+
const html = new TextDecoder("utf-8", { fatal: false }).decode(data);
|
|
289
|
+
const extracted = htmlToMarkdown(html, finalUrl);
|
|
290
|
+
if (looksBlockedPage(extracted.markdown, html)) {
|
|
291
|
+
return failed("html_direct", "blocked_or_challenge: direct HTML looked blocked");
|
|
292
|
+
}
|
|
293
|
+
if (extracted.markdown.length < DIRECT_HTML_MIN_CHARS) {
|
|
294
|
+
return failed("html_direct", `thin_content: direct HTML extracted ${extracted.markdown.length} chars`);
|
|
295
|
+
}
|
|
296
|
+
const attempt = {
|
|
297
|
+
method: "html_direct",
|
|
298
|
+
ok: true,
|
|
299
|
+
note: `html_direct: extracted ${extracted.markdown.length} text chars`,
|
|
300
|
+
};
|
|
301
|
+
return {
|
|
302
|
+
ok: true,
|
|
303
|
+
attempt,
|
|
304
|
+
page: {
|
|
305
|
+
finalUrl,
|
|
306
|
+
title: extracted.title,
|
|
307
|
+
markdown: extracted.markdown,
|
|
308
|
+
renderedWith: "html_direct",
|
|
309
|
+
metadata: extractionMetadataFromHtml({
|
|
310
|
+
markdownChars: extracted.markdown.length,
|
|
311
|
+
...(contentType ? { contentType } : {}),
|
|
312
|
+
finalUrl,
|
|
313
|
+
attempts: [attempt],
|
|
314
|
+
discoveredLinks: extracted.links,
|
|
315
|
+
pageMetadata: extracted.metadata,
|
|
316
|
+
}),
|
|
317
|
+
},
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
const PDF_PARSE_TIMEOUT_MS = 30_000;
|
|
321
|
+
async function extractPdf(data, contentType, finalUrl, signal) {
|
|
322
|
+
try {
|
|
323
|
+
const extracted = await withTimeout(PDF_PARSE_TIMEOUT_MS, signal, "pdf_parse", () => extractPdfText(data));
|
|
324
|
+
const markdown = extracted.text.trim();
|
|
325
|
+
if (!markdown) {
|
|
326
|
+
return failed("pdf_direct", "pdf_no_text: PDF extraction produced no text");
|
|
327
|
+
}
|
|
328
|
+
const attempt = {
|
|
329
|
+
method: "pdf_direct",
|
|
330
|
+
ok: true,
|
|
331
|
+
note: `pdf_direct: extracted ${markdown.length} text chars`,
|
|
332
|
+
};
|
|
333
|
+
return {
|
|
334
|
+
ok: true,
|
|
335
|
+
attempt,
|
|
336
|
+
page: {
|
|
337
|
+
finalUrl,
|
|
338
|
+
title: titleFromUrl(finalUrl),
|
|
339
|
+
markdown,
|
|
340
|
+
renderedWith: "pdf_direct",
|
|
341
|
+
metadata: extractionMetadataFromPdf({
|
|
342
|
+
markdownChars: markdown.length,
|
|
343
|
+
...(contentType ? { contentType } : {}),
|
|
344
|
+
finalUrl,
|
|
345
|
+
attempts: [attempt],
|
|
346
|
+
}),
|
|
347
|
+
},
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
catch (err) {
|
|
351
|
+
return failed("pdf_direct", `pdf_parse_error: PDF extraction failed: ${errorMessage(err)}`);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
function extractText(data, contentType, finalUrl) {
|
|
355
|
+
const decoded = new TextDecoder("utf-8", { fatal: false }).decode(data);
|
|
356
|
+
const parsed = normalizeDirectText(decoded, contentType);
|
|
357
|
+
const markdown = parsed.markdown.trim();
|
|
358
|
+
if (looksBlockedPage(markdown, decoded)) {
|
|
359
|
+
return failed("text_direct", "blocked_or_challenge: direct text looked blocked");
|
|
360
|
+
}
|
|
361
|
+
if (markdown.length < DIRECT_HTML_MIN_CHARS) {
|
|
362
|
+
return failed(parsed.method, `thin_content: direct text extracted ${markdown.length} chars`);
|
|
363
|
+
}
|
|
364
|
+
const attempt = {
|
|
365
|
+
method: parsed.method,
|
|
366
|
+
ok: true,
|
|
367
|
+
note: `${parsed.method}: extracted ${markdown.length} text chars`,
|
|
368
|
+
};
|
|
369
|
+
return {
|
|
370
|
+
ok: true,
|
|
371
|
+
attempt,
|
|
372
|
+
page: {
|
|
373
|
+
finalUrl,
|
|
374
|
+
title: titleFromUrl(finalUrl),
|
|
375
|
+
markdown,
|
|
376
|
+
renderedWith: parsed.method,
|
|
377
|
+
metadata: extractionMetadataFromText({
|
|
378
|
+
markdownChars: markdown.length,
|
|
379
|
+
method: parsed.method,
|
|
380
|
+
...(contentType ? { contentType } : {}),
|
|
381
|
+
finalUrl,
|
|
382
|
+
attempts: [attempt],
|
|
383
|
+
}),
|
|
384
|
+
},
|
|
385
|
+
};
|
|
386
|
+
}
|
|
387
|
+
function parseRetryAfterSeconds(err) {
|
|
388
|
+
const status = err?.status;
|
|
389
|
+
const message = errorMessage(err);
|
|
390
|
+
if (status !== 429 &&
|
|
391
|
+
!/(rate limit exceeded|too many requests)/i.test(message)) {
|
|
392
|
+
return null;
|
|
393
|
+
}
|
|
394
|
+
const headers = err?.headers;
|
|
395
|
+
let headerValue;
|
|
396
|
+
if (headers && typeof headers.get === "function") {
|
|
397
|
+
headerValue =
|
|
398
|
+
headers.get("retry-after") ??
|
|
399
|
+
undefined;
|
|
400
|
+
}
|
|
401
|
+
if (headerValue) {
|
|
402
|
+
const numeric = Number(headerValue);
|
|
403
|
+
if (Number.isFinite(numeric) && numeric > 0)
|
|
404
|
+
return Math.ceil(numeric);
|
|
405
|
+
}
|
|
406
|
+
const match = /try again in\s+(\d+(?:\.\d+)?)\s*seconds?/i.exec(message);
|
|
407
|
+
if (match)
|
|
408
|
+
return Math.ceil(Number(match[1]));
|
|
409
|
+
return 15;
|
|
410
|
+
}
|
|
411
|
+
export function steel(opts = {}) {
|
|
412
|
+
const apiKey = opts.apiKey ?? readEnv("ATLAS_STEEL_API_KEY", "STEEL_API_KEY");
|
|
413
|
+
if (!apiKey) {
|
|
414
|
+
throw new Error("steel() requires an apiKey (or set ATLAS_STEEL_API_KEY / STEEL_API_KEY)");
|
|
415
|
+
}
|
|
416
|
+
const baseUrl = opts.baseUrl ?? readEnv("ATLAS_STEEL_BASE_URL", "STEEL_BASE_URL");
|
|
417
|
+
let clientPromise = null;
|
|
418
|
+
const client = () => {
|
|
419
|
+
clientPromise ??= createSteelClient(apiKey, baseUrl);
|
|
420
|
+
return clientPromise;
|
|
421
|
+
};
|
|
422
|
+
return {
|
|
423
|
+
id: "steel",
|
|
424
|
+
async fetch({ url, signal, onRateLimit }) {
|
|
425
|
+
let response;
|
|
426
|
+
try {
|
|
427
|
+
response = await withSteelRetry(() => client().then((steelClient) => steelClient.scrape({
|
|
428
|
+
url,
|
|
429
|
+
format: isLikelyPdfUrl(url) ? ["html", "markdown"] : ["html"],
|
|
430
|
+
useProxy: opts.proxy ?? true,
|
|
431
|
+
}, { signal, timeout: SCRAPE_TIMEOUT_MS })), signal, onRateLimit);
|
|
432
|
+
}
|
|
433
|
+
catch (err) {
|
|
434
|
+
if (signal?.aborted)
|
|
435
|
+
throw err;
|
|
436
|
+
return failed("steel_scrape", `network_error: steel scrape failed: ${errorMessage(err)}`, false);
|
|
437
|
+
}
|
|
438
|
+
return steelAttemptFromResponse(response, url);
|
|
439
|
+
},
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
export function steelAttemptFromResponse(response, url) {
|
|
443
|
+
const status = response.metadata?.statusCode;
|
|
444
|
+
if (status !== undefined && status >= 400) {
|
|
445
|
+
return failed("steel_scrape", `http_error: steel scrape returned HTTP ${status}`, false);
|
|
446
|
+
}
|
|
447
|
+
const finalUrl = response.metadata?.canonical || url;
|
|
448
|
+
const html = response.content?.html;
|
|
449
|
+
if (html) {
|
|
450
|
+
const extracted = htmlToMarkdown(html, finalUrl);
|
|
451
|
+
if (!looksBlockedPage(extracted.markdown, html) &&
|
|
452
|
+
extracted.markdown.length >= DIRECT_HTML_MIN_CHARS) {
|
|
453
|
+
const attempt = {
|
|
454
|
+
method: "steel_scrape",
|
|
455
|
+
ok: true,
|
|
456
|
+
note: `steel_scrape: extracted ${extracted.markdown.length} text chars`,
|
|
457
|
+
};
|
|
458
|
+
return {
|
|
459
|
+
ok: true,
|
|
460
|
+
attempt,
|
|
461
|
+
page: {
|
|
462
|
+
finalUrl,
|
|
463
|
+
title: extracted.title,
|
|
464
|
+
markdown: extracted.markdown,
|
|
465
|
+
renderedWith: "steel_scrape",
|
|
466
|
+
metadata: extractionMetadataFromScrape({
|
|
467
|
+
markdownChars: extracted.markdown.length,
|
|
468
|
+
contentType: "text/html",
|
|
469
|
+
finalUrl,
|
|
470
|
+
attempts: [attempt],
|
|
471
|
+
discoveredLinks: extracted.links,
|
|
472
|
+
pageMetadata: extracted.metadata,
|
|
473
|
+
}),
|
|
474
|
+
},
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
const markdown = response.content?.markdown?.trim();
|
|
479
|
+
if (markdown &&
|
|
480
|
+
!looksBlockedPage(markdown) &&
|
|
481
|
+
markdown.length >= DIRECT_HTML_MIN_CHARS) {
|
|
482
|
+
const attempt = {
|
|
483
|
+
method: "steel_scrape",
|
|
484
|
+
ok: true,
|
|
485
|
+
note: `steel_scrape: extracted ${markdown.length} text chars`,
|
|
486
|
+
};
|
|
487
|
+
return {
|
|
488
|
+
ok: true,
|
|
489
|
+
attempt,
|
|
490
|
+
page: {
|
|
491
|
+
finalUrl,
|
|
492
|
+
title: response.metadata?.title?.trim() || titleFromUrl(finalUrl),
|
|
493
|
+
markdown,
|
|
494
|
+
renderedWith: "steel_scrape",
|
|
495
|
+
metadata: extractionMetadataFromScrape({
|
|
496
|
+
markdownChars: markdown.length,
|
|
497
|
+
finalUrl,
|
|
498
|
+
attempts: [attempt],
|
|
499
|
+
}),
|
|
500
|
+
},
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
return failed("steel_scrape", "empty_content: steel scrape returned no usable content", false);
|
|
504
|
+
}
|
|
505
|
+
async function createSteelClient(apiKey, baseUrl) {
|
|
506
|
+
const { default: Steel } = await import("steel-sdk");
|
|
507
|
+
return new Steel({
|
|
508
|
+
steelAPIKey: apiKey,
|
|
509
|
+
baseURL: baseUrl,
|
|
510
|
+
maxRetries: 0,
|
|
511
|
+
});
|
|
512
|
+
}
|
|
513
|
+
async function withSteelRetry(request, signal, onRateLimit) {
|
|
514
|
+
for (let attempt = 1;; attempt++) {
|
|
515
|
+
try {
|
|
516
|
+
return await request();
|
|
517
|
+
}
|
|
518
|
+
catch (err) {
|
|
519
|
+
const retryAfterSeconds = parseRetryAfterSeconds(err);
|
|
520
|
+
if (!retryAfterSeconds || attempt >= STEEL_RETRY_MAX_ATTEMPTS) {
|
|
521
|
+
throw err;
|
|
522
|
+
}
|
|
523
|
+
onRateLimit?.(retryAfterSeconds);
|
|
524
|
+
await sleep((retryAfterSeconds + 1) * 1000, signal);
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
const EXA_CONTENTS_MIN_CHARS = 200;
|
|
529
|
+
export function exaContents(opts = {}) {
|
|
530
|
+
const apiKey = opts.apiKey ?? readEnv("ATLAS_EXA_API_KEY", "EXA_API_KEY");
|
|
531
|
+
const endpoint = `${(opts.baseUrl ?? "https://api.exa.ai").replace(/\/+$/, "")}/contents`;
|
|
532
|
+
return {
|
|
533
|
+
id: "exa_contents",
|
|
534
|
+
async fetch(req) {
|
|
535
|
+
if (!apiKey) {
|
|
536
|
+
return failed("exa_contents", "exa_contents: no Exa API key", true);
|
|
537
|
+
}
|
|
538
|
+
let resp;
|
|
539
|
+
try {
|
|
540
|
+
resp = await fetch(endpoint, {
|
|
541
|
+
method: "POST",
|
|
542
|
+
signal: req.signal ?? null,
|
|
543
|
+
headers: { "content-type": "application/json", "x-api-key": apiKey },
|
|
544
|
+
body: JSON.stringify({
|
|
545
|
+
urls: [req.url],
|
|
546
|
+
text: true,
|
|
547
|
+
livecrawl: "fallback",
|
|
548
|
+
}),
|
|
549
|
+
...(req.dispatcher ? { dispatcher: req.dispatcher } : {}),
|
|
550
|
+
});
|
|
551
|
+
}
|
|
552
|
+
catch (err) {
|
|
553
|
+
if (req.signal?.aborted)
|
|
554
|
+
throw err;
|
|
555
|
+
return failed("exa_contents", `exa_contents: ${errorMessage(err)}`, true);
|
|
556
|
+
}
|
|
557
|
+
if (!resp.ok) {
|
|
558
|
+
return failed("exa_contents", `exa_contents: HTTP ${resp.status}`, true);
|
|
559
|
+
}
|
|
560
|
+
let data;
|
|
561
|
+
try {
|
|
562
|
+
data = (await resp.json());
|
|
563
|
+
}
|
|
564
|
+
catch (err) {
|
|
565
|
+
return failed("exa_contents", `exa_contents: bad JSON: ${errorMessage(err)}`, true);
|
|
566
|
+
}
|
|
567
|
+
const row = data.results?.[0];
|
|
568
|
+
const markdown = (row?.text ?? "").trim();
|
|
569
|
+
if (looksBlockedPage(markdown)) {
|
|
570
|
+
return failed("exa_contents", "blocked_or_challenge: exa contents looked blocked");
|
|
571
|
+
}
|
|
572
|
+
if (markdown.length < EXA_CONTENTS_MIN_CHARS) {
|
|
573
|
+
return failed("exa_contents", `thin_content: exa contents returned ${markdown.length} chars`);
|
|
574
|
+
}
|
|
575
|
+
const finalUrl = row?.url ?? req.url;
|
|
576
|
+
const attempt = {
|
|
577
|
+
method: "exa_contents",
|
|
578
|
+
ok: true,
|
|
579
|
+
note: `exa_contents: extracted ${markdown.length} text chars`,
|
|
580
|
+
};
|
|
581
|
+
return {
|
|
582
|
+
ok: true,
|
|
583
|
+
attempt,
|
|
584
|
+
page: {
|
|
585
|
+
finalUrl,
|
|
586
|
+
title: row?.title ?? null,
|
|
587
|
+
markdown,
|
|
588
|
+
renderedWith: "exa_contents",
|
|
589
|
+
metadata: extractionMetadataFromExa({
|
|
590
|
+
markdownChars: markdown.length,
|
|
591
|
+
finalUrl,
|
|
592
|
+
attempts: [attempt],
|
|
593
|
+
}),
|
|
594
|
+
},
|
|
595
|
+
};
|
|
596
|
+
},
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
export function defaultFetchProviders() {
|
|
600
|
+
const providers = [basicFetch()];
|
|
601
|
+
if (readEnv("ATLAS_EXA_API_KEY", "EXA_API_KEY")) {
|
|
602
|
+
providers.push(exaContents());
|
|
603
|
+
}
|
|
604
|
+
if (readEnv("ATLAS_STEEL_API_KEY", "STEEL_API_KEY")) {
|
|
605
|
+
providers.push(steel());
|
|
606
|
+
}
|
|
607
|
+
return providers;
|
|
608
|
+
}
|
|
609
|
+
const FETCH_PROVIDER_TIMEOUT_MS = 120_000;
|
|
610
|
+
const FETCH_CHAIN_TIMEOUT_MS = 180_000;
|
|
611
|
+
const CHAIN_GOOD_CHARS = 600;
|
|
612
|
+
function pageScore(page) {
|
|
613
|
+
const len = page.markdown.trim().length;
|
|
614
|
+
return looksBlockedPage(page.markdown) ? len : len + 1_000_000;
|
|
615
|
+
}
|
|
616
|
+
function pageIsGood(page) {
|
|
617
|
+
return (!looksBlockedPage(page.markdown) &&
|
|
618
|
+
page.markdown.trim().length >= CHAIN_GOOD_CHARS);
|
|
619
|
+
}
|
|
620
|
+
export async function fetchThroughChain(chain, req) {
|
|
621
|
+
const attempts = [];
|
|
622
|
+
const deadlineAt = Date.now() + FETCH_CHAIN_TIMEOUT_MS;
|
|
623
|
+
const runProvider = async (provider) => {
|
|
624
|
+
const remainingMs = deadlineAt - Date.now();
|
|
625
|
+
if (remainingMs <= 0) {
|
|
626
|
+
return failed(provider.id, "timeout: fetch chain deadline exhausted before this provider ran", false);
|
|
627
|
+
}
|
|
628
|
+
try {
|
|
629
|
+
return await withTimeout(Math.min(FETCH_PROVIDER_TIMEOUT_MS, remainingMs), req.signal, provider.id, (signal) => provider.fetch({ ...req, signal }));
|
|
630
|
+
}
|
|
631
|
+
catch (err) {
|
|
632
|
+
if (req.signal?.aborted)
|
|
633
|
+
throw err;
|
|
634
|
+
return failed(provider.id, `timeout: ${errorMessage(err)}`);
|
|
635
|
+
}
|
|
636
|
+
};
|
|
637
|
+
const finalize = (page) => {
|
|
638
|
+
const merged = [
|
|
639
|
+
...attempts.filter((a) => !a.ok),
|
|
640
|
+
...(page.metadata.attempts ?? []),
|
|
641
|
+
];
|
|
642
|
+
return {
|
|
643
|
+
page: { ...page, metadata: { ...page.metadata, attempts: merged } },
|
|
644
|
+
attempts: merged,
|
|
645
|
+
};
|
|
646
|
+
};
|
|
647
|
+
if (chain.length === 0)
|
|
648
|
+
return { page: null, attempts };
|
|
649
|
+
const first = await runProvider(chain[0]);
|
|
650
|
+
attempts.push(first.attempt);
|
|
651
|
+
const firstPage = first.ok ? first.page : null;
|
|
652
|
+
if (firstPage && pageIsGood(firstPage))
|
|
653
|
+
return finalize(firstPage);
|
|
654
|
+
if (!first.ok && !first.escalate)
|
|
655
|
+
return { page: null, attempts };
|
|
656
|
+
const rest = chain.slice(1);
|
|
657
|
+
const candidates = firstPage ? [firstPage] : [];
|
|
658
|
+
if (rest.length > 0) {
|
|
659
|
+
const settled = await Promise.allSettled(rest.map((p) => runProvider(p)));
|
|
660
|
+
for (const s of settled) {
|
|
661
|
+
if (s.status === "fulfilled") {
|
|
662
|
+
attempts.push(s.value.attempt);
|
|
663
|
+
if (s.value.ok)
|
|
664
|
+
candidates.push(s.value.page);
|
|
665
|
+
}
|
|
666
|
+
else if (req.signal?.aborted) {
|
|
667
|
+
throw s.reason;
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
if (candidates.length === 0)
|
|
672
|
+
return { page: null, attempts };
|
|
673
|
+
const best = candidates.reduce((a, b) => pageScore(b) > pageScore(a) ? b : a);
|
|
674
|
+
return finalize(best);
|
|
675
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { Researcher } from "../researcher.js";
|
|
2
|
+
export interface ParallelAgentOptions {
|
|
3
|
+
apiKey?: string;
|
|
4
|
+
baseUrl?: string;
|
|
5
|
+
processor?: string;
|
|
6
|
+
description?: string;
|
|
7
|
+
timeoutMs?: number;
|
|
8
|
+
pollIntervalMs?: number;
|
|
9
|
+
costPerRunUSD?: number;
|
|
10
|
+
}
|
|
11
|
+
export declare function parallelAgent(opts?: ParallelAgentOptions): Researcher;
|