@steel-dev/atlas 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +219 -0
- package/dist/agent.d.ts +34 -0
- package/dist/agent.js +133 -0
- package/dist/async.d.ts +19 -0
- package/dist/async.js +172 -0
- package/dist/atlas.d.ts +19 -0
- package/dist/atlas.js +69 -0
- package/dist/budget.d.ts +64 -0
- package/dist/budget.js +336 -0
- package/dist/checklist.d.ts +115 -0
- package/dist/checklist.js +297 -0
- package/dist/cli.js +38700 -0
- package/dist/config.d.ts +80 -0
- package/dist/config.js +109 -0
- package/dist/context.d.ts +26 -0
- package/dist/context.js +250 -0
- package/dist/custom-tools.d.ts +26 -0
- package/dist/custom-tools.js +33 -0
- package/dist/defaults.d.ts +10 -0
- package/dist/defaults.js +37 -0
- package/dist/economy.d.ts +12 -0
- package/dist/economy.js +6 -0
- package/dist/env.d.ts +1 -0
- package/dist/env.js +8 -0
- package/dist/errors.d.ts +6 -0
- package/dist/errors.js +11 -0
- package/dist/event-hub.d.ts +11 -0
- package/dist/event-hub.js +83 -0
- package/dist/events.d.ts +105 -0
- package/dist/events.js +1 -0
- package/dist/html-extract.d.ts +21 -0
- package/dist/html-extract.js +459 -0
- package/dist/index.d.ts +59 -0
- package/dist/index.js +26 -0
- package/dist/memory.d.ts +2 -0
- package/dist/memory.js +38 -0
- package/dist/model.d.ts +49 -0
- package/dist/model.js +630 -0
- package/dist/orchestrate.d.ts +5 -0
- package/dist/orchestrate.js +277 -0
- package/dist/pdf-extract.d.ts +5 -0
- package/dist/pdf-extract.js +20 -0
- package/dist/prompts.d.ts +2 -0
- package/dist/prompts.js +6 -0
- package/dist/providers/domain/arxiv.d.ts +6 -0
- package/dist/providers/domain/arxiv.js +83 -0
- package/dist/providers/domain/clinicaltrials.d.ts +6 -0
- package/dist/providers/domain/clinicaltrials.js +104 -0
- package/dist/providers/domain/edgar.d.ts +10 -0
- package/dist/providers/domain/edgar.js +92 -0
- package/dist/providers/domain/index.d.ts +14 -0
- package/dist/providers/domain/index.js +7 -0
- package/dist/providers/domain/openalex.d.ts +7 -0
- package/dist/providers/domain/openalex.js +128 -0
- package/dist/providers/domain/pubmed.d.ts +8 -0
- package/dist/providers/domain/pubmed.js +123 -0
- package/dist/providers/domain/semantic-scholar.d.ts +6 -0
- package/dist/providers/domain/semantic-scholar.js +112 -0
- package/dist/providers/domain/shared.d.ts +12 -0
- package/dist/providers/domain/shared.js +39 -0
- package/dist/providers/domain/wikipedia.d.ts +6 -0
- package/dist/providers/domain/wikipedia.js +71 -0
- package/dist/providers/exa-agent.d.ts +9 -0
- package/dist/providers/exa-agent.js +67 -0
- package/dist/providers/fetch.d.ts +66 -0
- package/dist/providers/fetch.js +675 -0
- package/dist/providers/parallel-agent.d.ts +11 -0
- package/dist/providers/parallel-agent.js +100 -0
- package/dist/providers/perplexity-agent.d.ts +17 -0
- package/dist/providers/perplexity-agent.js +86 -0
- package/dist/providers/search.d.ts +65 -0
- package/dist/providers/search.js +433 -0
- package/dist/providers/store.d.ts +48 -0
- package/dist/providers/store.js +217 -0
- package/dist/researcher.d.ts +20 -0
- package/dist/researcher.js +3 -0
- package/dist/robots.d.ts +16 -0
- package/dist/robots.js +146 -0
- package/dist/roles.d.ts +6 -0
- package/dist/roles.js +4 -0
- package/dist/run.d.ts +65 -0
- package/dist/run.js +371 -0
- package/dist/safe-dispatcher.d.ts +16 -0
- package/dist/safe-dispatcher.js +32 -0
- package/dist/safety.d.ts +23 -0
- package/dist/safety.js +206 -0
- package/dist/sandbox.d.ts +22 -0
- package/dist/sandbox.js +228 -0
- package/dist/search-normalize.d.ts +2 -0
- package/dist/search-normalize.js +13 -0
- package/dist/source-documents.d.ts +77 -0
- package/dist/source-documents.js +421 -0
- package/dist/sources.d.ts +57 -0
- package/dist/sources.js +1 -0
- package/dist/spine.d.ts +19 -0
- package/dist/spine.js +722 -0
- package/dist/state.d.ts +90 -0
- package/dist/state.js +27 -0
- package/dist/structured.d.ts +7 -0
- package/dist/structured.js +18 -0
- package/dist/tools.d.ts +33 -0
- package/dist/tools.js +1187 -0
- package/dist/trace-digest.d.ts +11 -0
- package/dist/trace-digest.js +309 -0
- package/dist/trace.d.ts +225 -0
- package/dist/trace.js +278 -0
- package/dist/trail.d.ts +15 -0
- package/dist/trail.js +74 -0
- package/dist/url.d.ts +1 -0
- package/dist/url.js +25 -0
- package/package.json +107 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
import { normalizeUrlForSource } from "./url.js";
|
|
2
|
+
const STORED_MARKDOWN_CAP = 500_000;
|
|
3
|
+
const SOURCE_CHUNK_CHARS = 12_000;
|
|
4
|
+
const DISCOVERY_LINK_LIMIT = 20;
|
|
5
|
+
const SOURCE_CARD_PREVIEW_CHARS = 700;
|
|
6
|
+
const GOAL_PASSAGE_MIN_DOC_CHARS = 3_000;
|
|
7
|
+
const GOAL_PASSAGE_COUNT = 2;
|
|
8
|
+
const GOAL_HEAD_PREVIEW_CHARS = 300;
|
|
9
|
+
function createChunks(markdown) {
|
|
10
|
+
const chunks = [];
|
|
11
|
+
for (let start = 0; start < markdown.length; start += SOURCE_CHUNK_CHARS) {
|
|
12
|
+
chunks.push({
|
|
13
|
+
index: chunks.length,
|
|
14
|
+
start,
|
|
15
|
+
end: Math.min(markdown.length, start + SOURCE_CHUNK_CHARS),
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
if (chunks.length === 0) {
|
|
19
|
+
chunks.push({ index: 0, start: 0, end: 0 });
|
|
20
|
+
}
|
|
21
|
+
return chunks;
|
|
22
|
+
}
|
|
23
|
+
export function createSourceDocument(url, title, markdown, metadata, originalChars, sourceId, canonicalUrl = normalizeUrlForSource(url)) {
|
|
24
|
+
return {
|
|
25
|
+
sourceId,
|
|
26
|
+
url,
|
|
27
|
+
canonicalUrl,
|
|
28
|
+
title,
|
|
29
|
+
markdown,
|
|
30
|
+
originalChars,
|
|
31
|
+
storedChars: markdown.length,
|
|
32
|
+
truncated: originalChars > markdown.length,
|
|
33
|
+
metadata,
|
|
34
|
+
chunks: createChunks(markdown),
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
function buildExtractionMetadata(opts) {
|
|
38
|
+
const page = opts.pageMetadata;
|
|
39
|
+
return {
|
|
40
|
+
markdownChars: opts.markdownChars,
|
|
41
|
+
method: opts.method,
|
|
42
|
+
...(opts.contentType ? { contentType: opts.contentType } : {}),
|
|
43
|
+
...(opts.finalUrl ? { finalUrl: opts.finalUrl } : {}),
|
|
44
|
+
...(opts.attempts && opts.attempts.length > 0
|
|
45
|
+
? { attempts: opts.attempts }
|
|
46
|
+
: {}),
|
|
47
|
+
...(opts.qualityWarnings && opts.qualityWarnings.length > 0
|
|
48
|
+
? { qualityWarnings: opts.qualityWarnings }
|
|
49
|
+
: {}),
|
|
50
|
+
...(opts.discoveredLinks && opts.discoveredLinks.length > 0
|
|
51
|
+
? { discoveredLinks: opts.discoveredLinks }
|
|
52
|
+
: {}),
|
|
53
|
+
...(page?.canonical ? { canonical: page.canonical } : {}),
|
|
54
|
+
...(page?.author ? { author: page.author } : {}),
|
|
55
|
+
...(page?.articleAuthor ? { articleAuthor: page.articleAuthor } : {}),
|
|
56
|
+
...(page?.publishedTime ? { publishedTime: page.publishedTime } : {}),
|
|
57
|
+
...(page?.modifiedTime ? { modifiedTime: page.modifiedTime } : {}),
|
|
58
|
+
...(page?.description ? { description: page.description } : {}),
|
|
59
|
+
...(page?.language ? { language: page.language } : {}),
|
|
60
|
+
...(page?.jsonLd !== undefined ? { jsonLd: page.jsonLd } : {}),
|
|
61
|
+
extractionNotes: [opts.leadNote, ...(opts.notes ?? [])],
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
export function extractionMetadataFromPdf(opts) {
|
|
65
|
+
return buildExtractionMetadata({
|
|
66
|
+
...opts,
|
|
67
|
+
method: "pdf_direct",
|
|
68
|
+
leadNote: "Fetched with direct PDF text extraction.",
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
export function extractionMetadataFromText(opts) {
|
|
72
|
+
return buildExtractionMetadata({
|
|
73
|
+
...opts,
|
|
74
|
+
leadNote: "Fetched with direct text extraction.",
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
export function extractionMetadataFromCustomTool(opts) {
|
|
78
|
+
return buildExtractionMetadata({
|
|
79
|
+
markdownChars: opts.markdownChars,
|
|
80
|
+
method: "custom_tool",
|
|
81
|
+
leadNote: `Added by the "${opts.toolName}" custom tool.`,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
export function extractionMetadataFromScrape(opts) {
|
|
85
|
+
return buildExtractionMetadata({
|
|
86
|
+
...opts,
|
|
87
|
+
method: "scrape_proxy",
|
|
88
|
+
leadNote: "Fetched server-side with Steel scrape through the residential proxy.",
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
export function extractionMetadataFromHtml(opts) {
|
|
92
|
+
return buildExtractionMetadata({
|
|
93
|
+
...opts,
|
|
94
|
+
method: "html_direct",
|
|
95
|
+
leadNote: "Fetched with direct HTML text extraction.",
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
export function extractionMetadataFromExa(opts) {
|
|
99
|
+
return buildExtractionMetadata({
|
|
100
|
+
...opts,
|
|
101
|
+
method: "exa_contents",
|
|
102
|
+
leadNote: "Fetched via the Exa /contents API.",
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
export function storeMarkdown(markdown) {
|
|
106
|
+
if (markdown.length <= STORED_MARKDOWN_CAP) {
|
|
107
|
+
return {
|
|
108
|
+
markdown,
|
|
109
|
+
originalChars: markdown.length,
|
|
110
|
+
truncated: false,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
markdown: markdown.slice(0, STORED_MARKDOWN_CAP),
|
|
115
|
+
originalChars: markdown.length,
|
|
116
|
+
truncated: true,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
export function formatSourceCard(document, previewChars = SOURCE_CARD_PREVIEW_CHARS) {
|
|
120
|
+
return JSON.stringify(sourceCardData(document, previewChars), null, 2);
|
|
121
|
+
}
|
|
122
|
+
export function sourceCardData(document, previewChars = SOURCE_CARD_PREVIEW_CHARS, goal) {
|
|
123
|
+
const qualityWarnings = document.metadata.qualityWarnings ?? [];
|
|
124
|
+
const isDiscoveryPage = document.metadata.qualityWarnings?.some((warning) => warning.startsWith("search_listing_page")) ?? false;
|
|
125
|
+
const passages = goal?.trim() && document.markdown.length > GOAL_PASSAGE_MIN_DOC_CHARS
|
|
126
|
+
? rankSourcePassages([document], goal, GOAL_PASSAGE_COUNT)
|
|
127
|
+
: [];
|
|
128
|
+
const headChars = passages.length > 0
|
|
129
|
+
? Math.min(previewChars, GOAL_HEAD_PREVIEW_CHARS)
|
|
130
|
+
: previewChars;
|
|
131
|
+
const previewEnd = Math.min(document.markdown.length, Math.max(0, Math.floor(headChars)));
|
|
132
|
+
const result = {
|
|
133
|
+
source_id: document.sourceId,
|
|
134
|
+
title: document.title,
|
|
135
|
+
url: document.url,
|
|
136
|
+
canonical_url: document.canonicalUrl,
|
|
137
|
+
...(qualityWarnings.length > 0
|
|
138
|
+
? { source_quality: { warnings: qualityWarnings } }
|
|
139
|
+
: {}),
|
|
140
|
+
...(document.metadata.method
|
|
141
|
+
? {
|
|
142
|
+
extraction: {
|
|
143
|
+
method: document.metadata.method,
|
|
144
|
+
...(document.metadata.contentType
|
|
145
|
+
? { content_type: document.metadata.contentType }
|
|
146
|
+
: {}),
|
|
147
|
+
...(document.metadata.finalUrl
|
|
148
|
+
? { final_url: document.metadata.finalUrl }
|
|
149
|
+
: {}),
|
|
150
|
+
...(document.metadata.attempts &&
|
|
151
|
+
document.metadata.attempts.length > 0
|
|
152
|
+
? { attempts: document.metadata.attempts }
|
|
153
|
+
: {}),
|
|
154
|
+
...(qualityWarnings.length > 0
|
|
155
|
+
? { quality_warnings: qualityWarnings }
|
|
156
|
+
: {}),
|
|
157
|
+
notes: document.metadata.extractionNotes,
|
|
158
|
+
},
|
|
159
|
+
}
|
|
160
|
+
: {}),
|
|
161
|
+
...(isDiscoveryPage
|
|
162
|
+
? {
|
|
163
|
+
discovery: {
|
|
164
|
+
source_kind: "discovery_page",
|
|
165
|
+
links: (document.metadata.discoveredLinks ?? []).slice(0, DISCOVERY_LINK_LIMIT),
|
|
166
|
+
},
|
|
167
|
+
}
|
|
168
|
+
: {}),
|
|
169
|
+
source_length_chars: document.markdown.length,
|
|
170
|
+
stored_chars: document.storedChars,
|
|
171
|
+
original_chars: document.originalChars,
|
|
172
|
+
truncated: document.truncated,
|
|
173
|
+
chunk_count: document.chunks.length,
|
|
174
|
+
chunk_chars: SOURCE_CHUNK_CHARS,
|
|
175
|
+
...(previewEnd > 0
|
|
176
|
+
? { preview: document.markdown.slice(0, previewEnd) }
|
|
177
|
+
: {}),
|
|
178
|
+
...(passages.length > 0
|
|
179
|
+
? {
|
|
180
|
+
relevant_passages: passages.map((passage) => ({
|
|
181
|
+
chunk_index: passage.chunkIndex,
|
|
182
|
+
start: passage.start,
|
|
183
|
+
end: passage.end,
|
|
184
|
+
snippet: passage.snippet,
|
|
185
|
+
})),
|
|
186
|
+
}
|
|
187
|
+
: {}),
|
|
188
|
+
raw_access: "Stored as a source document. Use search_sources to find relevant passages across stored sources, and read_source to read a chunk or quote an exact span.",
|
|
189
|
+
};
|
|
190
|
+
return result;
|
|
191
|
+
}
|
|
192
|
+
function chunkForRange(document, start) {
|
|
193
|
+
const chunk = document.chunks.find((candidate) => start >= candidate.start &&
|
|
194
|
+
(start < candidate.end || candidate.end === document.markdown.length)) ?? document.chunks[document.chunks.length - 1];
|
|
195
|
+
if (!chunk)
|
|
196
|
+
return { index: 0, start, end: start };
|
|
197
|
+
return chunk;
|
|
198
|
+
}
|
|
199
|
+
export function formatSourceChunk(document, chunkIndex) {
|
|
200
|
+
const chunk = document.chunks[chunkIndex];
|
|
201
|
+
if (!chunk) {
|
|
202
|
+
return `Error: source ${document.sourceId} has no chunk ${chunkIndex}.`;
|
|
203
|
+
}
|
|
204
|
+
const result = {
|
|
205
|
+
source_id: document.sourceId,
|
|
206
|
+
title: document.title,
|
|
207
|
+
url: document.url,
|
|
208
|
+
canonical_url: document.canonicalUrl,
|
|
209
|
+
chunk: {
|
|
210
|
+
index: chunk.index,
|
|
211
|
+
start: chunk.start,
|
|
212
|
+
end: chunk.end,
|
|
213
|
+
previous_chunk: chunk.index > 0 ? chunk.index - 1 : null,
|
|
214
|
+
next_chunk: chunk.index + 1 < document.chunks.length ? chunk.index + 1 : null,
|
|
215
|
+
},
|
|
216
|
+
content: document.markdown.slice(chunk.start, chunk.end),
|
|
217
|
+
};
|
|
218
|
+
return JSON.stringify(result, null, 2);
|
|
219
|
+
}
|
|
220
|
+
const SEARCH_WINDOW_CHARS = 900;
|
|
221
|
+
const SEARCH_WINDOW_OVERLAP = 200;
|
|
222
|
+
const BM25_K1 = 1.5;
|
|
223
|
+
const BM25_B = 0.75;
|
|
224
|
+
const searchWindowCache = new WeakMap();
|
|
225
|
+
function windowsForDocument(document) {
|
|
226
|
+
const cached = searchWindowCache.get(document);
|
|
227
|
+
if (cached)
|
|
228
|
+
return cached;
|
|
229
|
+
const text = document.markdown;
|
|
230
|
+
const windows = [];
|
|
231
|
+
const step = SEARCH_WINDOW_CHARS - SEARCH_WINDOW_OVERLAP;
|
|
232
|
+
for (let start = 0; start < text.length; start += step) {
|
|
233
|
+
const end = Math.min(start + SEARCH_WINDOW_CHARS, text.length);
|
|
234
|
+
const slice = text.slice(start, end);
|
|
235
|
+
const len = (slice.match(/[\p{L}\p{N}]+/gu) ?? []).length;
|
|
236
|
+
windows.push({ start, end, lower: slice.toLowerCase(), len });
|
|
237
|
+
if (end >= text.length)
|
|
238
|
+
break;
|
|
239
|
+
}
|
|
240
|
+
if (windows.length === 0) {
|
|
241
|
+
windows.push({ start: 0, end: 0, lower: "", len: 0 });
|
|
242
|
+
}
|
|
243
|
+
searchWindowCache.set(document, windows);
|
|
244
|
+
return windows;
|
|
245
|
+
}
|
|
246
|
+
export function rankSourcePassages(documents, query, maxResults) {
|
|
247
|
+
const terms = searchTerms(query);
|
|
248
|
+
if (terms.length === 0)
|
|
249
|
+
return [];
|
|
250
|
+
const pool = [];
|
|
251
|
+
for (const document of documents) {
|
|
252
|
+
for (const window of windowsForDocument(document)) {
|
|
253
|
+
if (window.lower.length > 0)
|
|
254
|
+
pool.push({ document, window });
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
if (pool.length === 0)
|
|
258
|
+
return [];
|
|
259
|
+
const total = pool.length;
|
|
260
|
+
const avgdl = pool.reduce((sum, p) => sum + p.window.len, 0) / total || 1;
|
|
261
|
+
const df = new Map();
|
|
262
|
+
for (const term of terms) {
|
|
263
|
+
let n = 0;
|
|
264
|
+
for (const p of pool)
|
|
265
|
+
if (p.window.lower.includes(term))
|
|
266
|
+
n++;
|
|
267
|
+
df.set(term, n);
|
|
268
|
+
}
|
|
269
|
+
const scored = [];
|
|
270
|
+
for (const { document, window } of pool) {
|
|
271
|
+
let score = 0;
|
|
272
|
+
const dl = window.len || 1;
|
|
273
|
+
for (const term of terms) {
|
|
274
|
+
const f = countOccurrences(window.lower, term);
|
|
275
|
+
if (f === 0)
|
|
276
|
+
continue;
|
|
277
|
+
const n = df.get(term) ?? 0;
|
|
278
|
+
const idf = Math.log(1 + (total - n + 0.5) / (n + 0.5));
|
|
279
|
+
score +=
|
|
280
|
+
(idf * (f * (BM25_K1 + 1))) /
|
|
281
|
+
(f + BM25_K1 * (1 - BM25_B + (BM25_B * dl) / avgdl));
|
|
282
|
+
}
|
|
283
|
+
if (score > 0)
|
|
284
|
+
scored.push({ document, window, score });
|
|
285
|
+
}
|
|
286
|
+
scored.sort((a, b) => b.score - a.score ||
|
|
287
|
+
a.document.sourceId.localeCompare(b.document.sourceId) ||
|
|
288
|
+
a.window.start - b.window.start);
|
|
289
|
+
return scored.slice(0, maxResults).map(({ document, window, score }) => ({
|
|
290
|
+
sourceId: document.sourceId,
|
|
291
|
+
title: document.title,
|
|
292
|
+
url: document.url,
|
|
293
|
+
canonicalUrl: document.canonicalUrl,
|
|
294
|
+
chunkIndex: chunkForRange(document, window.start).index,
|
|
295
|
+
start: window.start,
|
|
296
|
+
end: window.end,
|
|
297
|
+
score: Number(score.toFixed(3)),
|
|
298
|
+
snippet: document.markdown.slice(window.start, window.end).trim(),
|
|
299
|
+
}));
|
|
300
|
+
}
|
|
301
|
+
export function searchSourceDocuments(documents, query, maxResults) {
|
|
302
|
+
if (searchTerms(query).length === 0) {
|
|
303
|
+
return "Error: search_sources requires a non-empty `query`.";
|
|
304
|
+
}
|
|
305
|
+
const matches = rankSourcePassages(documents, query, maxResults).map((passage) => ({
|
|
306
|
+
source_id: passage.sourceId,
|
|
307
|
+
title: passage.title,
|
|
308
|
+
url: passage.url,
|
|
309
|
+
canonical_url: passage.canonicalUrl,
|
|
310
|
+
chunk_index: passage.chunkIndex,
|
|
311
|
+
start: passage.start,
|
|
312
|
+
end: passage.end,
|
|
313
|
+
score: passage.score,
|
|
314
|
+
snippet: passage.snippet,
|
|
315
|
+
}));
|
|
316
|
+
return JSON.stringify({ query, result_count: matches.length, matches }, null, 2);
|
|
317
|
+
}
|
|
318
|
+
const EXTRACTION_GAP_MARKER = "\n\n[…]\n\n";
|
|
319
|
+
export function selectExtractionWindow(document, query, maxChars) {
|
|
320
|
+
if (document.markdown.length <= maxChars) {
|
|
321
|
+
return { text: document.markdown, truncated: false };
|
|
322
|
+
}
|
|
323
|
+
const terms = searchTerms(query);
|
|
324
|
+
const scored = document.chunks.map((chunk) => {
|
|
325
|
+
const chunkLower = document.markdown
|
|
326
|
+
.slice(chunk.start, chunk.end)
|
|
327
|
+
.toLowerCase();
|
|
328
|
+
let score = 0;
|
|
329
|
+
for (const term of terms) {
|
|
330
|
+
score += countOccurrences(chunkLower, term) * Math.max(1, term.length);
|
|
331
|
+
}
|
|
332
|
+
return { chunk, score };
|
|
333
|
+
});
|
|
334
|
+
const selected = new Set();
|
|
335
|
+
let used = 0;
|
|
336
|
+
const take = (index) => {
|
|
337
|
+
if (selected.has(index))
|
|
338
|
+
return;
|
|
339
|
+
const chunk = document.chunks[index];
|
|
340
|
+
if (!chunk)
|
|
341
|
+
return;
|
|
342
|
+
const length = chunk.end - chunk.start;
|
|
343
|
+
if (selected.size > 0 && used + length > maxChars)
|
|
344
|
+
return;
|
|
345
|
+
selected.add(index);
|
|
346
|
+
used += length;
|
|
347
|
+
};
|
|
348
|
+
take(0);
|
|
349
|
+
for (const { chunk, score } of [...scored].sort((a, b) => b.score - a.score)) {
|
|
350
|
+
if (used >= maxChars)
|
|
351
|
+
break;
|
|
352
|
+
if (score === 0)
|
|
353
|
+
break;
|
|
354
|
+
take(chunk.index);
|
|
355
|
+
}
|
|
356
|
+
if (selected.size <= 1 && terms.length > 0) {
|
|
357
|
+
for (const chunk of document.chunks) {
|
|
358
|
+
if (used >= maxChars)
|
|
359
|
+
break;
|
|
360
|
+
take(chunk.index);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
const ordered = [...selected].sort((a, b) => a - b);
|
|
364
|
+
let text = "";
|
|
365
|
+
let previous = -1;
|
|
366
|
+
for (const index of ordered) {
|
|
367
|
+
const chunk = document.chunks[index];
|
|
368
|
+
if (!chunk)
|
|
369
|
+
continue;
|
|
370
|
+
if (previous !== -1 && index !== previous + 1)
|
|
371
|
+
text += EXTRACTION_GAP_MARKER;
|
|
372
|
+
text += document.markdown.slice(chunk.start, chunk.end);
|
|
373
|
+
previous = index;
|
|
374
|
+
}
|
|
375
|
+
if (text.length > maxChars)
|
|
376
|
+
text = text.slice(0, maxChars);
|
|
377
|
+
return { text, truncated: true };
|
|
378
|
+
}
|
|
379
|
+
function searchTerms(query) {
|
|
380
|
+
const quoted = Array.from(query.matchAll(/"([^"]+)"/g))
|
|
381
|
+
.map((match) => match[1]?.trim().toLowerCase())
|
|
382
|
+
.filter((term) => Boolean(term));
|
|
383
|
+
const unquoted = query
|
|
384
|
+
.replace(/"[^"]+"/g, " ")
|
|
385
|
+
.split(/[^\p{L}\p{N}_-]+/u)
|
|
386
|
+
.map((term) => term.trim().toLowerCase())
|
|
387
|
+
.filter((term) => term.length >= 2);
|
|
388
|
+
return [...new Set([...quoted, ...unquoted])];
|
|
389
|
+
}
|
|
390
|
+
function countOccurrences(haystack, needle) {
|
|
391
|
+
let count = 0;
|
|
392
|
+
let fromIndex = 0;
|
|
393
|
+
while (true) {
|
|
394
|
+
const found = haystack.indexOf(needle, fromIndex);
|
|
395
|
+
if (found === -1)
|
|
396
|
+
return count;
|
|
397
|
+
count++;
|
|
398
|
+
fromIndex = found + needle.length;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
export function quoteSource(document, start, end) {
|
|
402
|
+
const safeStart = Math.max(0, Math.floor(start));
|
|
403
|
+
const safeEnd = Math.min(document.markdown.length, Math.floor(end));
|
|
404
|
+
if (!Number.isFinite(safeStart) || !Number.isFinite(safeEnd)) {
|
|
405
|
+
return "Error: read_source start/end must be finite numbers.";
|
|
406
|
+
}
|
|
407
|
+
if (safeEnd <= safeStart) {
|
|
408
|
+
return "Error: read_source end must be greater than start.";
|
|
409
|
+
}
|
|
410
|
+
const chunk = chunkForRange(document, safeStart);
|
|
411
|
+
return JSON.stringify({
|
|
412
|
+
source_id: document.sourceId,
|
|
413
|
+
title: document.title,
|
|
414
|
+
url: document.url,
|
|
415
|
+
canonical_url: document.canonicalUrl,
|
|
416
|
+
start: safeStart,
|
|
417
|
+
end: safeEnd,
|
|
418
|
+
chunk_index: chunk.index,
|
|
419
|
+
quote: document.markdown.slice(safeStart, safeEnd),
|
|
420
|
+
}, null, 2);
|
|
421
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
export interface FetchedSource {
|
|
2
|
+
url: string;
|
|
3
|
+
title: string;
|
|
4
|
+
sourceId?: string;
|
|
5
|
+
canonicalUrl?: string;
|
|
6
|
+
}
|
|
7
|
+
export declare const NON_EVIDENCE_WARNINGS: RegExp;
|
|
8
|
+
export interface CitedSource {
|
|
9
|
+
url: string;
|
|
10
|
+
title: string;
|
|
11
|
+
sourceId?: string;
|
|
12
|
+
canonicalUrl?: string;
|
|
13
|
+
}
|
|
14
|
+
export interface SourceExtractionMetadata {
|
|
15
|
+
markdownChars: number;
|
|
16
|
+
extractionNotes: string[];
|
|
17
|
+
method?: string;
|
|
18
|
+
contentType?: string;
|
|
19
|
+
finalUrl?: string;
|
|
20
|
+
attempts?: SourceExtractionAttempt[];
|
|
21
|
+
qualityWarnings?: string[];
|
|
22
|
+
discoveredLinks?: SourceDiscoveredLink[];
|
|
23
|
+
canonical?: string;
|
|
24
|
+
author?: string;
|
|
25
|
+
articleAuthor?: string;
|
|
26
|
+
publishedTime?: string;
|
|
27
|
+
modifiedTime?: string;
|
|
28
|
+
description?: string;
|
|
29
|
+
language?: string;
|
|
30
|
+
jsonLd?: unknown;
|
|
31
|
+
}
|
|
32
|
+
export interface SourceExtractionAttempt {
|
|
33
|
+
method: string;
|
|
34
|
+
ok: boolean;
|
|
35
|
+
note: string;
|
|
36
|
+
}
|
|
37
|
+
export interface SourceDiscoveredLink {
|
|
38
|
+
url: string;
|
|
39
|
+
title?: string;
|
|
40
|
+
}
|
|
41
|
+
export interface SourceDocument {
|
|
42
|
+
sourceId: string;
|
|
43
|
+
url: string;
|
|
44
|
+
canonicalUrl: string;
|
|
45
|
+
title: string;
|
|
46
|
+
markdown: string;
|
|
47
|
+
originalChars: number;
|
|
48
|
+
storedChars: number;
|
|
49
|
+
truncated: boolean;
|
|
50
|
+
metadata: SourceExtractionMetadata;
|
|
51
|
+
chunks: SourceChunk[];
|
|
52
|
+
}
|
|
53
|
+
export interface SourceChunk {
|
|
54
|
+
index: number;
|
|
55
|
+
start: number;
|
|
56
|
+
end: number;
|
|
57
|
+
}
|
package/dist/sources.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const NON_EVIDENCE_WARNINGS = /\b(?:blocked_or_challenge|thin_content|error_page|search_listing_page)\b/i;
|
package/dist/spine.d.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { type BudgetGrant } from "./budget.js";
|
|
2
|
+
import type { Citation } from "./events.js";
|
|
3
|
+
import type { RunCtx } from "./state.js";
|
|
4
|
+
export interface SpineOutput {
|
|
5
|
+
report: string;
|
|
6
|
+
note: string;
|
|
7
|
+
citations: Citation[];
|
|
8
|
+
unboundCitations: string[];
|
|
9
|
+
sources?: {
|
|
10
|
+
url: string;
|
|
11
|
+
title: string;
|
|
12
|
+
via: string;
|
|
13
|
+
chars?: number;
|
|
14
|
+
}[];
|
|
15
|
+
warnings?: string[];
|
|
16
|
+
}
|
|
17
|
+
export declare function runSpine(rctx: RunCtx, opts: {
|
|
18
|
+
meter: BudgetGrant;
|
|
19
|
+
}): Promise<SpineOutput>;
|