@steel-dev/atlas 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +219 -0
- package/dist/agent.d.ts +34 -0
- package/dist/agent.js +133 -0
- package/dist/async.d.ts +19 -0
- package/dist/async.js +172 -0
- package/dist/atlas.d.ts +19 -0
- package/dist/atlas.js +69 -0
- package/dist/budget.d.ts +64 -0
- package/dist/budget.js +336 -0
- package/dist/checklist.d.ts +115 -0
- package/dist/checklist.js +297 -0
- package/dist/cli.js +38700 -0
- package/dist/config.d.ts +80 -0
- package/dist/config.js +109 -0
- package/dist/context.d.ts +26 -0
- package/dist/context.js +250 -0
- package/dist/custom-tools.d.ts +26 -0
- package/dist/custom-tools.js +33 -0
- package/dist/defaults.d.ts +10 -0
- package/dist/defaults.js +37 -0
- package/dist/economy.d.ts +12 -0
- package/dist/economy.js +6 -0
- package/dist/env.d.ts +1 -0
- package/dist/env.js +8 -0
- package/dist/errors.d.ts +6 -0
- package/dist/errors.js +11 -0
- package/dist/event-hub.d.ts +11 -0
- package/dist/event-hub.js +83 -0
- package/dist/events.d.ts +105 -0
- package/dist/events.js +1 -0
- package/dist/html-extract.d.ts +21 -0
- package/dist/html-extract.js +459 -0
- package/dist/index.d.ts +59 -0
- package/dist/index.js +26 -0
- package/dist/memory.d.ts +2 -0
- package/dist/memory.js +38 -0
- package/dist/model.d.ts +49 -0
- package/dist/model.js +630 -0
- package/dist/orchestrate.d.ts +5 -0
- package/dist/orchestrate.js +277 -0
- package/dist/pdf-extract.d.ts +5 -0
- package/dist/pdf-extract.js +20 -0
- package/dist/prompts.d.ts +2 -0
- package/dist/prompts.js +6 -0
- package/dist/providers/domain/arxiv.d.ts +6 -0
- package/dist/providers/domain/arxiv.js +83 -0
- package/dist/providers/domain/clinicaltrials.d.ts +6 -0
- package/dist/providers/domain/clinicaltrials.js +104 -0
- package/dist/providers/domain/edgar.d.ts +10 -0
- package/dist/providers/domain/edgar.js +92 -0
- package/dist/providers/domain/index.d.ts +14 -0
- package/dist/providers/domain/index.js +7 -0
- package/dist/providers/domain/openalex.d.ts +7 -0
- package/dist/providers/domain/openalex.js +128 -0
- package/dist/providers/domain/pubmed.d.ts +8 -0
- package/dist/providers/domain/pubmed.js +123 -0
- package/dist/providers/domain/semantic-scholar.d.ts +6 -0
- package/dist/providers/domain/semantic-scholar.js +112 -0
- package/dist/providers/domain/shared.d.ts +12 -0
- package/dist/providers/domain/shared.js +39 -0
- package/dist/providers/domain/wikipedia.d.ts +6 -0
- package/dist/providers/domain/wikipedia.js +71 -0
- package/dist/providers/exa-agent.d.ts +9 -0
- package/dist/providers/exa-agent.js +67 -0
- package/dist/providers/fetch.d.ts +66 -0
- package/dist/providers/fetch.js +675 -0
- package/dist/providers/parallel-agent.d.ts +11 -0
- package/dist/providers/parallel-agent.js +100 -0
- package/dist/providers/perplexity-agent.d.ts +17 -0
- package/dist/providers/perplexity-agent.js +86 -0
- package/dist/providers/search.d.ts +65 -0
- package/dist/providers/search.js +433 -0
- package/dist/providers/store.d.ts +48 -0
- package/dist/providers/store.js +217 -0
- package/dist/researcher.d.ts +20 -0
- package/dist/researcher.js +3 -0
- package/dist/robots.d.ts +16 -0
- package/dist/robots.js +146 -0
- package/dist/roles.d.ts +6 -0
- package/dist/roles.js +4 -0
- package/dist/run.d.ts +65 -0
- package/dist/run.js +371 -0
- package/dist/safe-dispatcher.d.ts +16 -0
- package/dist/safe-dispatcher.js +32 -0
- package/dist/safety.d.ts +23 -0
- package/dist/safety.js +206 -0
- package/dist/sandbox.d.ts +22 -0
- package/dist/sandbox.js +228 -0
- package/dist/search-normalize.d.ts +2 -0
- package/dist/search-normalize.js +13 -0
- package/dist/source-documents.d.ts +77 -0
- package/dist/source-documents.js +421 -0
- package/dist/sources.d.ts +57 -0
- package/dist/sources.js +1 -0
- package/dist/spine.d.ts +19 -0
- package/dist/spine.js +722 -0
- package/dist/state.d.ts +90 -0
- package/dist/state.js +27 -0
- package/dist/structured.d.ts +7 -0
- package/dist/structured.js +18 -0
- package/dist/tools.d.ts +33 -0
- package/dist/tools.js +1187 -0
- package/dist/trace-digest.d.ts +11 -0
- package/dist/trace-digest.js +309 -0
- package/dist/trace.d.ts +225 -0
- package/dist/trace.js +278 -0
- package/dist/trail.d.ts +15 -0
- package/dist/trail.js +74 -0
- package/dist/url.d.ts +1 -0
- package/dist/url.js +25 -0
- package/package.json +107 -0
package/dist/safety.d.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { ResearchEvent } from "./events.js";
|
|
2
|
+
export interface SafetyPolicy {
|
|
3
|
+
allowFlaggedUrls?: boolean;
|
|
4
|
+
allowPrivateNetworks?: boolean;
|
|
5
|
+
}
|
|
6
|
+
export declare function quarantine(text: string, source: {
|
|
7
|
+
sourceId?: string;
|
|
8
|
+
url?: string;
|
|
9
|
+
}): string;
|
|
10
|
+
export type UrlGuardResult = {
|
|
11
|
+
ok: true;
|
|
12
|
+
} | {
|
|
13
|
+
ok: false;
|
|
14
|
+
reason: string;
|
|
15
|
+
kind: "scheme" | "ssrf" | "url-entropy";
|
|
16
|
+
};
|
|
17
|
+
export declare function isPrivateAddress(ip: string): boolean;
|
|
18
|
+
export declare function guardRedirect(rawUrl: string, policy: SafetyPolicy): Promise<UrlGuardResult>;
|
|
19
|
+
export declare function guardUrl(rawUrl: string, opts: {
|
|
20
|
+
policy: SafetyPolicy;
|
|
21
|
+
seenDomains: Set<string>;
|
|
22
|
+
emit?: (event: ResearchEvent) => void;
|
|
23
|
+
}): Promise<UrlGuardResult>;
|
package/dist/safety.js
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { lookup } from "node:dns/promises";
|
|
2
|
+
import { isIP } from "node:net";
|
|
3
|
+
export function quarantine(text, source) {
|
|
4
|
+
const tag = [source.sourceId, source.url].filter(Boolean).join(" ");
|
|
5
|
+
const neutralized = text.replaceAll("<<<", "‹‹‹");
|
|
6
|
+
return `<<<untrusted-source ${tag}>>>\n${neutralized}\n<<<end-untrusted-source>>>`;
|
|
7
|
+
}
|
|
8
|
+
const MAX_URL_LENGTH = 2048;
|
|
9
|
+
const ENTROPY_MIN_QUERY_LENGTH = 64;
|
|
10
|
+
const ENTROPY_THRESHOLD = 4.6;
|
|
11
|
+
function shannonEntropy(text) {
|
|
12
|
+
if (text.length === 0)
|
|
13
|
+
return 0;
|
|
14
|
+
const counts = new Map();
|
|
15
|
+
for (const char of text) {
|
|
16
|
+
counts.set(char, (counts.get(char) ?? 0) + 1);
|
|
17
|
+
}
|
|
18
|
+
let entropy = 0;
|
|
19
|
+
for (const count of counts.values()) {
|
|
20
|
+
const p = count / text.length;
|
|
21
|
+
entropy -= p * Math.log2(p);
|
|
22
|
+
}
|
|
23
|
+
return entropy;
|
|
24
|
+
}
|
|
25
|
+
function isPrivateIPv4Octets(a, b) {
|
|
26
|
+
return (a === 0 ||
|
|
27
|
+
a === 10 ||
|
|
28
|
+
a === 127 ||
|
|
29
|
+
(a === 100 && b >= 64 && b <= 127) ||
|
|
30
|
+
(a === 169 && b === 254) ||
|
|
31
|
+
(a === 172 && b >= 16 && b <= 31) ||
|
|
32
|
+
(a === 192 && b === 168));
|
|
33
|
+
}
|
|
34
|
+
function isPrivateIPv4(ip) {
|
|
35
|
+
const parts = ip.split(".").map(Number);
|
|
36
|
+
if (parts.length !== 4)
|
|
37
|
+
return false;
|
|
38
|
+
return isPrivateIPv4Octets(parts[0], parts[1]);
|
|
39
|
+
}
|
|
40
|
+
function parseIPv6Groups(ip) {
|
|
41
|
+
let text = ip;
|
|
42
|
+
const zone = text.indexOf("%");
|
|
43
|
+
if (zone !== -1)
|
|
44
|
+
text = text.slice(0, zone);
|
|
45
|
+
const v4Tail = /^(.*:)(\d+)\.(\d+)\.(\d+)\.(\d+)$/.exec(text);
|
|
46
|
+
if (v4Tail) {
|
|
47
|
+
const octets = v4Tail.slice(2, 6).map(Number);
|
|
48
|
+
if (octets.some((octet) => octet > 255))
|
|
49
|
+
return undefined;
|
|
50
|
+
const hi = ((octets[0] << 8) | octets[1]).toString(16);
|
|
51
|
+
const lo = ((octets[2] << 8) | octets[3]).toString(16);
|
|
52
|
+
text = `${v4Tail[1]}${hi}:${lo}`;
|
|
53
|
+
}
|
|
54
|
+
const halves = text.split("::");
|
|
55
|
+
if (halves.length > 2)
|
|
56
|
+
return undefined;
|
|
57
|
+
const parseHalf = (half) => half === ""
|
|
58
|
+
? []
|
|
59
|
+
: half.split(":").map((group) => Number.parseInt(group, 16));
|
|
60
|
+
const head = parseHalf(halves[0]);
|
|
61
|
+
const tail = halves.length === 2 ? parseHalf(halves[1]) : [];
|
|
62
|
+
const missing = 8 - head.length - tail.length;
|
|
63
|
+
if (halves.length === 1 && head.length !== 8)
|
|
64
|
+
return undefined;
|
|
65
|
+
if (halves.length === 2 && missing < 0)
|
|
66
|
+
return undefined;
|
|
67
|
+
const groups = [
|
|
68
|
+
...head,
|
|
69
|
+
...Array.from({ length: halves.length === 2 ? missing : 0 }, () => 0),
|
|
70
|
+
...tail,
|
|
71
|
+
];
|
|
72
|
+
if (groups.length !== 8)
|
|
73
|
+
return undefined;
|
|
74
|
+
if (groups.some((group) => Number.isNaN(group) || group < 0 || group > 0xffff)) {
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
return groups;
|
|
78
|
+
}
|
|
79
|
+
function isPrivateIPv6(ip) {
|
|
80
|
+
const groups = parseIPv6Groups(ip);
|
|
81
|
+
if (!groups)
|
|
82
|
+
return true;
|
|
83
|
+
const [g0, g1, g2, g3, g4, g5, g6, g7] = groups;
|
|
84
|
+
const embeddedV4 = (hi) => isPrivateIPv4Octets(hi >> 8, hi & 0xff);
|
|
85
|
+
const leadingZeros = g0 === 0 && g1 === 0 && g2 === 0 && g3 === 0 && g4 === 0;
|
|
86
|
+
if (leadingZeros && g5 === 0 && g6 === 0 && (g7 === 0 || g7 === 1)) {
|
|
87
|
+
return true;
|
|
88
|
+
}
|
|
89
|
+
if ((g0 & 0xfe00) === 0xfc00)
|
|
90
|
+
return true;
|
|
91
|
+
if ((g0 & 0xffc0) === 0xfe80)
|
|
92
|
+
return true;
|
|
93
|
+
if (leadingZeros && g5 === 0xffff)
|
|
94
|
+
return embeddedV4(g6);
|
|
95
|
+
if (g0 === 0x64 &&
|
|
96
|
+
g1 === 0xff9b &&
|
|
97
|
+
g2 === 0 &&
|
|
98
|
+
g3 === 0 &&
|
|
99
|
+
g4 === 0 &&
|
|
100
|
+
g5 === 0) {
|
|
101
|
+
return embeddedV4(g6);
|
|
102
|
+
}
|
|
103
|
+
if (g0 === 0x2002)
|
|
104
|
+
return embeddedV4(g1);
|
|
105
|
+
return false;
|
|
106
|
+
}
|
|
107
|
+
export function isPrivateAddress(ip) {
|
|
108
|
+
const family = isIP(ip);
|
|
109
|
+
if (family === 4)
|
|
110
|
+
return isPrivateIPv4(ip);
|
|
111
|
+
if (family === 6)
|
|
112
|
+
return isPrivateIPv6(ip);
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
export async function guardRedirect(rawUrl, policy) {
|
|
116
|
+
let url;
|
|
117
|
+
try {
|
|
118
|
+
url = new URL(rawUrl);
|
|
119
|
+
}
|
|
120
|
+
catch {
|
|
121
|
+
return { ok: false, kind: "scheme", reason: "not a valid URL" };
|
|
122
|
+
}
|
|
123
|
+
if (url.protocol !== "http:" && url.protocol !== "https:") {
|
|
124
|
+
return {
|
|
125
|
+
ok: false,
|
|
126
|
+
kind: "scheme",
|
|
127
|
+
reason: `scheme ${url.protocol} is not allowed`,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
if (url.username || url.password) {
|
|
131
|
+
return {
|
|
132
|
+
ok: false,
|
|
133
|
+
kind: "ssrf",
|
|
134
|
+
reason: "URLs with embedded credentials are not allowed",
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
if (rawUrl.length > MAX_URL_LENGTH) {
|
|
138
|
+
return {
|
|
139
|
+
ok: false,
|
|
140
|
+
kind: "ssrf",
|
|
141
|
+
reason: `URL exceeds ${MAX_URL_LENGTH} characters`,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
if (!policy.allowPrivateNetworks) {
|
|
145
|
+
const hostname = url.hostname.replace(/^\[|\]$/g, "");
|
|
146
|
+
if (isIP(hostname)) {
|
|
147
|
+
if (isPrivateAddress(hostname)) {
|
|
148
|
+
return {
|
|
149
|
+
ok: false,
|
|
150
|
+
kind: "ssrf",
|
|
151
|
+
reason: `address ${hostname} is private or reserved`,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
try {
|
|
157
|
+
const records = await lookup(hostname, { all: true });
|
|
158
|
+
for (const record of records) {
|
|
159
|
+
if (isPrivateAddress(record.address)) {
|
|
160
|
+
return {
|
|
161
|
+
ok: false,
|
|
162
|
+
kind: "ssrf",
|
|
163
|
+
reason: `${hostname} resolves to private address ${record.address}`,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
catch {
|
|
169
|
+
return {
|
|
170
|
+
ok: false,
|
|
171
|
+
kind: "ssrf",
|
|
172
|
+
reason: `${hostname} did not resolve`,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
return { ok: true };
|
|
178
|
+
}
|
|
179
|
+
export async function guardUrl(rawUrl, opts) {
|
|
180
|
+
const target = await guardRedirect(rawUrl, opts.policy);
|
|
181
|
+
if (!target.ok)
|
|
182
|
+
return target;
|
|
183
|
+
const url = new URL(rawUrl);
|
|
184
|
+
const domain = url.hostname.toLowerCase();
|
|
185
|
+
const newDomain = !opts.seenDomains.has(domain);
|
|
186
|
+
if (newDomain) {
|
|
187
|
+
const suspect = `${url.search}${url.hash}`.replace(/^[?#]/, "");
|
|
188
|
+
if (suspect.length >= ENTROPY_MIN_QUERY_LENGTH &&
|
|
189
|
+
shannonEntropy(suspect) >= ENTROPY_THRESHOLD &&
|
|
190
|
+
!opts.policy.allowFlaggedUrls) {
|
|
191
|
+
opts.emit?.({
|
|
192
|
+
type: "safety.flag",
|
|
193
|
+
kind: "url-entropy",
|
|
194
|
+
detail: `high-entropy query string on first-seen domain ${domain}`,
|
|
195
|
+
url: rawUrl,
|
|
196
|
+
});
|
|
197
|
+
return {
|
|
198
|
+
ok: false,
|
|
199
|
+
kind: "url-entropy",
|
|
200
|
+
reason: "high-entropy query string on a never-seen domain (possible data exfiltration); set safety.allowFlaggedUrls to permit",
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
opts.seenDomains.add(domain);
|
|
205
|
+
return { ok: true };
|
|
206
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export interface SandboxSource {
|
|
2
|
+
source_id: string;
|
|
3
|
+
url: string;
|
|
4
|
+
title: string;
|
|
5
|
+
text: string;
|
|
6
|
+
}
|
|
7
|
+
export interface SandboxRequest {
|
|
8
|
+
code: string;
|
|
9
|
+
sources: SandboxSource[];
|
|
10
|
+
timeoutMs: number;
|
|
11
|
+
}
|
|
12
|
+
export interface SandboxOutput {
|
|
13
|
+
sources_in_scope: number;
|
|
14
|
+
stdout: string;
|
|
15
|
+
result?: unknown;
|
|
16
|
+
error?: string;
|
|
17
|
+
truncated?: boolean;
|
|
18
|
+
}
|
|
19
|
+
export declare function isRunCodeAvailable(): Promise<boolean>;
|
|
20
|
+
export declare function clampSandboxTimeout(raw: unknown): number;
|
|
21
|
+
export declare function runCodeSandboxed(req: SandboxRequest): Promise<SandboxOutput>;
|
|
22
|
+
export declare function shapeSandboxOutput(payload: SandboxOutput): string;
|
package/dist/sandbox.js
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
const DEFAULT_TIMEOUT_MS = 5_000;
|
|
2
|
+
const MAX_TIMEOUT_MS = 10_000;
|
|
3
|
+
const MAX_PAYLOAD_BYTES = 8 * 1024 * 1024;
|
|
4
|
+
const MEMORY_LIMIT_MB = 128;
|
|
5
|
+
const TOTAL_OUTPUT_CAP = 8_000;
|
|
6
|
+
const STDOUT_TRUNCATE_MARKER = "... [output truncated]";
|
|
7
|
+
let isolatedVmPromise = null;
|
|
8
|
+
function loadIsolatedVm() {
|
|
9
|
+
isolatedVmPromise ??= import("isolated-vm").then((mod) => mod.default ??
|
|
10
|
+
mod, () => null);
|
|
11
|
+
return isolatedVmPromise;
|
|
12
|
+
}
|
|
13
|
+
export async function isRunCodeAvailable() {
|
|
14
|
+
return (await loadIsolatedVm()) !== null;
|
|
15
|
+
}
|
|
16
|
+
const RUNNER_SCRIPT = String.raw `
|
|
17
|
+
const documents = Array.isArray(globalThis.sources) ? globalThis.sources : [];
|
|
18
|
+
const __lines = [];
|
|
19
|
+
let __collected = 0;
|
|
20
|
+
let __truncated = false;
|
|
21
|
+
const printable = (value) => {
|
|
22
|
+
if (typeof value === "string") return value;
|
|
23
|
+
try {
|
|
24
|
+
const json = JSON.stringify(value);
|
|
25
|
+
return json === undefined ? String(value) : json;
|
|
26
|
+
} catch {
|
|
27
|
+
return String(value);
|
|
28
|
+
}
|
|
29
|
+
};
|
|
30
|
+
const print = (...parts) => {
|
|
31
|
+
if (__truncated) return;
|
|
32
|
+
const line = parts.map(printable).join(" ");
|
|
33
|
+
__collected += line.length + 1;
|
|
34
|
+
if (__collected > 32000) {
|
|
35
|
+
__truncated = true;
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
__lines.push(line);
|
|
39
|
+
};
|
|
40
|
+
const grep = (pattern, opts) => {
|
|
41
|
+
const options = opts && typeof opts === "object" ? opts : {};
|
|
42
|
+
const isRegExp = Object.prototype.toString.call(pattern) === "[object RegExp]";
|
|
43
|
+
const source = isRegExp ? pattern.source : String(pattern ?? "");
|
|
44
|
+
if (!source) throw new Error("grep: pattern must be a non-empty string or RegExp");
|
|
45
|
+
const baseFlags = isRegExp ? pattern.flags : "";
|
|
46
|
+
const flags = [...new Set([...baseFlags, "g", ...(options.ignore_case === true ? ["i"] : [])])].join("");
|
|
47
|
+
const regex = new RegExp(source, flags);
|
|
48
|
+
const wanted = Array.isArray(options.source_ids) && options.source_ids.length > 0
|
|
49
|
+
? new Set(options.source_ids.map((id) => String(id ?? "").trim()))
|
|
50
|
+
: null;
|
|
51
|
+
const scope = wanted ? documents.filter((d) => wanted.has(d.source_id)) : documents;
|
|
52
|
+
const maxRaw = Math.floor(Number(options.max));
|
|
53
|
+
const max = Number.isFinite(maxRaw) && maxRaw > 0 ? Math.min(maxRaw, 200) : 50;
|
|
54
|
+
const ctxRaw = options.context === true ? 80 : Math.floor(Number(options.context));
|
|
55
|
+
const contextChars = Number.isFinite(ctxRaw) && ctxRaw > 0 ? Math.min(ctxRaw, 500) : 0;
|
|
56
|
+
const matches = [];
|
|
57
|
+
for (const doc of scope) {
|
|
58
|
+
regex.lastIndex = 0;
|
|
59
|
+
let found;
|
|
60
|
+
while (matches.length < max && (found = regex.exec(doc.text)) !== null) {
|
|
61
|
+
const text = found[0];
|
|
62
|
+
const ctxStr = contextChars > 0
|
|
63
|
+
? doc.text.slice(Math.max(0, found.index - contextChars), Math.min(doc.text.length, found.index + text.length + contextChars))
|
|
64
|
+
: "";
|
|
65
|
+
matches.push({
|
|
66
|
+
source_id: doc.source_id,
|
|
67
|
+
url: doc.url,
|
|
68
|
+
offset: found.index,
|
|
69
|
+
match: text,
|
|
70
|
+
text: ctxStr || text,
|
|
71
|
+
context: ctxStr,
|
|
72
|
+
});
|
|
73
|
+
if (text === "") regex.lastIndex++;
|
|
74
|
+
}
|
|
75
|
+
if (matches.length >= max) break;
|
|
76
|
+
}
|
|
77
|
+
return matches;
|
|
78
|
+
};
|
|
79
|
+
globalThis.print = print;
|
|
80
|
+
globalThis.grep = grep;
|
|
81
|
+
globalThis.console = { log: print };
|
|
82
|
+
let __result;
|
|
83
|
+
let __error;
|
|
84
|
+
try {
|
|
85
|
+
__result = (0, eval)(String(globalThis.__code || ""));
|
|
86
|
+
} catch (err) {
|
|
87
|
+
const message = err && typeof err.message === "string" ? err.message : String(err);
|
|
88
|
+
__error = "code threw: " + message;
|
|
89
|
+
}
|
|
90
|
+
let __resultJSON;
|
|
91
|
+
if (__error === undefined && __result !== undefined && typeof __result !== "function" && typeof __result !== "symbol") {
|
|
92
|
+
try {
|
|
93
|
+
const json = JSON.stringify(__result);
|
|
94
|
+
if (json !== undefined && json.length <= 4000) __resultJSON = json;
|
|
95
|
+
} catch {}
|
|
96
|
+
}
|
|
97
|
+
JSON.stringify({
|
|
98
|
+
stdout: __lines.join("\n"),
|
|
99
|
+
...(__resultJSON !== undefined ? { resultJSON: __resultJSON } : {}),
|
|
100
|
+
...(__error !== undefined ? { error: __error } : {}),
|
|
101
|
+
...(__truncated ? { truncated: true } : {}),
|
|
102
|
+
});
|
|
103
|
+
`;
|
|
104
|
+
export function clampSandboxTimeout(raw) {
|
|
105
|
+
const n = Math.floor(Number(raw));
|
|
106
|
+
if (!Number.isFinite(n) || n <= 0)
|
|
107
|
+
return DEFAULT_TIMEOUT_MS;
|
|
108
|
+
return Math.min(n, MAX_TIMEOUT_MS);
|
|
109
|
+
}
|
|
110
|
+
function hostErrorOutput(err, req) {
|
|
111
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
112
|
+
if (/timed out|execution timed out|timeout/i.test(message)) {
|
|
113
|
+
return {
|
|
114
|
+
sources_in_scope: req.sources.length,
|
|
115
|
+
stdout: "",
|
|
116
|
+
error: `code timed out after ${req.timeoutMs}ms`,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
if (/memory limit|isolate was disposed|array buffer allocation/i.test(message)) {
|
|
120
|
+
return {
|
|
121
|
+
sources_in_scope: req.sources.length,
|
|
122
|
+
stdout: "",
|
|
123
|
+
error: "code exceeded the sandbox memory limit",
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
return {
|
|
127
|
+
sources_in_scope: req.sources.length,
|
|
128
|
+
stdout: "",
|
|
129
|
+
error: `sandbox error: ${message}`,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
export async function runCodeSandboxed(req) {
|
|
133
|
+
const ivm = await loadIsolatedVm();
|
|
134
|
+
if (!ivm) {
|
|
135
|
+
return {
|
|
136
|
+
sources_in_scope: req.sources.length,
|
|
137
|
+
stdout: "",
|
|
138
|
+
error: 'run_code is unavailable: the optional "isolated-vm" dependency is not installed or failed to build. Run `npm install isolated-vm` to enable the sandbox.',
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
const documents = req.sources.map((source) => ({
|
|
142
|
+
source_id: String(source.source_id ?? ""),
|
|
143
|
+
url: String(source.url ?? ""),
|
|
144
|
+
title: String(source.title ?? ""),
|
|
145
|
+
text: String(source.text ?? ""),
|
|
146
|
+
}));
|
|
147
|
+
let payloadBytes = 0;
|
|
148
|
+
try {
|
|
149
|
+
payloadBytes = Buffer.byteLength(JSON.stringify(documents), "utf8");
|
|
150
|
+
}
|
|
151
|
+
catch {
|
|
152
|
+
payloadBytes = 0;
|
|
153
|
+
}
|
|
154
|
+
if (payloadBytes > MAX_PAYLOAD_BYTES) {
|
|
155
|
+
return {
|
|
156
|
+
sources_in_scope: documents.length,
|
|
157
|
+
stdout: "",
|
|
158
|
+
error: `sources too large for the sandbox (limit ${MAX_PAYLOAD_BYTES} bytes); restrict source_ids`,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
const isolate = new ivm.Isolate({ memoryLimit: MEMORY_LIMIT_MB });
|
|
162
|
+
try {
|
|
163
|
+
const context = await isolate.createContext();
|
|
164
|
+
await context.global.set("sources", new ivm.ExternalCopy(documents).copyInto({ release: true }));
|
|
165
|
+
await context.global.set("__code", req.code);
|
|
166
|
+
const script = await isolate.compileScript(RUNNER_SCRIPT);
|
|
167
|
+
const raw = await script.run(context, {
|
|
168
|
+
timeout: req.timeoutMs,
|
|
169
|
+
copy: true,
|
|
170
|
+
});
|
|
171
|
+
let summary;
|
|
172
|
+
try {
|
|
173
|
+
summary = JSON.parse(String(raw));
|
|
174
|
+
}
|
|
175
|
+
catch {
|
|
176
|
+
return {
|
|
177
|
+
sources_in_scope: documents.length,
|
|
178
|
+
stdout: "",
|
|
179
|
+
error: "sandbox produced no parseable output",
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
const output = {
|
|
183
|
+
sources_in_scope: documents.length,
|
|
184
|
+
stdout: summary.stdout ?? "",
|
|
185
|
+
};
|
|
186
|
+
if (summary.resultJSON !== undefined) {
|
|
187
|
+
try {
|
|
188
|
+
output.result = JSON.parse(summary.resultJSON);
|
|
189
|
+
}
|
|
190
|
+
catch { }
|
|
191
|
+
}
|
|
192
|
+
if (summary.error !== undefined)
|
|
193
|
+
output.error = summary.error;
|
|
194
|
+
if (summary.truncated)
|
|
195
|
+
output.truncated = true;
|
|
196
|
+
return output;
|
|
197
|
+
}
|
|
198
|
+
catch (err) {
|
|
199
|
+
return hostErrorOutput(err, req);
|
|
200
|
+
}
|
|
201
|
+
finally {
|
|
202
|
+
try {
|
|
203
|
+
isolate.dispose();
|
|
204
|
+
}
|
|
205
|
+
catch { }
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
export function shapeSandboxOutput(payload) {
|
|
209
|
+
let body = payload;
|
|
210
|
+
let json = JSON.stringify(body, null, 2);
|
|
211
|
+
if (json.length <= TOTAL_OUTPUT_CAP)
|
|
212
|
+
return json;
|
|
213
|
+
if ("result" in body) {
|
|
214
|
+
const { result: _dropped, ...rest } = body;
|
|
215
|
+
body = { ...rest, truncated: true };
|
|
216
|
+
json = JSON.stringify(body, null, 2);
|
|
217
|
+
if (json.length <= TOTAL_OUTPUT_CAP)
|
|
218
|
+
return json;
|
|
219
|
+
}
|
|
220
|
+
const overhead = JSON.stringify({ ...body, stdout: "", truncated: true }, null, 2).length;
|
|
221
|
+
const room = Math.max(0, TOTAL_OUTPUT_CAP - overhead - STDOUT_TRUNCATE_MARKER.length - 4);
|
|
222
|
+
body = {
|
|
223
|
+
...body,
|
|
224
|
+
stdout: `${body.stdout.slice(0, room)}\n${STDOUT_TRUNCATE_MARKER}`,
|
|
225
|
+
truncated: true,
|
|
226
|
+
};
|
|
227
|
+
return JSON.stringify(body, null, 2);
|
|
228
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
const TOKEN_SPLIT = /[^\p{L}\p{N}]+/u;
|
|
2
|
+
const BARE_YEAR = /^(?:19|20)\d{2}$/;
|
|
3
|
+
function tokenize(query) {
|
|
4
|
+
return query.toLowerCase().split(TOKEN_SPLIT).filter(Boolean);
|
|
5
|
+
}
|
|
6
|
+
export function canonicalQuery(query) {
|
|
7
|
+
return [...new Set(tokenize(query))].sort().join(" ");
|
|
8
|
+
}
|
|
9
|
+
export function trailKey(query) {
|
|
10
|
+
const tokens = tokenize(query).filter((token) => !BARE_YEAR.test(token));
|
|
11
|
+
const canonical = [...new Set(tokens)].sort().join(" ");
|
|
12
|
+
return canonical || query.trim().toLowerCase();
|
|
13
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import type { HtmlPageMetadata } from "./html-extract.js";
|
|
2
|
+
import type { SourceDiscoveredLink, SourceDocument, SourceExtractionAttempt, SourceExtractionMetadata } from "./sources.js";
|
|
3
|
+
export declare function createSourceDocument(url: string, title: string, markdown: string, metadata: SourceExtractionMetadata, originalChars: number, sourceId: string, canonicalUrl?: string): SourceDocument;
|
|
4
|
+
export declare function extractionMetadataFromPdf(opts: {
|
|
5
|
+
markdownChars: number;
|
|
6
|
+
contentType?: string;
|
|
7
|
+
finalUrl?: string;
|
|
8
|
+
notes?: string[];
|
|
9
|
+
attempts?: SourceExtractionAttempt[];
|
|
10
|
+
qualityWarnings?: string[];
|
|
11
|
+
discoveredLinks?: SourceDiscoveredLink[];
|
|
12
|
+
}): SourceExtractionMetadata;
|
|
13
|
+
export declare function extractionMetadataFromText(opts: {
|
|
14
|
+
markdownChars: number;
|
|
15
|
+
method: "json_direct" | "text_direct" | "xml_direct";
|
|
16
|
+
contentType?: string;
|
|
17
|
+
finalUrl?: string;
|
|
18
|
+
notes?: string[];
|
|
19
|
+
attempts?: SourceExtractionAttempt[];
|
|
20
|
+
qualityWarnings?: string[];
|
|
21
|
+
}): SourceExtractionMetadata;
|
|
22
|
+
export declare function extractionMetadataFromCustomTool(opts: {
|
|
23
|
+
markdownChars: number;
|
|
24
|
+
toolName: string;
|
|
25
|
+
}): SourceExtractionMetadata;
|
|
26
|
+
export declare function extractionMetadataFromScrape(opts: {
|
|
27
|
+
markdownChars: number;
|
|
28
|
+
contentType?: string;
|
|
29
|
+
finalUrl?: string;
|
|
30
|
+
notes?: string[];
|
|
31
|
+
attempts?: SourceExtractionAttempt[];
|
|
32
|
+
qualityWarnings?: string[];
|
|
33
|
+
discoveredLinks?: SourceDiscoveredLink[];
|
|
34
|
+
pageMetadata?: HtmlPageMetadata;
|
|
35
|
+
}): SourceExtractionMetadata;
|
|
36
|
+
export declare function extractionMetadataFromHtml(opts: {
|
|
37
|
+
markdownChars: number;
|
|
38
|
+
contentType?: string;
|
|
39
|
+
finalUrl?: string;
|
|
40
|
+
notes?: string[];
|
|
41
|
+
attempts?: SourceExtractionAttempt[];
|
|
42
|
+
qualityWarnings?: string[];
|
|
43
|
+
discoveredLinks?: SourceDiscoveredLink[];
|
|
44
|
+
pageMetadata?: HtmlPageMetadata;
|
|
45
|
+
}): SourceExtractionMetadata;
|
|
46
|
+
export declare function extractionMetadataFromExa(opts: {
|
|
47
|
+
markdownChars: number;
|
|
48
|
+
finalUrl?: string;
|
|
49
|
+
attempts?: SourceExtractionAttempt[];
|
|
50
|
+
qualityWarnings?: string[];
|
|
51
|
+
}): SourceExtractionMetadata;
|
|
52
|
+
export declare function storeMarkdown(markdown: string): {
|
|
53
|
+
markdown: string;
|
|
54
|
+
originalChars: number;
|
|
55
|
+
truncated: boolean;
|
|
56
|
+
};
|
|
57
|
+
export declare function formatSourceCard(document: SourceDocument, previewChars?: number): string;
|
|
58
|
+
export declare function sourceCardData(document: SourceDocument, previewChars?: number, goal?: string): Record<string, unknown>;
|
|
59
|
+
export declare function formatSourceChunk(document: SourceDocument, chunkIndex: number): string;
|
|
60
|
+
export interface SourcePassage {
|
|
61
|
+
sourceId: string;
|
|
62
|
+
title: string;
|
|
63
|
+
url: string;
|
|
64
|
+
canonicalUrl: string;
|
|
65
|
+
chunkIndex: number;
|
|
66
|
+
start: number;
|
|
67
|
+
end: number;
|
|
68
|
+
score: number;
|
|
69
|
+
snippet: string;
|
|
70
|
+
}
|
|
71
|
+
export declare function rankSourcePassages(documents: SourceDocument[], query: string, maxResults: number): SourcePassage[];
|
|
72
|
+
export declare function searchSourceDocuments(documents: SourceDocument[], query: string, maxResults: number): string;
|
|
73
|
+
export declare function selectExtractionWindow(document: SourceDocument, query: string, maxChars: number): {
|
|
74
|
+
text: string;
|
|
75
|
+
truncated: boolean;
|
|
76
|
+
};
|
|
77
|
+
export declare function quoteSource(document: SourceDocument, start: number, end: number): string;
|