@steel-dev/atlas 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +304 -15
- package/dist/providers/fetch.js +48 -1
- package/dist/providers/search.js +28 -14
- package/dist/source-documents.d.ts +9 -0
- package/dist/source-documents.js +15 -0
- package/dist/youtube.d.ts +44 -0
- package/dist/youtube.js +257 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -14,6 +14,14 @@ const { report } = await atlas.research(
|
|
|
14
14
|
);
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
+
## Try it
|
|
18
|
+
|
|
19
|
+
One-off query, no install:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
ANTHROPIC_API_KEY=sk-ant-... npx @steel-dev/atlas "How do reasoning models differ from standard LLMs?"
|
|
23
|
+
```
|
|
24
|
+
|
|
17
25
|
## Install
|
|
18
26
|
|
|
19
27
|
```bash
|
|
@@ -195,14 +203,6 @@ The real backstops are **price-independent** — each defaults to the effort row
|
|
|
195
203
|
|
|
196
204
|
`result.stats.stopReason` folds those into one value — `"completed"`, `"finished"` (`run.finish()`), or a binding cap (`"budget"`, `"tokens"`, `"timeout"`). When several apply, the most proximate wins.
|
|
197
205
|
|
|
198
|
-
## Safety
|
|
199
|
-
|
|
200
|
-
Untrusted web content is quarantined (data, not instructions). Fetches pass SSRF guards hop-by-hop; `run_code` runs in a memory-capped V8 isolate with no network, filesystem, or host access. Direct fetch honors robots.txt.
|
|
201
|
-
|
|
202
|
-
The isolate needs the optional `isolated-vm` dependency; without it, `run_code` is dropped from the toolset and the run proceeds without it — Atlas never falls back to an unsandboxed evaluator.
|
|
203
|
-
|
|
204
|
-
The SSRF guard validates DNS at check time but can't pin the connection, so an attacker controlling DNS can defeat it via rebinding. Treat it as defense-in-depth — for hostile targets, run behind network-level egress controls that block private ranges.
|
|
205
|
-
|
|
206
206
|
## Dev
|
|
207
207
|
|
|
208
208
|
```bash
|
package/dist/cli.js
CHANGED
|
@@ -22867,6 +22867,21 @@ function extractionMetadataFromHtml(opts) {
|
|
|
22867
22867
|
leadNote: "Fetched with direct HTML text extraction."
|
|
22868
22868
|
});
|
|
22869
22869
|
}
|
|
22870
|
+
function extractionMetadataFromYoutube(opts) {
|
|
22871
|
+
return buildExtractionMetadata({
|
|
22872
|
+
markdownChars: opts.markdownChars,
|
|
22873
|
+
method: "youtube_transcript",
|
|
22874
|
+
leadNote: "Fetched the YouTube caption track (timed text) for this video.",
|
|
22875
|
+
...opts.finalUrl ? { finalUrl: opts.finalUrl } : {},
|
|
22876
|
+
...opts.attempts ? { attempts: opts.attempts } : {},
|
|
22877
|
+
...opts.notes ? { notes: opts.notes } : {},
|
|
22878
|
+
pageMetadata: {
|
|
22879
|
+
...opts.author ? { author: opts.author } : {},
|
|
22880
|
+
...opts.language ? { language: opts.language } : {},
|
|
22881
|
+
...opts.description ? { description: opts.description } : {}
|
|
22882
|
+
}
|
|
22883
|
+
});
|
|
22884
|
+
}
|
|
22870
22885
|
function extractionMetadataFromExa(opts) {
|
|
22871
22886
|
return buildExtractionMetadata({
|
|
22872
22887
|
...opts,
|
|
@@ -23105,6 +23120,232 @@ function quoteSource(document, start, end) {
|
|
|
23105
23120
|
);
|
|
23106
23121
|
}
|
|
23107
23122
|
|
|
23123
|
+
// src/youtube.ts
|
|
23124
|
+
var VIDEO_ID_RE = /^[A-Za-z0-9_-]{11}$/;
|
|
23125
|
+
var PATH_ID_RE = /^\/(?:shorts|embed|v|live)\/([A-Za-z0-9_-]{11})/;
|
|
23126
|
+
var INNERTUBE_PLAYER_URL = "https://www.youtube.com/youtubei/v1/player?prettyPrint=false";
|
|
23127
|
+
var INNERTUBE_CLIENT = {
|
|
23128
|
+
clientName: "IOS",
|
|
23129
|
+
clientVersion: "20.10.4",
|
|
23130
|
+
deviceModel: "iPhone16,2",
|
|
23131
|
+
osName: "iPhone",
|
|
23132
|
+
osVersion: "18.3.2.22D82",
|
|
23133
|
+
hl: "en",
|
|
23134
|
+
gl: "US"
|
|
23135
|
+
};
|
|
23136
|
+
var INNERTUBE_USER_AGENT = "com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X)";
|
|
23137
|
+
var PLAYER_TIMEOUT_MS = 15e3;
|
|
23138
|
+
var TRANSCRIPT_TIMEOUT_MS = 15e3;
|
|
23139
|
+
var DESCRIPTION_CAP = 2e3;
|
|
23140
|
+
function youtubeVideoId(url) {
|
|
23141
|
+
let parsed;
|
|
23142
|
+
try {
|
|
23143
|
+
parsed = new URL(url);
|
|
23144
|
+
} catch {
|
|
23145
|
+
return null;
|
|
23146
|
+
}
|
|
23147
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") return null;
|
|
23148
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
|
|
23149
|
+
if (host === "youtu.be") {
|
|
23150
|
+
const id = parsed.pathname.split("/").filter(Boolean)[0] ?? "";
|
|
23151
|
+
return VIDEO_ID_RE.test(id) ? id : null;
|
|
23152
|
+
}
|
|
23153
|
+
if (host === "youtube.com" || host.endsWith(".youtube.com")) {
|
|
23154
|
+
if (parsed.pathname === "/watch") {
|
|
23155
|
+
const v = parsed.searchParams.get("v") ?? "";
|
|
23156
|
+
return VIDEO_ID_RE.test(v) ? v : null;
|
|
23157
|
+
}
|
|
23158
|
+
const match = PATH_ID_RE.exec(parsed.pathname);
|
|
23159
|
+
if (match) return match[1] ?? null;
|
|
23160
|
+
}
|
|
23161
|
+
return null;
|
|
23162
|
+
}
|
|
23163
|
+
async function fetchYoutubeTranscript(url, options = {}) {
|
|
23164
|
+
const videoId = youtubeVideoId(url);
|
|
23165
|
+
if (!videoId) return null;
|
|
23166
|
+
const fetchImpl = options.fetchImpl ?? globalThis.fetch;
|
|
23167
|
+
const player = await fetchPlayerResponse(videoId, fetchImpl, options.signal);
|
|
23168
|
+
if (!player) return null;
|
|
23169
|
+
const track = pickCaptionTrack(player, options.preferLang ?? "en");
|
|
23170
|
+
if (!track?.baseUrl) return null;
|
|
23171
|
+
const text2 = await fetchTranscriptText(
|
|
23172
|
+
track.baseUrl,
|
|
23173
|
+
fetchImpl,
|
|
23174
|
+
options.signal
|
|
23175
|
+
);
|
|
23176
|
+
const trimmed = text2.trim();
|
|
23177
|
+
if (!trimmed) return null;
|
|
23178
|
+
const details = player.videoDetails ?? {};
|
|
23179
|
+
const lengthRaw = Number(details.lengthSeconds);
|
|
23180
|
+
return {
|
|
23181
|
+
videoId,
|
|
23182
|
+
title: stringOr(details.title, `YouTube video ${videoId}`),
|
|
23183
|
+
author: nonEmpty(details.author),
|
|
23184
|
+
languageCode: track.languageCode ?? "und",
|
|
23185
|
+
kind: track.kind === "asr" ? "asr" : "manual",
|
|
23186
|
+
text: trimmed,
|
|
23187
|
+
segmentCount: trimmed.split("\n").filter(Boolean).length,
|
|
23188
|
+
lengthSeconds: Number.isFinite(lengthRaw) && lengthRaw > 0 ? lengthRaw : null,
|
|
23189
|
+
description: capDescription(nonEmpty(details.shortDescription))
|
|
23190
|
+
};
|
|
23191
|
+
}
|
|
23192
|
+
function youtubeTranscriptToMarkdown(t) {
|
|
23193
|
+
const header = [`# ${t.title}`];
|
|
23194
|
+
if (t.author) header.push(`**Channel:** ${t.author}`);
|
|
23195
|
+
const lang = t.kind === "asr" ? `${t.languageCode} (auto-generated)` : t.languageCode;
|
|
23196
|
+
header.push(`**Transcript language:** ${lang}`);
|
|
23197
|
+
if (t.lengthSeconds)
|
|
23198
|
+
header.push(`**Length:** ${formatDuration(t.lengthSeconds)}`);
|
|
23199
|
+
header.push("**Source:** YouTube caption track");
|
|
23200
|
+
const parts = [header.join("\n"), "", "## Transcript", "", t.text];
|
|
23201
|
+
if (t.description) {
|
|
23202
|
+
parts.push("", "## Description", "", t.description);
|
|
23203
|
+
}
|
|
23204
|
+
return parts.join("\n");
|
|
23205
|
+
}
|
|
23206
|
+
async function fetchPlayerResponse(videoId, fetchImpl, signal) {
|
|
23207
|
+
const resp = await fetchImpl(INNERTUBE_PLAYER_URL, {
|
|
23208
|
+
method: "POST",
|
|
23209
|
+
signal: withTimeout2(signal, PLAYER_TIMEOUT_MS),
|
|
23210
|
+
headers: {
|
|
23211
|
+
"content-type": "application/json",
|
|
23212
|
+
"user-agent": INNERTUBE_USER_AGENT,
|
|
23213
|
+
"accept-language": "en-US,en"
|
|
23214
|
+
},
|
|
23215
|
+
body: JSON.stringify({
|
|
23216
|
+
context: { client: INNERTUBE_CLIENT },
|
|
23217
|
+
videoId,
|
|
23218
|
+
contentCheckOk: true,
|
|
23219
|
+
racyCheckOk: true
|
|
23220
|
+
})
|
|
23221
|
+
});
|
|
23222
|
+
if (!resp.ok) throw new Error(`youtube player HTTP ${resp.status}`);
|
|
23223
|
+
try {
|
|
23224
|
+
return JSON.parse(await resp.text());
|
|
23225
|
+
} catch {
|
|
23226
|
+
return null;
|
|
23227
|
+
}
|
|
23228
|
+
}
|
|
23229
|
+
async function fetchTranscriptText(baseUrl, fetchImpl, signal) {
|
|
23230
|
+
const direct = await requestTimedText(baseUrl, fetchImpl, signal);
|
|
23231
|
+
const parsed = parseTranscriptBody(direct);
|
|
23232
|
+
if (parsed) return parsed;
|
|
23233
|
+
const json2 = await requestTimedText(
|
|
23234
|
+
appendQuery(baseUrl, "fmt", "json3"),
|
|
23235
|
+
fetchImpl,
|
|
23236
|
+
signal
|
|
23237
|
+
);
|
|
23238
|
+
return parseTranscriptBody(json2);
|
|
23239
|
+
}
|
|
23240
|
+
async function requestTimedText(url, fetchImpl, signal) {
|
|
23241
|
+
const resp = await fetchImpl(url, {
|
|
23242
|
+
signal: withTimeout2(signal, TRANSCRIPT_TIMEOUT_MS),
|
|
23243
|
+
headers: {
|
|
23244
|
+
"user-agent": INNERTUBE_USER_AGENT,
|
|
23245
|
+
"accept-language": "en-US,en"
|
|
23246
|
+
}
|
|
23247
|
+
});
|
|
23248
|
+
if (!resp.ok) throw new Error(`youtube timedtext HTTP ${resp.status}`);
|
|
23249
|
+
return resp.text();
|
|
23250
|
+
}
|
|
23251
|
+
function pickCaptionTrack(player, preferLang = "en") {
|
|
23252
|
+
const tracks = player?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
23253
|
+
if (!Array.isArray(tracks) || tracks.length === 0) return null;
|
|
23254
|
+
const lang = preferLang.toLowerCase();
|
|
23255
|
+
const inLang = tracks.filter(
|
|
23256
|
+
(t) => (t.languageCode ?? "").toLowerCase().startsWith(lang)
|
|
23257
|
+
);
|
|
23258
|
+
const manual = tracks.filter((t) => t.kind !== "asr");
|
|
23259
|
+
return inLang.find((t) => t.kind !== "asr") ?? inLang[0] ?? manual[0] ?? tracks[0] ?? null;
|
|
23260
|
+
}
|
|
23261
|
+
function parseTranscriptBody(raw) {
|
|
23262
|
+
const trimmed = raw.trim();
|
|
23263
|
+
if (!trimmed) return "";
|
|
23264
|
+
if (trimmed.startsWith("{")) return parseTimedTextJson(trimmed);
|
|
23265
|
+
const fromText = parseTagLines(trimmed, "text");
|
|
23266
|
+
if (fromText) return fromText;
|
|
23267
|
+
return parseTagLines(trimmed, "p");
|
|
23268
|
+
}
|
|
23269
|
+
function parseTagLines(xml, tag) {
|
|
23270
|
+
const re2 = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)</${tag}>`, "g");
|
|
23271
|
+
const lines = [];
|
|
23272
|
+
let match;
|
|
23273
|
+
while ((match = re2.exec(xml)) !== null) {
|
|
23274
|
+
const inner = (match[1] ?? "").replace(/<[^>]+>/g, "");
|
|
23275
|
+
const decoded = decodeEntities(inner).replace(/\s+/g, " ").trim();
|
|
23276
|
+
if (decoded) lines.push(decoded);
|
|
23277
|
+
}
|
|
23278
|
+
return lines.join("\n");
|
|
23279
|
+
}
|
|
23280
|
+
function parseTimedTextJson(raw) {
|
|
23281
|
+
let data;
|
|
23282
|
+
try {
|
|
23283
|
+
data = JSON.parse(raw);
|
|
23284
|
+
} catch {
|
|
23285
|
+
return "";
|
|
23286
|
+
}
|
|
23287
|
+
const lines = [];
|
|
23288
|
+
for (const event of data.events ?? []) {
|
|
23289
|
+
const text2 = (event.segs ?? []).map((seg) => seg.utf8 ?? "").join("").replace(/\s+/g, " ").trim();
|
|
23290
|
+
if (text2) lines.push(text2);
|
|
23291
|
+
}
|
|
23292
|
+
return lines.join("\n");
|
|
23293
|
+
}
|
|
23294
|
+
function decodeEntities(s) {
|
|
23295
|
+
const once = decodeEntitiesOnce(s);
|
|
23296
|
+
if (once === s || !once.includes("&")) return once;
|
|
23297
|
+
return decodeEntitiesOnce(once);
|
|
23298
|
+
}
|
|
23299
|
+
function decodeEntitiesOnce(s) {
|
|
23300
|
+
return s.replace(
|
|
23301
|
+
/&#x([0-9a-fA-F]+);/g,
|
|
23302
|
+
(_, hex) => fromCodePoint(Number.parseInt(hex, 16))
|
|
23303
|
+
).replace(/&#(\d+);/g, (_, dec) => fromCodePoint(Number.parseInt(dec, 10))).replace(/"/g, '"').replace(/'/g, "'").replace(/</g, "<").replace(/>/g, ">").replace(/ /g, " ").replace(/&/g, "&");
|
|
23304
|
+
}
|
|
23305
|
+
function fromCodePoint(cp) {
|
|
23306
|
+
if (!Number.isFinite(cp) || cp < 0 || cp > 1114111) return "";
|
|
23307
|
+
try {
|
|
23308
|
+
return String.fromCodePoint(cp);
|
|
23309
|
+
} catch {
|
|
23310
|
+
return "";
|
|
23311
|
+
}
|
|
23312
|
+
}
|
|
23313
|
+
function appendQuery(url, key, value) {
|
|
23314
|
+
try {
|
|
23315
|
+
const parsed = new URL(url);
|
|
23316
|
+
parsed.searchParams.set(key, value);
|
|
23317
|
+
return parsed.toString();
|
|
23318
|
+
} catch {
|
|
23319
|
+
const sep = url.includes("?") ? "&" : "?";
|
|
23320
|
+
return `${url}${sep}${key}=${value}`;
|
|
23321
|
+
}
|
|
23322
|
+
}
|
|
23323
|
+
function withTimeout2(signal, ms) {
|
|
23324
|
+
const timeout = AbortSignal.timeout(ms);
|
|
23325
|
+
return signal ? AbortSignal.any([signal, timeout]) : timeout;
|
|
23326
|
+
}
|
|
23327
|
+
function formatDuration(seconds) {
|
|
23328
|
+
const total = Math.floor(seconds);
|
|
23329
|
+
const h = Math.floor(total / 3600);
|
|
23330
|
+
const m = Math.floor(total % 3600 / 60);
|
|
23331
|
+
const s = total % 60;
|
|
23332
|
+
const mm = String(m).padStart(h > 0 ? 2 : 1, "0");
|
|
23333
|
+
const ss = String(s).padStart(2, "0");
|
|
23334
|
+
return h > 0 ? `${h}:${mm}:${ss}` : `${mm}:${ss}`;
|
|
23335
|
+
}
|
|
23336
|
+
function capDescription(value) {
|
|
23337
|
+
if (!value) return null;
|
|
23338
|
+
const trimmed = value.trim();
|
|
23339
|
+
if (!trimmed) return null;
|
|
23340
|
+
return trimmed.length > DESCRIPTION_CAP ? `${trimmed.slice(0, DESCRIPTION_CAP)}\u2026` : trimmed;
|
|
23341
|
+
}
|
|
23342
|
+
function stringOr(value, fallback) {
|
|
23343
|
+
return typeof value === "string" && value.trim() ? value.trim() : fallback;
|
|
23344
|
+
}
|
|
23345
|
+
function nonEmpty(value) {
|
|
23346
|
+
return typeof value === "string" && value.trim() ? value.trim() : null;
|
|
23347
|
+
}
|
|
23348
|
+
|
|
23108
23349
|
// src/providers/fetch.ts
|
|
23109
23350
|
var DIRECT_PDF_MAX_BYTES = 25 * 1024 * 1024;
|
|
23110
23351
|
var DIRECT_HTML_MAX_BYTES = 5 * 1024 * 1024;
|
|
@@ -23294,6 +23535,10 @@ async function directFetch({ url, signal, guardRedirect: guardRedirect2, dispatc
|
|
|
23294
23535
|
);
|
|
23295
23536
|
}
|
|
23296
23537
|
}
|
|
23538
|
+
if (youtubeVideoId(url)) {
|
|
23539
|
+
const transcript = await tryYoutubeTranscript(url, signal);
|
|
23540
|
+
if (transcript) return transcript;
|
|
23541
|
+
}
|
|
23297
23542
|
let currentUrl = url;
|
|
23298
23543
|
let response;
|
|
23299
23544
|
for (let hop = 0; ; hop++) {
|
|
@@ -23445,6 +23690,42 @@ function extractHtml(data, contentType, finalUrl) {
|
|
|
23445
23690
|
}
|
|
23446
23691
|
};
|
|
23447
23692
|
}
|
|
23693
|
+
var YOUTUBE_TRANSCRIPT_MIN_CHARS = 40;
|
|
23694
|
+
async function tryYoutubeTranscript(url, signal) {
|
|
23695
|
+
let transcript;
|
|
23696
|
+
try {
|
|
23697
|
+
transcript = await fetchYoutubeTranscript(url, { signal });
|
|
23698
|
+
} catch {
|
|
23699
|
+
return null;
|
|
23700
|
+
}
|
|
23701
|
+
if (!transcript) return null;
|
|
23702
|
+
const markdown = youtubeTranscriptToMarkdown(transcript).trim();
|
|
23703
|
+
if (markdown.length < YOUTUBE_TRANSCRIPT_MIN_CHARS) return null;
|
|
23704
|
+
const finalUrl = `https://www.youtube.com/watch?v=${transcript.videoId}`;
|
|
23705
|
+
const attempt = {
|
|
23706
|
+
method: "youtube_transcript",
|
|
23707
|
+
ok: true,
|
|
23708
|
+
note: `youtube_transcript: extracted ${markdown.length} text chars (${transcript.kind} ${transcript.languageCode}, ${transcript.segmentCount} segments)`
|
|
23709
|
+
};
|
|
23710
|
+
return {
|
|
23711
|
+
ok: true,
|
|
23712
|
+
attempt,
|
|
23713
|
+
page: {
|
|
23714
|
+
finalUrl,
|
|
23715
|
+
title: transcript.title,
|
|
23716
|
+
markdown,
|
|
23717
|
+
renderedWith: "youtube_transcript",
|
|
23718
|
+
metadata: extractionMetadataFromYoutube({
|
|
23719
|
+
markdownChars: markdown.length,
|
|
23720
|
+
finalUrl,
|
|
23721
|
+
attempts: [attempt],
|
|
23722
|
+
...transcript.author ? { author: transcript.author } : {},
|
|
23723
|
+
language: transcript.languageCode,
|
|
23724
|
+
...transcript.description ? { description: transcript.description } : {}
|
|
23725
|
+
})
|
|
23726
|
+
}
|
|
23727
|
+
};
|
|
23728
|
+
}
|
|
23448
23729
|
var PDF_PARSE_TIMEOUT_MS = 3e4;
|
|
23449
23730
|
async function extractPdf(data, contentType, finalUrl, signal) {
|
|
23450
23731
|
try {
|
|
@@ -32208,36 +32489,44 @@ Run one web search and list the ${limit} most relevant distinct result pages. Do
|
|
|
32208
32489
|
maxOutputTokens: 1500,
|
|
32209
32490
|
abortSignal: signal
|
|
32210
32491
|
});
|
|
32211
|
-
const
|
|
32492
|
+
const lines = parseResultLines(result.text);
|
|
32493
|
+
const summaryByKey = /* @__PURE__ */ new Map();
|
|
32494
|
+
for (const line of lines) {
|
|
32495
|
+
summaryByKey.set(normalizeUrlForSource(line.url), line.summary);
|
|
32496
|
+
}
|
|
32212
32497
|
const seen = /* @__PURE__ */ new Set();
|
|
32213
32498
|
const results = [];
|
|
32499
|
+
const add = (url, title, snippet) => {
|
|
32500
|
+
if (results.length >= limit || typeof url !== "string") return;
|
|
32501
|
+
const key = normalizeUrlForSource(url);
|
|
32502
|
+
if (seen.has(key)) return;
|
|
32503
|
+
const parsed = toResult(results.length, url, title, snippet);
|
|
32504
|
+
if (!parsed) return;
|
|
32505
|
+
seen.add(key);
|
|
32506
|
+
results.push(parsed);
|
|
32507
|
+
};
|
|
32214
32508
|
for (const source of result.sources) {
|
|
32215
32509
|
if (source.sourceType !== "url") continue;
|
|
32216
32510
|
const key = normalizeUrlForSource(source.url);
|
|
32217
|
-
|
|
32218
|
-
seen.add(key);
|
|
32219
|
-
const parsed = toResult(
|
|
32220
|
-
results.length,
|
|
32221
|
-
source.url,
|
|
32222
|
-
source.title,
|
|
32223
|
-
snippets.get(key) ?? ""
|
|
32224
|
-
);
|
|
32225
|
-
if (parsed) results.push(parsed);
|
|
32226
|
-
if (results.length >= limit) break;
|
|
32511
|
+
add(source.url, source.title, summaryByKey.get(key) ?? "");
|
|
32227
32512
|
}
|
|
32513
|
+
for (const line of lines) add(line.url, void 0, line.summary);
|
|
32228
32514
|
return results;
|
|
32229
32515
|
}
|
|
32230
32516
|
};
|
|
32231
32517
|
}
|
|
32232
|
-
function
|
|
32233
|
-
const
|
|
32518
|
+
function parseResultLines(text2) {
|
|
32519
|
+
const out = [];
|
|
32520
|
+
const seen = /* @__PURE__ */ new Set();
|
|
32234
32521
|
for (const line of text2.split("\n")) {
|
|
32235
32522
|
const match = /(https?:\/\/\S+?)[)\]>.,]*\s*::\s*(\S.*)/.exec(line);
|
|
32236
32523
|
if (!match) continue;
|
|
32237
32524
|
const key = normalizeUrlForSource(match[1]);
|
|
32238
|
-
if (
|
|
32525
|
+
if (seen.has(key)) continue;
|
|
32526
|
+
seen.add(key);
|
|
32527
|
+
out.push({ url: match[1], summary: match[2].trim().slice(0, 500) });
|
|
32239
32528
|
}
|
|
32240
|
-
return
|
|
32529
|
+
return out;
|
|
32241
32530
|
}
|
|
32242
32531
|
async function importProvider(pkg, load4) {
|
|
32243
32532
|
try {
|
package/dist/providers/fetch.js
CHANGED
|
@@ -5,7 +5,8 @@ import { htmlToMarkdown } from "../html-extract.js";
|
|
|
5
5
|
import { extractPdfText } from "../pdf-extract.js";
|
|
6
6
|
import { createRobotsCache } from "../robots.js";
|
|
7
7
|
import { guardRedirect as guardRedirectUrl } from "../safety.js";
|
|
8
|
-
import { extractionMetadataFromExa, extractionMetadataFromHtml, extractionMetadataFromPdf, extractionMetadataFromScrape, extractionMetadataFromText, } from "../source-documents.js";
|
|
8
|
+
import { extractionMetadataFromExa, extractionMetadataFromHtml, extractionMetadataFromPdf, extractionMetadataFromScrape, extractionMetadataFromText, extractionMetadataFromYoutube, } from "../source-documents.js";
|
|
9
|
+
import { fetchYoutubeTranscript, youtubeTranscriptToMarkdown, youtubeVideoId, } from "../youtube.js";
|
|
9
10
|
const DIRECT_PDF_MAX_BYTES = 25 * 1024 * 1024;
|
|
10
11
|
const DIRECT_HTML_MAX_BYTES = 5 * 1024 * 1024;
|
|
11
12
|
const DIRECT_FETCH_TIMEOUT_MS = 15_000;
|
|
@@ -197,6 +198,11 @@ async function directFetch({ url, signal, guardRedirect, dispatcher }, robots) {
|
|
|
197
198
|
return failed("direct_http", `blocked_url: fetch of ${url} blocked: ${initial.reason}`, false);
|
|
198
199
|
}
|
|
199
200
|
}
|
|
201
|
+
if (youtubeVideoId(url)) {
|
|
202
|
+
const transcript = await tryYoutubeTranscript(url, signal);
|
|
203
|
+
if (transcript)
|
|
204
|
+
return transcript;
|
|
205
|
+
}
|
|
200
206
|
let currentUrl = url;
|
|
201
207
|
let response;
|
|
202
208
|
for (let hop = 0;; hop++) {
|
|
@@ -317,6 +323,47 @@ function extractHtml(data, contentType, finalUrl) {
|
|
|
317
323
|
},
|
|
318
324
|
};
|
|
319
325
|
}
|
|
326
|
+
const YOUTUBE_TRANSCRIPT_MIN_CHARS = 40;
|
|
327
|
+
async function tryYoutubeTranscript(url, signal) {
|
|
328
|
+
let transcript;
|
|
329
|
+
try {
|
|
330
|
+
transcript = await fetchYoutubeTranscript(url, { signal });
|
|
331
|
+
}
|
|
332
|
+
catch {
|
|
333
|
+
return null;
|
|
334
|
+
}
|
|
335
|
+
if (!transcript)
|
|
336
|
+
return null;
|
|
337
|
+
const markdown = youtubeTranscriptToMarkdown(transcript).trim();
|
|
338
|
+
if (markdown.length < YOUTUBE_TRANSCRIPT_MIN_CHARS)
|
|
339
|
+
return null;
|
|
340
|
+
const finalUrl = `https://www.youtube.com/watch?v=${transcript.videoId}`;
|
|
341
|
+
const attempt = {
|
|
342
|
+
method: "youtube_transcript",
|
|
343
|
+
ok: true,
|
|
344
|
+
note: `youtube_transcript: extracted ${markdown.length} text chars (${transcript.kind} ${transcript.languageCode}, ${transcript.segmentCount} segments)`,
|
|
345
|
+
};
|
|
346
|
+
return {
|
|
347
|
+
ok: true,
|
|
348
|
+
attempt,
|
|
349
|
+
page: {
|
|
350
|
+
finalUrl,
|
|
351
|
+
title: transcript.title,
|
|
352
|
+
markdown,
|
|
353
|
+
renderedWith: "youtube_transcript",
|
|
354
|
+
metadata: extractionMetadataFromYoutube({
|
|
355
|
+
markdownChars: markdown.length,
|
|
356
|
+
finalUrl,
|
|
357
|
+
attempts: [attempt],
|
|
358
|
+
...(transcript.author ? { author: transcript.author } : {}),
|
|
359
|
+
language: transcript.languageCode,
|
|
360
|
+
...(transcript.description
|
|
361
|
+
? { description: transcript.description }
|
|
362
|
+
: {}),
|
|
363
|
+
}),
|
|
364
|
+
},
|
|
365
|
+
};
|
|
366
|
+
}
|
|
320
367
|
const PDF_PARSE_TIMEOUT_MS = 30_000;
|
|
321
368
|
async function extractPdf(data, contentType, finalUrl, signal) {
|
|
322
369
|
try {
|
package/dist/providers/search.js
CHANGED
|
@@ -264,37 +264,51 @@ export function nativeModelSearch(opts) {
|
|
|
264
264
|
maxOutputTokens: 1_500,
|
|
265
265
|
abortSignal: signal,
|
|
266
266
|
});
|
|
267
|
-
const
|
|
267
|
+
const lines = parseResultLines(result.text);
|
|
268
|
+
const summaryByKey = new Map();
|
|
269
|
+
for (const line of lines) {
|
|
270
|
+
summaryByKey.set(normalizeUrlForSource(line.url), line.summary);
|
|
271
|
+
}
|
|
268
272
|
const seen = new Set();
|
|
269
273
|
const results = [];
|
|
274
|
+
const add = (url, title, snippet) => {
|
|
275
|
+
if (results.length >= limit || typeof url !== "string")
|
|
276
|
+
return;
|
|
277
|
+
const key = normalizeUrlForSource(url);
|
|
278
|
+
if (seen.has(key))
|
|
279
|
+
return;
|
|
280
|
+
const parsed = toResult(results.length, url, title, snippet);
|
|
281
|
+
if (!parsed)
|
|
282
|
+
return;
|
|
283
|
+
seen.add(key);
|
|
284
|
+
results.push(parsed);
|
|
285
|
+
};
|
|
270
286
|
for (const source of result.sources) {
|
|
271
287
|
if (source.sourceType !== "url")
|
|
272
288
|
continue;
|
|
273
289
|
const key = normalizeUrlForSource(source.url);
|
|
274
|
-
|
|
275
|
-
continue;
|
|
276
|
-
seen.add(key);
|
|
277
|
-
const parsed = toResult(results.length, source.url, source.title, snippets.get(key) ?? "");
|
|
278
|
-
if (parsed)
|
|
279
|
-
results.push(parsed);
|
|
280
|
-
if (results.length >= limit)
|
|
281
|
-
break;
|
|
290
|
+
add(source.url, source.title, summaryByKey.get(key) ?? "");
|
|
282
291
|
}
|
|
292
|
+
for (const line of lines)
|
|
293
|
+
add(line.url, undefined, line.summary);
|
|
283
294
|
return results;
|
|
284
295
|
},
|
|
285
296
|
};
|
|
286
297
|
}
|
|
287
|
-
function
|
|
288
|
-
const
|
|
298
|
+
function parseResultLines(text) {
|
|
299
|
+
const out = [];
|
|
300
|
+
const seen = new Set();
|
|
289
301
|
for (const line of text.split("\n")) {
|
|
290
302
|
const match = /(https?:\/\/\S+?)[)\]>.,]*\s*::\s*(\S.*)/.exec(line);
|
|
291
303
|
if (!match)
|
|
292
304
|
continue;
|
|
293
305
|
const key = normalizeUrlForSource(match[1]);
|
|
294
|
-
if (
|
|
295
|
-
|
|
306
|
+
if (seen.has(key))
|
|
307
|
+
continue;
|
|
308
|
+
seen.add(key);
|
|
309
|
+
out.push({ url: match[1], summary: match[2].trim().slice(0, 500) });
|
|
296
310
|
}
|
|
297
|
-
return
|
|
311
|
+
return out;
|
|
298
312
|
}
|
|
299
313
|
async function importProvider(pkg, load) {
|
|
300
314
|
try {
|
|
@@ -43,6 +43,15 @@ export declare function extractionMetadataFromHtml(opts: {
|
|
|
43
43
|
discoveredLinks?: SourceDiscoveredLink[];
|
|
44
44
|
pageMetadata?: HtmlPageMetadata;
|
|
45
45
|
}): SourceExtractionMetadata;
|
|
46
|
+
export declare function extractionMetadataFromYoutube(opts: {
|
|
47
|
+
markdownChars: number;
|
|
48
|
+
finalUrl?: string;
|
|
49
|
+
attempts?: SourceExtractionAttempt[];
|
|
50
|
+
author?: string;
|
|
51
|
+
language?: string;
|
|
52
|
+
description?: string;
|
|
53
|
+
notes?: string[];
|
|
54
|
+
}): SourceExtractionMetadata;
|
|
46
55
|
export declare function extractionMetadataFromExa(opts: {
|
|
47
56
|
markdownChars: number;
|
|
48
57
|
finalUrl?: string;
|
package/dist/source-documents.js
CHANGED
|
@@ -95,6 +95,21 @@ export function extractionMetadataFromHtml(opts) {
|
|
|
95
95
|
leadNote: "Fetched with direct HTML text extraction.",
|
|
96
96
|
});
|
|
97
97
|
}
|
|
98
|
+
export function extractionMetadataFromYoutube(opts) {
|
|
99
|
+
return buildExtractionMetadata({
|
|
100
|
+
markdownChars: opts.markdownChars,
|
|
101
|
+
method: "youtube_transcript",
|
|
102
|
+
leadNote: "Fetched the YouTube caption track (timed text) for this video.",
|
|
103
|
+
...(opts.finalUrl ? { finalUrl: opts.finalUrl } : {}),
|
|
104
|
+
...(opts.attempts ? { attempts: opts.attempts } : {}),
|
|
105
|
+
...(opts.notes ? { notes: opts.notes } : {}),
|
|
106
|
+
pageMetadata: {
|
|
107
|
+
...(opts.author ? { author: opts.author } : {}),
|
|
108
|
+
...(opts.language ? { language: opts.language } : {}),
|
|
109
|
+
...(opts.description ? { description: opts.description } : {}),
|
|
110
|
+
},
|
|
111
|
+
});
|
|
112
|
+
}
|
|
98
113
|
export function extractionMetadataFromExa(opts) {
|
|
99
114
|
return buildExtractionMetadata({
|
|
100
115
|
...opts,
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
type FetchImpl = (input: string, init?: {
|
|
2
|
+
method?: string;
|
|
3
|
+
signal?: AbortSignal;
|
|
4
|
+
headers?: Record<string, string>;
|
|
5
|
+
body?: string;
|
|
6
|
+
}) => Promise<{
|
|
7
|
+
ok: boolean;
|
|
8
|
+
status: number;
|
|
9
|
+
text(): Promise<string>;
|
|
10
|
+
}>;
|
|
11
|
+
export interface YoutubeTranscriptOptions {
|
|
12
|
+
preferLang?: string;
|
|
13
|
+
fetchImpl?: FetchImpl;
|
|
14
|
+
signal?: AbortSignal | undefined;
|
|
15
|
+
}
|
|
16
|
+
export interface YoutubeTranscript {
|
|
17
|
+
videoId: string;
|
|
18
|
+
title: string;
|
|
19
|
+
author: string | null;
|
|
20
|
+
languageCode: string;
|
|
21
|
+
kind: "asr" | "manual";
|
|
22
|
+
text: string;
|
|
23
|
+
segmentCount: number;
|
|
24
|
+
lengthSeconds: number | null;
|
|
25
|
+
description: string | null;
|
|
26
|
+
}
|
|
27
|
+
interface CaptionTrack {
|
|
28
|
+
baseUrl?: string;
|
|
29
|
+
languageCode?: string;
|
|
30
|
+
kind?: string;
|
|
31
|
+
name?: {
|
|
32
|
+
simpleText?: string;
|
|
33
|
+
runs?: Array<{
|
|
34
|
+
text?: string;
|
|
35
|
+
}>;
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
export declare function youtubeVideoId(url: string): string | null;
|
|
39
|
+
export declare function isYoutubeWatchUrl(url: string): boolean;
|
|
40
|
+
export declare function fetchYoutubeTranscript(url: string, options?: YoutubeTranscriptOptions): Promise<YoutubeTranscript | null>;
|
|
41
|
+
export declare function youtubeTranscriptToMarkdown(t: YoutubeTranscript): string;
|
|
42
|
+
export declare function pickCaptionTrack(player: Record<string, any>, preferLang?: string): CaptionTrack | null;
|
|
43
|
+
export declare function parseTranscriptBody(raw: string): string;
|
|
44
|
+
export {};
|
package/dist/youtube.js
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
const VIDEO_ID_RE = /^[A-Za-z0-9_-]{11}$/;
|
|
2
|
+
const PATH_ID_RE = /^\/(?:shorts|embed|v|live)\/([A-Za-z0-9_-]{11})/;
|
|
3
|
+
const INNERTUBE_PLAYER_URL = "https://www.youtube.com/youtubei/v1/player?prettyPrint=false";
|
|
4
|
+
const INNERTUBE_CLIENT = {
|
|
5
|
+
clientName: "IOS",
|
|
6
|
+
clientVersion: "20.10.4",
|
|
7
|
+
deviceModel: "iPhone16,2",
|
|
8
|
+
osName: "iPhone",
|
|
9
|
+
osVersion: "18.3.2.22D82",
|
|
10
|
+
hl: "en",
|
|
11
|
+
gl: "US",
|
|
12
|
+
};
|
|
13
|
+
const INNERTUBE_USER_AGENT = "com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X)";
|
|
14
|
+
const PLAYER_TIMEOUT_MS = 15_000;
|
|
15
|
+
const TRANSCRIPT_TIMEOUT_MS = 15_000;
|
|
16
|
+
const DESCRIPTION_CAP = 2_000;
|
|
17
|
+
export function youtubeVideoId(url) {
|
|
18
|
+
let parsed;
|
|
19
|
+
try {
|
|
20
|
+
parsed = new URL(url);
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:")
|
|
26
|
+
return null;
|
|
27
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
|
|
28
|
+
if (host === "youtu.be") {
|
|
29
|
+
const id = parsed.pathname.split("/").filter(Boolean)[0] ?? "";
|
|
30
|
+
return VIDEO_ID_RE.test(id) ? id : null;
|
|
31
|
+
}
|
|
32
|
+
if (host === "youtube.com" || host.endsWith(".youtube.com")) {
|
|
33
|
+
if (parsed.pathname === "/watch") {
|
|
34
|
+
const v = parsed.searchParams.get("v") ?? "";
|
|
35
|
+
return VIDEO_ID_RE.test(v) ? v : null;
|
|
36
|
+
}
|
|
37
|
+
const match = PATH_ID_RE.exec(parsed.pathname);
|
|
38
|
+
if (match)
|
|
39
|
+
return match[1] ?? null;
|
|
40
|
+
}
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
export function isYoutubeWatchUrl(url) {
|
|
44
|
+
return youtubeVideoId(url) !== null;
|
|
45
|
+
}
|
|
46
|
+
export async function fetchYoutubeTranscript(url, options = {}) {
|
|
47
|
+
const videoId = youtubeVideoId(url);
|
|
48
|
+
if (!videoId)
|
|
49
|
+
return null;
|
|
50
|
+
const fetchImpl = options.fetchImpl ?? globalThis.fetch;
|
|
51
|
+
const player = await fetchPlayerResponse(videoId, fetchImpl, options.signal);
|
|
52
|
+
if (!player)
|
|
53
|
+
return null;
|
|
54
|
+
const track = pickCaptionTrack(player, options.preferLang ?? "en");
|
|
55
|
+
if (!track?.baseUrl)
|
|
56
|
+
return null;
|
|
57
|
+
const text = await fetchTranscriptText(track.baseUrl, fetchImpl, options.signal);
|
|
58
|
+
const trimmed = text.trim();
|
|
59
|
+
if (!trimmed)
|
|
60
|
+
return null;
|
|
61
|
+
const details = (player.videoDetails ?? {});
|
|
62
|
+
const lengthRaw = Number(details.lengthSeconds);
|
|
63
|
+
return {
|
|
64
|
+
videoId,
|
|
65
|
+
title: stringOr(details.title, `YouTube video ${videoId}`),
|
|
66
|
+
author: nonEmpty(details.author),
|
|
67
|
+
languageCode: track.languageCode ?? "und",
|
|
68
|
+
kind: track.kind === "asr" ? "asr" : "manual",
|
|
69
|
+
text: trimmed,
|
|
70
|
+
segmentCount: trimmed.split("\n").filter(Boolean).length,
|
|
71
|
+
lengthSeconds: Number.isFinite(lengthRaw) && lengthRaw > 0 ? lengthRaw : null,
|
|
72
|
+
description: capDescription(nonEmpty(details.shortDescription)),
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
export function youtubeTranscriptToMarkdown(t) {
|
|
76
|
+
const header = [`# ${t.title}`];
|
|
77
|
+
if (t.author)
|
|
78
|
+
header.push(`**Channel:** ${t.author}`);
|
|
79
|
+
const lang = t.kind === "asr" ? `${t.languageCode} (auto-generated)` : t.languageCode;
|
|
80
|
+
header.push(`**Transcript language:** ${lang}`);
|
|
81
|
+
if (t.lengthSeconds)
|
|
82
|
+
header.push(`**Length:** ${formatDuration(t.lengthSeconds)}`);
|
|
83
|
+
header.push("**Source:** YouTube caption track");
|
|
84
|
+
const parts = [header.join("\n"), "", "## Transcript", "", t.text];
|
|
85
|
+
if (t.description) {
|
|
86
|
+
parts.push("", "## Description", "", t.description);
|
|
87
|
+
}
|
|
88
|
+
return parts.join("\n");
|
|
89
|
+
}
|
|
90
|
+
async function fetchPlayerResponse(videoId, fetchImpl, signal) {
|
|
91
|
+
const resp = await fetchImpl(INNERTUBE_PLAYER_URL, {
|
|
92
|
+
method: "POST",
|
|
93
|
+
signal: withTimeout(signal, PLAYER_TIMEOUT_MS),
|
|
94
|
+
headers: {
|
|
95
|
+
"content-type": "application/json",
|
|
96
|
+
"user-agent": INNERTUBE_USER_AGENT,
|
|
97
|
+
"accept-language": "en-US,en",
|
|
98
|
+
},
|
|
99
|
+
body: JSON.stringify({
|
|
100
|
+
context: { client: INNERTUBE_CLIENT },
|
|
101
|
+
videoId,
|
|
102
|
+
contentCheckOk: true,
|
|
103
|
+
racyCheckOk: true,
|
|
104
|
+
}),
|
|
105
|
+
});
|
|
106
|
+
if (!resp.ok)
|
|
107
|
+
throw new Error(`youtube player HTTP ${resp.status}`);
|
|
108
|
+
try {
|
|
109
|
+
return JSON.parse(await resp.text());
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
async function fetchTranscriptText(baseUrl, fetchImpl, signal) {
|
|
116
|
+
const direct = await requestTimedText(baseUrl, fetchImpl, signal);
|
|
117
|
+
const parsed = parseTranscriptBody(direct);
|
|
118
|
+
if (parsed)
|
|
119
|
+
return parsed;
|
|
120
|
+
const json = await requestTimedText(appendQuery(baseUrl, "fmt", "json3"), fetchImpl, signal);
|
|
121
|
+
return parseTranscriptBody(json);
|
|
122
|
+
}
|
|
123
|
+
async function requestTimedText(url, fetchImpl, signal) {
|
|
124
|
+
const resp = await fetchImpl(url, {
|
|
125
|
+
signal: withTimeout(signal, TRANSCRIPT_TIMEOUT_MS),
|
|
126
|
+
headers: {
|
|
127
|
+
"user-agent": INNERTUBE_USER_AGENT,
|
|
128
|
+
"accept-language": "en-US,en",
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
if (!resp.ok)
|
|
132
|
+
throw new Error(`youtube timedtext HTTP ${resp.status}`);
|
|
133
|
+
return resp.text();
|
|
134
|
+
}
|
|
135
|
+
export function pickCaptionTrack(player, preferLang = "en") {
|
|
136
|
+
const tracks = player?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
137
|
+
if (!Array.isArray(tracks) || tracks.length === 0)
|
|
138
|
+
return null;
|
|
139
|
+
const lang = preferLang.toLowerCase();
|
|
140
|
+
const inLang = tracks.filter((t) => (t.languageCode ?? "").toLowerCase().startsWith(lang));
|
|
141
|
+
const manual = tracks.filter((t) => t.kind !== "asr");
|
|
142
|
+
return (inLang.find((t) => t.kind !== "asr") ??
|
|
143
|
+
inLang[0] ??
|
|
144
|
+
manual[0] ??
|
|
145
|
+
tracks[0] ??
|
|
146
|
+
null);
|
|
147
|
+
}
|
|
148
|
+
export function parseTranscriptBody(raw) {
|
|
149
|
+
const trimmed = raw.trim();
|
|
150
|
+
if (!trimmed)
|
|
151
|
+
return "";
|
|
152
|
+
if (trimmed.startsWith("{"))
|
|
153
|
+
return parseTimedTextJson(trimmed);
|
|
154
|
+
const fromText = parseTagLines(trimmed, "text");
|
|
155
|
+
if (fromText)
|
|
156
|
+
return fromText;
|
|
157
|
+
return parseTagLines(trimmed, "p");
|
|
158
|
+
}
|
|
159
|
+
function parseTagLines(xml, tag) {
|
|
160
|
+
const re = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)</${tag}>`, "g");
|
|
161
|
+
const lines = [];
|
|
162
|
+
let match;
|
|
163
|
+
while ((match = re.exec(xml)) !== null) {
|
|
164
|
+
const inner = (match[1] ?? "").replace(/<[^>]+>/g, "");
|
|
165
|
+
const decoded = decodeEntities(inner).replace(/\s+/g, " ").trim();
|
|
166
|
+
if (decoded)
|
|
167
|
+
lines.push(decoded);
|
|
168
|
+
}
|
|
169
|
+
return lines.join("\n");
|
|
170
|
+
}
|
|
171
|
+
function parseTimedTextJson(raw) {
|
|
172
|
+
let data;
|
|
173
|
+
try {
|
|
174
|
+
data = JSON.parse(raw);
|
|
175
|
+
}
|
|
176
|
+
catch {
|
|
177
|
+
return "";
|
|
178
|
+
}
|
|
179
|
+
const lines = [];
|
|
180
|
+
for (const event of data.events ?? []) {
|
|
181
|
+
const text = (event.segs ?? [])
|
|
182
|
+
.map((seg) => seg.utf8 ?? "")
|
|
183
|
+
.join("")
|
|
184
|
+
.replace(/\s+/g, " ")
|
|
185
|
+
.trim();
|
|
186
|
+
if (text)
|
|
187
|
+
lines.push(text);
|
|
188
|
+
}
|
|
189
|
+
return lines.join("\n");
|
|
190
|
+
}
|
|
191
|
+
function decodeEntities(s) {
|
|
192
|
+
const once = decodeEntitiesOnce(s);
|
|
193
|
+
if (once === s || !once.includes("&"))
|
|
194
|
+
return once;
|
|
195
|
+
return decodeEntitiesOnce(once);
|
|
196
|
+
}
|
|
197
|
+
function decodeEntitiesOnce(s) {
|
|
198
|
+
return s
|
|
199
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => fromCodePoint(Number.parseInt(hex, 16)))
|
|
200
|
+
.replace(/&#(\d+);/g, (_, dec) => fromCodePoint(Number.parseInt(dec, 10)))
|
|
201
|
+
.replace(/"/g, '"')
|
|
202
|
+
.replace(/'/g, "'")
|
|
203
|
+
.replace(/</g, "<")
|
|
204
|
+
.replace(/>/g, ">")
|
|
205
|
+
.replace(/ /g, " ")
|
|
206
|
+
.replace(/&/g, "&");
|
|
207
|
+
}
|
|
208
|
+
function fromCodePoint(cp) {
|
|
209
|
+
if (!Number.isFinite(cp) || cp < 0 || cp > 0x10ffff)
|
|
210
|
+
return "";
|
|
211
|
+
try {
|
|
212
|
+
return String.fromCodePoint(cp);
|
|
213
|
+
}
|
|
214
|
+
catch {
|
|
215
|
+
return "";
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
function appendQuery(url, key, value) {
|
|
219
|
+
try {
|
|
220
|
+
const parsed = new URL(url);
|
|
221
|
+
parsed.searchParams.set(key, value);
|
|
222
|
+
return parsed.toString();
|
|
223
|
+
}
|
|
224
|
+
catch {
|
|
225
|
+
const sep = url.includes("?") ? "&" : "?";
|
|
226
|
+
return `${url}${sep}${key}=${value}`;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
function withTimeout(signal, ms) {
|
|
230
|
+
const timeout = AbortSignal.timeout(ms);
|
|
231
|
+
return signal ? AbortSignal.any([signal, timeout]) : timeout;
|
|
232
|
+
}
|
|
233
|
+
function formatDuration(seconds) {
|
|
234
|
+
const total = Math.floor(seconds);
|
|
235
|
+
const h = Math.floor(total / 3600);
|
|
236
|
+
const m = Math.floor((total % 3600) / 60);
|
|
237
|
+
const s = total % 60;
|
|
238
|
+
const mm = String(m).padStart(h > 0 ? 2 : 1, "0");
|
|
239
|
+
const ss = String(s).padStart(2, "0");
|
|
240
|
+
return h > 0 ? `${h}:${mm}:${ss}` : `${mm}:${ss}`;
|
|
241
|
+
}
|
|
242
|
+
function capDescription(value) {
|
|
243
|
+
if (!value)
|
|
244
|
+
return null;
|
|
245
|
+
const trimmed = value.trim();
|
|
246
|
+
if (!trimmed)
|
|
247
|
+
return null;
|
|
248
|
+
return trimmed.length > DESCRIPTION_CAP
|
|
249
|
+
? `${trimmed.slice(0, DESCRIPTION_CAP)}…`
|
|
250
|
+
: trimmed;
|
|
251
|
+
}
|
|
252
|
+
function stringOr(value, fallback) {
|
|
253
|
+
return typeof value === "string" && value.trim() ? value.trim() : fallback;
|
|
254
|
+
}
|
|
255
|
+
function nonEmpty(value) {
|
|
256
|
+
return typeof value === "string" && value.trim() ? value.trim() : null;
|
|
257
|
+
}
|