@steel-dev/atlas 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +219 -0
  3. package/dist/agent.d.ts +34 -0
  4. package/dist/agent.js +133 -0
  5. package/dist/async.d.ts +19 -0
  6. package/dist/async.js +172 -0
  7. package/dist/atlas.d.ts +19 -0
  8. package/dist/atlas.js +69 -0
  9. package/dist/budget.d.ts +64 -0
  10. package/dist/budget.js +336 -0
  11. package/dist/checklist.d.ts +115 -0
  12. package/dist/checklist.js +297 -0
  13. package/dist/cli.js +38700 -0
  14. package/dist/config.d.ts +80 -0
  15. package/dist/config.js +109 -0
  16. package/dist/context.d.ts +26 -0
  17. package/dist/context.js +250 -0
  18. package/dist/custom-tools.d.ts +26 -0
  19. package/dist/custom-tools.js +33 -0
  20. package/dist/defaults.d.ts +10 -0
  21. package/dist/defaults.js +37 -0
  22. package/dist/economy.d.ts +12 -0
  23. package/dist/economy.js +6 -0
  24. package/dist/env.d.ts +1 -0
  25. package/dist/env.js +8 -0
  26. package/dist/errors.d.ts +6 -0
  27. package/dist/errors.js +11 -0
  28. package/dist/event-hub.d.ts +11 -0
  29. package/dist/event-hub.js +83 -0
  30. package/dist/events.d.ts +105 -0
  31. package/dist/events.js +1 -0
  32. package/dist/html-extract.d.ts +21 -0
  33. package/dist/html-extract.js +459 -0
  34. package/dist/index.d.ts +59 -0
  35. package/dist/index.js +26 -0
  36. package/dist/memory.d.ts +2 -0
  37. package/dist/memory.js +38 -0
  38. package/dist/model.d.ts +49 -0
  39. package/dist/model.js +630 -0
  40. package/dist/orchestrate.d.ts +5 -0
  41. package/dist/orchestrate.js +277 -0
  42. package/dist/pdf-extract.d.ts +5 -0
  43. package/dist/pdf-extract.js +20 -0
  44. package/dist/prompts.d.ts +2 -0
  45. package/dist/prompts.js +6 -0
  46. package/dist/providers/domain/arxiv.d.ts +6 -0
  47. package/dist/providers/domain/arxiv.js +83 -0
  48. package/dist/providers/domain/clinicaltrials.d.ts +6 -0
  49. package/dist/providers/domain/clinicaltrials.js +104 -0
  50. package/dist/providers/domain/edgar.d.ts +10 -0
  51. package/dist/providers/domain/edgar.js +92 -0
  52. package/dist/providers/domain/index.d.ts +14 -0
  53. package/dist/providers/domain/index.js +7 -0
  54. package/dist/providers/domain/openalex.d.ts +7 -0
  55. package/dist/providers/domain/openalex.js +128 -0
  56. package/dist/providers/domain/pubmed.d.ts +8 -0
  57. package/dist/providers/domain/pubmed.js +123 -0
  58. package/dist/providers/domain/semantic-scholar.d.ts +6 -0
  59. package/dist/providers/domain/semantic-scholar.js +112 -0
  60. package/dist/providers/domain/shared.d.ts +12 -0
  61. package/dist/providers/domain/shared.js +39 -0
  62. package/dist/providers/domain/wikipedia.d.ts +6 -0
  63. package/dist/providers/domain/wikipedia.js +71 -0
  64. package/dist/providers/exa-agent.d.ts +9 -0
  65. package/dist/providers/exa-agent.js +67 -0
  66. package/dist/providers/fetch.d.ts +66 -0
  67. package/dist/providers/fetch.js +675 -0
  68. package/dist/providers/parallel-agent.d.ts +11 -0
  69. package/dist/providers/parallel-agent.js +100 -0
  70. package/dist/providers/perplexity-agent.d.ts +17 -0
  71. package/dist/providers/perplexity-agent.js +86 -0
  72. package/dist/providers/search.d.ts +65 -0
  73. package/dist/providers/search.js +433 -0
  74. package/dist/providers/store.d.ts +48 -0
  75. package/dist/providers/store.js +217 -0
  76. package/dist/researcher.d.ts +20 -0
  77. package/dist/researcher.js +3 -0
  78. package/dist/robots.d.ts +16 -0
  79. package/dist/robots.js +146 -0
  80. package/dist/roles.d.ts +6 -0
  81. package/dist/roles.js +4 -0
  82. package/dist/run.d.ts +65 -0
  83. package/dist/run.js +371 -0
  84. package/dist/safe-dispatcher.d.ts +16 -0
  85. package/dist/safe-dispatcher.js +32 -0
  86. package/dist/safety.d.ts +23 -0
  87. package/dist/safety.js +206 -0
  88. package/dist/sandbox.d.ts +22 -0
  89. package/dist/sandbox.js +228 -0
  90. package/dist/search-normalize.d.ts +2 -0
  91. package/dist/search-normalize.js +13 -0
  92. package/dist/source-documents.d.ts +77 -0
  93. package/dist/source-documents.js +421 -0
  94. package/dist/sources.d.ts +57 -0
  95. package/dist/sources.js +1 -0
  96. package/dist/spine.d.ts +19 -0
  97. package/dist/spine.js +722 -0
  98. package/dist/state.d.ts +90 -0
  99. package/dist/state.js +27 -0
  100. package/dist/structured.d.ts +7 -0
  101. package/dist/structured.js +18 -0
  102. package/dist/tools.d.ts +33 -0
  103. package/dist/tools.js +1187 -0
  104. package/dist/trace-digest.d.ts +11 -0
  105. package/dist/trace-digest.js +309 -0
  106. package/dist/trace.d.ts +225 -0
  107. package/dist/trace.js +278 -0
  108. package/dist/trail.d.ts +15 -0
  109. package/dist/trail.js +74 -0
  110. package/dist/url.d.ts +1 -0
  111. package/dist/url.js +25 -0
  112. package/package.json +107 -0
@@ -0,0 +1,83 @@
1
+ export class EventHub {
2
+ subscribers = new Set();
3
+ history = [];
4
+ closed = false;
5
+ failure = null;
6
+ emit(event) {
7
+ if (this.closed)
8
+ return;
9
+ this.history.push(event);
10
+ for (const sub of this.subscribers) {
11
+ if (sub.resolveNext) {
12
+ const resolve = sub.resolveNext;
13
+ sub.resolveNext = null;
14
+ sub.rejectNext = null;
15
+ resolve({ value: event, done: false });
16
+ }
17
+ else {
18
+ sub.queue.push(event);
19
+ }
20
+ }
21
+ }
22
+ close() {
23
+ if (this.closed)
24
+ return;
25
+ this.closed = true;
26
+ for (const sub of this.subscribers) {
27
+ sub.resolveNext?.({ value: undefined, done: true });
28
+ sub.resolveNext = null;
29
+ sub.rejectNext = null;
30
+ }
31
+ }
32
+ fail(error) {
33
+ if (this.closed)
34
+ return;
35
+ this.failure = error;
36
+ this.closed = true;
37
+ for (const sub of this.subscribers) {
38
+ sub.rejectNext?.(error);
39
+ sub.resolveNext = null;
40
+ sub.rejectNext = null;
41
+ }
42
+ }
43
+ iterable() {
44
+ const subscribers = this.subscribers;
45
+ const hub = this;
46
+ return {
47
+ [Symbol.asyncIterator]: () => {
48
+ const sub = {
49
+ queue: [...hub.history],
50
+ resolveNext: null,
51
+ rejectNext: null,
52
+ };
53
+ subscribers.add(sub);
54
+ return {
55
+ next() {
56
+ if (sub.queue.length > 0) {
57
+ return Promise.resolve({
58
+ value: sub.queue.shift(),
59
+ done: false,
60
+ });
61
+ }
62
+ if (hub.failure) {
63
+ subscribers.delete(sub);
64
+ return Promise.reject(hub.failure);
65
+ }
66
+ if (hub.closed) {
67
+ subscribers.delete(sub);
68
+ return Promise.resolve({ value: undefined, done: true });
69
+ }
70
+ return new Promise((resolve, reject) => {
71
+ sub.resolveNext = resolve;
72
+ sub.rejectNext = reject;
73
+ });
74
+ },
75
+ return() {
76
+ subscribers.delete(sub);
77
+ return Promise.resolve({ value: undefined, done: true });
78
+ },
79
+ };
80
+ },
81
+ };
82
+ }
83
+ }
@@ -0,0 +1,105 @@
1
+ import type { Effort } from "./config.js";
2
+ import type { ModelRole } from "./model.js";
3
+ export type AgentRole = "research" | "write";
4
+ export interface Citation {
5
+ sourceId: string;
6
+ marker: number;
7
+ }
8
+ export type StopReason = "completed" | "finished" | "budget" | "tokens" | "timeout";
9
+ export interface RunStats {
10
+ effort: Effort;
11
+ searches: number;
12
+ searchCacheHits: number;
13
+ modelCacheHits: number;
14
+ modelGatePeakWidth: number;
15
+ sourcesFetched: number;
16
+ sourcesFailed: number;
17
+ citationsBound: number;
18
+ citationsUnsupported: number;
19
+ tokens: Partial<Record<ModelRole, {
20
+ input: number;
21
+ output: number;
22
+ }>>;
23
+ costUSD: number;
24
+ durationMs: number;
25
+ budgetExhausted: boolean;
26
+ tokensExhausted: boolean;
27
+ stopReason: StopReason;
28
+ }
29
+ export type ResearchEvent = {
30
+ type: "run.started";
31
+ runId: string;
32
+ question: string;
33
+ effort: Effort;
34
+ budgetUSD: number;
35
+ } | {
36
+ type: "plan.updated";
37
+ rationale: string;
38
+ } | {
39
+ type: "search.completed";
40
+ query: string;
41
+ provider: string;
42
+ results: number;
43
+ } | {
44
+ type: "search.failed";
45
+ query: string;
46
+ error: string;
47
+ } | {
48
+ type: "source.fetched";
49
+ sourceId: string;
50
+ url: string;
51
+ title: string;
52
+ via: string;
53
+ chars: number;
54
+ warnings?: string[];
55
+ } | {
56
+ type: "source.failed";
57
+ url: string;
58
+ reason: string;
59
+ } | {
60
+ type: "report.drafting";
61
+ } | {
62
+ type: "report.delta";
63
+ text: string;
64
+ } | {
65
+ type: "report.reset";
66
+ } | {
67
+ type: "report.completed";
68
+ report: string;
69
+ } | {
70
+ type: "budget.warning";
71
+ spentUSD: number;
72
+ limitUSD: number;
73
+ fraction: number;
74
+ } | {
75
+ type: "safety.flag";
76
+ kind: "ssrf" | "url-entropy" | "injection" | "scheme";
77
+ detail: string;
78
+ url?: string;
79
+ } | {
80
+ type: "pricing.missing";
81
+ modelId: string;
82
+ detail: string;
83
+ } | {
84
+ type: "run_code.unavailable";
85
+ detail: string;
86
+ } | {
87
+ type: "rate.limited";
88
+ retryAfterSeconds: number;
89
+ } | {
90
+ type: "tool.event";
91
+ tool: string;
92
+ data: Record<string, unknown>;
93
+ } | {
94
+ type: "run.completed";
95
+ stats: RunStats;
96
+ } | {
97
+ type: "run.error";
98
+ message: string;
99
+ recoverable: boolean;
100
+ };
101
+ export type ResearchEventType = ResearchEvent["type"];
102
+ export type ResearchEventMap = {
103
+ [E in ResearchEvent as E["type"]]: E;
104
+ };
105
+ export declare const EVENT_SCHEMA_VERSION = "1.0";
package/dist/events.js ADDED
@@ -0,0 +1 @@
1
+ export const EVENT_SCHEMA_VERSION = "1.0";
@@ -0,0 +1,21 @@
1
+ import type { SourceDiscoveredLink } from "./sources.js";
2
+ export interface HtmlPageMetadata {
3
+ canonical?: string;
4
+ author?: string;
5
+ articleAuthor?: string;
6
+ publishedTime?: string;
7
+ modifiedTime?: string;
8
+ description?: string;
9
+ language?: string;
10
+ jsonLd?: unknown;
11
+ }
12
+ interface HtmlMarkdownExtraction {
13
+ title: string;
14
+ markdown: string;
15
+ links: SourceDiscoveredLink[];
16
+ metadata: HtmlPageMetadata;
17
+ }
18
+ export declare function htmlToMarkdown(html: string, url: string, opts?: {
19
+ linkLimit?: number;
20
+ }): HtmlMarkdownExtraction;
21
+ export {};
@@ -0,0 +1,459 @@
1
+ import * as cheerio from "cheerio";
2
+ import { normalizeUrlForSource } from "./url.js";
3
+ const DEFAULT_LINK_LIMIT = 200;
4
+ const BLOCK_TAGS = new Set([
5
+ "address",
6
+ "article",
7
+ "aside",
8
+ "blockquote",
9
+ "body",
10
+ "caption",
11
+ "dd",
12
+ "details",
13
+ "div",
14
+ "dl",
15
+ "dt",
16
+ "fieldset",
17
+ "figcaption",
18
+ "figure",
19
+ "footer",
20
+ "form",
21
+ "h1",
22
+ "h2",
23
+ "h3",
24
+ "h4",
25
+ "h5",
26
+ "h6",
27
+ "header",
28
+ "hr",
29
+ "html",
30
+ "li",
31
+ "main",
32
+ "nav",
33
+ "ol",
34
+ "p",
35
+ "pre",
36
+ "section",
37
+ "summary",
38
+ "table",
39
+ "tbody",
40
+ "td",
41
+ "tfoot",
42
+ "th",
43
+ "thead",
44
+ "tr",
45
+ "ul",
46
+ ]);
47
+ export function htmlToMarkdown(html, url, opts = {}) {
48
+ const $ = cheerio.load(html);
49
+ const metadata = extractHtmlMetadata($, url);
50
+ $("script, style, noscript, svg, canvas, template").remove();
51
+ const title = $('meta[property="og:title"]').attr("content")?.trim() ||
52
+ $("title").first().text().trim() ||
53
+ $("h1").first().text().trim() ||
54
+ url;
55
+ const root = $("main").first().length
56
+ ? $("main").first()
57
+ : $("article").first().length
58
+ ? $("article").first()
59
+ : $("body").first();
60
+ const rootElement = root.get(0);
61
+ const blocks = rootElement && isElement(rootElement)
62
+ ? childBlocks(rootElement.children, url)
63
+ : [];
64
+ const markdown = (blocks.length > 0 ? blocks.join("\n\n") : root.text())
65
+ .replace(/[ \t]+\n/g, "\n")
66
+ .replace(/\n{3,}/g, "\n\n")
67
+ .trim();
68
+ return {
69
+ title,
70
+ markdown,
71
+ links: extractLinks($, url, opts.linkLimit ?? DEFAULT_LINK_LIMIT),
72
+ metadata,
73
+ };
74
+ }
75
+ function isElement(node) {
76
+ return node.type === "tag" || node.type === "script" || node.type === "style";
77
+ }
78
+ function isText(node) {
79
+ return node.type === "text";
80
+ }
81
+ function collapse(text) {
82
+ return text.replace(/\s+/g, " ").trim();
83
+ }
84
+ function childBlocks(children, baseUrl) {
85
+ const blocks = [];
86
+ let inline = "";
87
+ const flush = () => {
88
+ const text = collapse(inline);
89
+ inline = "";
90
+ if (text)
91
+ blocks.push(text);
92
+ };
93
+ for (const child of children) {
94
+ if (isText(child)) {
95
+ inline += child.data;
96
+ continue;
97
+ }
98
+ if (!isElement(child))
99
+ continue;
100
+ const tag = child.name.toLowerCase();
101
+ if (BLOCK_TAGS.has(tag)) {
102
+ flush();
103
+ blocks.push(...elementBlocks(child, tag, baseUrl));
104
+ }
105
+ else {
106
+ inline += inlineText(child, baseUrl);
107
+ }
108
+ }
109
+ flush();
110
+ return blocks;
111
+ }
112
+ function elementBlocks(el, tag, baseUrl) {
113
+ if (/^h[1-6]$/.test(tag)) {
114
+ const text = collapse(inlineChildren(el.children, baseUrl));
115
+ return text ? [`${"#".repeat(Number(tag[1]))} ${text}`] : [];
116
+ }
117
+ switch (tag) {
118
+ case "p":
119
+ case "figcaption":
120
+ case "caption":
121
+ case "summary":
122
+ case "address":
123
+ case "dt": {
124
+ const text = collapse(inlineChildren(el.children, baseUrl));
125
+ return text ? [text] : [];
126
+ }
127
+ case "blockquote": {
128
+ const inner = childBlocks(el.children, baseUrl);
129
+ if (inner.length === 0)
130
+ return [];
131
+ return [
132
+ inner
133
+ .join("\n\n")
134
+ .split("\n")
135
+ .map((line) => (line ? `> ${line}` : ">"))
136
+ .join("\n"),
137
+ ];
138
+ }
139
+ case "pre": {
140
+ const fence = codeFence(el);
141
+ return fence ? [fence] : [];
142
+ }
143
+ case "ul":
144
+ case "ol": {
145
+ const list = renderList(el, tag === "ol", baseUrl);
146
+ return list ? [list] : [];
147
+ }
148
+ case "table":
149
+ return renderTable(el, baseUrl);
150
+ case "dl":
151
+ return renderDefinitionList(el, baseUrl);
152
+ case "hr":
153
+ return [];
154
+ default:
155
+ return childBlocks(el.children, baseUrl);
156
+ }
157
+ }
158
+ function inlineChildren(children, baseUrl) {
159
+ let out = "";
160
+ for (const child of children) {
161
+ if (isText(child))
162
+ out += child.data;
163
+ else if (isElement(child))
164
+ out += inlineText(child, baseUrl);
165
+ }
166
+ return out;
167
+ }
168
+ function inlineText(el, baseUrl) {
169
+ const tag = el.name.toLowerCase();
170
+ if (tag === "br")
171
+ return "\n";
172
+ if (tag === "img" || tag === "wbr")
173
+ return "";
174
+ if (tag === "a")
175
+ return markdownLink(el, baseUrl);
176
+ if (tag === "code" || tag === "kbd" || tag === "samp") {
177
+ return inlineCode(collapse(rawText(el)));
178
+ }
179
+ const inner = inlineChildren(el.children, baseUrl);
180
+ return BLOCK_TAGS.has(tag) ? ` ${inner} ` : inner;
181
+ }
182
+ function inlineCode(text) {
183
+ if (!text)
184
+ return "";
185
+ return text.includes("`") ? `\`\` ${text} \`\`` : `\`${text}\``;
186
+ }
187
+ function markdownLink(el, baseUrl) {
188
+ const inner = collapse(inlineChildren(el.children, baseUrl));
189
+ const href = resolveHttpUrl(el.attribs?.href, baseUrl);
190
+ if (!href)
191
+ return inner;
192
+ if (!inner)
193
+ return "";
194
+ if (inner === href)
195
+ return href;
196
+ const target = /[()\s]/.test(href) ? `<${href}>` : href;
197
+ return `[${inner}](${target})`;
198
+ }
199
+ function resolveHttpUrl(rawHref, baseUrl) {
200
+ const href = (rawHref ?? "").trim();
201
+ if (!href || href.startsWith("#"))
202
+ return null;
203
+ if (/^(?:javascript|mailto|tel|data):/i.test(href))
204
+ return null;
205
+ try {
206
+ const absolute = new URL(href, baseUrl).toString();
207
+ return /^https?:\/\//i.test(absolute) ? absolute : null;
208
+ }
209
+ catch {
210
+ return null;
211
+ }
212
+ }
213
+ function renderList(el, ordered, baseUrl) {
214
+ const lines = [];
215
+ let index = 0;
216
+ for (const child of el.children) {
217
+ if (!isElement(child))
218
+ continue;
219
+ const tag = child.name.toLowerCase();
220
+ if (tag === "ul" || tag === "ol") {
221
+ const nested = renderList(child, tag === "ol", baseUrl);
222
+ if (nested) {
223
+ lines.push(...nested.split("\n").map((line) => ` ${line}`));
224
+ }
225
+ continue;
226
+ }
227
+ if (tag !== "li")
228
+ continue;
229
+ const inner = childBlocks(child.children, baseUrl);
230
+ if (inner.length === 0)
231
+ continue;
232
+ index += 1;
233
+ const marker = ordered ? `${index}. ` : "- ";
234
+ const innerLines = inner.join("\n").split("\n");
235
+ lines.push(marker + innerLines[0]);
236
+ const pad = " ".repeat(marker.length);
237
+ for (const line of innerLines.slice(1)) {
238
+ lines.push(line ? pad + line : "");
239
+ }
240
+ }
241
+ return lines.join("\n");
242
+ }
243
+ function renderTable(el, baseUrl) {
244
+ const blocks = [];
245
+ const rows = [];
246
+ const visitRows = (children) => {
247
+ for (const child of children) {
248
+ if (!isElement(child))
249
+ continue;
250
+ const tag = child.name.toLowerCase();
251
+ if (tag === "tr") {
252
+ const cells = [];
253
+ for (const cell of child.children) {
254
+ if (!isElement(cell))
255
+ continue;
256
+ const cellTag = cell.name.toLowerCase();
257
+ if (cellTag === "td" || cellTag === "th") {
258
+ cells.push(cellText(cell, baseUrl));
259
+ }
260
+ }
261
+ if (cells.length > 0)
262
+ rows.push(cells);
263
+ }
264
+ else if (tag === "thead" || tag === "tbody" || tag === "tfoot") {
265
+ visitRows(child.children);
266
+ }
267
+ else if (tag === "caption") {
268
+ const caption = collapse(inlineChildren(child.children, baseUrl));
269
+ if (caption)
270
+ blocks.push(caption);
271
+ }
272
+ }
273
+ };
274
+ visitRows(el.children);
275
+ if (rows.length === 0)
276
+ return blocks;
277
+ const width = Math.max(...rows.map((row) => row.length));
278
+ const padded = (row) => [
279
+ ...row,
280
+ ...Array.from({ length: width - row.length }, () => ""),
281
+ ];
282
+ const line = (row) => `| ${padded(row).join(" | ")} |`;
283
+ const lines = [
284
+ line(rows[0]),
285
+ `| ${Array.from({ length: width }, () => "---").join(" | ")} |`,
286
+ ...rows.slice(1).map(line),
287
+ ];
288
+ blocks.push(lines.join("\n"));
289
+ return blocks;
290
+ }
291
+ function cellText(el, baseUrl) {
292
+ return collapse(inlineChildren(el.children, baseUrl)).replace(/\|/g, "\\|");
293
+ }
294
+ function renderDefinitionList(el, baseUrl) {
295
+ const lines = [];
296
+ let terms = [];
297
+ let defs = [];
298
+ const flushPair = () => {
299
+ const term = terms.filter(Boolean).join(", ");
300
+ const def = defs.filter(Boolean).join("; ");
301
+ terms = [];
302
+ defs = [];
303
+ if (!term && !def)
304
+ return;
305
+ lines.push(term && def ? `- ${term}: ${def}` : `- ${term || def}`);
306
+ };
307
+ const visit = (children) => {
308
+ for (const child of children) {
309
+ if (!isElement(child))
310
+ continue;
311
+ const tag = child.name.toLowerCase();
312
+ if (tag === "dt") {
313
+ if (defs.length > 0)
314
+ flushPair();
315
+ terms.push(collapse(inlineChildren(child.children, baseUrl)));
316
+ }
317
+ else if (tag === "dd") {
318
+ defs.push(collapse(inlineChildren(child.children, baseUrl)));
319
+ }
320
+ else if (tag === "div") {
321
+ visit(child.children);
322
+ }
323
+ }
324
+ };
325
+ visit(el.children);
326
+ flushPair();
327
+ return lines.length > 0 ? [lines.join("\n")] : [];
328
+ }
329
+ function codeFence(el) {
330
+ const text = rawText(el).replace(/^\n+/, "").replace(/\s+$/, "");
331
+ if (!collapse(text))
332
+ return "";
333
+ const lang = codeLanguage(el);
334
+ let fence = "```";
335
+ while (text.includes(fence))
336
+ fence += "`";
337
+ return `${fence}${lang}\n${text}\n${fence}`;
338
+ }
339
+ function codeLanguage(el) {
340
+ const classes = [el.attribs?.class ?? ""];
341
+ for (const child of el.children) {
342
+ if (isElement(child) && child.name.toLowerCase() === "code") {
343
+ classes.push(child.attribs?.class ?? "");
344
+ }
345
+ }
346
+ const match = /(?:language|lang)-([\w+-]+)/i.exec(classes.join(" "));
347
+ return match ? match[1].toLowerCase() : "";
348
+ }
349
+ function rawText(el) {
350
+ let out = "";
351
+ for (const child of el.children) {
352
+ if (isText(child)) {
353
+ out += child.data;
354
+ }
355
+ else if (isElement(child)) {
356
+ out += child.name.toLowerCase() === "br" ? "\n" : rawText(child);
357
+ }
358
+ }
359
+ return out;
360
+ }
361
+ function extractLinks($, baseUrl, limit) {
362
+ const links = [];
363
+ const seen = new Set();
364
+ $("a[href]").each((_idx, el) => {
365
+ if (links.length >= limit)
366
+ return false;
367
+ const href = ($(el).attr("href") ?? "").trim();
368
+ if (!href || href.startsWith("#") || /^javascript:/i.test(href))
369
+ return;
370
+ let absolute;
371
+ try {
372
+ absolute = new URL(href, baseUrl).toString();
373
+ }
374
+ catch {
375
+ return;
376
+ }
377
+ if (!/^https?:\/\//i.test(absolute))
378
+ return;
379
+ const normalized = normalizeUrlForSource(absolute);
380
+ if (seen.has(normalized))
381
+ return;
382
+ seen.add(normalized);
383
+ const title = $(el).text().replace(/\s+/g, " ").trim();
384
+ links.push({
385
+ url: absolute,
386
+ ...(title ? { title: title.slice(0, 200) } : {}),
387
+ });
388
+ });
389
+ return links;
390
+ }
391
+ function extractHtmlMetadata($, baseUrl) {
392
+ const meta = (selector) => $(selector).attr("content")?.trim();
393
+ const canonical = normalizeOptionalUrl($('link[rel="canonical"]').first().attr("href"), baseUrl);
394
+ const jsonLd = $("script[type='application/ld+json']")
395
+ .toArray()
396
+ .map((el) => parseJsonLd($(el).text()))
397
+ .filter((value) => value !== undefined);
398
+ return {
399
+ ...(canonical ? { canonical } : {}),
400
+ ...(meta('meta[name="author"]')
401
+ ? { author: meta('meta[name="author"]') }
402
+ : {}),
403
+ ...(meta('meta[property="article:author"]')
404
+ ? { articleAuthor: meta('meta[property="article:author"]') }
405
+ : {}),
406
+ ...(meta('meta[property="article:published_time"]') ||
407
+ meta('meta[name="date"]') ||
408
+ meta('meta[name="pubdate"]')
409
+ ? {
410
+ publishedTime: meta('meta[property="article:published_time"]') ||
411
+ meta('meta[name="date"]') ||
412
+ meta('meta[name="pubdate"]'),
413
+ }
414
+ : {}),
415
+ ...(meta('meta[property="article:modified_time"]') ||
416
+ meta('meta[name="last-modified"]')
417
+ ? {
418
+ modifiedTime: meta('meta[property="article:modified_time"]') ||
419
+ meta('meta[name="last-modified"]'),
420
+ }
421
+ : {}),
422
+ ...(meta('meta[name="description"]') ||
423
+ meta('meta[property="og:description"]')
424
+ ? {
425
+ description: meta('meta[name="description"]') ||
426
+ meta('meta[property="og:description"]'),
427
+ }
428
+ : {}),
429
+ ...($("html").attr("lang")?.trim()
430
+ ? { language: $("html").attr("lang")?.trim() }
431
+ : {}),
432
+ ...(jsonLd.length === 1
433
+ ? { jsonLd: jsonLd[0] }
434
+ : jsonLd.length > 1
435
+ ? { jsonLd }
436
+ : {}),
437
+ };
438
+ }
439
+ function normalizeOptionalUrl(rawUrl, baseUrl) {
440
+ if (!rawUrl)
441
+ return undefined;
442
+ try {
443
+ return new URL(rawUrl, baseUrl).toString();
444
+ }
445
+ catch {
446
+ return undefined;
447
+ }
448
+ }
449
+ function parseJsonLd(text) {
450
+ const trimmed = text.trim();
451
+ if (!trimmed)
452
+ return undefined;
453
+ try {
454
+ return JSON.parse(trimmed);
455
+ }
456
+ catch {
457
+ return undefined;
458
+ }
459
+ }