edgecrawl 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ // src/html2md.mjs
2
+ // HTML -> clean Markdown conversion
3
+ // CSS selector-based HTML cleaning + node-html-markdown
4
+
5
+ import { JSDOM } from "jsdom";
6
+ import { NodeHtmlMarkdown } from "node-html-markdown";
7
+
8
+ // ---- CSS selector-based HTML cleaning ----
9
+
10
+ // Elements to remove: nav, footer, sidebar, ads, modals, social, etc.
11
+ const REMOVE_SELECTORS = [
12
+ // Nav, header, footer
13
+ "header", "footer", "nav",
14
+ ".header", ".top", ".navbar", "#header",
15
+ ".footer", ".bottom", "#footer",
16
+ // Sidebar
17
+ "aside",
18
+ ".sidebar", ".side", ".aside", "#sidebar",
19
+ // Ads
20
+ ".ad", ".ads", ".advert", "#ad",
21
+ "[class*='ad-']", "[class*='ads-']", "[id*='ad-']",
22
+ // Modals, popups
23
+ ".modal", ".popup", "#modal", ".overlay",
24
+ // Social, share
25
+ ".social", ".social-media", ".social-links", "#social",
26
+ ".share", "#share",
27
+ // Nav, menu, breadcrumbs
28
+ ".menu", ".navigation", "#nav",
29
+ ".breadcrumbs", "#breadcrumbs", ".breadcrumb",
30
+ // Widgets, cookie banners
31
+ ".widget", "#widget",
32
+ ".cookie", "#cookie", ".cookie-banner",
33
+ // Language selectors
34
+ ".lang-selector", ".language", "#language-selector",
35
+ ];
36
+
37
+ // Protected elements: do not remove if they contain or are contained by these
38
+ const PROTECT_SELECTORS = [
39
+ "#main", "#content", "main", "article",
40
+ "[role='main']", "[role='article']",
41
+ ];
42
+
43
+ /**
44
+ * Remove unwanted noise elements from HTML using CSS selectors
45
+ * @param {Document} document - JSDOM document
46
+ */
47
+ function removeUnwantedElements(document) {
48
+ // 0. Remove <head> — title and meta description are extracted beforehand
49
+ const head = document.querySelector("head");
50
+ if (head) head.remove();
51
+
52
+ // 1. Remove script, style, noscript, iframe, svg, canvas, template
53
+ for (const tag of ["script", "style", "noscript", "iframe", "svg", "canvas", "template"]) {
54
+ for (const el of document.querySelectorAll(tag)) {
55
+ el.remove();
56
+ }
57
+ }
58
+
59
+ // 2. Strip style attributes, event handlers, and data-* attributes
60
+ for (const el of document.querySelectorAll("*")) {
61
+ const attrs = Array.from(el.attributes || []);
62
+ for (const attr of attrs) {
63
+ if (
64
+ attr.name === "style" ||
65
+ attr.name.startsWith("on") ||
66
+ attr.name.startsWith("data-")
67
+ ) {
68
+ el.removeAttribute(attr.name);
69
+ }
70
+ }
71
+ }
72
+
73
+ // 3. Remove hidden elements
74
+ for (const el of document.querySelectorAll("[hidden], [aria-hidden='true']")) {
75
+ el.remove();
76
+ }
77
+
78
+ // 4. Collect protected elements
79
+ const protectedElements = new Set();
80
+ for (const sel of PROTECT_SELECTORS) {
81
+ try {
82
+ for (const el of document.querySelectorAll(sel)) {
83
+ protectedElements.add(el);
84
+ }
85
+ } catch { /* invalid selector */ }
86
+ }
87
+
88
+ function isProtected(el) {
89
+ for (const p of protectedElements) {
90
+ if (p.contains(el) || el.contains(p)) return true;
91
+ }
92
+ return false;
93
+ }
94
+
95
+ // 5. Remove noise elements by CSS selector
96
+ for (const sel of REMOVE_SELECTORS) {
97
+ try {
98
+ for (const el of document.querySelectorAll(sel)) {
99
+ if (!isProtected(el)) el.remove();
100
+ }
101
+ } catch { /* invalid selector */ }
102
+ }
103
+
104
+ // 6. Remove empty elements
105
+ for (const tag of ["div", "span", "p", "section"]) {
106
+ for (const el of document.querySelectorAll(tag)) {
107
+ if (!el.textContent.trim() && !el.querySelector("img, video")) {
108
+ el.remove();
109
+ }
110
+ }
111
+ }
112
+ }
113
+
114
+ /**
115
+ * Convert relative URLs to absolute URLs
116
+ * @param {Document} document
117
+ * @param {string} baseUrl
118
+ */
119
+ function absolutifyURLs(document, baseUrl) {
120
+ try {
121
+ const base = new URL(baseUrl);
122
+
123
+ for (const img of document.querySelectorAll("img[src]")) {
124
+ try {
125
+ img.setAttribute("src", new URL(img.getAttribute("src"), base).href);
126
+ } catch { /* invalid URL */ }
127
+ }
128
+
129
+ for (const a of document.querySelectorAll("a[href]")) {
130
+ try {
131
+ const href = a.getAttribute("href");
132
+ if (href && !href.startsWith("#") && !href.startsWith("javascript:")) {
133
+ a.setAttribute("href", new URL(href, base).href);
134
+ }
135
+ } catch { /* invalid URL */ }
136
+ }
137
+ } catch { /* invalid baseUrl */ }
138
+ }
139
+
140
+ /**
141
+ * Resolve img srcset to the largest image URL
142
+ * @param {Document} document
143
+ * @param {string} baseUrl
144
+ */
145
+ function resolveSrcset(document, baseUrl) {
146
+ for (const img of document.querySelectorAll("img[srcset]")) {
147
+ try {
148
+ const srcset = img.getAttribute("srcset");
149
+ const candidates = srcset.split(",").map((s) => {
150
+ const parts = s.trim().split(/\s+/);
151
+ const url = parts[0];
152
+ const descriptor = parts[1] || "1x";
153
+ const size = parseFloat(descriptor) || 1;
154
+ return { url, size };
155
+ });
156
+ candidates.sort((a, b) => b.size - a.size);
157
+ if (candidates.length > 0) {
158
+ const resolved = new URL(candidates[0].url, baseUrl).href;
159
+ img.setAttribute("src", resolved);
160
+ }
161
+ img.removeAttribute("srcset");
162
+ } catch { /* parse error */ }
163
+ }
164
+ }
165
+
166
+ /**
167
+ * Main HTML-to-Markdown conversion
168
+ * CSS selector-based HTML cleaning -> node-html-markdown
169
+ * @param {string} html - Raw HTML
170
+ * @param {string} url - Source URL
171
+ * @param {object} options
172
+ * @param {string} options.selector - CSS selector to narrow target content
173
+ * @returns {{ markdown: string, title: string, excerpt: string }}
174
+ */
175
+ export function htmlToMarkdown(html, url = "https://example.com", options = {}) {
176
+ const { selector } = options;
177
+ const dom = new JSDOM(html, { url });
178
+ const document = dom.window.document;
179
+
180
+ // Extract title and meta description first
181
+ const title = document.title || "";
182
+ const metaDesc = document.querySelector('meta[name="description"]');
183
+ const excerpt = metaDesc ? metaDesc.getAttribute("content") || "" : "";
184
+
185
+ // If selector specified, replace body with target element
186
+ if (selector) {
187
+ const target = document.querySelector(selector);
188
+ if (target) {
189
+ document.body.innerHTML = target.outerHTML;
190
+ }
191
+ }
192
+
193
+ // HTML cleaning
194
+ removeUnwantedElements(document);
195
+ absolutifyURLs(document, url);
196
+ resolveSrcset(document, url);
197
+
198
+ // Convert cleaned HTML to Markdown via node-html-markdown
199
+ const bodyHtml = document.body?.innerHTML || "";
200
+ if (!bodyHtml.trim()) {
201
+ return { markdown: "", title, excerpt };
202
+ }
203
+
204
+ const markdown = NodeHtmlMarkdown.translate(bodyHtml, {
205
+ maxConsecutiveNewlines: 2,
206
+ }).trim();
207
+
208
+ return { markdown, title, excerpt };
209
+ }
210
+
211
+ /**
212
+ * Markdown post-processing: generic noise reduction for LLM input
213
+ * Algorithm-based, no site-specific rules
214
+ * @param {string} md - Raw Markdown
215
+ * @param {object} options
216
+ * @param {boolean} options.removeImages - Remove image references (default: false)
217
+ * @returns {string} Cleaned Markdown
218
+ */
219
+ export function cleanMarkdown(md, options = {}) {
220
+ const { removeImages = false } = options;
221
+
222
+ let result = md;
223
+
224
+ // 1. Remove lines containing javascript: links
225
+ result = result.replace(/^.*\[.*\]\(javascript:.*\).*$/gm, "");
226
+
227
+ // 2. Remove empty links and alt-less images
228
+ result = result
229
+ .replace(/\[\s*\]\([^)]*\)/g, "") // [](url) or [ ](url)
230
+ .replace(/!\[\]\([^)]*\)/g, ""); // ![](url) alt-less image
231
+
232
+ // 3. Hash-only links -> keep text only
233
+ result = result.replace(/\[([^\]]+)\]\(#[^)]*\)/g, "$1");
234
+
235
+ // 4. Overly long URL links -> keep text only
236
+ result = result.replace(/\[([^\]]+)\]\([^)]{150,}\)/g, "$1");
237
+
238
+ // 5. Normalize extra whitespace in link text
239
+ result = result.replace(/\[([^\]]*)\]/g, (_, t) => `[${t.replace(/\s+/g, " ").trim()}]`);
240
+
241
+ // 6. Remove lines containing template variables (${var}, %var%)
242
+ result = result.replace(/^.*\$\{[^}]+\}.*$/gm, "");
243
+ result = result.replace(/^\s*%[a-zA-Z]\w*%\s*$/gm, "");
244
+
245
+ // 7. Remove JSON-like lines (escaped included: {" or {\\" patterns)
246
+ result = result.replace(/^\s*\{\\?".*[\w_]+\\?"\s*:.*\}\s*$/gm, "");
247
+
248
+ // 8. Remove images (optional)
249
+ if (removeImages) {
250
+ result = result.replace(/!\[[^\]]*\]\([^)]*\)/g, "");
251
+ }
252
+
253
+ // 9. Collapse consecutive blank lines
254
+ return result
255
+ .replace(/\n{3,}/g, "\n\n")
256
+ .trim();
257
+ }
258
+
259
+ /**
260
+ * Truncate Markdown to fit within LLM token limits
261
+ * Rough estimate: 1 token ~ 3 characters
262
+ */
263
+ export function truncateForLLM(markdown, maxTokens = 2048) {
264
+ const approxMaxChars = maxTokens * 3;
265
+ if (markdown.length <= approxMaxChars) return markdown;
266
+
267
+ return markdown.slice(0, approxMaxChars) + "\n\n[... truncated]";
268
+ }
package/src/llm.mjs ADDED
@@ -0,0 +1,281 @@
1
+ // src/llm.mjs
2
+ // ONNX LLM wrapper using Transformers.js v4
3
+ // Runs on Node.js with WebGPU or WASM
4
+
5
+ import { pipeline, env } from "@huggingface/transformers";
6
+
7
+ const MODEL_PRESETS = {
8
+ // NuExtract: 500M - specialized for structured extraction
9
+ granite: {
10
+ model: "onnx-community/granite-4.0-350m-ONNX-web",
11
+ dtype: "fp16",
12
+ maxNewTokens: 32768,
13
+ type: "granite",
14
+ },
15
+ nuextract: {
16
+ model: "onnx-community/NuExtract-1.5-tiny-ONNX",
17
+ dtype: "q4",
18
+ maxNewTokens: 32768,
19
+ type: "nuextract",
20
+ },
21
+ qwen25: {
22
+ model: "onnx-community/Qwen2.5-0.5B-Instruct-ONNX-MHA",
23
+ dtype: "q4f16",
24
+ maxNewTokens: 32768,
25
+ type: "qwen25",
26
+ },
27
+ // Light: 0.6B - Qwen3
28
+ light: {
29
+ model: "onnx-community/Qwen3-0.6B-ONNX",
30
+ dtype: "q4f16",
31
+ maxNewTokens: 32768,
32
+ type: "qwen3",
33
+ },
34
+ // Balanced: 1.7B - Qwen3
35
+ balanced: {
36
+ model: "onnx-community/Qwen3-1.7B-ONNX",
37
+ dtype: "q4f16",
38
+ maxNewTokens: 32768,
39
+ type: "qwen3",
40
+ },
41
+ // Quality: 4B - Qwen3
42
+ quality: {
43
+ model: "onnx-community/Qwen3-4B-ONNX",
44
+ dtype: "q4f16",
45
+ maxNewTokens: 32768,
46
+ type: "qwen3",
47
+ },
48
+ };
49
+
50
+ let generator = null;
51
+ let currentPreset = null;
52
+
53
+ /**
54
+ * Initialize the LLM
55
+ */
56
+ export async function initLLM(preset = "balanced", device = "webgpu") {
57
+ if (generator && currentPreset === preset) return;
58
+
59
+ const config = MODEL_PRESETS[preset];
60
+ if (!config) throw new Error(`Unknown preset: ${preset}. Available: ${Object.keys(MODEL_PRESETS).join(", ")}`);
61
+
62
+ console.log(`Loading model: ${config.model} (${config.dtype}, ${device})...`);
63
+
64
+ env.cacheDir = "./.model-cache";
65
+
66
+ generator = await pipeline("text-generation", config.model, {
67
+ dtype: config.dtype,
68
+ device,
69
+ });
70
+ currentPreset = preset;
71
+
72
+ console.log("Model loaded.");
73
+ }
74
+
75
+ /**
76
+ * Generate a template object from a JSON Schema
77
+ * NuExtract expects an empty output template, not a JSON Schema
78
+ */
79
+ function schemaToTemplate(schema) {
80
+ if (!schema.properties) return schema;
81
+
82
+ const template = {};
83
+ for (const [key, def] of Object.entries(schema.properties)) {
84
+ if (def.type === "array") {
85
+ if (def.items?.properties) {
86
+ const item = {};
87
+ for (const [k, v] of Object.entries(def.items.properties)) {
88
+ item[k] = "";
89
+ }
90
+ template[key] = [item];
91
+ } else {
92
+ template[key] = [];
93
+ }
94
+ } else if (def.type === "object" && def.properties) {
95
+ template[key] = schemaToTemplate(def);
96
+ } else {
97
+ template[key] = "";
98
+ }
99
+ }
100
+ return template;
101
+ }
102
+
103
+ /**
104
+ * Extract structured data (JSON) from Markdown content
105
+ */
106
+ export async function extractStructured(markdown, schema) {
107
+ if (!generator) throw new Error("LLM not initialized. Call initLLM() first.");
108
+
109
+ const config = MODEL_PRESETS[currentPreset];
110
+
111
+ if (config.type === "nuextract") {
112
+ return extractWithNuExtract(markdown, schema, config);
113
+ } else if (config.type === "qwen25" || config.type === "granite") {
114
+ return extractWithQwen25(markdown, schema, config);
115
+ } else {
116
+ return extractWithQwen3(markdown, schema, config);
117
+ }
118
+ }
119
+
120
+ /**
121
+ * NuExtract-specific extraction
122
+ */
123
+ async function extractWithNuExtract(markdown, schema, config) {
124
+ const template = schemaToTemplate(schema);
125
+ const templateStr = JSON.stringify(template, null, 2);
126
+
127
+ const prompt = `<|input|>\n### Template:\n${templateStr}\n### Text:\n${markdown}\n\n<|output|>\n`;
128
+
129
+ const output = await generator(prompt, {
130
+ max_new_tokens: config.maxNewTokens,
131
+ do_sample: false,
132
+ });
133
+
134
+ const raw = output[0].generated_text.slice(prompt.length).trim();
135
+ return parseJSONSafe(raw);
136
+ }
137
+
138
+ /**
139
+ * Qwen2.5-specific extraction (no thinking mode)
140
+ */
141
+ async function extractWithQwen25(markdown, schema, config) {
142
+ const schemaStr = JSON.stringify(schema, null, 2);
143
+
144
+ const messages = [
145
+ {
146
+ role: "system",
147
+ content: `You are a data extraction assistant. Extract structured information from the given content and return ONLY valid JSON matching the provided JSON Schema. No explanation, no markdown fences, just raw JSON.`,
148
+ },
149
+ {
150
+ role: "user",
151
+ content: `Extract data from the content below according to this JSON Schema:\n\n${schemaStr}\n\n---\nContent:\n${markdown}\n---\n\nReturn ONLY a valid JSON object matching the schema above.`,
152
+ },
153
+ ];
154
+
155
+ const output = await generator(messages, {
156
+ max_new_tokens: config.maxNewTokens,
157
+ temperature: 0.7,
158
+ top_p: 0.8,
159
+ top_k: 20,
160
+ do_sample: true,
161
+ repetition_penalty: 1.2,
162
+ });
163
+
164
+ const raw = output[0].generated_text.at(-1).content;
165
+ return normalizeExtracted(parseJSONSafe(raw));
166
+ }
167
+
168
+ /**
169
+ * Qwen3-specific extraction
170
+ */
171
+ async function extractWithQwen3(markdown, schema, config) {
172
+ const schemaStr = JSON.stringify(schema, null, 2);
173
+
174
+ const messages = [
175
+ {
176
+ role: "system",
177
+ content: `You are a data extraction assistant. Extract structured information from the given content and return ONLY valid JSON matching the provided JSON Schema. No explanation, no markdown fences, no thinking, just raw JSON.`,
178
+ },
179
+ {
180
+ role: "user",
181
+ content: `Extract data from the content below according to this JSON Schema:\n\n${schemaStr}\n\n---\nContent:\n${markdown}\n---\n\nReturn ONLY a valid JSON object matching the schema above.`,
182
+ },
183
+ ];
184
+
185
+ const output = await generator(messages, {
186
+ max_new_tokens: config.maxNewTokens,
187
+ temperature: 0,
188
+ do_sample: false,
189
+ repetition_penalty: 1.05,
190
+ enable_thinking: false,
191
+ });
192
+
193
+ const raw = output[0].generated_text.at(-1).content;
194
+ return normalizeExtracted(parseJSONSafe(raw));
195
+ }
196
+
197
+ /**
198
+ * Call LLM with a custom prompt
199
+ */
200
+ export async function queryLLM(systemPrompt, userPrompt, options = {}) {
201
+ if (!generator) throw new Error("LLM not initialized. Call initLLM() first.");
202
+
203
+ const config = MODEL_PRESETS[currentPreset];
204
+ const { noThink = true } = options;
205
+
206
+ const finalSystemPrompt = noThink
207
+ ? `${systemPrompt} /no_think`
208
+ : systemPrompt;
209
+
210
+ const messages = [
211
+ { role: "system", content: finalSystemPrompt },
212
+ { role: "user", content: userPrompt },
213
+ ];
214
+
215
+ const output = await generator(messages, {
216
+ max_new_tokens: config.maxNewTokens,
217
+ do_sample: false,
218
+ });
219
+
220
+ return output[0].generated_text.at(-1).content;
221
+ }
222
+
223
+ /**
224
+ * Normalize LLM output structure
225
+ * - If schema definition leaked into output -> extract properties content
226
+ * - If wrapped in array -> extract first element
227
+ */
228
+ function normalizeExtracted(parsed) {
229
+ if (!parsed || parsed._error) return parsed;
230
+
231
+ // Schema leak: { type: "object", properties: { title: "...", ... } }
232
+ if (parsed.type && parsed.properties && typeof parsed.properties === "object") {
233
+ const inner = parsed.properties;
234
+ // If property values are actual data (not schema defs like { type: "string" })
235
+ const firstVal = Object.values(inner)[0];
236
+ if (firstVal && typeof firstVal !== "object") {
237
+ return inner;
238
+ }
239
+ }
240
+
241
+ // Array wrap: [{ title: "...", ... }] -> { title: "...", ... }
242
+ if (Array.isArray(parsed) && parsed.length === 1) {
243
+ return parsed[0];
244
+ }
245
+
246
+ return parsed;
247
+ }
248
+
249
+ /**
250
+ * Safely parse a JSON string from LLM output
251
+ */
252
+ function parseJSONSafe(raw) {
253
+ // Remove think tags
254
+ let cleaned = raw.replace(/<think>[\s\S]*?<\/think>/g, "");
255
+ cleaned = cleaned.replace(/<think>[\s\S]*/g, "");
256
+ // Remove code fences
257
+ cleaned = cleaned.replace(/```(?:json)?\s*/gi, "").replace(/```/g, "");
258
+ cleaned = cleaned.trim();
259
+
260
+ try {
261
+ return JSON.parse(cleaned);
262
+ } catch {
263
+ // Find the last valid JSON object
264
+ const matches = [...cleaned.matchAll(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/g)];
265
+ for (let i = matches.length - 1; i >= 0; i--) {
266
+ try {
267
+ return JSON.parse(matches[i][0]);
268
+ } catch { continue; }
269
+ }
270
+ // Try broadest match
271
+ const broad = cleaned.match(/\{[\s\S]*\}/);
272
+ if (broad) {
273
+ try {
274
+ return JSON.parse(broad[0]);
275
+ } catch {}
276
+ }
277
+ return { _raw: raw, _error: "Failed to parse JSON" };
278
+ }
279
+ }
280
+
281
+ export { MODEL_PRESETS };
@@ -0,0 +1,165 @@
1
+ // src/pipeline.mjs
2
+ // Scraping -> Markdown conversion -> LLM structured extraction pipeline
3
+
4
+ import {
5
+ fetchPage,
6
+ fetchPages,
7
+ launchBrowser,
8
+ closeBrowser,
9
+ } from "./scraper.mjs";
10
+ import { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./html2md.mjs";
11
+ import { initLLM, extractStructured, queryLLM } from "./llm.mjs";
12
+ import { tryStructuredExtract } from "./structured-extract.mjs";
13
+
14
+ /**
15
+ * Default schema definition
16
+ */
17
+ const DEFAULT_SCHEMA = {
18
+ title: "Page title (string)",
19
+ description: "Page summary in 1-2 sentences (string)",
20
+ main_topics: "List of main topics (array of strings)",
21
+ key_facts: "Important facts and data points (array of strings)",
22
+ entities: "People, organizations, and product names mentioned (array of strings)",
23
+ language: "Content language (string, e.g. 'ja', 'en')",
24
+ };
25
+
26
+ /**
27
+ * Scrape a single URL and extract structured data
28
+ * @param {string} url
29
+ * @param {object} options
30
+ */
31
+ export async function scrapeAndExtract(url, options = {}) {
32
+ const {
33
+ schema = DEFAULT_SCHEMA,
34
+ preset = "balanced",
35
+ device = "webgpu",
36
+ maxTokens = 2048,
37
+ selector,
38
+ browserOptions = {},
39
+ scrapeOptions = {},
40
+ } = options;
41
+
42
+ await launchBrowser(browserOptions);
43
+
44
+ const { html, url: finalUrl, status } = await fetchPage(url, scrapeOptions);
45
+ if (!html) return { url, error: "Failed to fetch page", status };
46
+
47
+ // Try structured data (JSON-LD / OG) first
48
+ const structResult = tryStructuredExtract(html, schema);
49
+ if (structResult) {
50
+ return {
51
+ url: finalUrl, status,
52
+ extraction_source: structResult.source,
53
+ coverage: structResult.coverage,
54
+ extracted: structResult.extracted,
55
+ };
56
+ }
57
+
58
+ // Fallback: Markdown conversion -> cleanMarkdown -> LLM
59
+ const { markdown, title, excerpt } = htmlToMarkdown(html, finalUrl, { selector });
60
+ const llmInput = cleanMarkdown(markdown);
61
+ const metadata = {
62
+ url: finalUrl, status, title, excerpt,
63
+ markdown_length: markdown.length,
64
+ cleaned_length: llmInput.length,
65
+ extraction_source: "llm",
66
+ };
67
+
68
+ // LLM structured extraction
69
+ await initLLM(preset, device);
70
+ const extracted = await extractStructured(llmInput, schema);
71
+
72
+ return { ...metadata, extracted };
73
+ }
74
+
75
+ /**
76
+ * Batch process multiple URLs
77
+ */
78
+ export async function batchScrapeAndExtract(urls, options = {}) {
79
+ const {
80
+ schema = DEFAULT_SCHEMA,
81
+ preset = "balanced",
82
+ device = "webgpu",
83
+ maxTokens = 2048,
84
+ concurrency = 3,
85
+ browserOptions = {},
86
+ scrapeOptions = {},
87
+ } = options;
88
+
89
+ await initLLM(preset, device);
90
+ await launchBrowser(browserOptions);
91
+
92
+ const results = [];
93
+ const pages = await fetchPages(urls, scrapeOptions, concurrency);
94
+
95
+ for (const page of pages) {
96
+ if (page.error || !page.html) {
97
+ results.push({ url: page.url, error: page.error || "Empty response", extracted: null });
98
+ continue;
99
+ }
100
+
101
+ // Try structured data first
102
+ const structResult = tryStructuredExtract(page.html, schema);
103
+ if (structResult) {
104
+ results.push({
105
+ url: page.url, status: page.status,
106
+ extraction_source: structResult.source,
107
+ coverage: structResult.coverage,
108
+ extracted: structResult.extracted,
109
+ });
110
+ continue;
111
+ }
112
+
113
+ // Fallback: Markdown -> LLM
114
+ const { markdown, title } = htmlToMarkdown(page.html, page.url);
115
+ const truncated = truncateForLLM(cleanMarkdown(markdown), maxTokens);
116
+
117
+ try {
118
+ const extracted = await extractStructured(truncated, schema);
119
+ results.push({ url: page.url, status: page.status, title, extraction_source: "llm", extracted });
120
+ } catch (error) {
121
+ results.push({ url: page.url, error: error.message, extracted: null });
122
+ }
123
+ }
124
+
125
+ return results;
126
+ }
127
+
128
+ /**
129
+ * Custom prompt query against page content
130
+ */
131
+ export async function scrapeAndQuery(url, prompt, options = {}) {
132
+ const {
133
+ preset = "balanced",
134
+ device = "webgpu",
135
+ maxTokens = 2048,
136
+ browserOptions = {},
137
+ scrapeOptions = {},
138
+ } = options;
139
+
140
+ await launchBrowser(browserOptions);
141
+
142
+ const { html, url: finalUrl } = await fetchPage(url, scrapeOptions);
143
+ if (!html) return { url, error: "Failed to fetch" };
144
+
145
+ const { markdown } = htmlToMarkdown(html, finalUrl);
146
+ const content = truncateForLLM(cleanMarkdown(markdown), maxTokens);
147
+
148
+ await initLLM(preset, device);
149
+
150
+ const answer = await queryLLM(
151
+ "You are a helpful assistant analyzing web content. Answer based only on the provided content.",
152
+ `${prompt}\n\n---\nContent:\n${content}`
153
+ );
154
+
155
+ return { url, answer };
156
+ }
157
+
158
+ /**
159
+ * Cleanup (close browser)
160
+ */
161
+ export async function cleanup() {
162
+ await closeBrowser();
163
+ }
164
+
165
+ export { DEFAULT_SCHEMA };