edgecrawl 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +260 -0
- package/index.mjs +22 -0
- package/package.json +63 -0
- package/schemas/article.json +15 -0
- package/schemas/news.json +19 -0
- package/schemas/product.json +20 -0
- package/src/cli.mjs +226 -0
- package/src/html2md.mjs +268 -0
- package/src/llm.mjs +281 -0
- package/src/pipeline.mjs +165 -0
- package/src/scraper.mjs +218 -0
- package/src/structured-extract.mjs +226 -0
package/src/html2md.mjs
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
// src/html2md.mjs
|
|
2
|
+
// HTML -> clean Markdown conversion
|
|
3
|
+
// CSS selector-based HTML cleaning + node-html-markdown
|
|
4
|
+
|
|
5
|
+
import { JSDOM } from "jsdom";
|
|
6
|
+
import { NodeHtmlMarkdown } from "node-html-markdown";
|
|
7
|
+
|
|
8
|
+
// ---- CSS selector-based HTML cleaning ----
|
|
9
|
+
|
|
10
|
+
// Elements to remove: nav, footer, sidebar, ads, modals, social, etc.
|
|
11
|
+
const REMOVE_SELECTORS = [
|
|
12
|
+
// Nav, header, footer
|
|
13
|
+
"header", "footer", "nav",
|
|
14
|
+
".header", ".top", ".navbar", "#header",
|
|
15
|
+
".footer", ".bottom", "#footer",
|
|
16
|
+
// Sidebar
|
|
17
|
+
"aside",
|
|
18
|
+
".sidebar", ".side", ".aside", "#sidebar",
|
|
19
|
+
// Ads
|
|
20
|
+
".ad", ".ads", ".advert", "#ad",
|
|
21
|
+
"[class*='ad-']", "[class*='ads-']", "[id*='ad-']",
|
|
22
|
+
// Modals, popups
|
|
23
|
+
".modal", ".popup", "#modal", ".overlay",
|
|
24
|
+
// Social, share
|
|
25
|
+
".social", ".social-media", ".social-links", "#social",
|
|
26
|
+
".share", "#share",
|
|
27
|
+
// Nav, menu, breadcrumbs
|
|
28
|
+
".menu", ".navigation", "#nav",
|
|
29
|
+
".breadcrumbs", "#breadcrumbs", ".breadcrumb",
|
|
30
|
+
// Widgets, cookie banners
|
|
31
|
+
".widget", "#widget",
|
|
32
|
+
".cookie", "#cookie", ".cookie-banner",
|
|
33
|
+
// Language selectors
|
|
34
|
+
".lang-selector", ".language", "#language-selector",
|
|
35
|
+
];
|
|
36
|
+
|
|
37
|
+
// Protected elements: do not remove if they contain or are contained by these
|
|
38
|
+
const PROTECT_SELECTORS = [
|
|
39
|
+
"#main", "#content", "main", "article",
|
|
40
|
+
"[role='main']", "[role='article']",
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Remove unwanted noise elements from HTML using CSS selectors
|
|
45
|
+
* @param {Document} document - JSDOM document
|
|
46
|
+
*/
|
|
47
|
+
function removeUnwantedElements(document) {
|
|
48
|
+
// 0. Remove <head> — title and meta description are extracted beforehand
|
|
49
|
+
const head = document.querySelector("head");
|
|
50
|
+
if (head) head.remove();
|
|
51
|
+
|
|
52
|
+
// 1. Remove script, style, noscript, iframe, svg, canvas, template
|
|
53
|
+
for (const tag of ["script", "style", "noscript", "iframe", "svg", "canvas", "template"]) {
|
|
54
|
+
for (const el of document.querySelectorAll(tag)) {
|
|
55
|
+
el.remove();
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// 2. Strip style attributes, event handlers, and data-* attributes
|
|
60
|
+
for (const el of document.querySelectorAll("*")) {
|
|
61
|
+
const attrs = Array.from(el.attributes || []);
|
|
62
|
+
for (const attr of attrs) {
|
|
63
|
+
if (
|
|
64
|
+
attr.name === "style" ||
|
|
65
|
+
attr.name.startsWith("on") ||
|
|
66
|
+
attr.name.startsWith("data-")
|
|
67
|
+
) {
|
|
68
|
+
el.removeAttribute(attr.name);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// 3. Remove hidden elements
|
|
74
|
+
for (const el of document.querySelectorAll("[hidden], [aria-hidden='true']")) {
|
|
75
|
+
el.remove();
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// 4. Collect protected elements
|
|
79
|
+
const protectedElements = new Set();
|
|
80
|
+
for (const sel of PROTECT_SELECTORS) {
|
|
81
|
+
try {
|
|
82
|
+
for (const el of document.querySelectorAll(sel)) {
|
|
83
|
+
protectedElements.add(el);
|
|
84
|
+
}
|
|
85
|
+
} catch { /* invalid selector */ }
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function isProtected(el) {
|
|
89
|
+
for (const p of protectedElements) {
|
|
90
|
+
if (p.contains(el) || el.contains(p)) return true;
|
|
91
|
+
}
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// 5. Remove noise elements by CSS selector
|
|
96
|
+
for (const sel of REMOVE_SELECTORS) {
|
|
97
|
+
try {
|
|
98
|
+
for (const el of document.querySelectorAll(sel)) {
|
|
99
|
+
if (!isProtected(el)) el.remove();
|
|
100
|
+
}
|
|
101
|
+
} catch { /* invalid selector */ }
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// 6. Remove empty elements
|
|
105
|
+
for (const tag of ["div", "span", "p", "section"]) {
|
|
106
|
+
for (const el of document.querySelectorAll(tag)) {
|
|
107
|
+
if (!el.textContent.trim() && !el.querySelector("img, video")) {
|
|
108
|
+
el.remove();
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Convert relative URLs to absolute URLs
|
|
116
|
+
* @param {Document} document
|
|
117
|
+
* @param {string} baseUrl
|
|
118
|
+
*/
|
|
119
|
+
function absolutifyURLs(document, baseUrl) {
|
|
120
|
+
try {
|
|
121
|
+
const base = new URL(baseUrl);
|
|
122
|
+
|
|
123
|
+
for (const img of document.querySelectorAll("img[src]")) {
|
|
124
|
+
try {
|
|
125
|
+
img.setAttribute("src", new URL(img.getAttribute("src"), base).href);
|
|
126
|
+
} catch { /* invalid URL */ }
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
for (const a of document.querySelectorAll("a[href]")) {
|
|
130
|
+
try {
|
|
131
|
+
const href = a.getAttribute("href");
|
|
132
|
+
if (href && !href.startsWith("#") && !href.startsWith("javascript:")) {
|
|
133
|
+
a.setAttribute("href", new URL(href, base).href);
|
|
134
|
+
}
|
|
135
|
+
} catch { /* invalid URL */ }
|
|
136
|
+
}
|
|
137
|
+
} catch { /* invalid baseUrl */ }
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Resolve img srcset to the largest image URL
|
|
142
|
+
* @param {Document} document
|
|
143
|
+
* @param {string} baseUrl
|
|
144
|
+
*/
|
|
145
|
+
function resolveSrcset(document, baseUrl) {
|
|
146
|
+
for (const img of document.querySelectorAll("img[srcset]")) {
|
|
147
|
+
try {
|
|
148
|
+
const srcset = img.getAttribute("srcset");
|
|
149
|
+
const candidates = srcset.split(",").map((s) => {
|
|
150
|
+
const parts = s.trim().split(/\s+/);
|
|
151
|
+
const url = parts[0];
|
|
152
|
+
const descriptor = parts[1] || "1x";
|
|
153
|
+
const size = parseFloat(descriptor) || 1;
|
|
154
|
+
return { url, size };
|
|
155
|
+
});
|
|
156
|
+
candidates.sort((a, b) => b.size - a.size);
|
|
157
|
+
if (candidates.length > 0) {
|
|
158
|
+
const resolved = new URL(candidates[0].url, baseUrl).href;
|
|
159
|
+
img.setAttribute("src", resolved);
|
|
160
|
+
}
|
|
161
|
+
img.removeAttribute("srcset");
|
|
162
|
+
} catch { /* parse error */ }
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Main HTML-to-Markdown conversion
|
|
168
|
+
* CSS selector-based HTML cleaning -> node-html-markdown
|
|
169
|
+
* @param {string} html - Raw HTML
|
|
170
|
+
* @param {string} url - Source URL
|
|
171
|
+
* @param {object} options
|
|
172
|
+
* @param {string} options.selector - CSS selector to narrow target content
|
|
173
|
+
* @returns {{ markdown: string, title: string, excerpt: string }}
|
|
174
|
+
*/
|
|
175
|
+
export function htmlToMarkdown(html, url = "https://example.com", options = {}) {
|
|
176
|
+
const { selector } = options;
|
|
177
|
+
const dom = new JSDOM(html, { url });
|
|
178
|
+
const document = dom.window.document;
|
|
179
|
+
|
|
180
|
+
// Extract title and meta description first
|
|
181
|
+
const title = document.title || "";
|
|
182
|
+
const metaDesc = document.querySelector('meta[name="description"]');
|
|
183
|
+
const excerpt = metaDesc ? metaDesc.getAttribute("content") || "" : "";
|
|
184
|
+
|
|
185
|
+
// If selector specified, replace body with target element
|
|
186
|
+
if (selector) {
|
|
187
|
+
const target = document.querySelector(selector);
|
|
188
|
+
if (target) {
|
|
189
|
+
document.body.innerHTML = target.outerHTML;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// HTML cleaning
|
|
194
|
+
removeUnwantedElements(document);
|
|
195
|
+
absolutifyURLs(document, url);
|
|
196
|
+
resolveSrcset(document, url);
|
|
197
|
+
|
|
198
|
+
// Convert cleaned HTML to Markdown via node-html-markdown
|
|
199
|
+
const bodyHtml = document.body?.innerHTML || "";
|
|
200
|
+
if (!bodyHtml.trim()) {
|
|
201
|
+
return { markdown: "", title, excerpt };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const markdown = NodeHtmlMarkdown.translate(bodyHtml, {
|
|
205
|
+
maxConsecutiveNewlines: 2,
|
|
206
|
+
}).trim();
|
|
207
|
+
|
|
208
|
+
return { markdown, title, excerpt };
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Markdown post-processing: generic noise reduction for LLM input
|
|
213
|
+
* Algorithm-based, no site-specific rules
|
|
214
|
+
* @param {string} md - Raw Markdown
|
|
215
|
+
* @param {object} options
|
|
216
|
+
* @param {boolean} options.removeImages - Remove image references (default: false)
|
|
217
|
+
* @returns {string} Cleaned Markdown
|
|
218
|
+
*/
|
|
219
|
+
export function cleanMarkdown(md, options = {}) {
|
|
220
|
+
const { removeImages = false } = options;
|
|
221
|
+
|
|
222
|
+
let result = md;
|
|
223
|
+
|
|
224
|
+
// 1. Remove lines containing javascript: links
|
|
225
|
+
result = result.replace(/^.*\[.*\]\(javascript:.*\).*$/gm, "");
|
|
226
|
+
|
|
227
|
+
// 2. Remove empty links and alt-less images
|
|
228
|
+
result = result
|
|
229
|
+
.replace(/\[\s*\]\([^)]*\)/g, "") // [](url) or [ ](url)
|
|
230
|
+
.replace(/!\[\]\([^)]*\)/g, ""); //  alt-less image
|
|
231
|
+
|
|
232
|
+
// 3. Hash-only links -> keep text only
|
|
233
|
+
result = result.replace(/\[([^\]]+)\]\(#[^)]*\)/g, "$1");
|
|
234
|
+
|
|
235
|
+
// 4. Overly long URL links -> keep text only
|
|
236
|
+
result = result.replace(/\[([^\]]+)\]\([^)]{150,}\)/g, "$1");
|
|
237
|
+
|
|
238
|
+
// 5. Normalize extra whitespace in link text
|
|
239
|
+
result = result.replace(/\[([^\]]*)\]/g, (_, t) => `[${t.replace(/\s+/g, " ").trim()}]`);
|
|
240
|
+
|
|
241
|
+
// 6. Remove lines containing template variables (${var}, %var%)
|
|
242
|
+
result = result.replace(/^.*\$\{[^}]+\}.*$/gm, "");
|
|
243
|
+
result = result.replace(/^\s*%[a-zA-Z]\w*%\s*$/gm, "");
|
|
244
|
+
|
|
245
|
+
// 7. Remove JSON-like lines (escaped included: {" or {\\" patterns)
|
|
246
|
+
result = result.replace(/^\s*\{\\?".*[\w_]+\\?"\s*:.*\}\s*$/gm, "");
|
|
247
|
+
|
|
248
|
+
// 8. Remove images (optional)
|
|
249
|
+
if (removeImages) {
|
|
250
|
+
result = result.replace(/!\[[^\]]*\]\([^)]*\)/g, "");
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// 9. Collapse consecutive blank lines
|
|
254
|
+
return result
|
|
255
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
256
|
+
.trim();
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Truncate Markdown to fit within LLM token limits
|
|
261
|
+
* Rough estimate: 1 token ~ 3 characters
|
|
262
|
+
*/
|
|
263
|
+
export function truncateForLLM(markdown, maxTokens = 2048) {
|
|
264
|
+
const approxMaxChars = maxTokens * 3;
|
|
265
|
+
if (markdown.length <= approxMaxChars) return markdown;
|
|
266
|
+
|
|
267
|
+
return markdown.slice(0, approxMaxChars) + "\n\n[... truncated]";
|
|
268
|
+
}
|
package/src/llm.mjs
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
// src/llm.mjs
|
|
2
|
+
// ONNX LLM wrapper using Transformers.js v4
|
|
3
|
+
// Runs on Node.js with WebGPU or WASM
|
|
4
|
+
|
|
5
|
+
import { pipeline, env } from "@huggingface/transformers";
|
|
6
|
+
|
|
7
|
+
const MODEL_PRESETS = {
|
|
8
|
+
// NuExtract: 500M - specialized for structured extraction
|
|
9
|
+
granite: {
|
|
10
|
+
model: "onnx-community/granite-4.0-350m-ONNX-web",
|
|
11
|
+
dtype: "fp16",
|
|
12
|
+
maxNewTokens: 32768,
|
|
13
|
+
type: "granite",
|
|
14
|
+
},
|
|
15
|
+
nuextract: {
|
|
16
|
+
model: "onnx-community/NuExtract-1.5-tiny-ONNX",
|
|
17
|
+
dtype: "q4",
|
|
18
|
+
maxNewTokens: 32768,
|
|
19
|
+
type: "nuextract",
|
|
20
|
+
},
|
|
21
|
+
qwen25: {
|
|
22
|
+
model: "onnx-community/Qwen2.5-0.5B-Instruct-ONNX-MHA",
|
|
23
|
+
dtype: "q4f16",
|
|
24
|
+
maxNewTokens: 32768,
|
|
25
|
+
type: "qwen25",
|
|
26
|
+
},
|
|
27
|
+
// Light: 0.6B - Qwen3
|
|
28
|
+
light: {
|
|
29
|
+
model: "onnx-community/Qwen3-0.6B-ONNX",
|
|
30
|
+
dtype: "q4f16",
|
|
31
|
+
maxNewTokens: 32768,
|
|
32
|
+
type: "qwen3",
|
|
33
|
+
},
|
|
34
|
+
// Balanced: 1.7B - Qwen3
|
|
35
|
+
balanced: {
|
|
36
|
+
model: "onnx-community/Qwen3-1.7B-ONNX",
|
|
37
|
+
dtype: "q4f16",
|
|
38
|
+
maxNewTokens: 32768,
|
|
39
|
+
type: "qwen3",
|
|
40
|
+
},
|
|
41
|
+
// Quality: 4B - Qwen3
|
|
42
|
+
quality: {
|
|
43
|
+
model: "onnx-community/Qwen3-4B-ONNX",
|
|
44
|
+
dtype: "q4f16",
|
|
45
|
+
maxNewTokens: 32768,
|
|
46
|
+
type: "qwen3",
|
|
47
|
+
},
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
let generator = null;
|
|
51
|
+
let currentPreset = null;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Initialize the LLM
|
|
55
|
+
*/
|
|
56
|
+
export async function initLLM(preset = "balanced", device = "webgpu") {
|
|
57
|
+
if (generator && currentPreset === preset) return;
|
|
58
|
+
|
|
59
|
+
const config = MODEL_PRESETS[preset];
|
|
60
|
+
if (!config) throw new Error(`Unknown preset: ${preset}. Available: ${Object.keys(MODEL_PRESETS).join(", ")}`);
|
|
61
|
+
|
|
62
|
+
console.log(`Loading model: ${config.model} (${config.dtype}, ${device})...`);
|
|
63
|
+
|
|
64
|
+
env.cacheDir = "./.model-cache";
|
|
65
|
+
|
|
66
|
+
generator = await pipeline("text-generation", config.model, {
|
|
67
|
+
dtype: config.dtype,
|
|
68
|
+
device,
|
|
69
|
+
});
|
|
70
|
+
currentPreset = preset;
|
|
71
|
+
|
|
72
|
+
console.log("Model loaded.");
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Generate a template object from a JSON Schema
|
|
77
|
+
* NuExtract expects an empty output template, not a JSON Schema
|
|
78
|
+
*/
|
|
79
|
+
function schemaToTemplate(schema) {
|
|
80
|
+
if (!schema.properties) return schema;
|
|
81
|
+
|
|
82
|
+
const template = {};
|
|
83
|
+
for (const [key, def] of Object.entries(schema.properties)) {
|
|
84
|
+
if (def.type === "array") {
|
|
85
|
+
if (def.items?.properties) {
|
|
86
|
+
const item = {};
|
|
87
|
+
for (const [k, v] of Object.entries(def.items.properties)) {
|
|
88
|
+
item[k] = "";
|
|
89
|
+
}
|
|
90
|
+
template[key] = [item];
|
|
91
|
+
} else {
|
|
92
|
+
template[key] = [];
|
|
93
|
+
}
|
|
94
|
+
} else if (def.type === "object" && def.properties) {
|
|
95
|
+
template[key] = schemaToTemplate(def);
|
|
96
|
+
} else {
|
|
97
|
+
template[key] = "";
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return template;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Extract structured data (JSON) from Markdown content
|
|
105
|
+
*/
|
|
106
|
+
export async function extractStructured(markdown, schema) {
|
|
107
|
+
if (!generator) throw new Error("LLM not initialized. Call initLLM() first.");
|
|
108
|
+
|
|
109
|
+
const config = MODEL_PRESETS[currentPreset];
|
|
110
|
+
|
|
111
|
+
if (config.type === "nuextract") {
|
|
112
|
+
return extractWithNuExtract(markdown, schema, config);
|
|
113
|
+
} else if (config.type === "qwen25" || config.type === "granite") {
|
|
114
|
+
return extractWithQwen25(markdown, schema, config);
|
|
115
|
+
} else {
|
|
116
|
+
return extractWithQwen3(markdown, schema, config);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* NuExtract-specific extraction
|
|
122
|
+
*/
|
|
123
|
+
async function extractWithNuExtract(markdown, schema, config) {
|
|
124
|
+
const template = schemaToTemplate(schema);
|
|
125
|
+
const templateStr = JSON.stringify(template, null, 2);
|
|
126
|
+
|
|
127
|
+
const prompt = `<|input|>\n### Template:\n${templateStr}\n### Text:\n${markdown}\n\n<|output|>\n`;
|
|
128
|
+
|
|
129
|
+
const output = await generator(prompt, {
|
|
130
|
+
max_new_tokens: config.maxNewTokens,
|
|
131
|
+
do_sample: false,
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
const raw = output[0].generated_text.slice(prompt.length).trim();
|
|
135
|
+
return parseJSONSafe(raw);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Qwen2.5-specific extraction (no thinking mode)
|
|
140
|
+
*/
|
|
141
|
+
async function extractWithQwen25(markdown, schema, config) {
|
|
142
|
+
const schemaStr = JSON.stringify(schema, null, 2);
|
|
143
|
+
|
|
144
|
+
const messages = [
|
|
145
|
+
{
|
|
146
|
+
role: "system",
|
|
147
|
+
content: `You are a data extraction assistant. Extract structured information from the given content and return ONLY valid JSON matching the provided JSON Schema. No explanation, no markdown fences, just raw JSON.`,
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
role: "user",
|
|
151
|
+
content: `Extract data from the content below according to this JSON Schema:\n\n${schemaStr}\n\n---\nContent:\n${markdown}\n---\n\nReturn ONLY a valid JSON object matching the schema above.`,
|
|
152
|
+
},
|
|
153
|
+
];
|
|
154
|
+
|
|
155
|
+
const output = await generator(messages, {
|
|
156
|
+
max_new_tokens: config.maxNewTokens,
|
|
157
|
+
temperature: 0.7,
|
|
158
|
+
top_p: 0.8,
|
|
159
|
+
top_k: 20,
|
|
160
|
+
do_sample: true,
|
|
161
|
+
repetition_penalty: 1.2,
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
const raw = output[0].generated_text.at(-1).content;
|
|
165
|
+
return normalizeExtracted(parseJSONSafe(raw));
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Qwen3-specific extraction
|
|
170
|
+
*/
|
|
171
|
+
async function extractWithQwen3(markdown, schema, config) {
|
|
172
|
+
const schemaStr = JSON.stringify(schema, null, 2);
|
|
173
|
+
|
|
174
|
+
const messages = [
|
|
175
|
+
{
|
|
176
|
+
role: "system",
|
|
177
|
+
content: `You are a data extraction assistant. Extract structured information from the given content and return ONLY valid JSON matching the provided JSON Schema. No explanation, no markdown fences, no thinking, just raw JSON.`,
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
role: "user",
|
|
181
|
+
content: `Extract data from the content below according to this JSON Schema:\n\n${schemaStr}\n\n---\nContent:\n${markdown}\n---\n\nReturn ONLY a valid JSON object matching the schema above.`,
|
|
182
|
+
},
|
|
183
|
+
];
|
|
184
|
+
|
|
185
|
+
const output = await generator(messages, {
|
|
186
|
+
max_new_tokens: config.maxNewTokens,
|
|
187
|
+
temperature: 0,
|
|
188
|
+
do_sample: false,
|
|
189
|
+
repetition_penalty: 1.05,
|
|
190
|
+
enable_thinking: false,
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const raw = output[0].generated_text.at(-1).content;
|
|
194
|
+
return normalizeExtracted(parseJSONSafe(raw));
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Call LLM with a custom prompt
|
|
199
|
+
*/
|
|
200
|
+
export async function queryLLM(systemPrompt, userPrompt, options = {}) {
|
|
201
|
+
if (!generator) throw new Error("LLM not initialized. Call initLLM() first.");
|
|
202
|
+
|
|
203
|
+
const config = MODEL_PRESETS[currentPreset];
|
|
204
|
+
const { noThink = true } = options;
|
|
205
|
+
|
|
206
|
+
const finalSystemPrompt = noThink
|
|
207
|
+
? `${systemPrompt} /no_think`
|
|
208
|
+
: systemPrompt;
|
|
209
|
+
|
|
210
|
+
const messages = [
|
|
211
|
+
{ role: "system", content: finalSystemPrompt },
|
|
212
|
+
{ role: "user", content: userPrompt },
|
|
213
|
+
];
|
|
214
|
+
|
|
215
|
+
const output = await generator(messages, {
|
|
216
|
+
max_new_tokens: config.maxNewTokens,
|
|
217
|
+
do_sample: false,
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
return output[0].generated_text.at(-1).content;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Normalize LLM output structure
|
|
225
|
+
* - If schema definition leaked into output -> extract properties content
|
|
226
|
+
* - If wrapped in array -> extract first element
|
|
227
|
+
*/
|
|
228
|
+
function normalizeExtracted(parsed) {
|
|
229
|
+
if (!parsed || parsed._error) return parsed;
|
|
230
|
+
|
|
231
|
+
// Schema leak: { type: "object", properties: { title: "...", ... } }
|
|
232
|
+
if (parsed.type && parsed.properties && typeof parsed.properties === "object") {
|
|
233
|
+
const inner = parsed.properties;
|
|
234
|
+
// If property values are actual data (not schema defs like { type: "string" })
|
|
235
|
+
const firstVal = Object.values(inner)[0];
|
|
236
|
+
if (firstVal && typeof firstVal !== "object") {
|
|
237
|
+
return inner;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Array wrap: [{ title: "...", ... }] -> { title: "...", ... }
|
|
242
|
+
if (Array.isArray(parsed) && parsed.length === 1) {
|
|
243
|
+
return parsed[0];
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return parsed;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Safely parse a JSON string from LLM output
|
|
251
|
+
*/
|
|
252
|
+
function parseJSONSafe(raw) {
|
|
253
|
+
// Remove think tags
|
|
254
|
+
let cleaned = raw.replace(/<think>[\s\S]*?<\/think>/g, "");
|
|
255
|
+
cleaned = cleaned.replace(/<think>[\s\S]*/g, "");
|
|
256
|
+
// Remove code fences
|
|
257
|
+
cleaned = cleaned.replace(/```(?:json)?\s*/gi, "").replace(/```/g, "");
|
|
258
|
+
cleaned = cleaned.trim();
|
|
259
|
+
|
|
260
|
+
try {
|
|
261
|
+
return JSON.parse(cleaned);
|
|
262
|
+
} catch {
|
|
263
|
+
// Find the last valid JSON object
|
|
264
|
+
const matches = [...cleaned.matchAll(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/g)];
|
|
265
|
+
for (let i = matches.length - 1; i >= 0; i--) {
|
|
266
|
+
try {
|
|
267
|
+
return JSON.parse(matches[i][0]);
|
|
268
|
+
} catch { continue; }
|
|
269
|
+
}
|
|
270
|
+
// Try broadest match
|
|
271
|
+
const broad = cleaned.match(/\{[\s\S]*\}/);
|
|
272
|
+
if (broad) {
|
|
273
|
+
try {
|
|
274
|
+
return JSON.parse(broad[0]);
|
|
275
|
+
} catch {}
|
|
276
|
+
}
|
|
277
|
+
return { _raw: raw, _error: "Failed to parse JSON" };
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
export { MODEL_PRESETS };
|
package/src/pipeline.mjs
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
// src/pipeline.mjs
|
|
2
|
+
// Scraping -> Markdown conversion -> LLM structured extraction pipeline
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
fetchPage,
|
|
6
|
+
fetchPages,
|
|
7
|
+
launchBrowser,
|
|
8
|
+
closeBrowser,
|
|
9
|
+
} from "./scraper.mjs";
|
|
10
|
+
import { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./html2md.mjs";
|
|
11
|
+
import { initLLM, extractStructured, queryLLM } from "./llm.mjs";
|
|
12
|
+
import { tryStructuredExtract } from "./structured-extract.mjs";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Default schema definition
|
|
16
|
+
*/
|
|
17
|
+
const DEFAULT_SCHEMA = {
|
|
18
|
+
title: "Page title (string)",
|
|
19
|
+
description: "Page summary in 1-2 sentences (string)",
|
|
20
|
+
main_topics: "List of main topics (array of strings)",
|
|
21
|
+
key_facts: "Important facts and data points (array of strings)",
|
|
22
|
+
entities: "People, organizations, and product names mentioned (array of strings)",
|
|
23
|
+
language: "Content language (string, e.g. 'ja', 'en')",
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Scrape a single URL and extract structured data
|
|
28
|
+
* @param {string} url
|
|
29
|
+
* @param {object} options
|
|
30
|
+
*/
|
|
31
|
+
export async function scrapeAndExtract(url, options = {}) {
|
|
32
|
+
const {
|
|
33
|
+
schema = DEFAULT_SCHEMA,
|
|
34
|
+
preset = "balanced",
|
|
35
|
+
device = "webgpu",
|
|
36
|
+
maxTokens = 2048,
|
|
37
|
+
selector,
|
|
38
|
+
browserOptions = {},
|
|
39
|
+
scrapeOptions = {},
|
|
40
|
+
} = options;
|
|
41
|
+
|
|
42
|
+
await launchBrowser(browserOptions);
|
|
43
|
+
|
|
44
|
+
const { html, url: finalUrl, status } = await fetchPage(url, scrapeOptions);
|
|
45
|
+
if (!html) return { url, error: "Failed to fetch page", status };
|
|
46
|
+
|
|
47
|
+
// Try structured data (JSON-LD / OG) first
|
|
48
|
+
const structResult = tryStructuredExtract(html, schema);
|
|
49
|
+
if (structResult) {
|
|
50
|
+
return {
|
|
51
|
+
url: finalUrl, status,
|
|
52
|
+
extraction_source: structResult.source,
|
|
53
|
+
coverage: structResult.coverage,
|
|
54
|
+
extracted: structResult.extracted,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Fallback: Markdown conversion -> cleanMarkdown -> LLM
|
|
59
|
+
const { markdown, title, excerpt } = htmlToMarkdown(html, finalUrl, { selector });
|
|
60
|
+
const llmInput = cleanMarkdown(markdown);
|
|
61
|
+
const metadata = {
|
|
62
|
+
url: finalUrl, status, title, excerpt,
|
|
63
|
+
markdown_length: markdown.length,
|
|
64
|
+
cleaned_length: llmInput.length,
|
|
65
|
+
extraction_source: "llm",
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
// LLM structured extraction
|
|
69
|
+
await initLLM(preset, device);
|
|
70
|
+
const extracted = await extractStructured(llmInput, schema);
|
|
71
|
+
|
|
72
|
+
return { ...metadata, extracted };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Batch process multiple URLs
|
|
77
|
+
*/
|
|
78
|
+
export async function batchScrapeAndExtract(urls, options = {}) {
|
|
79
|
+
const {
|
|
80
|
+
schema = DEFAULT_SCHEMA,
|
|
81
|
+
preset = "balanced",
|
|
82
|
+
device = "webgpu",
|
|
83
|
+
maxTokens = 2048,
|
|
84
|
+
concurrency = 3,
|
|
85
|
+
browserOptions = {},
|
|
86
|
+
scrapeOptions = {},
|
|
87
|
+
} = options;
|
|
88
|
+
|
|
89
|
+
await initLLM(preset, device);
|
|
90
|
+
await launchBrowser(browserOptions);
|
|
91
|
+
|
|
92
|
+
const results = [];
|
|
93
|
+
const pages = await fetchPages(urls, scrapeOptions, concurrency);
|
|
94
|
+
|
|
95
|
+
for (const page of pages) {
|
|
96
|
+
if (page.error || !page.html) {
|
|
97
|
+
results.push({ url: page.url, error: page.error || "Empty response", extracted: null });
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Try structured data first
|
|
102
|
+
const structResult = tryStructuredExtract(page.html, schema);
|
|
103
|
+
if (structResult) {
|
|
104
|
+
results.push({
|
|
105
|
+
url: page.url, status: page.status,
|
|
106
|
+
extraction_source: structResult.source,
|
|
107
|
+
coverage: structResult.coverage,
|
|
108
|
+
extracted: structResult.extracted,
|
|
109
|
+
});
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Fallback: Markdown -> LLM
|
|
114
|
+
const { markdown, title } = htmlToMarkdown(page.html, page.url);
|
|
115
|
+
const truncated = truncateForLLM(cleanMarkdown(markdown), maxTokens);
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
const extracted = await extractStructured(truncated, schema);
|
|
119
|
+
results.push({ url: page.url, status: page.status, title, extraction_source: "llm", extracted });
|
|
120
|
+
} catch (error) {
|
|
121
|
+
results.push({ url: page.url, error: error.message, extracted: null });
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return results;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Custom prompt query against page content
|
|
130
|
+
*/
|
|
131
|
+
export async function scrapeAndQuery(url, prompt, options = {}) {
|
|
132
|
+
const {
|
|
133
|
+
preset = "balanced",
|
|
134
|
+
device = "webgpu",
|
|
135
|
+
maxTokens = 2048,
|
|
136
|
+
browserOptions = {},
|
|
137
|
+
scrapeOptions = {},
|
|
138
|
+
} = options;
|
|
139
|
+
|
|
140
|
+
await launchBrowser(browserOptions);
|
|
141
|
+
|
|
142
|
+
const { html, url: finalUrl } = await fetchPage(url, scrapeOptions);
|
|
143
|
+
if (!html) return { url, error: "Failed to fetch" };
|
|
144
|
+
|
|
145
|
+
const { markdown } = htmlToMarkdown(html, finalUrl);
|
|
146
|
+
const content = truncateForLLM(cleanMarkdown(markdown), maxTokens);
|
|
147
|
+
|
|
148
|
+
await initLLM(preset, device);
|
|
149
|
+
|
|
150
|
+
const answer = await queryLLM(
|
|
151
|
+
"You are a helpful assistant analyzing web content. Answer based only on the provided content.",
|
|
152
|
+
`${prompt}\n\n---\nContent:\n${content}`
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
return { url, answer };
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Cleanup (close browser)
|
|
160
|
+
*/
|
|
161
|
+
export async function cleanup() {
|
|
162
|
+
await closeBrowser();
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
export { DEFAULT_SCHEMA };
|