edgecrawl 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/index.mjs +0 -3
- package/package.json +2 -3
- package/src/llm.mjs +34 -9
- package/src/pipeline.mjs +0 -26
- package/src/structured-extract.mjs +0 -226
package/README.md
CHANGED
|
@@ -7,7 +7,6 @@ Local AI-powered web scraper. Extract structured JSON from any website using on-
|
|
|
7
7
|
- **100% Local AI** — Runs Qwen3 ONNX models on your machine via Transformers.js v4 (WebGPU/WASM)
|
|
8
8
|
- **Zero API Keys** — No OpenAI, no Anthropic, no cloud bills. Everything runs on-device
|
|
9
9
|
- **Structured JSON Output** — Define a schema, get clean JSON back
|
|
10
|
-
- **Smart Extraction** — Tries JSON-LD/Open Graph first, falls back to LLM only when needed
|
|
11
10
|
- **CLI + Library** — Use from the command line or import into your Node.js app
|
|
12
11
|
|
|
13
12
|
## Architecture
|
package/index.mjs
CHANGED
|
@@ -15,8 +15,5 @@ export { initLLM, extractStructured, queryLLM, MODEL_PRESETS } from "./src/llm.m
|
|
|
15
15
|
// HTML-to-Markdown conversion
|
|
16
16
|
export { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./src/html2md.mjs";
|
|
17
17
|
|
|
18
|
-
// Structured data extraction (JSON-LD / Open Graph)
|
|
19
|
-
export { tryStructuredExtract } from "./src/structured-extract.mjs";
|
|
20
|
-
|
|
21
18
|
// Browser / scraping primitives
|
|
22
19
|
export { launchBrowser, fetchPage, fetchPages, closeBrowser } from "./src/scraper.mjs";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "edgecrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Local AI-powered web scraper. Extract structured JSON from any website using on-device ONNX LLMs. No API keys, no cloud, no Python.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.mjs",
|
|
@@ -8,8 +8,7 @@
|
|
|
8
8
|
".": "./index.mjs",
|
|
9
9
|
"./scraper": "./src/scraper.mjs",
|
|
10
10
|
"./html2md": "./src/html2md.mjs",
|
|
11
|
-
"./llm": "./src/llm.mjs"
|
|
12
|
-
"./structured-extract": "./src/structured-extract.mjs"
|
|
11
|
+
"./llm": "./src/llm.mjs"
|
|
13
12
|
},
|
|
14
13
|
"bin": {
|
|
15
14
|
"edgecrawl": "./src/cli.mjs"
|
package/src/llm.mjs
CHANGED
|
@@ -250,7 +250,7 @@ function normalizeExtracted(parsed) {
|
|
|
250
250
|
* Safely parse a JSON string from LLM output
|
|
251
251
|
*/
|
|
252
252
|
function parseJSONSafe(raw) {
|
|
253
|
-
// Remove think tags
|
|
253
|
+
// Remove think tags (closed and unclosed)
|
|
254
254
|
let cleaned = raw.replace(/<think>[\s\S]*?<\/think>/g, "");
|
|
255
255
|
cleaned = cleaned.replace(/<think>[\s\S]*/g, "");
|
|
256
256
|
// Remove code fences
|
|
@@ -260,20 +260,45 @@ function parseJSONSafe(raw) {
|
|
|
260
260
|
try {
|
|
261
261
|
return JSON.parse(cleaned);
|
|
262
262
|
} catch {
|
|
263
|
-
//
|
|
263
|
+
// Try broadest { ... } match first (handles nested objects/arrays)
|
|
264
|
+
const broad = cleaned.match(/\{[\s\S]*\}/);
|
|
265
|
+
if (broad) {
|
|
266
|
+
try {
|
|
267
|
+
return JSON.parse(broad[0]);
|
|
268
|
+
} catch {
|
|
269
|
+
// Try fixing common LLM JSON issues
|
|
270
|
+
try {
|
|
271
|
+
let fixed = broad[0];
|
|
272
|
+
fixed = fixed.replace(/,\s*([}\]])/g, "$1"); // trailing commas
|
|
273
|
+
// Fix bracket mismatches: ] closed with } or vice versa
|
|
274
|
+
const stack = [];
|
|
275
|
+
const chars = fixed.split("");
|
|
276
|
+
for (let i = 0; i < chars.length; i++) {
|
|
277
|
+
if (chars[i] === "{" || chars[i] === "[") {
|
|
278
|
+
stack.push(chars[i]);
|
|
279
|
+
} else if (chars[i] === "}") {
|
|
280
|
+
if (stack.length && stack[stack.length - 1] === "[") {
|
|
281
|
+
chars[i] = "]"; // fix: [ was opened, } should be ]
|
|
282
|
+
}
|
|
283
|
+
stack.pop();
|
|
284
|
+
} else if (chars[i] === "]") {
|
|
285
|
+
if (stack.length && stack[stack.length - 1] === "{") {
|
|
286
|
+
chars[i] = "}"; // fix: { was opened, ] should be }
|
|
287
|
+
}
|
|
288
|
+
stack.pop();
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
return JSON.parse(chars.join(""));
|
|
292
|
+
} catch {}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
// Find the last valid JSON object (shallow)
|
|
264
296
|
const matches = [...cleaned.matchAll(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/g)];
|
|
265
297
|
for (let i = matches.length - 1; i >= 0; i--) {
|
|
266
298
|
try {
|
|
267
299
|
return JSON.parse(matches[i][0]);
|
|
268
300
|
} catch { continue; }
|
|
269
301
|
}
|
|
270
|
-
// Try broadest match
|
|
271
|
-
const broad = cleaned.match(/\{[\s\S]*\}/);
|
|
272
|
-
if (broad) {
|
|
273
|
-
try {
|
|
274
|
-
return JSON.parse(broad[0]);
|
|
275
|
-
} catch {}
|
|
276
|
-
}
|
|
277
302
|
return { _raw: raw, _error: "Failed to parse JSON" };
|
|
278
303
|
}
|
|
279
304
|
}
|
package/src/pipeline.mjs
CHANGED
|
@@ -9,7 +9,6 @@ import {
|
|
|
9
9
|
} from "./scraper.mjs";
|
|
10
10
|
import { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./html2md.mjs";
|
|
11
11
|
import { initLLM, extractStructured, queryLLM } from "./llm.mjs";
|
|
12
|
-
import { tryStructuredExtract } from "./structured-extract.mjs";
|
|
13
12
|
|
|
14
13
|
/**
|
|
15
14
|
* Default schema definition
|
|
@@ -44,18 +43,6 @@ export async function scrapeAndExtract(url, options = {}) {
|
|
|
44
43
|
const { html, url: finalUrl, status } = await fetchPage(url, scrapeOptions);
|
|
45
44
|
if (!html) return { url, error: "Failed to fetch page", status };
|
|
46
45
|
|
|
47
|
-
// Try structured data (JSON-LD / OG) first
|
|
48
|
-
const structResult = tryStructuredExtract(html, schema);
|
|
49
|
-
if (structResult) {
|
|
50
|
-
return {
|
|
51
|
-
url: finalUrl, status,
|
|
52
|
-
extraction_source: structResult.source,
|
|
53
|
-
coverage: structResult.coverage,
|
|
54
|
-
extracted: structResult.extracted,
|
|
55
|
-
};
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// Fallback: Markdown conversion -> cleanMarkdown -> LLM
|
|
59
46
|
const { markdown, title, excerpt } = htmlToMarkdown(html, finalUrl, { selector });
|
|
60
47
|
const llmInput = cleanMarkdown(markdown);
|
|
61
48
|
const metadata = {
|
|
@@ -98,19 +85,6 @@ export async function batchScrapeAndExtract(urls, options = {}) {
|
|
|
98
85
|
continue;
|
|
99
86
|
}
|
|
100
87
|
|
|
101
|
-
// Try structured data first
|
|
102
|
-
const structResult = tryStructuredExtract(page.html, schema);
|
|
103
|
-
if (structResult) {
|
|
104
|
-
results.push({
|
|
105
|
-
url: page.url, status: page.status,
|
|
106
|
-
extraction_source: structResult.source,
|
|
107
|
-
coverage: structResult.coverage,
|
|
108
|
-
extracted: structResult.extracted,
|
|
109
|
-
});
|
|
110
|
-
continue;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// Fallback: Markdown -> LLM
|
|
114
88
|
const { markdown, title } = htmlToMarkdown(page.html, page.url);
|
|
115
89
|
const truncated = truncateForLLM(cleanMarkdown(markdown), maxTokens);
|
|
116
90
|
|
|
@@ -1,226 +0,0 @@
|
|
|
1
|
-
// src/structured-extract.mjs
|
|
2
|
-
// Extract structured data (JSON-LD, Open Graph) from HTML
|
|
3
|
-
// If schema match coverage is sufficient, return result without LLM
|
|
4
|
-
|
|
5
|
-
import { JSDOM } from "jsdom";
|
|
6
|
-
|
|
7
|
-
/**
|
|
8
|
-
* Extract JSON-LD data from HTML
|
|
9
|
-
* @param {string} html - Raw HTML
|
|
10
|
-
* @returns {object[]} Array of JSON-LD objects
|
|
11
|
-
*/
|
|
12
|
-
function extractJsonLd(html) {
|
|
13
|
-
const dom = new JSDOM(html);
|
|
14
|
-
const scripts = dom.window.document.querySelectorAll('script[type="application/ld+json"]');
|
|
15
|
-
const results = [];
|
|
16
|
-
|
|
17
|
-
for (const script of scripts) {
|
|
18
|
-
try {
|
|
19
|
-
const data = JSON.parse(script.textContent);
|
|
20
|
-
// Expand @graph if present
|
|
21
|
-
if (data["@graph"] && Array.isArray(data["@graph"])) {
|
|
22
|
-
results.push(...data["@graph"]);
|
|
23
|
-
} else if (Array.isArray(data)) {
|
|
24
|
-
results.push(...data);
|
|
25
|
-
} else {
|
|
26
|
-
results.push(data);
|
|
27
|
-
}
|
|
28
|
-
} catch {
|
|
29
|
-
// Ignore JSON parse failures
|
|
30
|
-
}
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
return results;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
/**
|
|
37
|
-
* Extract Open Graph meta tags from HTML
|
|
38
|
-
* @param {string} html - Raw HTML
|
|
39
|
-
* @returns {object} OG property dictionary { "og:title": "...", ... }
|
|
40
|
-
*/
|
|
41
|
-
function extractOpenGraph(html) {
|
|
42
|
-
const dom = new JSDOM(html);
|
|
43
|
-
const doc = dom.window.document;
|
|
44
|
-
const og = {};
|
|
45
|
-
|
|
46
|
-
// og:* meta tags
|
|
47
|
-
const ogMetas = doc.querySelectorAll('meta[property^="og:"]');
|
|
48
|
-
for (const meta of ogMetas) {
|
|
49
|
-
const prop = meta.getAttribute("property");
|
|
50
|
-
const content = meta.getAttribute("content");
|
|
51
|
-
if (prop && content) {
|
|
52
|
-
og[prop] = content;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// twitter:* meta tags
|
|
57
|
-
const twitterMetas = doc.querySelectorAll('meta[name^="twitter:"], meta[property^="twitter:"]');
|
|
58
|
-
for (const meta of twitterMetas) {
|
|
59
|
-
const prop = meta.getAttribute("name") || meta.getAttribute("property");
|
|
60
|
-
const content = meta.getAttribute("content");
|
|
61
|
-
if (prop && content) {
|
|
62
|
-
og[prop] = content;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
// Basic meta tags
|
|
67
|
-
const descMeta = doc.querySelector('meta[name="description"]');
|
|
68
|
-
if (descMeta) og["meta:description"] = descMeta.getAttribute("content");
|
|
69
|
-
|
|
70
|
-
const titleEl = doc.querySelector("title");
|
|
71
|
-
if (titleEl) og["meta:title"] = titleEl.textContent.trim();
|
|
72
|
-
|
|
73
|
-
return og;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
/**
|
|
77
|
-
* Flatten JSON-LD data into key-value pairs
|
|
78
|
-
* Nested objects become "parent.child" format
|
|
79
|
-
*/
|
|
80
|
-
function flattenObject(obj, prefix = "") {
|
|
81
|
-
const result = {};
|
|
82
|
-
|
|
83
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
84
|
-
if (key.startsWith("@")) continue; // Skip @type, @context, etc.
|
|
85
|
-
|
|
86
|
-
const fullKey = prefix ? `${prefix}.${key}` : key;
|
|
87
|
-
|
|
88
|
-
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
89
|
-
Object.assign(result, flattenObject(value, fullKey));
|
|
90
|
-
} else {
|
|
91
|
-
result[fullKey] = value;
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
return result;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* Match structured data against a schema
|
|
100
|
-
* For each schema key, find a corresponding value in structured data
|
|
101
|
-
* @param {object} structuredData - Flattened structured data
|
|
102
|
-
* @param {object} schema - User-defined schema
|
|
103
|
-
* @returns {{ matched: object, coverage: number }} Match results and coverage ratio
|
|
104
|
-
*/
|
|
105
|
-
function matchToSchema(structuredData, schema) {
|
|
106
|
-
const schemaKeys = Object.keys(schema);
|
|
107
|
-
const matched = {};
|
|
108
|
-
let matchCount = 0;
|
|
109
|
-
|
|
110
|
-
// Key name alias map (common mapping patterns)
|
|
111
|
-
const keyAliases = {
|
|
112
|
-
title: ["name", "headline", "og:title", "twitter:title", "meta:title"],
|
|
113
|
-
description: ["description", "abstract", "og:description", "twitter:description", "meta:description"],
|
|
114
|
-
price: ["price", "offers.price", "offers.lowPrice"],
|
|
115
|
-
currency: ["priceCurrency", "offers.priceCurrency"],
|
|
116
|
-
image: ["image", "thumbnailUrl", "og:image", "twitter:image"],
|
|
117
|
-
url: ["url", "mainEntityOfPage", "og:url"],
|
|
118
|
-
author: ["author.name", "author", "creator"],
|
|
119
|
-
date: ["datePublished", "dateCreated", "dateModified"],
|
|
120
|
-
published: ["datePublished", "dateCreated"],
|
|
121
|
-
brand: ["brand.name", "brand"],
|
|
122
|
-
category: ["category", "articleSection"],
|
|
123
|
-
rating: ["aggregateRating.ratingValue"],
|
|
124
|
-
reviewCount: ["aggregateRating.reviewCount"],
|
|
125
|
-
availability: ["offers.availability"],
|
|
126
|
-
language: ["inLanguage"],
|
|
127
|
-
};
|
|
128
|
-
|
|
129
|
-
for (const schemaKey of schemaKeys) {
|
|
130
|
-
const normalizedKey = schemaKey.toLowerCase().replace(/[_-]/g, "");
|
|
131
|
-
|
|
132
|
-
// 1. Direct key match
|
|
133
|
-
if (structuredData[schemaKey] !== undefined) {
|
|
134
|
-
matched[schemaKey] = structuredData[schemaKey];
|
|
135
|
-
matchCount++;
|
|
136
|
-
continue;
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
// 2. Alias match
|
|
140
|
-
let found = false;
|
|
141
|
-
for (const [aliasGroup, aliases] of Object.entries(keyAliases)) {
|
|
142
|
-
if (normalizedKey.includes(aliasGroup)) {
|
|
143
|
-
for (const alias of aliases) {
|
|
144
|
-
if (structuredData[alias] !== undefined) {
|
|
145
|
-
matched[schemaKey] = structuredData[alias];
|
|
146
|
-
matchCount++;
|
|
147
|
-
found = true;
|
|
148
|
-
break;
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
if (found) break;
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
if (found) continue;
|
|
155
|
-
|
|
156
|
-
// 3. Partial match (structured data key contains schema key or vice versa)
|
|
157
|
-
for (const [dataKey, dataValue] of Object.entries(structuredData)) {
|
|
158
|
-
const normalizedDataKey = dataKey.toLowerCase().replace(/[_.-]/g, "");
|
|
159
|
-
if (normalizedDataKey.includes(normalizedKey) || normalizedKey.includes(normalizedDataKey)) {
|
|
160
|
-
matched[schemaKey] = dataValue;
|
|
161
|
-
matchCount++;
|
|
162
|
-
break;
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
return {
|
|
168
|
-
matched,
|
|
169
|
-
coverage: schemaKeys.length > 0 ? matchCount / schemaKeys.length : 0,
|
|
170
|
-
};
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
/**
|
|
174
|
-
* Extract structured data from HTML and match against schema
|
|
175
|
-
* Returns result if coverage meets threshold (no LLM needed)
|
|
176
|
-
*
|
|
177
|
-
* @param {string} html - Raw HTML
|
|
178
|
-
* @param {object} schema - User-defined schema
|
|
179
|
-
* @param {object} options
|
|
180
|
-
* @param {number} options.minCoverage - Minimum coverage ratio (0-1, default: 0.5)
|
|
181
|
-
* @returns {{ extracted: object, source: string, coverage: number } | null}
|
|
182
|
-
*/
|
|
183
|
-
export function tryStructuredExtract(html, schema, options = {}) {
|
|
184
|
-
const { minCoverage = 0.5 } = options;
|
|
185
|
-
|
|
186
|
-
// 1. Try extraction from JSON-LD
|
|
187
|
-
const jsonLdItems = extractJsonLd(html);
|
|
188
|
-
if (jsonLdItems.length > 0) {
|
|
189
|
-
// Merge and flatten all JSON-LD items
|
|
190
|
-
const merged = {};
|
|
191
|
-
for (const item of jsonLdItems) {
|
|
192
|
-
Object.assign(merged, flattenObject(item));
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
const { matched, coverage } = matchToSchema(merged, schema);
|
|
196
|
-
if (coverage >= minCoverage) {
|
|
197
|
-
return { extracted: matched, source: "json-ld", coverage };
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
// 2. Try extraction from Open Graph
|
|
202
|
-
const ogData = extractOpenGraph(html);
|
|
203
|
-
if (Object.keys(ogData).length > 0) {
|
|
204
|
-
const { matched, coverage } = matchToSchema(ogData, schema);
|
|
205
|
-
if (coverage >= minCoverage) {
|
|
206
|
-
return { extracted: matched, source: "open-graph", coverage };
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
// 3. Merge JSON-LD + OG and retry
|
|
211
|
-
if (jsonLdItems.length > 0 && Object.keys(ogData).length > 0) {
|
|
212
|
-
const merged = {};
|
|
213
|
-
for (const item of jsonLdItems) {
|
|
214
|
-
Object.assign(merged, flattenObject(item));
|
|
215
|
-
}
|
|
216
|
-
Object.assign(merged, ogData);
|
|
217
|
-
|
|
218
|
-
const { matched, coverage } = matchToSchema(merged, schema);
|
|
219
|
-
if (coverage >= minCoverage) {
|
|
220
|
-
return { extracted: matched, source: "json-ld+open-graph", coverage };
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
// Coverage insufficient -> null (fallback to LLM)
|
|
225
|
-
return null;
|
|
226
|
-
}
|