edgecrawl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,7 +7,6 @@ Local AI-powered web scraper. Extract structured JSON from any website using on-
7
7
  - **100% Local AI** — Runs Qwen3 ONNX models on your machine via Transformers.js v4 (WebGPU/WASM)
8
8
  - **Zero API Keys** — No OpenAI, no Anthropic, no cloud bills. Everything runs on-device
9
9
  - **Structured JSON Output** — Define a schema, get clean JSON back
10
- - **Smart Extraction** — Tries JSON-LD/Open Graph first, falls back to LLM only when needed
11
10
  - **CLI + Library** — Use from the command line or import into your Node.js app
12
11
 
13
12
  ## Architecture
package/index.mjs CHANGED
@@ -15,8 +15,5 @@ export { initLLM, extractStructured, queryLLM, MODEL_PRESETS } from "./src/llm.m
15
15
  // HTML-to-Markdown conversion
16
16
  export { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./src/html2md.mjs";
17
17
 
18
- // Structured data extraction (JSON-LD / Open Graph)
19
- export { tryStructuredExtract } from "./src/structured-extract.mjs";
20
-
21
18
  // Browser / scraping primitives
22
19
  export { launchBrowser, fetchPage, fetchPages, closeBrowser } from "./src/scraper.mjs";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "edgecrawl",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "Local AI-powered web scraper. Extract structured JSON from any website using on-device ONNX LLMs. No API keys, no cloud, no Python.",
5
5
  "type": "module",
6
6
  "main": "./index.mjs",
@@ -8,8 +8,7 @@
8
8
  ".": "./index.mjs",
9
9
  "./scraper": "./src/scraper.mjs",
10
10
  "./html2md": "./src/html2md.mjs",
11
- "./llm": "./src/llm.mjs",
12
- "./structured-extract": "./src/structured-extract.mjs"
11
+ "./llm": "./src/llm.mjs"
13
12
  },
14
13
  "bin": {
15
14
  "edgecrawl": "./src/cli.mjs"
package/src/llm.mjs CHANGED
@@ -250,7 +250,7 @@ function normalizeExtracted(parsed) {
250
250
  * Safely parse a JSON string from LLM output
251
251
  */
252
252
  function parseJSONSafe(raw) {
253
- // Remove think tags
253
+ // Remove think tags (closed and unclosed)
254
254
  let cleaned = raw.replace(/<think>[\s\S]*?<\/think>/g, "");
255
255
  cleaned = cleaned.replace(/<think>[\s\S]*/g, "");
256
256
  // Remove code fences
@@ -260,20 +260,45 @@ function parseJSONSafe(raw) {
260
260
  try {
261
261
  return JSON.parse(cleaned);
262
262
  } catch {
263
- // Find the last valid JSON object
263
+ // Try broadest { ... } match first (handles nested objects/arrays)
264
+ const broad = cleaned.match(/\{[\s\S]*\}/);
265
+ if (broad) {
266
+ try {
267
+ return JSON.parse(broad[0]);
268
+ } catch {
269
+ // Try fixing common LLM JSON issues
270
+ try {
271
+ let fixed = broad[0];
272
+ fixed = fixed.replace(/,\s*([}\]])/g, "$1"); // trailing commas
273
+ // Fix bracket mismatches: ] closed with } or vice versa
274
+ const stack = [];
275
+ const chars = fixed.split("");
276
+ for (let i = 0; i < chars.length; i++) {
277
+ if (chars[i] === "{" || chars[i] === "[") {
278
+ stack.push(chars[i]);
279
+ } else if (chars[i] === "}") {
280
+ if (stack.length && stack[stack.length - 1] === "[") {
281
+ chars[i] = "]"; // fix: [ was opened, } should be ]
282
+ }
283
+ stack.pop();
284
+ } else if (chars[i] === "]") {
285
+ if (stack.length && stack[stack.length - 1] === "{") {
286
+ chars[i] = "}"; // fix: { was opened, ] should be }
287
+ }
288
+ stack.pop();
289
+ }
290
+ }
291
+ return JSON.parse(chars.join(""));
292
+ } catch {}
293
+ }
294
+ }
295
+ // Find the last valid JSON object (shallow)
264
296
  const matches = [...cleaned.matchAll(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/g)];
265
297
  for (let i = matches.length - 1; i >= 0; i--) {
266
298
  try {
267
299
  return JSON.parse(matches[i][0]);
268
300
  } catch { continue; }
269
301
  }
270
- // Try broadest match
271
- const broad = cleaned.match(/\{[\s\S]*\}/);
272
- if (broad) {
273
- try {
274
- return JSON.parse(broad[0]);
275
- } catch {}
276
- }
277
302
  return { _raw: raw, _error: "Failed to parse JSON" };
278
303
  }
279
304
  }
package/src/pipeline.mjs CHANGED
@@ -9,7 +9,6 @@ import {
9
9
  } from "./scraper.mjs";
10
10
  import { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./html2md.mjs";
11
11
  import { initLLM, extractStructured, queryLLM } from "./llm.mjs";
12
- import { tryStructuredExtract } from "./structured-extract.mjs";
13
12
 
14
13
  /**
15
14
  * Default schema definition
@@ -44,18 +43,6 @@ export async function scrapeAndExtract(url, options = {}) {
44
43
  const { html, url: finalUrl, status } = await fetchPage(url, scrapeOptions);
45
44
  if (!html) return { url, error: "Failed to fetch page", status };
46
45
 
47
- // Try structured data (JSON-LD / OG) first
48
- const structResult = tryStructuredExtract(html, schema);
49
- if (structResult) {
50
- return {
51
- url: finalUrl, status,
52
- extraction_source: structResult.source,
53
- coverage: structResult.coverage,
54
- extracted: structResult.extracted,
55
- };
56
- }
57
-
58
- // Fallback: Markdown conversion -> cleanMarkdown -> LLM
59
46
  const { markdown, title, excerpt } = htmlToMarkdown(html, finalUrl, { selector });
60
47
  const llmInput = cleanMarkdown(markdown);
61
48
  const metadata = {
@@ -98,19 +85,6 @@ export async function batchScrapeAndExtract(urls, options = {}) {
98
85
  continue;
99
86
  }
100
87
 
101
- // Try structured data first
102
- const structResult = tryStructuredExtract(page.html, schema);
103
- if (structResult) {
104
- results.push({
105
- url: page.url, status: page.status,
106
- extraction_source: structResult.source,
107
- coverage: structResult.coverage,
108
- extracted: structResult.extracted,
109
- });
110
- continue;
111
- }
112
-
113
- // Fallback: Markdown -> LLM
114
88
  const { markdown, title } = htmlToMarkdown(page.html, page.url);
115
89
  const truncated = truncateForLLM(cleanMarkdown(markdown), maxTokens);
116
90
 
@@ -1,226 +0,0 @@
1
- // src/structured-extract.mjs
2
- // Extract structured data (JSON-LD, Open Graph) from HTML
3
- // If schema match coverage is sufficient, return result without LLM
4
-
5
- import { JSDOM } from "jsdom";
6
-
7
- /**
8
- * Extract JSON-LD data from HTML
9
- * @param {string} html - Raw HTML
10
- * @returns {object[]} Array of JSON-LD objects
11
- */
12
- function extractJsonLd(html) {
13
- const dom = new JSDOM(html);
14
- const scripts = dom.window.document.querySelectorAll('script[type="application/ld+json"]');
15
- const results = [];
16
-
17
- for (const script of scripts) {
18
- try {
19
- const data = JSON.parse(script.textContent);
20
- // Expand @graph if present
21
- if (data["@graph"] && Array.isArray(data["@graph"])) {
22
- results.push(...data["@graph"]);
23
- } else if (Array.isArray(data)) {
24
- results.push(...data);
25
- } else {
26
- results.push(data);
27
- }
28
- } catch {
29
- // Ignore JSON parse failures
30
- }
31
- }
32
-
33
- return results;
34
- }
35
-
36
- /**
37
- * Extract Open Graph meta tags from HTML
38
- * @param {string} html - Raw HTML
39
- * @returns {object} OG property dictionary { "og:title": "...", ... }
40
- */
41
- function extractOpenGraph(html) {
42
- const dom = new JSDOM(html);
43
- const doc = dom.window.document;
44
- const og = {};
45
-
46
- // og:* meta tags
47
- const ogMetas = doc.querySelectorAll('meta[property^="og:"]');
48
- for (const meta of ogMetas) {
49
- const prop = meta.getAttribute("property");
50
- const content = meta.getAttribute("content");
51
- if (prop && content) {
52
- og[prop] = content;
53
- }
54
- }
55
-
56
- // twitter:* meta tags
57
- const twitterMetas = doc.querySelectorAll('meta[name^="twitter:"], meta[property^="twitter:"]');
58
- for (const meta of twitterMetas) {
59
- const prop = meta.getAttribute("name") || meta.getAttribute("property");
60
- const content = meta.getAttribute("content");
61
- if (prop && content) {
62
- og[prop] = content;
63
- }
64
- }
65
-
66
- // Basic meta tags
67
- const descMeta = doc.querySelector('meta[name="description"]');
68
- if (descMeta) og["meta:description"] = descMeta.getAttribute("content");
69
-
70
- const titleEl = doc.querySelector("title");
71
- if (titleEl) og["meta:title"] = titleEl.textContent.trim();
72
-
73
- return og;
74
- }
75
-
76
- /**
77
- * Flatten JSON-LD data into key-value pairs
78
- * Nested objects become "parent.child" format
79
- */
80
- function flattenObject(obj, prefix = "") {
81
- const result = {};
82
-
83
- for (const [key, value] of Object.entries(obj)) {
84
- if (key.startsWith("@")) continue; // Skip @type, @context, etc.
85
-
86
- const fullKey = prefix ? `${prefix}.${key}` : key;
87
-
88
- if (value && typeof value === "object" && !Array.isArray(value)) {
89
- Object.assign(result, flattenObject(value, fullKey));
90
- } else {
91
- result[fullKey] = value;
92
- }
93
- }
94
-
95
- return result;
96
- }
97
-
98
- /**
99
- * Match structured data against a schema
100
- * For each schema key, find a corresponding value in structured data
101
- * @param {object} structuredData - Flattened structured data
102
- * @param {object} schema - User-defined schema
103
- * @returns {{ matched: object, coverage: number }} Match results and coverage ratio
104
- */
105
- function matchToSchema(structuredData, schema) {
106
- const schemaKeys = Object.keys(schema);
107
- const matched = {};
108
- let matchCount = 0;
109
-
110
- // Key name alias map (common mapping patterns)
111
- const keyAliases = {
112
- title: ["name", "headline", "og:title", "twitter:title", "meta:title"],
113
- description: ["description", "abstract", "og:description", "twitter:description", "meta:description"],
114
- price: ["price", "offers.price", "offers.lowPrice"],
115
- currency: ["priceCurrency", "offers.priceCurrency"],
116
- image: ["image", "thumbnailUrl", "og:image", "twitter:image"],
117
- url: ["url", "mainEntityOfPage", "og:url"],
118
- author: ["author.name", "author", "creator"],
119
- date: ["datePublished", "dateCreated", "dateModified"],
120
- published: ["datePublished", "dateCreated"],
121
- brand: ["brand.name", "brand"],
122
- category: ["category", "articleSection"],
123
- rating: ["aggregateRating.ratingValue"],
124
- reviewCount: ["aggregateRating.reviewCount"],
125
- availability: ["offers.availability"],
126
- language: ["inLanguage"],
127
- };
128
-
129
- for (const schemaKey of schemaKeys) {
130
- const normalizedKey = schemaKey.toLowerCase().replace(/[_-]/g, "");
131
-
132
- // 1. Direct key match
133
- if (structuredData[schemaKey] !== undefined) {
134
- matched[schemaKey] = structuredData[schemaKey];
135
- matchCount++;
136
- continue;
137
- }
138
-
139
- // 2. Alias match
140
- let found = false;
141
- for (const [aliasGroup, aliases] of Object.entries(keyAliases)) {
142
- if (normalizedKey.includes(aliasGroup)) {
143
- for (const alias of aliases) {
144
- if (structuredData[alias] !== undefined) {
145
- matched[schemaKey] = structuredData[alias];
146
- matchCount++;
147
- found = true;
148
- break;
149
- }
150
- }
151
- if (found) break;
152
- }
153
- }
154
- if (found) continue;
155
-
156
- // 3. Partial match (structured data key contains schema key or vice versa)
157
- for (const [dataKey, dataValue] of Object.entries(structuredData)) {
158
- const normalizedDataKey = dataKey.toLowerCase().replace(/[_.-]/g, "");
159
- if (normalizedDataKey.includes(normalizedKey) || normalizedKey.includes(normalizedDataKey)) {
160
- matched[schemaKey] = dataValue;
161
- matchCount++;
162
- break;
163
- }
164
- }
165
- }
166
-
167
- return {
168
- matched,
169
- coverage: schemaKeys.length > 0 ? matchCount / schemaKeys.length : 0,
170
- };
171
- }
172
-
173
- /**
174
- * Extract structured data from HTML and match against schema
175
- * Returns result if coverage meets threshold (no LLM needed)
176
- *
177
- * @param {string} html - Raw HTML
178
- * @param {object} schema - User-defined schema
179
- * @param {object} options
180
- * @param {number} options.minCoverage - Minimum coverage ratio (0-1, default: 0.5)
181
- * @returns {{ extracted: object, source: string, coverage: number } | null}
182
- */
183
- export function tryStructuredExtract(html, schema, options = {}) {
184
- const { minCoverage = 0.5 } = options;
185
-
186
- // 1. Try extraction from JSON-LD
187
- const jsonLdItems = extractJsonLd(html);
188
- if (jsonLdItems.length > 0) {
189
- // Merge and flatten all JSON-LD items
190
- const merged = {};
191
- for (const item of jsonLdItems) {
192
- Object.assign(merged, flattenObject(item));
193
- }
194
-
195
- const { matched, coverage } = matchToSchema(merged, schema);
196
- if (coverage >= minCoverage) {
197
- return { extracted: matched, source: "json-ld", coverage };
198
- }
199
- }
200
-
201
- // 2. Try extraction from Open Graph
202
- const ogData = extractOpenGraph(html);
203
- if (Object.keys(ogData).length > 0) {
204
- const { matched, coverage } = matchToSchema(ogData, schema);
205
- if (coverage >= minCoverage) {
206
- return { extracted: matched, source: "open-graph", coverage };
207
- }
208
- }
209
-
210
- // 3. Merge JSON-LD + OG and retry
211
- if (jsonLdItems.length > 0 && Object.keys(ogData).length > 0) {
212
- const merged = {};
213
- for (const item of jsonLdItems) {
214
- Object.assign(merged, flattenObject(item));
215
- }
216
- Object.assign(merged, ogData);
217
-
218
- const { matched, coverage } = matchToSchema(merged, schema);
219
- if (coverage >= minCoverage) {
220
- return { extracted: matched, source: "json-ld+open-graph", coverage };
221
- }
222
- }
223
-
224
- // Coverage insufficient -> null (fallback to LLM)
225
- return null;
226
- }