npm - edgecrawl - Versions diffs - 0.3.0 → 0.3.1 - Mend

edgecrawl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +0 -1
package/index.mjs +0 -3
package/package.json +2 -3
package/src/llm.mjs +34 -9
package/src/pipeline.mjs +0 -26
package/src/structured-extract.mjs +0 -226

package/README.md CHANGED Viewed

@@ -7,7 +7,6 @@ Local AI-powered web scraper. Extract structured JSON from any website using on-
 - **100% Local AI** — Runs Qwen3 ONNX models on your machine via Transformers.js v4 (WebGPU/WASM)
 - **Zero API Keys** — No OpenAI, no Anthropic, no cloud bills. Everything runs on-device
 - **Structured JSON Output** — Define a schema, get clean JSON back
-- **Smart Extraction** — Tries JSON-LD/Open Graph first, falls back to LLM only when needed
 - **CLI + Library** — Use from the command line or import into your Node.js app
 ## Architecture

package/index.mjs CHANGED Viewed

@@ -15,8 +15,5 @@ export { initLLM, extractStructured, queryLLM, MODEL_PRESETS } from "./src/llm.m
 // HTML-to-Markdown conversion
 export { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./src/html2md.mjs";
-// Structured data extraction (JSON-LD / Open Graph)
-export { tryStructuredExtract } from "./src/structured-extract.mjs";
 // Browser / scraping primitives
 export { launchBrowser, fetchPage, fetchPages, closeBrowser } from "./src/scraper.mjs";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "edgecrawl",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "Local AI-powered web scraper. Extract structured JSON from any website using on-device ONNX LLMs. No API keys, no cloud, no Python.",
   "type": "module",
   "main": "./index.mjs",
@@ -8,8 +8,7 @@
     ".": "./index.mjs",
     "./scraper": "./src/scraper.mjs",
     "./html2md": "./src/html2md.mjs",
-    "./llm": "./src/llm.mjs",
-    "./structured-extract": "./src/structured-extract.mjs"
+    "./llm": "./src/llm.mjs"
   },
   "bin": {
     "edgecrawl": "./src/cli.mjs"

package/src/llm.mjs CHANGED Viewed

@@ -250,7 +250,7 @@ function normalizeExtracted(parsed) {
  * Safely parse a JSON string from LLM output
  */
 function parseJSONSafe(raw) {
-  // Remove think tags
+  // Remove think tags (closed and unclosed)
   let cleaned = raw.replace(/<think>[\s\S]*?<\/think>/g, "");
   cleaned = cleaned.replace(/<think>[\s\S]*/g, "");
   // Remove code fences
@@ -260,20 +260,45 @@ function parseJSONSafe(raw) {
   try {
     return JSON.parse(cleaned);
   } catch {
-    // Find the last valid JSON object
+    // Try broadest { ... } match first (handles nested objects/arrays)
+    const broad = cleaned.match(/\{[\s\S]*\}/);
+    if (broad) {
+      try {
+        return JSON.parse(broad[0]);
+      } catch {
+        // Try fixing common LLM JSON issues
+        try {
+          let fixed = broad[0];
+          fixed = fixed.replace(/,\s*([}\]])/g, "$1");  // trailing commas
+          // Fix bracket mismatches: ] closed with } or vice versa
+          const stack = [];
+          const chars = fixed.split("");
+          for (let i = 0; i < chars.length; i++) {
+            if (chars[i] === "{" || chars[i] === "[") {
+              stack.push(chars[i]);
+            } else if (chars[i] === "}") {
+              if (stack.length && stack[stack.length - 1] === "[") {
+                chars[i] = "]";  // fix: [ was opened, } should be ]
+              }
+              stack.pop();
+            } else if (chars[i] === "]") {
+              if (stack.length && stack[stack.length - 1] === "{") {
+                chars[i] = "}";  // fix: { was opened, ] should be }
+              }
+              stack.pop();
+            }
+          }
+          return JSON.parse(chars.join(""));
+        } catch {}
+      }
+    }
+    // Find the last valid JSON object (shallow)
     const matches = [...cleaned.matchAll(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/g)];
     for (let i = matches.length - 1; i >= 0; i--) {
       try {
         return JSON.parse(matches[i][0]);
       } catch { continue; }
     }
-    // Try broadest match
-    const broad = cleaned.match(/\{[\s\S]*\}/);
-    if (broad) {
-      try {
-        return JSON.parse(broad[0]);
-      } catch {}
-    }
     return { _raw: raw, _error: "Failed to parse JSON" };
   }
 }

package/src/pipeline.mjs CHANGED Viewed

@@ -9,7 +9,6 @@ import {
 } from "./scraper.mjs";
 import { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./html2md.mjs";
 import { initLLM, extractStructured, queryLLM } from "./llm.mjs";
-import { tryStructuredExtract } from "./structured-extract.mjs";
 /**
  * Default schema definition
@@ -44,18 +43,6 @@ export async function scrapeAndExtract(url, options = {}) {
   const { html, url: finalUrl, status } = await fetchPage(url, scrapeOptions);
   if (!html) return { url, error: "Failed to fetch page", status };
-  // Try structured data (JSON-LD / OG) first
-  const structResult = tryStructuredExtract(html, schema);
-  if (structResult) {
-    return {
-      url: finalUrl, status,
-      extraction_source: structResult.source,
-      coverage: structResult.coverage,
-      extracted: structResult.extracted,
-    };
-  }
-  // Fallback: Markdown conversion -> cleanMarkdown -> LLM
   const { markdown, title, excerpt } = htmlToMarkdown(html, finalUrl, { selector });
   const llmInput = cleanMarkdown(markdown);
   const metadata = {
@@ -98,19 +85,6 @@ export async function batchScrapeAndExtract(urls, options = {}) {
       continue;
     }
-    // Try structured data first
-    const structResult = tryStructuredExtract(page.html, schema);
-    if (structResult) {
-      results.push({
-        url: page.url, status: page.status,
-        extraction_source: structResult.source,
-        coverage: structResult.coverage,
-        extracted: structResult.extracted,
-      });
-      continue;
-    }
-    // Fallback: Markdown -> LLM
     const { markdown, title } = htmlToMarkdown(page.html, page.url);
     const truncated = truncateForLLM(cleanMarkdown(markdown), maxTokens);

package/src/structured-extract.mjs DELETED Viewed

@@ -1,226 +0,0 @@
-// src/structured-extract.mjs
-// Extract structured data (JSON-LD, Open Graph) from HTML
-// If schema match coverage is sufficient, return result without LLM
-import { JSDOM } from "jsdom";
-/**
- * Extract JSON-LD data from HTML
- * @param {string} html - Raw HTML
- * @returns {object[]} Array of JSON-LD objects
- */
-function extractJsonLd(html) {
-  const dom = new JSDOM(html);
-  const scripts = dom.window.document.querySelectorAll('script[type="application/ld+json"]');
-  const results = [];
-  for (const script of scripts) {
-    try {
-      const data = JSON.parse(script.textContent);
-      // Expand @graph if present
-      if (data["@graph"] && Array.isArray(data["@graph"])) {
-        results.push(...data["@graph"]);
-      } else if (Array.isArray(data)) {
-        results.push(...data);
-      } else {
-        results.push(data);
-      }
-    } catch {
-      // Ignore JSON parse failures
-    }
-  }
-  return results;
-}
-/**
- * Extract Open Graph meta tags from HTML
- * @param {string} html - Raw HTML
- * @returns {object} OG property dictionary { "og:title": "...", ... }
- */
-function extractOpenGraph(html) {
-  const dom = new JSDOM(html);
-  const doc = dom.window.document;
-  const og = {};
-  // og:* meta tags
-  const ogMetas = doc.querySelectorAll('meta[property^="og:"]');
-  for (const meta of ogMetas) {
-    const prop = meta.getAttribute("property");
-    const content = meta.getAttribute("content");
-    if (prop && content) {
-      og[prop] = content;
-    }
-  }
-  // twitter:* meta tags
-  const twitterMetas = doc.querySelectorAll('meta[name^="twitter:"], meta[property^="twitter:"]');
-  for (const meta of twitterMetas) {
-    const prop = meta.getAttribute("name") || meta.getAttribute("property");
-    const content = meta.getAttribute("content");
-    if (prop && content) {
-      og[prop] = content;
-    }
-  }
-  // Basic meta tags
-  const descMeta = doc.querySelector('meta[name="description"]');
-  if (descMeta) og["meta:description"] = descMeta.getAttribute("content");
-  const titleEl = doc.querySelector("title");
-  if (titleEl) og["meta:title"] = titleEl.textContent.trim();
-  return og;
-}
-/**
- * Flatten JSON-LD data into key-value pairs
- * Nested objects become "parent.child" format
- */
-function flattenObject(obj, prefix = "") {
-  const result = {};
-  for (const [key, value] of Object.entries(obj)) {
-    if (key.startsWith("@")) continue; // Skip @type, @context, etc.
-    const fullKey = prefix ? `${prefix}.${key}` : key;
-    if (value && typeof value === "object" && !Array.isArray(value)) {
-      Object.assign(result, flattenObject(value, fullKey));
-    } else {
-      result[fullKey] = value;
-    }
-  }
-  return result;
-}
-/**
- * Match structured data against a schema
- * For each schema key, find a corresponding value in structured data
- * @param {object} structuredData - Flattened structured data
- * @param {object} schema - User-defined schema
- * @returns {{ matched: object, coverage: number }} Match results and coverage ratio
- */
-function matchToSchema(structuredData, schema) {
-  const schemaKeys = Object.keys(schema);
-  const matched = {};
-  let matchCount = 0;
-  // Key name alias map (common mapping patterns)
-  const keyAliases = {
-    title: ["name", "headline", "og:title", "twitter:title", "meta:title"],
-    description: ["description", "abstract", "og:description", "twitter:description", "meta:description"],
-    price: ["price", "offers.price", "offers.lowPrice"],
-    currency: ["priceCurrency", "offers.priceCurrency"],
-    image: ["image", "thumbnailUrl", "og:image", "twitter:image"],
-    url: ["url", "mainEntityOfPage", "og:url"],
-    author: ["author.name", "author", "creator"],
-    date: ["datePublished", "dateCreated", "dateModified"],
-    published: ["datePublished", "dateCreated"],
-    brand: ["brand.name", "brand"],
-    category: ["category", "articleSection"],
-    rating: ["aggregateRating.ratingValue"],
-    reviewCount: ["aggregateRating.reviewCount"],
-    availability: ["offers.availability"],
-    language: ["inLanguage"],
-  };
-  for (const schemaKey of schemaKeys) {
-    const normalizedKey = schemaKey.toLowerCase().replace(/[_-]/g, "");
-    // 1. Direct key match
-    if (structuredData[schemaKey] !== undefined) {
-      matched[schemaKey] = structuredData[schemaKey];
-      matchCount++;
-      continue;
-    }
-    // 2. Alias match
-    let found = false;
-    for (const [aliasGroup, aliases] of Object.entries(keyAliases)) {
-      if (normalizedKey.includes(aliasGroup)) {
-        for (const alias of aliases) {
-          if (structuredData[alias] !== undefined) {
-            matched[schemaKey] = structuredData[alias];
-            matchCount++;
-            found = true;
-            break;
-          }
-        }
-        if (found) break;
-      }
-    }
-    if (found) continue;
-    // 3. Partial match (structured data key contains schema key or vice versa)
-    for (const [dataKey, dataValue] of Object.entries(structuredData)) {
-      const normalizedDataKey = dataKey.toLowerCase().replace(/[_.-]/g, "");
-      if (normalizedDataKey.includes(normalizedKey) || normalizedKey.includes(normalizedDataKey)) {
-        matched[schemaKey] = dataValue;
-        matchCount++;
-        break;
-      }
-    }
-  }
-  return {
-    matched,
-    coverage: schemaKeys.length > 0 ? matchCount / schemaKeys.length : 0,
-  };
-}
-/**
- * Extract structured data from HTML and match against schema
- * Returns result if coverage meets threshold (no LLM needed)
- *
- * @param {string} html - Raw HTML
- * @param {object} schema - User-defined schema
- * @param {object} options
- * @param {number} options.minCoverage - Minimum coverage ratio (0-1, default: 0.5)
- * @returns {{ extracted: object, source: string, coverage: number } | null}
- */
-export function tryStructuredExtract(html, schema, options = {}) {
-  const { minCoverage = 0.5 } = options;
-  // 1. Try extraction from JSON-LD
-  const jsonLdItems = extractJsonLd(html);
-  if (jsonLdItems.length > 0) {
-    // Merge and flatten all JSON-LD items
-    const merged = {};
-    for (const item of jsonLdItems) {
-      Object.assign(merged, flattenObject(item));
-    }
-    const { matched, coverage } = matchToSchema(merged, schema);
-    if (coverage >= minCoverage) {
-      return { extracted: matched, source: "json-ld", coverage };
-    }
-  }
-  // 2. Try extraction from Open Graph
-  const ogData = extractOpenGraph(html);
-  if (Object.keys(ogData).length > 0) {
-    const { matched, coverage } = matchToSchema(ogData, schema);
-    if (coverage >= minCoverage) {
-      return { extracted: matched, source: "open-graph", coverage };
-    }
-  }
-  // 3. Merge JSON-LD + OG and retry
-  if (jsonLdItems.length > 0 && Object.keys(ogData).length > 0) {
-    const merged = {};
-    for (const item of jsonLdItems) {
-      Object.assign(merged, flattenObject(item));
-    }
-    Object.assign(merged, ogData);
-    const { matched, coverage } = matchToSchema(merged, schema);
-    if (coverage >= minCoverage) {
-      return { extracted: matched, source: "json-ld+open-graph", coverage };
-    }
-  }
-  // Coverage insufficient -> null (fallback to LLM)
-  return null;
-}