npm - crawler-extraction-engine - Versions diffs - 1.0.0 - Mend

crawler-extraction-engine 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,63 @@
+# @properti-ag/extraction-engine
+Unified HTML and JSON extraction engine with CSS, XPath, and JSONPath support.
+## Installation
+```bash
+npm install @properti-ag/extraction-engine
+```
+## Quick Start
+### HTML Extraction (CSS/XPath)
+```javascript
+const { extractHtml } = require('@properti-ag/extraction-engine');
+const result = extractHtml(
+  { html: '<div class="item">Content</div>' },
+  { kind: 'html', selector: 'div.item' }
+);
+console.log(result.nodes); // [<div class="item">Content</div>]
+```
+### JSON Extraction (JSONPath)
+```javascript
+const { extractJson } = require('@properti-ag/extraction-engine');
+const html = '<script>window.__DATA__ = {"title": "Hello"}</script>';
+const result = extractJson(
+  { html },
+  {
+    kind: 'json',
+    selector: 'script',
+    extractionRegex: '{.*}',
+    elements: [
+      { property: 'title', jsonPath: '$.title' }
+    ]
+  }
+);
+console.log(result.data); // { title: "Hello" }
+```
+## Features
+- ✅ **CSS Selectors**: `div.className`, `#id`, `[attribute]`
+- ✅ **XPath Support**: `//div[@class='item']`, `.//a[@href]`
+- ✅ **JSONPath Queries**: `$.data.items[*].title`
+- ✅ **Auto-detection**: Automatically detects selector type
+- ✅ **Context Nodes**: Relative queries with context
+- ✅ **Error Handling**: Custom error types for debugging
+## API
+See [full documentation](https://github.com/properti-ag/extraction-engine)
+## License
+MIT

package/package.json ADDED Viewed

@@ -0,0 +1,26 @@
+{
+  "name": "crawler-extraction-engine",
+  "version": "1.0.0",
+  "description": "Unified HTML and JSON extraction engine with CSS, XPath, and JSONPath support",
+  "main": "src/index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [
+    "extraction",
+    "scraping",
+    "xpath",
+    "jsonpath",
+    "html-parser",
+    "web-scraping"
+  ],
+  "author": "Properti AG",
+  "license": "MIT",
+  "dependencies": {
+    "jsdom": "^24.0.0",
+    "jsonpath-plus": "^7.2.0"
+  },
+  "engines": {
+    "node": ">=16.0.0"
+  }
+}

package/src/context.js ADDED Viewed

@@ -0,0 +1,23 @@
+const { JSDOM } = require("jsdom");
+const { ExtractionError } = require("./errors");
+/**
+ * Resolve Document from extraction context
+ * Supports both Node.js (jsdom) and browser (native Document) contexts
+ * @param {import("./types").ExtractionContext} ctx - Extraction context
+ * @returns {Document} - Document object
+ * @throws {ExtractionError} If neither html nor document is provided
+ */
+function resolveDocument(ctx) {
+    if (ctx.document) return ctx.document;
+    if (typeof document !== "undefined" && !ctx.html) return document;
+    if (!ctx.html) {
+        throw new ExtractionError("ExtractionContext requires html or document", { context: ctx });
+    }
+    const dom = new JSDOM(ctx.html, { url: ctx.baseUrl || "https://example.com" });
+    return dom.window.document;
+}
+module.exports = {
+    resolveDocument
+};

package/src/errors.js ADDED Viewed

@@ -0,0 +1,29 @@
+class InvalidSelectorError extends Error {
+    constructor(selector, reason) {
+        super(`Invalid selector: ${selector} - ${reason}`);
+        this.selector = selector;
+        this.name = 'InvalidSelectorError';
+    }
+}
+class JsonPathError extends Error {
+    constructor(path, reason) {
+        super(`JSONPath error: ${path} - ${reason}`);
+        this.path = path;
+        this.name = 'JsonPathError';
+    }
+}
+class ExtractionError extends Error {
+    constructor(message, context) {
+        super(message);
+        this.context = context;
+        this.name = 'ExtractionError';
+    }
+}
+module.exports = {
+    InvalidSelectorError,
+    JsonPathError,
+    ExtractionError
+};

package/src/html.js ADDED Viewed

@@ -0,0 +1,106 @@
+const { resolveDocument } = require("./context");
+const { InvalidSelectorError } = require("./errors");
+/**
+ * Check if selector is XPath based on selector string or explicit type
+ * @param {string} selector - Selector string
+ * @param {string} [selectorType] - Explicit selector type
+ * @returns {boolean} - True if XPath, false if CSS
+ */
+function isXPathSelector(selector, selectorType) {
+    if (selectorType) {
+        const v = String(selectorType).toLowerCase();
+        if (v === "xpath" || v === "xpathselector") return true;
+        if (v === "css") return false;
+    }
+    if (!selector) return false;
+    return selector.startsWith("//") || selector.startsWith(".//");
+}
+/**
+ * Evaluate XPath using native document.evaluate (works with jsdom)
+ * @param {Document} doc - Document object
+ * @param {string} xpath - XPath expression
+ * @param {Node} [contextNode] - Context node for relative XPath (defaults to doc)
+ * @returns {Element[]} - Array of matched elements
+ */
+function evaluateXPath(doc, xpath, contextNode) {
+    const context = contextNode || doc;
+    try {
+        const result = doc.evaluate(xpath, context, null, doc.XPathResult ? doc.XPathResult.ORDERED_NODE_SNAPSHOT_TYPE : 7, null);
+        const nodes = [];
+        for (let i = 0; i < result.snapshotLength; i++) {
+            const node = result.snapshotItem(i);
+            if (node && node.nodeType === 1) {
+                nodes.push(node);
+            }
+        }
+        return nodes;
+    } catch (error) {
+        throw new InvalidSelectorError(xpath, error.message || "XPath evaluation failed");
+    }
+}
+/**
+ * Select HTML elements using CSS or XPath selector
+ * @param {Document} doc - Document object
+ * @param {string} selector - CSS or XPath selector
+ * @param {string} [selectorType] - Explicit selector type
+ * @param {Node} [contextNode] - Context node for relative queries
+ * @returns {Element[]} - Array of matched elements
+ * @throws {InvalidSelectorError} If selector is invalid or evaluation fails
+ */
+function selectHtml(doc, selector, selectorType, contextNode) {
+    if (!selector) {
+        throw new InvalidSelectorError(selector, "selector is required");
+    }
+    const useXPath = isXPathSelector(selector, selectorType);
+    try {
+        if (useXPath) {
+            return evaluateXPath(doc, selector, contextNode);
+        }
+        const context = contextNode || doc;
+        if (contextNode) {
+            return Array.from(contextNode.querySelectorAll(selector));
+        }
+        return Array.from(doc.querySelectorAll(selector));
+    } catch (error) {
+        if (error instanceof InvalidSelectorError) {
+            throw error;
+        }
+        throw new InvalidSelectorError(selector, error.message || "selector evaluation failed");
+    }
+}
+/**
+ * Extract HTML elements using CSS or XPath selector
+ * @param {ExtractionContext} ctx - Extraction context
+ * @param {HtmlExtractionConfig} cfg - HTML extraction configuration
+ * @returns {HtmlExtractionResult} - Extraction result with nodes and count
+ * @throws {InvalidSelectorError} If selector is invalid
+ * @throws {ExtractionError} If document resolution fails
+ */
+function extractHtml(ctx, cfg) {
+    if (!cfg || cfg.kind !== "html") {
+        throw new ExtractionError("Invalid HtmlExtractionConfig: kind must be 'html'", { config: cfg });
+    }
+    if (!cfg.selector) {
+        throw new InvalidSelectorError(cfg.selector, "selector is required");
+    }
+    const doc = resolveDocument(ctx);
+    const nodes = selectHtml(doc, cfg.selector, cfg.selectorType);
+    return {
+        kind: "html",
+        nodes,
+        count: nodes.length
+    };
+}
+module.exports = {
+    selectHtml,
+    extractHtml
+};

package/src/index.js ADDED Viewed

@@ -0,0 +1,55 @@
+const { resolveDocument } = require("./context");
+const { selectHtml, extractHtml } = require("./html");
+const { extractJson } = require("./json");
+const { InvalidSelectorError, JsonPathError, ExtractionError } = require("./errors");
+/**
+ * Type guard: Check if config is HtmlExtractionConfig
+ * @param {ExtractionConfig} cfg - Extraction configuration
+ * @returns {cfg is HtmlExtractionConfig} - True if HTML config
+ */
+function isHtmlConfig(cfg) {
+    return cfg && cfg.kind === "html";
+}
+/**
+ * Type guard: Check if config is JsonExtractionConfig
+ * @param {ExtractionConfig} cfg - Extraction configuration
+ * @returns {cfg is JsonExtractionConfig} - True if JSON config
+ */
+function isJsonConfig(cfg) {
+    return cfg && cfg.kind === "json";
+}
+/**
+ * Unified extraction entry point
+ * Automatically routes to HTML or JSON extraction based on config kind
+ * @param {ExtractionContext} ctx - Extraction context
+ * @param {ExtractionConfig} cfg - Extraction configuration
+ * @returns {ExtractionResult} - Extraction result
+ * @throws {ExtractionError} If config kind is unknown
+ */
+function extract(ctx, cfg) {
+    if (!ctx) {
+        throw new ExtractionError("ExtractionContext is required", {});
+    }
+    if (!cfg) {
+        throw new ExtractionError("ExtractionConfig is required", {});
+    }
+    if (isHtmlConfig(cfg)) return extractHtml(ctx, cfg);
+    if (isJsonConfig(cfg)) return extractJson(ctx, cfg);
+    throw new ExtractionError(`Unknown ExtractionConfig kind: ${cfg.kind || 'undefined'}`, { config: cfg });
+}
+module.exports = {
+    resolveDocument,
+    selectHtml,
+    extractHtml,
+    extractJson,
+    extract,
+    InvalidSelectorError,
+    JsonPathError,
+    ExtractionError
+};

package/src/json.js ADDED Viewed

@@ -0,0 +1,180 @@
+const { JSONPath } = require("jsonpath-plus");
+const { resolveDocument } = require("./context");
+const { JsonPathError, ExtractionError } = require("./errors");
+/**
+ * Normalize selector type string to "xpath" or "css"
+ * @param {string} [selectorType] - Selector type string
+ * @returns {"xpath" | "css"} - Normalized selector type
+ */
+function normalizeSelectorType(selectorType) {
+    if (!selectorType) return "css";
+    const v = String(selectorType).toLowerCase();
+    if (v === "xpath" || v === "xpathselector") return "xpath";
+    return "css";
+}
+/**
+ * Extract text content from script element using CSS or XPath selector
+ * @param {Document} doc - Document object
+ * @param {string} selector - CSS or XPath selector
+ * @param {string} [selectorType] - Selector type
+ * @returns {string} - Extracted text content
+ */
+function extractScriptText(doc, selector, selectorType) {
+    const type = normalizeSelectorType(selectorType);
+    if (type === "css") {
+        const el = doc.querySelector(selector);
+        return el ? el.textContent || "" : "";
+    }
+    // Use native document.evaluate for XPath (works with jsdom)
+    try {
+        const result = doc.evaluate(selector, doc, null, doc.XPathResult ? doc.XPathResult.FIRST_ORDERED_NODE_TYPE : 9, null);
+        const el = result.singleNodeValue;
+        if (!el || el.nodeType !== 1) return "";
+        return el.textContent || "";
+    } catch (error) {
+        return "";
+    }
+}
+/**
+ * Parse JSON payload from raw text with optional regex extraction
+ * @param {string} raw - Raw text content
+ * @param {string} [extractionRegex] - Regex pattern to extract JSON (e.g., "{.*}")
+ * @returns {unknown} - Parsed JSON object or null
+ * @throws {ExtractionError} If JSON parsing fails
+ */
+function parseJsonPayload(raw, extractionRegex) {
+    if (!raw) return null;
+    let jsonStr = raw;
+    if (extractionRegex) {
+        const re = new RegExp(extractionRegex, "s");
+        const m = raw.match(re);
+        if (!m) return null;
+        jsonStr = m[0];
+    }
+    try {
+        return JSON.parse(jsonStr);
+    } catch (error) {
+        throw new ExtractionError(`Failed to parse JSON payload: ${error.message}`, {
+            raw: raw.substring(0, 200),
+            extractionRegex
+        });
+    }
+}
+/**
+ * Evaluate JSONPath expression on JSON root
+ * @param {unknown} root - JSON root object
+ * @param {string} path - JSONPath expression
+ * @returns {unknown[]} - Array of matching values
+ * @throws {JsonPathError} If JSONPath evaluation fails
+ */
+function evalJsonPath(root, path) {
+    if (!path) {
+        throw new JsonPathError(path, "jsonPath is required");
+    }
+    try {
+        return JSONPath({ path, json: root });
+    } catch (error) {
+        throw new JsonPathError(path, error.message || "jsonPath evaluation failed");
+    }
+}
+/**
+ * Extract data from JSON using configuration
+ * @param {unknown} root - JSON root object
+ * @param {JsonExtractionConfig} cfg - JSON extraction configuration
+ * @returns {JsonExtractionResult} - Extraction result
+ */
+function extractFromJsonConfig(root, cfg) {
+    if (!root) return { kind: "json", notFound: true, data: {} };
+    if (cfg.condition) {
+        try {
+            const cond = evalJsonPath(root, cfg.condition);
+            if (!cond || cond.length === 0) return { kind: "json", notFound: true, data: {} };
+        } catch (error) {
+            return { kind: "json", notFound: true, data: {} };
+        }
+    }
+    if (cfg.notFoundSelector) {
+        try {
+            const errVal = evalJsonPath(root, cfg.notFoundSelector);
+            if (errVal && errVal.length > 0) return { kind: "json", notFound: true, data: {} };
+        } catch (error) {
+            return { kind: "json", notFound: true, data: {} };
+        }
+    }
+    const result = {};
+    const elements = Array.isArray(cfg.elements) ? cfg.elements : [];
+    for (const el of elements) {
+        if (!el.property || !el.jsonPath) continue;
+        const paths = el.jsonPath.split(";").map(p => p.trim()).filter(Boolean);
+        let collected = [];
+        for (const p of paths) {
+            try {
+                const vals = evalJsonPath(root, p);
+                if (vals && vals.length > 0) {
+                    collected = collected.concat(vals);
+                    if (!el.mergeAll && collected.length > 0) break;
+                }
+            } catch (error) {
+                continue;
+            }
+        }
+        if (!collected.length) continue;
+        if (el.mergeAll) {
+            result[el.property] = collected;
+        } else {
+            result[el.property] = collected[0];
+        }
+    }
+    return {
+        kind: "json",
+        notFound: false,
+        data: result
+    };
+}
+/**
+ * Extract JSON data from HTML using embedded script
+ * @param {ExtractionContext} ctx - Extraction context
+ * @param {JsonExtractionConfig} cfg - JSON extraction configuration
+ * @returns {JsonExtractionResult} - Extraction result
+ * @throws {ExtractionError} If JSON parsing fails
+ * @throws {JsonPathError} If JSONPath evaluation fails
+ */
+function extractJson(ctx, cfg) {
+    if (!cfg || cfg.kind !== "json") {
+        throw new ExtractionError("Invalid JsonExtractionConfig: kind must be 'json'", { config: cfg });
+    }
+    if (!cfg.selector) {
+        throw new ExtractionError("JsonExtractionConfig requires selector", { config: cfg });
+    }
+    if (!Array.isArray(cfg.elements) || cfg.elements.length === 0) {
+        throw new ExtractionError("JsonExtractionConfig requires elements array", { config: cfg });
+    }
+    const doc = resolveDocument(ctx);
+    const scriptText = extractScriptText(doc, cfg.selector, cfg.selectorType);
+    const root = parseJsonPayload(scriptText, cfg.extractionRegex);
+    if (!root) {
+        return { kind: "json", notFound: true, data: {} };
+    }
+    return extractFromJsonConfig(root, cfg);
+}
+module.exports = {
+    extractJson
+};

package/src/types.js ADDED Viewed

@@ -0,0 +1,59 @@
+/**
+ * @typedef {"css" | "xpath"} SelectorType
+ */
+/**
+ * @typedef {Object} ExtractionContext
+ * @property {string} [html] - HTML content string
+ * @property {string} [baseUrl] - Base URL for resolving relative URLs
+ * @property {Document} [document] - Browser Document object (for browser context)
+ */
+/**
+ * @typedef {Object} HtmlExtractionConfig
+ * @property {"html"} kind - Extraction type identifier
+ * @property {string} selector - CSS or XPath selector
+ * @property {SelectorType} [selectorType] - Explicit selector type (auto-detected if not provided)
+ */
+/**
+ * @typedef {Object} JsonElementConfig
+ * @property {string} property - Property name in result object
+ * @property {string} jsonPath - JSONPath expression (can be semicolon-separated for multiple paths)
+ * @property {boolean} [mergeAll] - If true, merge all matching values into array
+ */
+/**
+ * @typedef {Object} JsonExtractionConfig
+ * @property {"json"} kind - Extraction type identifier
+ * @property {string} selector - CSS or XPath selector to find script/container element
+ * @property {"xPath" | "css"} [selectorType] - Selector type for finding script element
+ * @property {string} [extractionRegex] - Regex pattern to extract JSON from script text (e.g., "{.*}")
+ * @property {string} [condition] - JSONPath condition to check if extraction should proceed
+ * @property {string} [notFoundSelector] - JSONPath to detect error/not found state
+ * @property {JsonElementConfig[]} elements - Array of element extraction configurations
+ */
+/**
+ * @typedef {HtmlExtractionConfig | JsonExtractionConfig} ExtractionConfig
+ */
+/**
+ * @typedef {Object} HtmlExtractionResult
+ * @property {"html"} kind - Result type identifier
+ * @property {Element[]} nodes - Array of extracted DOM elements
+ * @property {number} count - Number of elements found
+ */
+/**
+ * @typedef {Object} JsonExtractionResult
+ * @property {"json"} kind - Result type identifier
+ * @property {boolean} notFound - True if extraction failed (condition not met, error detected, etc.)
+ * @property {Record<string, unknown>} data - Extracted data object with property names as keys
+ */
+/**
+ * @typedef {HtmlExtractionResult | JsonExtractionResult} ExtractionResult
+ */
+module.exports = {};