crawler-extraction-engine 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,63 @@
1
+ # @properti-ag/extraction-engine
2
+
3
+ Unified HTML and JSON extraction engine with CSS, XPath, and JSONPath support.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @properti-ag/extraction-engine
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ### HTML Extraction (CSS/XPath)
14
+
15
+ ```javascript
16
+ const { extractHtml } = require('@properti-ag/extraction-engine');
17
+
18
+ const result = extractHtml(
19
+ { html: '<div class="item">Content</div>' },
20
+ { kind: 'html', selector: 'div.item' }
21
+ );
22
+
23
+ console.log(result.nodes); // [<div class="item">Content</div>]
24
+ ```
25
+
26
+ ### JSON Extraction (JSONPath)
27
+
28
+ ```javascript
29
+ const { extractJson } = require('@properti-ag/extraction-engine');
30
+
31
+ const html = '<script>window.__DATA__ = {"title": "Hello"}</script>';
32
+
33
+ const result = extractJson(
34
+ { html },
35
+ {
36
+ kind: 'json',
37
+ selector: 'script',
38
+ extractionRegex: '{.*}',
39
+ elements: [
40
+ { property: 'title', jsonPath: '$.title' }
41
+ ]
42
+ }
43
+ );
44
+
45
+ console.log(result.data); // { title: "Hello" }
46
+ ```
47
+
48
+ ## Features
49
+
50
+ - ✅ **CSS Selectors**: `div.className`, `#id`, `[attribute]`
51
+ - ✅ **XPath Support**: `//div[@class='item']`, `.//a[@href]`
52
+ - ✅ **JSONPath Queries**: `$.data.items[*].title`
53
+ - ✅ **Auto-detection**: Automatically detects selector type
54
+ - ✅ **Context Nodes**: Relative queries with context
55
+ - ✅ **Error Handling**: Custom error types for debugging
56
+
57
+ ## API
58
+
59
+ See [full documentation](https://github.com/properti-ag/extraction-engine)
60
+
61
+ ## License
62
+
63
+ MIT
package/package.json ADDED
@@ -0,0 +1,26 @@
1
+ {
2
+ "name": "crawler-extraction-engine",
3
+ "version": "1.0.0",
4
+ "description": "Unified HTML and JSON extraction engine with CSS, XPath, and JSONPath support",
5
+ "main": "src/index.js",
6
+ "scripts": {
7
+ "test": "echo \"Error: no test specified\" && exit 1"
8
+ },
9
+ "keywords": [
10
+ "extraction",
11
+ "scraping",
12
+ "xpath",
13
+ "jsonpath",
14
+ "html-parser",
15
+ "web-scraping"
16
+ ],
17
+ "author": "Properti AG",
18
+ "license": "MIT",
19
+ "dependencies": {
20
+ "jsdom": "^24.0.0",
21
+ "jsonpath-plus": "^7.2.0"
22
+ },
23
+ "engines": {
24
+ "node": ">=16.0.0"
25
+ }
26
+ }
package/src/context.js ADDED
@@ -0,0 +1,23 @@
1
+ const { JSDOM } = require("jsdom");
2
+ const { ExtractionError } = require("./errors");
3
+
4
+ /**
5
+ * Resolve Document from extraction context
6
+ * Supports both Node.js (jsdom) and browser (native Document) contexts
7
+ * @param {import("./types").ExtractionContext} ctx - Extraction context
8
+ * @returns {Document} - Document object
9
+ * @throws {ExtractionError} If neither html nor document is provided
10
+ */
11
+ function resolveDocument(ctx) {
12
+ if (ctx.document) return ctx.document;
13
+ if (typeof document !== "undefined" && !ctx.html) return document;
14
+ if (!ctx.html) {
15
+ throw new ExtractionError("ExtractionContext requires html or document", { context: ctx });
16
+ }
17
+ const dom = new JSDOM(ctx.html, { url: ctx.baseUrl || "https://example.com" });
18
+ return dom.window.document;
19
+ }
20
+
21
+ module.exports = {
22
+ resolveDocument
23
+ };
package/src/errors.js ADDED
@@ -0,0 +1,29 @@
1
+ class InvalidSelectorError extends Error {
2
+ constructor(selector, reason) {
3
+ super(`Invalid selector: ${selector} - ${reason}`);
4
+ this.selector = selector;
5
+ this.name = 'InvalidSelectorError';
6
+ }
7
+ }
8
+
9
+ class JsonPathError extends Error {
10
+ constructor(path, reason) {
11
+ super(`JSONPath error: ${path} - ${reason}`);
12
+ this.path = path;
13
+ this.name = 'JsonPathError';
14
+ }
15
+ }
16
+
17
+ class ExtractionError extends Error {
18
+ constructor(message, context) {
19
+ super(message);
20
+ this.context = context;
21
+ this.name = 'ExtractionError';
22
+ }
23
+ }
24
+
25
+ module.exports = {
26
+ InvalidSelectorError,
27
+ JsonPathError,
28
+ ExtractionError
29
+ };
package/src/html.js ADDED
@@ -0,0 +1,106 @@
1
+ const { resolveDocument } = require("./context");
2
+ const { InvalidSelectorError } = require("./errors");
3
+
4
+ /**
5
+ * Check if selector is XPath based on selector string or explicit type
6
+ * @param {string} selector - Selector string
7
+ * @param {string} [selectorType] - Explicit selector type
8
+ * @returns {boolean} - True if XPath, false if CSS
9
+ */
10
+ function isXPathSelector(selector, selectorType) {
11
+ if (selectorType) {
12
+ const v = String(selectorType).toLowerCase();
13
+ if (v === "xpath" || v === "xpathselector") return true;
14
+ if (v === "css") return false;
15
+ }
16
+ if (!selector) return false;
17
+ return selector.startsWith("//") || selector.startsWith(".//");
18
+ }
19
+
20
+ /**
21
+ * Evaluate XPath using native document.evaluate (works with jsdom)
22
+ * @param {Document} doc - Document object
23
+ * @param {string} xpath - XPath expression
24
+ * @param {Node} [contextNode] - Context node for relative XPath (defaults to doc)
25
+ * @returns {Element[]} - Array of matched elements
26
+ */
27
+ function evaluateXPath(doc, xpath, contextNode) {
28
+ const context = contextNode || doc;
29
+ try {
30
+ const result = doc.evaluate(xpath, context, null, doc.XPathResult ? doc.XPathResult.ORDERED_NODE_SNAPSHOT_TYPE : 7, null);
31
+ const nodes = [];
32
+ for (let i = 0; i < result.snapshotLength; i++) {
33
+ const node = result.snapshotItem(i);
34
+ if (node && node.nodeType === 1) {
35
+ nodes.push(node);
36
+ }
37
+ }
38
+ return nodes;
39
+ } catch (error) {
40
+ throw new InvalidSelectorError(xpath, error.message || "XPath evaluation failed");
41
+ }
42
+ }
43
+
44
+ /**
45
+ * Select HTML elements using CSS or XPath selector
46
+ * @param {Document} doc - Document object
47
+ * @param {string} selector - CSS or XPath selector
48
+ * @param {string} [selectorType] - Explicit selector type
49
+ * @param {Node} [contextNode] - Context node for relative queries
50
+ * @returns {Element[]} - Array of matched elements
51
+ * @throws {InvalidSelectorError} If selector is invalid or evaluation fails
52
+ */
53
+ function selectHtml(doc, selector, selectorType, contextNode) {
54
+ if (!selector) {
55
+ throw new InvalidSelectorError(selector, "selector is required");
56
+ }
57
+
58
+ const useXPath = isXPathSelector(selector, selectorType);
59
+
60
+ try {
61
+ if (useXPath) {
62
+ return evaluateXPath(doc, selector, contextNode);
63
+ }
64
+ const context = contextNode || doc;
65
+ if (contextNode) {
66
+ return Array.from(contextNode.querySelectorAll(selector));
67
+ }
68
+ return Array.from(doc.querySelectorAll(selector));
69
+ } catch (error) {
70
+ if (error instanceof InvalidSelectorError) {
71
+ throw error;
72
+ }
73
+ throw new InvalidSelectorError(selector, error.message || "selector evaluation failed");
74
+ }
75
+ }
76
+
77
+ /**
78
+ * Extract HTML elements using CSS or XPath selector
79
+ * @param {ExtractionContext} ctx - Extraction context
80
+ * @param {HtmlExtractionConfig} cfg - HTML extraction configuration
81
+ * @returns {HtmlExtractionResult} - Extraction result with nodes and count
82
+ * @throws {InvalidSelectorError} If selector is invalid
83
+ * @throws {ExtractionError} If document resolution fails
84
+ */
85
+ function extractHtml(ctx, cfg) {
86
+ if (!cfg || cfg.kind !== "html") {
87
+ throw new ExtractionError("Invalid HtmlExtractionConfig: kind must be 'html'", { config: cfg });
88
+ }
89
+ if (!cfg.selector) {
90
+ throw new InvalidSelectorError(cfg.selector, "selector is required");
91
+ }
92
+
93
+ const doc = resolveDocument(ctx);
94
+ const nodes = selectHtml(doc, cfg.selector, cfg.selectorType);
95
+
96
+ return {
97
+ kind: "html",
98
+ nodes,
99
+ count: nodes.length
100
+ };
101
+ }
102
+
103
+ module.exports = {
104
+ selectHtml,
105
+ extractHtml
106
+ };
package/src/index.js ADDED
@@ -0,0 +1,55 @@
1
+ const { resolveDocument } = require("./context");
2
+ const { selectHtml, extractHtml } = require("./html");
3
+ const { extractJson } = require("./json");
4
+ const { InvalidSelectorError, JsonPathError, ExtractionError } = require("./errors");
5
+
6
+ /**
7
+ * Type guard: Check if config is HtmlExtractionConfig
8
+ * @param {ExtractionConfig} cfg - Extraction configuration
9
+ * @returns {cfg is HtmlExtractionConfig} - True if HTML config
10
+ */
11
+ function isHtmlConfig(cfg) {
12
+ return cfg && cfg.kind === "html";
13
+ }
14
+
15
+ /**
16
+ * Type guard: Check if config is JsonExtractionConfig
17
+ * @param {ExtractionConfig} cfg - Extraction configuration
18
+ * @returns {cfg is JsonExtractionConfig} - True if JSON config
19
+ */
20
+ function isJsonConfig(cfg) {
21
+ return cfg && cfg.kind === "json";
22
+ }
23
+
24
+ /**
25
+ * Unified extraction entry point
26
+ * Automatically routes to HTML or JSON extraction based on config kind
27
+ * @param {ExtractionContext} ctx - Extraction context
28
+ * @param {ExtractionConfig} cfg - Extraction configuration
29
+ * @returns {ExtractionResult} - Extraction result
30
+ * @throws {ExtractionError} If config kind is unknown
31
+ */
32
+ function extract(ctx, cfg) {
33
+ if (!ctx) {
34
+ throw new ExtractionError("ExtractionContext is required", {});
35
+ }
36
+ if (!cfg) {
37
+ throw new ExtractionError("ExtractionConfig is required", {});
38
+ }
39
+
40
+ if (isHtmlConfig(cfg)) return extractHtml(ctx, cfg);
41
+ if (isJsonConfig(cfg)) return extractJson(ctx, cfg);
42
+
43
+ throw new ExtractionError(`Unknown ExtractionConfig kind: ${cfg.kind || 'undefined'}`, { config: cfg });
44
+ }
45
+
46
+ module.exports = {
47
+ resolveDocument,
48
+ selectHtml,
49
+ extractHtml,
50
+ extractJson,
51
+ extract,
52
+ InvalidSelectorError,
53
+ JsonPathError,
54
+ ExtractionError
55
+ };
package/src/json.js ADDED
@@ -0,0 +1,180 @@
1
+ const { JSONPath } = require("jsonpath-plus");
2
+ const { resolveDocument } = require("./context");
3
+ const { JsonPathError, ExtractionError } = require("./errors");
4
+
5
+ /**
6
+ * Normalize selector type string to "xpath" or "css"
7
+ * @param {string} [selectorType] - Selector type string
8
+ * @returns {"xpath" | "css"} - Normalized selector type
9
+ */
10
+ function normalizeSelectorType(selectorType) {
11
+ if (!selectorType) return "css";
12
+ const v = String(selectorType).toLowerCase();
13
+ if (v === "xpath" || v === "xpathselector") return "xpath";
14
+ return "css";
15
+ }
16
+
17
+ /**
18
+ * Extract text content from script element using CSS or XPath selector
19
+ * @param {Document} doc - Document object
20
+ * @param {string} selector - CSS or XPath selector
21
+ * @param {string} [selectorType] - Selector type
22
+ * @returns {string} - Extracted text content
23
+ */
24
+ function extractScriptText(doc, selector, selectorType) {
25
+ const type = normalizeSelectorType(selectorType);
26
+ if (type === "css") {
27
+ const el = doc.querySelector(selector);
28
+ return el ? el.textContent || "" : "";
29
+ }
30
+ // Use native document.evaluate for XPath (works with jsdom)
31
+ try {
32
+ const result = doc.evaluate(selector, doc, null, doc.XPathResult ? doc.XPathResult.FIRST_ORDERED_NODE_TYPE : 9, null);
33
+ const el = result.singleNodeValue;
34
+ if (!el || el.nodeType !== 1) return "";
35
+ return el.textContent || "";
36
+ } catch (error) {
37
+ return "";
38
+ }
39
+ }
40
+
41
+ /**
42
+ * Parse JSON payload from raw text with optional regex extraction
43
+ * @param {string} raw - Raw text content
44
+ * @param {string} [extractionRegex] - Regex pattern to extract JSON (e.g., "{.*}")
45
+ * @returns {unknown} - Parsed JSON object or null
46
+ * @throws {ExtractionError} If JSON parsing fails
47
+ */
48
+ function parseJsonPayload(raw, extractionRegex) {
49
+ if (!raw) return null;
50
+ let jsonStr = raw;
51
+ if (extractionRegex) {
52
+ const re = new RegExp(extractionRegex, "s");
53
+ const m = raw.match(re);
54
+ if (!m) return null;
55
+ jsonStr = m[0];
56
+ }
57
+ try {
58
+ return JSON.parse(jsonStr);
59
+ } catch (error) {
60
+ throw new ExtractionError(`Failed to parse JSON payload: ${error.message}`, {
61
+ raw: raw.substring(0, 200),
62
+ extractionRegex
63
+ });
64
+ }
65
+ }
66
+
67
+ /**
68
+ * Evaluate JSONPath expression on JSON root
69
+ * @param {unknown} root - JSON root object
70
+ * @param {string} path - JSONPath expression
71
+ * @returns {unknown[]} - Array of matching values
72
+ * @throws {JsonPathError} If JSONPath evaluation fails
73
+ */
74
+ function evalJsonPath(root, path) {
75
+ if (!path) {
76
+ throw new JsonPathError(path, "jsonPath is required");
77
+ }
78
+ try {
79
+ return JSONPath({ path, json: root });
80
+ } catch (error) {
81
+ throw new JsonPathError(path, error.message || "jsonPath evaluation failed");
82
+ }
83
+ }
84
+
85
+ /**
86
+ * Extract data from JSON using configuration
87
+ * @param {unknown} root - JSON root object
88
+ * @param {JsonExtractionConfig} cfg - JSON extraction configuration
89
+ * @returns {JsonExtractionResult} - Extraction result
90
+ */
91
+ function extractFromJsonConfig(root, cfg) {
92
+ if (!root) return { kind: "json", notFound: true, data: {} };
93
+
94
+ if (cfg.condition) {
95
+ try {
96
+ const cond = evalJsonPath(root, cfg.condition);
97
+ if (!cond || cond.length === 0) return { kind: "json", notFound: true, data: {} };
98
+ } catch (error) {
99
+ return { kind: "json", notFound: true, data: {} };
100
+ }
101
+ }
102
+
103
+ if (cfg.notFoundSelector) {
104
+ try {
105
+ const errVal = evalJsonPath(root, cfg.notFoundSelector);
106
+ if (errVal && errVal.length > 0) return { kind: "json", notFound: true, data: {} };
107
+ } catch (error) {
108
+ return { kind: "json", notFound: true, data: {} };
109
+ }
110
+ }
111
+
112
+ const result = {};
113
+ const elements = Array.isArray(cfg.elements) ? cfg.elements : [];
114
+
115
+ for (const el of elements) {
116
+ if (!el.property || !el.jsonPath) continue;
117
+ const paths = el.jsonPath.split(";").map(p => p.trim()).filter(Boolean);
118
+ let collected = [];
119
+
120
+ for (const p of paths) {
121
+ try {
122
+ const vals = evalJsonPath(root, p);
123
+ if (vals && vals.length > 0) {
124
+ collected = collected.concat(vals);
125
+ if (!el.mergeAll && collected.length > 0) break;
126
+ }
127
+ } catch (error) {
128
+ continue;
129
+ }
130
+ }
131
+
132
+ if (!collected.length) continue;
133
+
134
+ if (el.mergeAll) {
135
+ result[el.property] = collected;
136
+ } else {
137
+ result[el.property] = collected[0];
138
+ }
139
+ }
140
+
141
+ return {
142
+ kind: "json",
143
+ notFound: false,
144
+ data: result
145
+ };
146
+ }
147
+
148
+ /**
149
+ * Extract JSON data from HTML using embedded script
150
+ * @param {ExtractionContext} ctx - Extraction context
151
+ * @param {JsonExtractionConfig} cfg - JSON extraction configuration
152
+ * @returns {JsonExtractionResult} - Extraction result
153
+ * @throws {ExtractionError} If JSON parsing fails
154
+ * @throws {JsonPathError} If JSONPath evaluation fails
155
+ */
156
+ function extractJson(ctx, cfg) {
157
+ if (!cfg || cfg.kind !== "json") {
158
+ throw new ExtractionError("Invalid JsonExtractionConfig: kind must be 'json'", { config: cfg });
159
+ }
160
+ if (!cfg.selector) {
161
+ throw new ExtractionError("JsonExtractionConfig requires selector", { config: cfg });
162
+ }
163
+ if (!Array.isArray(cfg.elements) || cfg.elements.length === 0) {
164
+ throw new ExtractionError("JsonExtractionConfig requires elements array", { config: cfg });
165
+ }
166
+
167
+ const doc = resolveDocument(ctx);
168
+ const scriptText = extractScriptText(doc, cfg.selector, cfg.selectorType);
169
+ const root = parseJsonPayload(scriptText, cfg.extractionRegex);
170
+
171
+ if (!root) {
172
+ return { kind: "json", notFound: true, data: {} };
173
+ }
174
+
175
+ return extractFromJsonConfig(root, cfg);
176
+ }
177
+
178
+ module.exports = {
179
+ extractJson
180
+ };
package/src/types.js ADDED
@@ -0,0 +1,59 @@
1
+ /**
2
+ * @typedef {"css" | "xpath"} SelectorType
3
+ */
4
+
5
+ /**
6
+ * @typedef {Object} ExtractionContext
7
+ * @property {string} [html] - HTML content string
8
+ * @property {string} [baseUrl] - Base URL for resolving relative URLs
9
+ * @property {Document} [document] - Browser Document object (for browser context)
10
+ */
11
+
12
+ /**
13
+ * @typedef {Object} HtmlExtractionConfig
14
+ * @property {"html"} kind - Extraction type identifier
15
+ * @property {string} selector - CSS or XPath selector
16
+ * @property {SelectorType} [selectorType] - Explicit selector type (auto-detected if not provided)
17
+ */
18
+
19
+ /**
20
+ * @typedef {Object} JsonElementConfig
21
+ * @property {string} property - Property name in result object
22
+ * @property {string} jsonPath - JSONPath expression (can be semicolon-separated for multiple paths)
23
+ * @property {boolean} [mergeAll] - If true, merge all matching values into array
24
+ */
25
+
26
+ /**
27
+ * @typedef {Object} JsonExtractionConfig
28
+ * @property {"json"} kind - Extraction type identifier
29
+ * @property {string} selector - CSS or XPath selector to find script/container element
30
+ * @property {"xPath" | "css"} [selectorType] - Selector type for finding script element
31
+ * @property {string} [extractionRegex] - Regex pattern to extract JSON from script text (e.g., "{.*}")
32
+ * @property {string} [condition] - JSONPath condition to check if extraction should proceed
33
+ * @property {string} [notFoundSelector] - JSONPath to detect error/not found state
34
+ * @property {JsonElementConfig[]} elements - Array of element extraction configurations
35
+ */
36
+
37
+ /**
38
+ * @typedef {HtmlExtractionConfig | JsonExtractionConfig} ExtractionConfig
39
+ */
40
+
41
+ /**
42
+ * @typedef {Object} HtmlExtractionResult
43
+ * @property {"html"} kind - Result type identifier
44
+ * @property {Element[]} nodes - Array of extracted DOM elements
45
+ * @property {number} count - Number of elements found
46
+ */
47
+
48
+ /**
49
+ * @typedef {Object} JsonExtractionResult
50
+ * @property {"json"} kind - Result type identifier
51
+ * @property {boolean} notFound - True if extraction failed (condition not met, error detected, etc.)
52
+ * @property {Record<string, unknown>} data - Extracted data object with property names as keys
53
+ */
54
+
55
+ /**
56
+ * @typedef {HtmlExtractionResult | JsonExtractionResult} ExtractionResult
57
+ */
58
+
59
+ module.exports = {};