html2md4llm 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,74 @@
1
+ # html2md4llm
2
+
3
+ Convert HTML to clean Markdown or JSON format, optimized for LLM processing.
4
+
5
+ ## Features
6
+
7
+ - Convert HTML to Markdown or JSON
8
+ - Intelligent content extraction (list/article modes)
9
+ - Automatic HTML cleaning (removes scripts, styles, iframes)
10
+ - Preserves metadata (title, description, keywords)
11
+ - Zero dependencies - uses only Node.js built-in modules
12
+
13
+ ## Installation
14
+
15
+ ### NPM Package
16
+
17
+ ```bash
18
+ npm install html2md4llm
19
+ ```
20
+
21
+ ### Standalone Script
22
+
23
+ Download `dist/html2md4llm.min.js` and include it directly:
24
+
25
+ ```html
26
+ <script src="html2md4llm.min.js"></script>
27
+ <script>
28
+ const result = html2md4llm('<h1>Hello</h1>');
29
+ </script>
30
+ ```
31
+
32
+ ### Dify Plugin
33
+
34
+ Install from the plugin marketplace or import from this repository's `plugin/` directory.
35
+
36
+ ## Usage
37
+
38
+ ```javascript
39
+ import { main } from 'html2md4llm';
40
+
41
+ // Basic conversion to Markdown
42
+ const markdown = main('<h1>Hello</h1><p>World</p>');
43
+
44
+ // Convert to JSON
45
+ const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
46
+
47
+ // Extract largest list
48
+ const list = main(html, { strategy: 'list' });
49
+
50
+ // Extract article content
51
+ const article = main(html, { strategy: 'article' });
52
+ ```
53
+
54
+ ## API
55
+
56
+ ### `main(htmlInput, options)`
57
+
58
+ **Parameters:**
59
+ - `htmlInput` (string): HTML text to convert
60
+ - `options` (object, optional):
61
+ - `outputFormat` (string): `'markdown'` (default) or `'json'`
62
+ - `strategy` (string): `'list'`, `'article'`, or undefined
63
+ - `removeAttributes` (boolean): Remove HTML attributes during parsing
64
+
65
+ **Returns:** String (Markdown or JSON)
66
+
67
+ ## Strategies
68
+
69
+ - **list**: Extracts the largest `<ul>` or `<ol>` element
70
+ - **article**: Filters out empty containers, keeping main content
71
+
72
+ ## License
73
+
74
+ MIT
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "html2md4llm",
3
+ "version": "1.0.0",
4
+ "description": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
5
+ "type": "module",
6
+ "main": "src/main.js",
7
+ "exports": {
8
+ ".": "./src/main.js"
9
+ },
10
+ "scripts": {
11
+ "test": "node tests/run-tests.js",
12
+ "build": "node build.js"
13
+ },
14
+ "keywords": [
15
+ "html",
16
+ "markdown",
17
+ "json",
18
+ "converter",
19
+ "llm",
20
+ "parser",
21
+ "html-to-markdown"
22
+ ],
23
+ "author": "kaiye",
24
+ "license": "MIT",
25
+ "engines": {
26
+ "node": ">=18.0.0"
27
+ },
28
+ "repository": {
29
+ "type": "git",
30
+ "url": "https://github.com/kaiye/html2md4llm.git"
31
+ },
32
+ "files": [
33
+ "src/**/*",
34
+ "dist/**/*",
35
+ "plugin/**/*",
36
+ "README.md",
37
+ "LICENSE"
38
+ ],
39
+ "devDependencies": {
40
+ "esbuild": "^0.27.0"
41
+ }
42
+ }
package/plugin/main.js ADDED
@@ -0,0 +1,41 @@
1
+ import { main } from '../src/main.js';
2
+
3
+ export default class Html2Md4LlmTool {
4
+ async invoke(parameters) {
5
+ const { html, outputFormat = 'markdown', strategy } = parameters;
6
+
7
+ if (!html) {
8
+ throw new Error('html parameter is required');
9
+ }
10
+
11
+ const options = { outputFormat };
12
+ if (strategy) {
13
+ options.strategy = strategy;
14
+ }
15
+
16
+ const result = main(html, options);
17
+
18
+ return {
19
+ result,
20
+ format: outputFormat
21
+ };
22
+ }
23
+
24
+ async validate(parameters) {
25
+ if (!parameters.html || typeof parameters.html !== 'string') {
26
+ return { valid: false, error: 'html must be a non-empty string' };
27
+ }
28
+
29
+ const { outputFormat, strategy } = parameters;
30
+
31
+ if (outputFormat && !['markdown', 'json'].includes(outputFormat)) {
32
+ return { valid: false, error: 'outputFormat must be "markdown" or "json"' };
33
+ }
34
+
35
+ if (strategy && !['list', 'article'].includes(strategy)) {
36
+ return { valid: false, error: 'strategy must be "list" or "article"' };
37
+ }
38
+
39
+ return { valid: true };
40
+ }
41
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "version": "0.0.1",
3
+ "type": "tool",
4
+ "author": "kaiye",
5
+ "name": "html2md4llm",
6
+ "label": {
7
+ "en_US": "HTML to Markdown/JSON",
8
+ "zh_Hans": "HTML 转 Markdown/JSON"
9
+ },
10
+ "description": {
11
+ "en_US": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
12
+ "zh_Hans": "将 HTML 转换为干净的 Markdown 或 JSON 格式,针对 LLM 处理优化"
13
+ },
14
+ "icon": "icon.svg",
15
+ "tags": ["html", "markdown", "json", "converter", "parser"]
16
+ }
@@ -0,0 +1,132 @@
1
+ const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
2
+ const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section'];
3
+
4
+ function isInline(node) {
5
+ if (node.type === 'element' && node.tag === 'br') return false;
6
+ if (node.type === 'element' && inlineElements.includes(node.tag)) return true;
7
+ if (node.type === 'text' && node.flattenedTags) {
8
+ return node.flattenedTags.every(tag => inlineElements.includes(tag));
9
+ }
10
+ return false;
11
+ }
12
+
13
+ function isBlock(node) {
14
+ if (node.type === 'element' && node.tag === 'br') return false;
15
+ if (node.flattenedTags && node.flattenedTags.length > 0) {
16
+ return node.flattenedTags.some(tag => blockElements.includes(tag));
17
+ }
18
+ if (node.type === 'element') {
19
+ return blockElements.includes(node.tag);
20
+ }
21
+ return false;
22
+ }
23
+
24
+ export function generate(node, indent = 0) {
25
+ if (node.type === 'text') {
26
+ return node.text;
27
+ }
28
+
29
+ if (node.type !== 'element') {
30
+ return '';
31
+ }
32
+
33
+ const tag = node.tag;
34
+ const children = node.children || [];
35
+
36
+ // If only one child and no special handling for this tag, pass through transparently
37
+ const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
38
+ if (children.length === 1 && !hasSpecialHandling) {
39
+ return generate(children[0], indent);
40
+ }
41
+
42
+ // Generate children with proper spacing
43
+ const parts = [];
44
+ for (let i = 0; i < children.length; i++) {
45
+ const child = children[i];
46
+ const childText = generate(child, indent);
47
+ if (childText) parts.push(childText);
48
+
49
+ // Add separator between children
50
+ if (i < children.length - 1 && childText) {
51
+ const nextChild = children[i + 1];
52
+ const nextText = generate(nextChild, indent);
53
+ if (nextText) {
54
+ const childIsInline = isInline(child);
55
+ const childIsBlock = isBlock(child);
56
+ const nextIsInline = isInline(nextChild);
57
+ const nextIsBlock = isBlock(nextChild);
58
+
59
+ if (childIsInline && nextIsInline) {
60
+ // inline + inline → space
61
+ parts.push(' ');
62
+ } else if (childIsBlock && nextIsBlock) {
63
+ // block + block → double newline
64
+ parts.push('\n\n');
65
+ } else if (childIsBlock || nextIsBlock) {
66
+ // block + anything or anything + block → single newline
67
+ parts.push('\n');
68
+ }
69
+ // container + container or inline + container → no separator (transparent)
70
+ }
71
+ }
72
+ }
73
+ const childText = parts.join('');
74
+
75
+ // Headings
76
+ if (tag === 'h1') return `# ${childText}`;
77
+ if (tag === 'h2') return `## ${childText}`;
78
+ if (tag === 'h3') return `### ${childText}`;
79
+ if (tag === 'h4') return `#### ${childText}`;
80
+ if (tag === 'h5') return `##### ${childText}`;
81
+ if (tag === 'h6') return `###### ${childText}`;
82
+
83
+ // Paragraph
84
+ if (tag === 'p') return childText;
85
+
86
+ // Inline formatting
87
+ if (tag === 'strong' || tag === 'b') return `**${childText}**`;
88
+ if (tag === 'em' || tag === 'i') return `*${childText}*`;
89
+ if (tag === 'code') return `\`${childText}\``;
90
+ if (tag === 'a') {
91
+ // Extract all text nodes and join with comma
92
+ const texts = [];
93
+ function collectText(n) {
94
+ if (n.type === 'text') {
95
+ const t = n.text.trim();
96
+ if (t) texts.push(t);
97
+ } else if (n.children) {
98
+ n.children.forEach(collectText);
99
+ }
100
+ }
101
+ children.forEach(collectText);
102
+ const linkText = texts.join(', ');
103
+ return `[${linkText}](${node.attributes.href || ''})`;
104
+ }
105
+ if (tag === 'img') {
106
+ const alt = node.attributes.alt || '';
107
+ const src = node.attributes.src || '';
108
+ return `![${alt}](${src})`;
109
+ }
110
+
111
+ // Lists
112
+ if (tag === 'ul' || tag === 'ol') {
113
+ const listContent = children.map((c, i) => {
114
+ if (c.type === 'element' && c.tag === 'li') {
115
+ const marker = tag === 'ul' ? '-' : `${i + 1}.`;
116
+ const content = c.children.map(ch => generate(ch, indent + 2)).join('');
117
+ return `${' '.repeat(indent)}${marker} ${content}\n`;
118
+ }
119
+ return '';
120
+ }).join('');
121
+ return listContent.trimEnd();
122
+ }
123
+
124
+ // Code block
125
+ if (tag === 'pre') return `\`\`\`\n${childText}\n\`\`\``;
126
+
127
+ // Line break
128
+ if (tag === 'br') return '\n';
129
+
130
+ // Default: just return children
131
+ return childText;
132
+ }
package/src/main.js ADDED
@@ -0,0 +1,82 @@
1
+ import { parse } from './parser.js';
2
+ import { generate as generateMarkdown } from './generators/markdown.js';
3
+
4
+ export function main(htmlInput, options = {}) {
5
+ // Validate input
6
+ if (typeof htmlInput !== 'string') {
7
+ throw new TypeError('htmlInput must be a string');
8
+ }
9
+
10
+ // Validate options
11
+ const outputFormat = options.outputFormat || 'markdown';
12
+ if (outputFormat !== 'markdown' && outputFormat !== 'json') {
13
+ throw new Error('options.outputFormat must be \'markdown\' or \'json\'');
14
+ }
15
+
16
+ const strategy = options.strategy;
17
+ if (strategy && strategy !== 'list' && strategy !== 'article') {
18
+ throw new Error('options.strategy must be \'list\', \'article\', or undefined');
19
+ }
20
+
21
+ // Parse HTML to virtual DOM
22
+ let tree = parse(htmlInput, options.removeAttributes);
23
+
24
+ // Apply extraction strategy
25
+ if (strategy === 'list') {
26
+ tree = extractLargestList(tree);
27
+ } else if (strategy === 'article') {
28
+ tree = extractArticle(tree);
29
+ }
30
+
31
+ // Generate output
32
+ if (outputFormat === 'markdown') {
33
+ return generateMarkdown(tree);
34
+ }
35
+
36
+ // JSON output - remove parent refs to avoid circular structure
37
+ return JSON.stringify(tree, (key, value) => key === 'parent' ? undefined : value, 2);
38
+ }
39
+
40
+ function extractLargestList(node) {
41
+ let largest = null;
42
+ let maxCount = 0;
43
+
44
+ function traverse(n) {
45
+ if (n.type === 'element' && (n.tag === 'ul' || n.tag === 'ol')) {
46
+ const count = n.children.filter(c => c.type === 'element' && c.tag === 'li').length;
47
+ if (count > maxCount) {
48
+ maxCount = count;
49
+ largest = n;
50
+ }
51
+ }
52
+ if (n.children) {
53
+ n.children.forEach(traverse);
54
+ }
55
+ }
56
+
57
+ traverse(node);
58
+ return largest || node;
59
+ }
60
+
61
+ function extractArticle(node) {
62
+ // Filter out sibling divs with no text content
63
+ function hasText(n) {
64
+ if (n.type === 'text' && n.text.trim()) return true;
65
+ if (n.children) return n.children.some(hasText);
66
+ return false;
67
+ }
68
+
69
+ function filterNode(n) {
70
+ if (n.type !== 'element') return n;
71
+
72
+ const filtered = { ...n, children: [] };
73
+ for (const child of n.children || []) {
74
+ if (child.type === 'text' || hasText(child)) {
75
+ filtered.children.push(filterNode(child));
76
+ }
77
+ }
78
+ return filtered;
79
+ }
80
+
81
+ return filterNode(node);
82
+ }
package/src/parser.js ADDED
@@ -0,0 +1,208 @@
1
+ import { createElement, createText, decodeEntities } from './utils.js';
2
+
3
+ function cleanText(text) {
4
+ return text.replace(/[\u200E\u200F\u202A-\u202E]/g, '');
5
+ }
6
+
7
+ export function parse(html, removeAttributes = []) {
8
+ // Default blacklist (style temporarily preserved for display:none filtering)
9
+ const defaultBlacklist = ['loading', 'decoding', 'fetchpriority'];
10
+ const blacklist = [...defaultBlacklist, ...removeAttributes];
11
+
12
+ function shouldRemove(attrName) {
13
+ if (attrName === 'style') return false; // Preserve for filtering
14
+ if (attrName.startsWith('data-')) return true;
15
+ return blacklist.some(pattern => {
16
+ if (pattern.endsWith('-*')) {
17
+ return attrName.startsWith(pattern.slice(0, -1));
18
+ }
19
+ return attrName === pattern;
20
+ });
21
+ }
22
+
23
+ // Pre-clean: remove DOCTYPE, script, style, iframe, svg, link, source, input, comments
24
+ html = html.replace(/<!DOCTYPE[^>]*>/gi, '');
25
+ html = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
26
+ html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
27
+ html = html.replace(/<iframe[^>]*>[\s\S]*?<\/iframe>/gi, '');
28
+ html = html.replace(/<svg[^>]*>[\s\S]*?<\/svg>/gi, '');
29
+ html = html.replace(/<link[^>]*>/gi, '');
30
+ html = html.replace(/<source[^>]*>/gi, '');
31
+ html = html.replace(/<input[^>]*>/gi, '');
32
+ html = html.replace(/<!--[\s\S]*?-->/g, '');
33
+
34
+ const stack = [];
35
+ const root = createElement('root', {}, []);
36
+ stack.push(root);
37
+
38
+ const tagRegex = /<\/?([a-z][a-z0-9]*)[^>]*>/gi;
39
+ let lastIndex = 0;
40
+ let match;
41
+
42
+ while ((match = tagRegex.exec(html)) !== null) {
43
+ // Add text before tag
44
+ if (match.index > lastIndex) {
45
+ const text = html.slice(lastIndex, match.index).trim();
46
+ if (text) {
47
+ const textNode = createText(decodeEntities(text));
48
+ const parent = stack[stack.length - 1];
49
+ textNode.parent = parent;
50
+ parent.children.push(textNode);
51
+ }
52
+ }
53
+
54
+ const fullTag = match[0];
55
+ const tagName = match[1].toLowerCase();
56
+
57
+ if (fullTag.startsWith('</')) {
58
+ // Closing tag
59
+ if (stack.length > 1 && stack[stack.length - 1].tag === tagName) {
60
+ stack.pop();
61
+ }
62
+ } else {
63
+ // Opening tag
64
+ const attrs = {};
65
+ const attrRegex = /([a-z][a-z0-9-]*)="([^"]*)"/gi;
66
+ let attrMatch;
67
+ let dataSrc = null;
68
+ while ((attrMatch = attrRegex.exec(fullTag)) !== null) {
69
+ const attrName = attrMatch[1];
70
+ if (attrName === 'data-src') dataSrc = attrMatch[2];
71
+ if (shouldRemove(attrName)) continue;
72
+ attrs[attrName] = attrMatch[2];
73
+ }
74
+
75
+ // For img tags, use data-src as src if src is missing
76
+ if (tagName === 'img' && !attrs.src && dataSrc) {
77
+ attrs.src = dataSrc;
78
+ }
79
+
80
+ const node = createElement(tagName, attrs, []);
81
+ const parent = stack[stack.length - 1];
82
+ node.parent = parent;
83
+ parent.children.push(node);
84
+
85
+ // Self-closing or void elements
86
+ const voidElements = ['br', 'hr', 'img', 'input', 'meta', 'link', 'source', 'area', 'base', 'col', 'embed', 'param', 'track', 'wbr'];
87
+ if (fullTag.endsWith('/>') || voidElements.includes(tagName)) {
88
+ // Don't push to stack
89
+ } else {
90
+ stack.push(node);
91
+ }
92
+ }
93
+
94
+ lastIndex = tagRegex.lastIndex;
95
+ }
96
+
97
+ // Add remaining text
98
+ if (lastIndex < html.length) {
99
+ const text = html.slice(lastIndex).trim();
100
+ if (text) {
101
+ const textNode = createText(decodeEntities(text));
102
+ const parent = stack[stack.length - 1];
103
+ textNode.parent = parent;
104
+ parent.children.push(textNode);
105
+ }
106
+ }
107
+
108
+ // Post-processing: flatten pre/code, flatten containers, remove unwanted nodes
109
+ const voidElements = ['br', 'hr', 'img'];
110
+ const flattenableTags = ['div', 'span', 'section', 'p'];
111
+
112
+ function flattenPreCode(node) {
113
+ if (node.type === 'element' && (node.tag === 'pre' || node.tag === 'code')) {
114
+ const texts = [];
115
+ function collectText(n) {
116
+ if (n.type === 'text') texts.push(n.text);
117
+ else if (n.children) n.children.forEach(collectText);
118
+ }
119
+ node.children.forEach(collectText);
120
+ node.children = [createText(texts.join(''))];
121
+ } else if (node.children) {
122
+ node.children.forEach(flattenPreCode);
123
+ }
124
+ }
125
+
126
+ function removeUnwantedNodes(node) {
127
+ if (!node.children) return;
128
+
129
+ node.children = node.children.filter(child => {
130
+ if (child.type === 'text') return true;
131
+ if (child.type === 'element') {
132
+ // Filter elements with display:none
133
+ const style = child.attributes.style;
134
+ if (style && /display\s*:\s*none/i.test(style)) return false;
135
+
136
+ // Filter ARIA hidden elements
137
+ if (child.attributes['aria-hidden'] === 'true') return false;
138
+ if (child.attributes.tabindex === '-1') return false;
139
+ if (child.attributes.hidden !== undefined) return false;
140
+ const role = child.attributes.role;
141
+ if (role === 'presentation' || role === 'none') return false;
142
+
143
+ // Filter img without src
144
+ if (child.tag === 'img' && !child.attributes.src) return false;
145
+
146
+ // Filter a with javascript: href
147
+ if (child.tag === 'a' && child.attributes.href?.startsWith('javascript:')) return false;
148
+
149
+ // Keep description and keywords meta tags, remove others
150
+ if (child.tag === 'meta') {
151
+ const name = child.attributes.name;
152
+ if (name === 'description' || name === 'keywords') {
153
+ return true;
154
+ }
155
+ return false;
156
+ }
157
+ // Keep void elements
158
+ if (voidElements.includes(child.tag)) return true;
159
+ // Recursively process children
160
+ removeUnwantedNodes(child);
161
+ // Remove style attribute after filtering
162
+ delete child.attributes.style;
163
+ // Remove empty nodes
164
+ if (child.children && child.children.length === 0) return false;
165
+ }
166
+ return true;
167
+ });
168
+ }
169
+
170
+ function flattenContainers(node) {
171
+ if (!node.children) return node;
172
+
173
+ node.children = node.children.map(child => {
174
+ // Collect flattened tags and classes
175
+ const tags = [];
176
+ const classes = [];
177
+ let current = child;
178
+
179
+ // Walk down single-child flattenable containers
180
+ while (current.type === 'element' &&
181
+ flattenableTags.includes(current.tag) &&
182
+ current.children?.length === 1) {
183
+ tags.push(current.tag);
184
+ if (current.attributes.class) {
185
+ classes.push(current.attributes.class);
186
+ }
187
+ current = current.children[0];
188
+ }
189
+
190
+ // If we collected any tags, attach them to the final node
191
+ if (tags.length > 0) {
192
+ current.flattenedTags = tags;
193
+ current.flattenedClasses = classes;
194
+ }
195
+
196
+ // Recursively process the final node
197
+ return flattenContainers(current);
198
+ });
199
+
200
+ return node;
201
+ }
202
+
203
+ flattenPreCode(root);
204
+ removeUnwantedNodes(root);
205
+ flattenContainers(root);
206
+
207
+ return root.children.length === 1 ? root.children[0] : root;
208
+ }
package/src/utils.js ADDED
@@ -0,0 +1,35 @@
1
+ const htmlEntities = {
2
+ '&nbsp;': ' ',
3
+ '&lt;': '<',
4
+ '&gt;': '>',
5
+ '&amp;': '&',
6
+ '&quot;': '"',
7
+ '&#39;': "'",
8
+ '&apos;': "'"
9
+ };
10
+
11
+ export function decodeEntity(entity) {
12
+ if (entity.startsWith('&#x')) {
13
+ return String.fromCharCode(parseInt(entity.slice(3, -1), 16));
14
+ }
15
+ if (entity.startsWith('&#')) {
16
+ return String.fromCharCode(parseInt(entity.slice(2, -1), 10));
17
+ }
18
+ return htmlEntities[entity] || entity;
19
+ }
20
+
21
+ export function decodeEntities(text) {
22
+ return text.replace(/&[#\w]+;/g, decodeEntity);
23
+ }
24
+
25
+ export function createElement(tag, attributes = {}, children = []) {
26
+ return { type: 'element', tag, attributes, children, parent: null };
27
+ }
28
+
29
+ function cleanText(text) {
30
+ return text.replace(/[\u200E\u200F\u202A-\u202E]/g, '');
31
+ }
32
+
33
+ export function createText(text) {
34
+ return { type: 'text', text: cleanText(text), parent: null };
35
+ }