html2md4llm 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -0
- package/package.json +42 -0
- package/plugin/main.js +41 -0
- package/plugin/manifest.json +16 -0
- package/src/generators/markdown.js +132 -0
- package/src/main.js +82 -0
- package/src/parser.js +208 -0
- package/src/utils.js +35 -0
package/README.md
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# html2md4llm
|
|
2
|
+
|
|
3
|
+
Convert HTML to clean Markdown or JSON format, optimized for LLM processing.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Convert HTML to Markdown or JSON
|
|
8
|
+
- Intelligent content extraction (list/article modes)
|
|
9
|
+
- Automatic HTML cleaning (removes scripts, styles, iframes)
|
|
10
|
+
- Preserves metadata (title, description, keywords)
|
|
11
|
+
- Zero dependencies - uses only Node.js built-in modules
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
### NPM Package
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npm install html2md4llm
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### Standalone Script
|
|
22
|
+
|
|
23
|
+
Download `dist/html2md4llm.min.js` and include it directly:
|
|
24
|
+
|
|
25
|
+
```html
|
|
26
|
+
<script src="html2md4llm.min.js"></script>
|
|
27
|
+
<script>
|
|
28
|
+
const result = html2md4llm('<h1>Hello</h1>');
|
|
29
|
+
</script>
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Dify Plugin
|
|
33
|
+
|
|
34
|
+
Install from the plugin marketplace or import from this repository's `plugin/` directory.
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
```javascript
|
|
39
|
+
import { main } from 'html2md4llm';
|
|
40
|
+
|
|
41
|
+
// Basic conversion to Markdown
|
|
42
|
+
const markdown = main('<h1>Hello</h1><p>World</p>');
|
|
43
|
+
|
|
44
|
+
// Convert to JSON
|
|
45
|
+
const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
|
|
46
|
+
|
|
47
|
+
// Extract largest list
|
|
48
|
+
const list = main(html, { strategy: 'list' });
|
|
49
|
+
|
|
50
|
+
// Extract article content
|
|
51
|
+
const article = main(html, { strategy: 'article' });
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## API
|
|
55
|
+
|
|
56
|
+
### `main(htmlInput, options)`
|
|
57
|
+
|
|
58
|
+
**Parameters:**
|
|
59
|
+
- `htmlInput` (string): HTML text to convert
|
|
60
|
+
- `options` (object, optional):
|
|
61
|
+
- `outputFormat` (string): `'markdown'` (default) or `'json'`
|
|
62
|
+
- `strategy` (string): `'list'`, `'article'`, or undefined
|
|
63
|
+
- `removeAttributes` (boolean): Remove HTML attributes during parsing
|
|
64
|
+
|
|
65
|
+
**Returns:** String (Markdown or JSON)
|
|
66
|
+
|
|
67
|
+
## Strategies
|
|
68
|
+
|
|
69
|
+
- **list**: Extracts the largest `<ul>` or `<ol>` element
|
|
70
|
+
- **article**: Filters out empty containers, keeping main content
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "html2md4llm",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "src/main.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/main.js"
|
|
9
|
+
},
|
|
10
|
+
"scripts": {
|
|
11
|
+
"test": "node tests/run-tests.js",
|
|
12
|
+
"build": "node build.js"
|
|
13
|
+
},
|
|
14
|
+
"keywords": [
|
|
15
|
+
"html",
|
|
16
|
+
"markdown",
|
|
17
|
+
"json",
|
|
18
|
+
"converter",
|
|
19
|
+
"llm",
|
|
20
|
+
"parser",
|
|
21
|
+
"html-to-markdown"
|
|
22
|
+
],
|
|
23
|
+
"author": "kaiye",
|
|
24
|
+
"license": "MIT",
|
|
25
|
+
"engines": {
|
|
26
|
+
"node": ">=18.0.0"
|
|
27
|
+
},
|
|
28
|
+
"repository": {
|
|
29
|
+
"type": "git",
|
|
30
|
+
"url": "https://github.com/kaiye/html2md4llm.git"
|
|
31
|
+
},
|
|
32
|
+
"files": [
|
|
33
|
+
"src/**/*",
|
|
34
|
+
"dist/**/*",
|
|
35
|
+
"plugin/**/*",
|
|
36
|
+
"README.md",
|
|
37
|
+
"LICENSE"
|
|
38
|
+
],
|
|
39
|
+
"devDependencies": {
|
|
40
|
+
"esbuild": "^0.27.0"
|
|
41
|
+
}
|
|
42
|
+
}
|
package/plugin/main.js
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { main } from '../src/main.js';
|
|
2
|
+
|
|
3
|
+
export default class Html2Md4LlmTool {
|
|
4
|
+
async invoke(parameters) {
|
|
5
|
+
const { html, outputFormat = 'markdown', strategy } = parameters;
|
|
6
|
+
|
|
7
|
+
if (!html) {
|
|
8
|
+
throw new Error('html parameter is required');
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
const options = { outputFormat };
|
|
12
|
+
if (strategy) {
|
|
13
|
+
options.strategy = strategy;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const result = main(html, options);
|
|
17
|
+
|
|
18
|
+
return {
|
|
19
|
+
result,
|
|
20
|
+
format: outputFormat
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async validate(parameters) {
|
|
25
|
+
if (!parameters.html || typeof parameters.html !== 'string') {
|
|
26
|
+
return { valid: false, error: 'html must be a non-empty string' };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const { outputFormat, strategy } = parameters;
|
|
30
|
+
|
|
31
|
+
if (outputFormat && !['markdown', 'json'].includes(outputFormat)) {
|
|
32
|
+
return { valid: false, error: 'outputFormat must be "markdown" or "json"' };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (strategy && !['list', 'article'].includes(strategy)) {
|
|
36
|
+
return { valid: false, error: 'strategy must be "list" or "article"' };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return { valid: true };
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "0.0.1",
|
|
3
|
+
"type": "tool",
|
|
4
|
+
"author": "kaiye",
|
|
5
|
+
"name": "html2md4llm",
|
|
6
|
+
"label": {
|
|
7
|
+
"en_US": "HTML to Markdown/JSON",
|
|
8
|
+
"zh_Hans": "HTML 转 Markdown/JSON"
|
|
9
|
+
},
|
|
10
|
+
"description": {
|
|
11
|
+
"en_US": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
|
|
12
|
+
"zh_Hans": "将 HTML 转换为干净的 Markdown 或 JSON 格式,针对 LLM 处理优化"
|
|
13
|
+
},
|
|
14
|
+
"icon": "icon.svg",
|
|
15
|
+
"tags": ["html", "markdown", "json", "converter", "parser"]
|
|
16
|
+
}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
|
|
2
|
+
const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section'];
|
|
3
|
+
|
|
4
|
+
function isInline(node) {
|
|
5
|
+
if (node.type === 'element' && node.tag === 'br') return false;
|
|
6
|
+
if (node.type === 'element' && inlineElements.includes(node.tag)) return true;
|
|
7
|
+
if (node.type === 'text' && node.flattenedTags) {
|
|
8
|
+
return node.flattenedTags.every(tag => inlineElements.includes(tag));
|
|
9
|
+
}
|
|
10
|
+
return false;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function isBlock(node) {
|
|
14
|
+
if (node.type === 'element' && node.tag === 'br') return false;
|
|
15
|
+
if (node.flattenedTags && node.flattenedTags.length > 0) {
|
|
16
|
+
return node.flattenedTags.some(tag => blockElements.includes(tag));
|
|
17
|
+
}
|
|
18
|
+
if (node.type === 'element') {
|
|
19
|
+
return blockElements.includes(node.tag);
|
|
20
|
+
}
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function generate(node, indent = 0) {
|
|
25
|
+
if (node.type === 'text') {
|
|
26
|
+
return node.text;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (node.type !== 'element') {
|
|
30
|
+
return '';
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const tag = node.tag;
|
|
34
|
+
const children = node.children || [];
|
|
35
|
+
|
|
36
|
+
// If only one child and no special handling for this tag, pass through transparently
|
|
37
|
+
const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
|
|
38
|
+
if (children.length === 1 && !hasSpecialHandling) {
|
|
39
|
+
return generate(children[0], indent);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Generate children with proper spacing
|
|
43
|
+
const parts = [];
|
|
44
|
+
for (let i = 0; i < children.length; i++) {
|
|
45
|
+
const child = children[i];
|
|
46
|
+
const childText = generate(child, indent);
|
|
47
|
+
if (childText) parts.push(childText);
|
|
48
|
+
|
|
49
|
+
// Add separator between children
|
|
50
|
+
if (i < children.length - 1 && childText) {
|
|
51
|
+
const nextChild = children[i + 1];
|
|
52
|
+
const nextText = generate(nextChild, indent);
|
|
53
|
+
if (nextText) {
|
|
54
|
+
const childIsInline = isInline(child);
|
|
55
|
+
const childIsBlock = isBlock(child);
|
|
56
|
+
const nextIsInline = isInline(nextChild);
|
|
57
|
+
const nextIsBlock = isBlock(nextChild);
|
|
58
|
+
|
|
59
|
+
if (childIsInline && nextIsInline) {
|
|
60
|
+
// inline + inline → space
|
|
61
|
+
parts.push(' ');
|
|
62
|
+
} else if (childIsBlock && nextIsBlock) {
|
|
63
|
+
// block + block → double newline
|
|
64
|
+
parts.push('\n\n');
|
|
65
|
+
} else if (childIsBlock || nextIsBlock) {
|
|
66
|
+
// block + anything or anything + block → single newline
|
|
67
|
+
parts.push('\n');
|
|
68
|
+
}
|
|
69
|
+
// container + container or inline + container → no separator (transparent)
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
const childText = parts.join('');
|
|
74
|
+
|
|
75
|
+
// Headings
|
|
76
|
+
if (tag === 'h1') return `# ${childText}`;
|
|
77
|
+
if (tag === 'h2') return `## ${childText}`;
|
|
78
|
+
if (tag === 'h3') return `### ${childText}`;
|
|
79
|
+
if (tag === 'h4') return `#### ${childText}`;
|
|
80
|
+
if (tag === 'h5') return `##### ${childText}`;
|
|
81
|
+
if (tag === 'h6') return `###### ${childText}`;
|
|
82
|
+
|
|
83
|
+
// Paragraph
|
|
84
|
+
if (tag === 'p') return childText;
|
|
85
|
+
|
|
86
|
+
// Inline formatting
|
|
87
|
+
if (tag === 'strong' || tag === 'b') return `**${childText}**`;
|
|
88
|
+
if (tag === 'em' || tag === 'i') return `*${childText}*`;
|
|
89
|
+
if (tag === 'code') return `\`${childText}\``;
|
|
90
|
+
if (tag === 'a') {
|
|
91
|
+
// Extract all text nodes and join with comma
|
|
92
|
+
const texts = [];
|
|
93
|
+
function collectText(n) {
|
|
94
|
+
if (n.type === 'text') {
|
|
95
|
+
const t = n.text.trim();
|
|
96
|
+
if (t) texts.push(t);
|
|
97
|
+
} else if (n.children) {
|
|
98
|
+
n.children.forEach(collectText);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
children.forEach(collectText);
|
|
102
|
+
const linkText = texts.join(', ');
|
|
103
|
+
return `[${linkText}](${node.attributes.href || ''})`;
|
|
104
|
+
}
|
|
105
|
+
if (tag === 'img') {
|
|
106
|
+
const alt = node.attributes.alt || '';
|
|
107
|
+
const src = node.attributes.src || '';
|
|
108
|
+
return ``;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Lists
|
|
112
|
+
if (tag === 'ul' || tag === 'ol') {
|
|
113
|
+
const listContent = children.map((c, i) => {
|
|
114
|
+
if (c.type === 'element' && c.tag === 'li') {
|
|
115
|
+
const marker = tag === 'ul' ? '-' : `${i + 1}.`;
|
|
116
|
+
const content = c.children.map(ch => generate(ch, indent + 2)).join('');
|
|
117
|
+
return `${' '.repeat(indent)}${marker} ${content}\n`;
|
|
118
|
+
}
|
|
119
|
+
return '';
|
|
120
|
+
}).join('');
|
|
121
|
+
return listContent.trimEnd();
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Code block
|
|
125
|
+
if (tag === 'pre') return `\`\`\`\n${childText}\n\`\`\``;
|
|
126
|
+
|
|
127
|
+
// Line break
|
|
128
|
+
if (tag === 'br') return '\n';
|
|
129
|
+
|
|
130
|
+
// Default: just return children
|
|
131
|
+
return childText;
|
|
132
|
+
}
|
package/src/main.js
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { parse } from './parser.js';
|
|
2
|
+
import { generate as generateMarkdown } from './generators/markdown.js';
|
|
3
|
+
|
|
4
|
+
export function main(htmlInput, options = {}) {
|
|
5
|
+
// Validate input
|
|
6
|
+
if (typeof htmlInput !== 'string') {
|
|
7
|
+
throw new TypeError('htmlInput must be a string');
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
// Validate options
|
|
11
|
+
const outputFormat = options.outputFormat || 'markdown';
|
|
12
|
+
if (outputFormat !== 'markdown' && outputFormat !== 'json') {
|
|
13
|
+
throw new Error('options.outputFormat must be \'markdown\' or \'json\'');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const strategy = options.strategy;
|
|
17
|
+
if (strategy && strategy !== 'list' && strategy !== 'article') {
|
|
18
|
+
throw new Error('options.strategy must be \'list\', \'article\', or undefined');
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Parse HTML to virtual DOM
|
|
22
|
+
let tree = parse(htmlInput, options.removeAttributes);
|
|
23
|
+
|
|
24
|
+
// Apply extraction strategy
|
|
25
|
+
if (strategy === 'list') {
|
|
26
|
+
tree = extractLargestList(tree);
|
|
27
|
+
} else if (strategy === 'article') {
|
|
28
|
+
tree = extractArticle(tree);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Generate output
|
|
32
|
+
if (outputFormat === 'markdown') {
|
|
33
|
+
return generateMarkdown(tree);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// JSON output - remove parent refs to avoid circular structure
|
|
37
|
+
return JSON.stringify(tree, (key, value) => key === 'parent' ? undefined : value, 2);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function extractLargestList(node) {
|
|
41
|
+
let largest = null;
|
|
42
|
+
let maxCount = 0;
|
|
43
|
+
|
|
44
|
+
function traverse(n) {
|
|
45
|
+
if (n.type === 'element' && (n.tag === 'ul' || n.tag === 'ol')) {
|
|
46
|
+
const count = n.children.filter(c => c.type === 'element' && c.tag === 'li').length;
|
|
47
|
+
if (count > maxCount) {
|
|
48
|
+
maxCount = count;
|
|
49
|
+
largest = n;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
if (n.children) {
|
|
53
|
+
n.children.forEach(traverse);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
traverse(node);
|
|
58
|
+
return largest || node;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function extractArticle(node) {
|
|
62
|
+
// Filter out sibling divs with no text content
|
|
63
|
+
function hasText(n) {
|
|
64
|
+
if (n.type === 'text' && n.text.trim()) return true;
|
|
65
|
+
if (n.children) return n.children.some(hasText);
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function filterNode(n) {
|
|
70
|
+
if (n.type !== 'element') return n;
|
|
71
|
+
|
|
72
|
+
const filtered = { ...n, children: [] };
|
|
73
|
+
for (const child of n.children || []) {
|
|
74
|
+
if (child.type === 'text' || hasText(child)) {
|
|
75
|
+
filtered.children.push(filterNode(child));
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return filtered;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return filterNode(node);
|
|
82
|
+
}
|
package/src/parser.js
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import { createElement, createText, decodeEntities } from './utils.js';
|
|
2
|
+
|
|
3
|
+
function cleanText(text) {
|
|
4
|
+
return text.replace(/[\u200E\u200F\u202A-\u202E]/g, '');
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export function parse(html, removeAttributes = []) {
|
|
8
|
+
// Default blacklist (style temporarily preserved for display:none filtering)
|
|
9
|
+
const defaultBlacklist = ['loading', 'decoding', 'fetchpriority'];
|
|
10
|
+
const blacklist = [...defaultBlacklist, ...removeAttributes];
|
|
11
|
+
|
|
12
|
+
function shouldRemove(attrName) {
|
|
13
|
+
if (attrName === 'style') return false; // Preserve for filtering
|
|
14
|
+
if (attrName.startsWith('data-')) return true;
|
|
15
|
+
return blacklist.some(pattern => {
|
|
16
|
+
if (pattern.endsWith('-*')) {
|
|
17
|
+
return attrName.startsWith(pattern.slice(0, -1));
|
|
18
|
+
}
|
|
19
|
+
return attrName === pattern;
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Pre-clean: remove DOCTYPE, script, style, iframe, svg, link, source, input, comments
|
|
24
|
+
html = html.replace(/<!DOCTYPE[^>]*>/gi, '');
|
|
25
|
+
html = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
|
|
26
|
+
html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
|
27
|
+
html = html.replace(/<iframe[^>]*>[\s\S]*?<\/iframe>/gi, '');
|
|
28
|
+
html = html.replace(/<svg[^>]*>[\s\S]*?<\/svg>/gi, '');
|
|
29
|
+
html = html.replace(/<link[^>]*>/gi, '');
|
|
30
|
+
html = html.replace(/<source[^>]*>/gi, '');
|
|
31
|
+
html = html.replace(/<input[^>]*>/gi, '');
|
|
32
|
+
html = html.replace(/<!--[\s\S]*?-->/g, '');
|
|
33
|
+
|
|
34
|
+
const stack = [];
|
|
35
|
+
const root = createElement('root', {}, []);
|
|
36
|
+
stack.push(root);
|
|
37
|
+
|
|
38
|
+
const tagRegex = /<\/?([a-z][a-z0-9]*)[^>]*>/gi;
|
|
39
|
+
let lastIndex = 0;
|
|
40
|
+
let match;
|
|
41
|
+
|
|
42
|
+
while ((match = tagRegex.exec(html)) !== null) {
|
|
43
|
+
// Add text before tag
|
|
44
|
+
if (match.index > lastIndex) {
|
|
45
|
+
const text = html.slice(lastIndex, match.index).trim();
|
|
46
|
+
if (text) {
|
|
47
|
+
const textNode = createText(decodeEntities(text));
|
|
48
|
+
const parent = stack[stack.length - 1];
|
|
49
|
+
textNode.parent = parent;
|
|
50
|
+
parent.children.push(textNode);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const fullTag = match[0];
|
|
55
|
+
const tagName = match[1].toLowerCase();
|
|
56
|
+
|
|
57
|
+
if (fullTag.startsWith('</')) {
|
|
58
|
+
// Closing tag
|
|
59
|
+
if (stack.length > 1 && stack[stack.length - 1].tag === tagName) {
|
|
60
|
+
stack.pop();
|
|
61
|
+
}
|
|
62
|
+
} else {
|
|
63
|
+
// Opening tag
|
|
64
|
+
const attrs = {};
|
|
65
|
+
const attrRegex = /([a-z][a-z0-9-]*)="([^"]*)"/gi;
|
|
66
|
+
let attrMatch;
|
|
67
|
+
let dataSrc = null;
|
|
68
|
+
while ((attrMatch = attrRegex.exec(fullTag)) !== null) {
|
|
69
|
+
const attrName = attrMatch[1];
|
|
70
|
+
if (attrName === 'data-src') dataSrc = attrMatch[2];
|
|
71
|
+
if (shouldRemove(attrName)) continue;
|
|
72
|
+
attrs[attrName] = attrMatch[2];
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// For img tags, use data-src as src if src is missing
|
|
76
|
+
if (tagName === 'img' && !attrs.src && dataSrc) {
|
|
77
|
+
attrs.src = dataSrc;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const node = createElement(tagName, attrs, []);
|
|
81
|
+
const parent = stack[stack.length - 1];
|
|
82
|
+
node.parent = parent;
|
|
83
|
+
parent.children.push(node);
|
|
84
|
+
|
|
85
|
+
// Self-closing or void elements
|
|
86
|
+
const voidElements = ['br', 'hr', 'img', 'input', 'meta', 'link', 'source', 'area', 'base', 'col', 'embed', 'param', 'track', 'wbr'];
|
|
87
|
+
if (fullTag.endsWith('/>') || voidElements.includes(tagName)) {
|
|
88
|
+
// Don't push to stack
|
|
89
|
+
} else {
|
|
90
|
+
stack.push(node);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
lastIndex = tagRegex.lastIndex;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Add remaining text
|
|
98
|
+
if (lastIndex < html.length) {
|
|
99
|
+
const text = html.slice(lastIndex).trim();
|
|
100
|
+
if (text) {
|
|
101
|
+
const textNode = createText(decodeEntities(text));
|
|
102
|
+
const parent = stack[stack.length - 1];
|
|
103
|
+
textNode.parent = parent;
|
|
104
|
+
parent.children.push(textNode);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Post-processing: flatten pre/code, flatten containers, remove unwanted nodes
|
|
109
|
+
const voidElements = ['br', 'hr', 'img'];
|
|
110
|
+
const flattenableTags = ['div', 'span', 'section', 'p'];
|
|
111
|
+
|
|
112
|
+
function flattenPreCode(node) {
|
|
113
|
+
if (node.type === 'element' && (node.tag === 'pre' || node.tag === 'code')) {
|
|
114
|
+
const texts = [];
|
|
115
|
+
function collectText(n) {
|
|
116
|
+
if (n.type === 'text') texts.push(n.text);
|
|
117
|
+
else if (n.children) n.children.forEach(collectText);
|
|
118
|
+
}
|
|
119
|
+
node.children.forEach(collectText);
|
|
120
|
+
node.children = [createText(texts.join(''))];
|
|
121
|
+
} else if (node.children) {
|
|
122
|
+
node.children.forEach(flattenPreCode);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function removeUnwantedNodes(node) {
|
|
127
|
+
if (!node.children) return;
|
|
128
|
+
|
|
129
|
+
node.children = node.children.filter(child => {
|
|
130
|
+
if (child.type === 'text') return true;
|
|
131
|
+
if (child.type === 'element') {
|
|
132
|
+
// Filter elements with display:none
|
|
133
|
+
const style = child.attributes.style;
|
|
134
|
+
if (style && /display\s*:\s*none/i.test(style)) return false;
|
|
135
|
+
|
|
136
|
+
// Filter ARIA hidden elements
|
|
137
|
+
if (child.attributes['aria-hidden'] === 'true') return false;
|
|
138
|
+
if (child.attributes.tabindex === '-1') return false;
|
|
139
|
+
if (child.attributes.hidden !== undefined) return false;
|
|
140
|
+
const role = child.attributes.role;
|
|
141
|
+
if (role === 'presentation' || role === 'none') return false;
|
|
142
|
+
|
|
143
|
+
// Filter img without src
|
|
144
|
+
if (child.tag === 'img' && !child.attributes.src) return false;
|
|
145
|
+
|
|
146
|
+
// Filter a with javascript: href
|
|
147
|
+
if (child.tag === 'a' && child.attributes.href?.startsWith('javascript:')) return false;
|
|
148
|
+
|
|
149
|
+
// Keep description and keywords meta tags, remove others
|
|
150
|
+
if (child.tag === 'meta') {
|
|
151
|
+
const name = child.attributes.name;
|
|
152
|
+
if (name === 'description' || name === 'keywords') {
|
|
153
|
+
return true;
|
|
154
|
+
}
|
|
155
|
+
return false;
|
|
156
|
+
}
|
|
157
|
+
// Keep void elements
|
|
158
|
+
if (voidElements.includes(child.tag)) return true;
|
|
159
|
+
// Recursively process children
|
|
160
|
+
removeUnwantedNodes(child);
|
|
161
|
+
// Remove style attribute after filtering
|
|
162
|
+
delete child.attributes.style;
|
|
163
|
+
// Remove empty nodes
|
|
164
|
+
if (child.children && child.children.length === 0) return false;
|
|
165
|
+
}
|
|
166
|
+
return true;
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function flattenContainers(node) {
|
|
171
|
+
if (!node.children) return node;
|
|
172
|
+
|
|
173
|
+
node.children = node.children.map(child => {
|
|
174
|
+
// Collect flattened tags and classes
|
|
175
|
+
const tags = [];
|
|
176
|
+
const classes = [];
|
|
177
|
+
let current = child;
|
|
178
|
+
|
|
179
|
+
// Walk down single-child flattenable containers
|
|
180
|
+
while (current.type === 'element' &&
|
|
181
|
+
flattenableTags.includes(current.tag) &&
|
|
182
|
+
current.children?.length === 1) {
|
|
183
|
+
tags.push(current.tag);
|
|
184
|
+
if (current.attributes.class) {
|
|
185
|
+
classes.push(current.attributes.class);
|
|
186
|
+
}
|
|
187
|
+
current = current.children[0];
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// If we collected any tags, attach them to the final node
|
|
191
|
+
if (tags.length > 0) {
|
|
192
|
+
current.flattenedTags = tags;
|
|
193
|
+
current.flattenedClasses = classes;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Recursively process the final node
|
|
197
|
+
return flattenContainers(current);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
return node;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
flattenPreCode(root);
|
|
204
|
+
removeUnwantedNodes(root);
|
|
205
|
+
flattenContainers(root);
|
|
206
|
+
|
|
207
|
+
return root.children.length === 1 ? root.children[0] : root;
|
|
208
|
+
}
|
package/src/utils.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
const htmlEntities = {
|
|
2
|
+
' ': ' ',
|
|
3
|
+
'<': '<',
|
|
4
|
+
'>': '>',
|
|
5
|
+
'&': '&',
|
|
6
|
+
'"': '"',
|
|
7
|
+
''': "'",
|
|
8
|
+
''': "'"
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
export function decodeEntity(entity) {
|
|
12
|
+
if (entity.startsWith('&#x')) {
|
|
13
|
+
return String.fromCharCode(parseInt(entity.slice(3, -1), 16));
|
|
14
|
+
}
|
|
15
|
+
if (entity.startsWith('&#')) {
|
|
16
|
+
return String.fromCharCode(parseInt(entity.slice(2, -1), 10));
|
|
17
|
+
}
|
|
18
|
+
return htmlEntities[entity] || entity;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function decodeEntities(text) {
|
|
22
|
+
return text.replace(/&[#\w]+;/g, decodeEntity);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function createElement(tag, attributes = {}, children = []) {
|
|
26
|
+
return { type: 'element', tag, attributes, children, parent: null };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function cleanText(text) {
|
|
30
|
+
return text.replace(/[\u200E\u200F\u202A-\u202E]/g, '');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function createText(text) {
|
|
34
|
+
return { type: 'text', text: cleanText(text), parent: null };
|
|
35
|
+
}
|