html2md4llm 1.0.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +56 -43
- package/bin/html2md4llm.js +199 -0
- package/package.json +15 -12
- package/src/generators/markdown.js +81 -1
- package/src/main.js +3 -0
- package/src/parser.js +24 -0
- package/plugin/main.js +0 -41
- package/plugin/manifest.json +0 -16
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 kaiye
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,73 +1,86 @@
|
|
|
1
1
|
# html2md4llm
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
把 HTML 转成更干净的 Markdown 或 JSON,便于人和 AI 直接消费。
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## 安装
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
- Preserves metadata (title, description, keywords)
|
|
11
|
-
- Zero dependencies - uses only Node.js built-in modules
|
|
7
|
+
```bash
|
|
8
|
+
npm install html2md4llm
|
|
9
|
+
```
|
|
12
10
|
|
|
13
|
-
##
|
|
11
|
+
## 最常用方式
|
|
14
12
|
|
|
15
|
-
###
|
|
13
|
+
### 1) 在代码中调用
|
|
16
14
|
|
|
17
|
-
```
|
|
18
|
-
|
|
15
|
+
```js
|
|
16
|
+
import { main } from 'html2md4llm';
|
|
17
|
+
|
|
18
|
+
const html = '<h1>Hello</h1><p>World</p>';
|
|
19
|
+
const md = main(html); // 默认输出 markdown
|
|
20
|
+
console.log(md);
|
|
19
21
|
```
|
|
20
22
|
|
|
21
|
-
|
|
23
|
+
输出 JSON:
|
|
24
|
+
|
|
25
|
+
```js
|
|
26
|
+
const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
|
|
27
|
+
```
|
|
22
28
|
|
|
23
|
-
|
|
29
|
+
### 2) 用 npx 当命令行工具
|
|
24
30
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
</script>
|
|
31
|
+
本地文件转 `.md`:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
npx html2md4llm ./input.html ./output.md
|
|
30
35
|
```
|
|
31
36
|
|
|
32
|
-
|
|
37
|
+
stdin -> stdout(Linux 管道):
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+
```bash
|
|
40
|
+
cat ./input.html | npx html2md4llm > ./output.md
|
|
41
|
+
```
|
|
35
42
|
|
|
36
|
-
##
|
|
43
|
+
## API(给代码/AI 调用)
|
|
37
44
|
|
|
38
|
-
|
|
39
|
-
import { main } from 'html2md4llm';
|
|
45
|
+
`main(htmlInput, options?)`
|
|
40
46
|
|
|
41
|
-
|
|
42
|
-
|
|
47
|
+
- `htmlInput: string`:HTML 字符串
|
|
48
|
+
- `options.outputFormat: 'markdown' | 'json'`:默认 `'markdown'`
|
|
49
|
+
- `options.strategy: 'list' | 'article'`:可选提取策略
|
|
50
|
+
- `options.removeAttributes: string[]`:按规则移除属性,如 `['aria-*', 'role']`
|
|
43
51
|
|
|
44
|
-
|
|
45
|
-
const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
|
|
52
|
+
返回值:`string`(Markdown 文本或 JSON 字符串)
|
|
46
53
|
|
|
47
|
-
|
|
48
|
-
const list = main(html, { strategy: 'list' });
|
|
54
|
+
## CLI 参数
|
|
49
55
|
|
|
50
|
-
|
|
51
|
-
|
|
56
|
+
```bash
|
|
57
|
+
html2md4llm <input.html> [output.md] [options]
|
|
58
|
+
html2md4llm - [output.md] [options]
|
|
52
59
|
```
|
|
53
60
|
|
|
54
|
-
|
|
61
|
+
- `-o, --output <file>`:输出文件路径
|
|
62
|
+
- `-f, --format <markdown|json>`:输出格式
|
|
63
|
+
- `--json`:等价于 `--format json`
|
|
64
|
+
- `--markdown`:等价于 `--format markdown`
|
|
65
|
+
- `-s, --strategy <list|article>`:内容提取策略
|
|
66
|
+
- `-r, --remove-attrs <attrs>`:逗号分隔,如 `aria-*,role`
|
|
67
|
+
- `-h, --help`:查看帮助
|
|
68
|
+
- `-v, --version`:查看版本
|
|
55
69
|
|
|
56
|
-
|
|
70
|
+
## AI 使用建议
|
|
57
71
|
|
|
58
|
-
|
|
59
|
-
- `htmlInput` (string): HTML text to convert
|
|
60
|
-
- `options` (object, optional):
|
|
61
|
-
- `outputFormat` (string): `'markdown'` (default) or `'json'`
|
|
62
|
-
- `strategy` (string): `'list'`, `'article'`, or undefined
|
|
63
|
-
- `removeAttributes` (boolean): Remove HTML attributes during parsing
|
|
72
|
+
如果你在 Agent/工作流里调用这个工具,推荐固定约定:
|
|
64
73
|
|
|
65
|
-
|
|
74
|
+
1. 输入始终传完整 HTML 字符串。
|
|
75
|
+
2. 需要结构化消费时使用 `outputFormat: 'json'`。
|
|
76
|
+
3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
|
|
77
|
+
4. 需要清理无关属性时传 `removeAttributes`,例如 `['aria-*', 'role', 'data-*']`。
|
|
66
78
|
|
|
67
|
-
##
|
|
79
|
+
## 开发
|
|
68
80
|
|
|
69
|
-
|
|
70
|
-
|
|
81
|
+
```bash
|
|
82
|
+
npm test
|
|
83
|
+
```
|
|
71
84
|
|
|
72
85
|
## License
|
|
73
86
|
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFileSync } from 'node:fs';
|
|
4
|
+
import { readFile, writeFile } from 'node:fs/promises';
|
|
5
|
+
import process from 'node:process';
|
|
6
|
+
import { main } from '../src/main.js';
|
|
7
|
+
|
|
8
|
+
const pkg = JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf-8'));
|
|
9
|
+
|
|
10
|
+
function printHelp() {
|
|
11
|
+
const help = `html2md4llm ${pkg.version}
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
html2md4llm <input.html> [output.md] [options]
|
|
15
|
+
html2md4llm - [output.md] [options]
|
|
16
|
+
cat input.html | html2md4llm [options] > output.md
|
|
17
|
+
|
|
18
|
+
Options:
|
|
19
|
+
-o, --output <file> Output file path
|
|
20
|
+
-f, --format <markdown|json> Output format (default: markdown)
|
|
21
|
+
--json Shortcut for --format json
|
|
22
|
+
--markdown Shortcut for --format markdown
|
|
23
|
+
-s, --strategy <list|article> Extraction strategy
|
|
24
|
+
-r, --remove-attrs <attrs> Comma-separated attrs (e.g. aria-*,role)
|
|
25
|
+
-h, --help Show help
|
|
26
|
+
-v, --version Show version`;
|
|
27
|
+
|
|
28
|
+
process.stdout.write(`${help}\n`);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function parseArgs(argv) {
|
|
32
|
+
const parsed = {
|
|
33
|
+
positional: [],
|
|
34
|
+
outputFile: undefined,
|
|
35
|
+
outputFormat: 'markdown',
|
|
36
|
+
strategy: undefined,
|
|
37
|
+
removeAttributes: []
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const takeValue = (name, index) => {
|
|
41
|
+
const value = argv[index + 1];
|
|
42
|
+
if (!value || value.startsWith('-')) {
|
|
43
|
+
throw new Error(`Missing value for ${name}`);
|
|
44
|
+
}
|
|
45
|
+
return value;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
for (let i = 0; i < argv.length; i++) {
|
|
49
|
+
const arg = argv[i];
|
|
50
|
+
|
|
51
|
+
if (arg === '-h' || arg === '--help') {
|
|
52
|
+
parsed.help = true;
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (arg === '-v' || arg === '--version') {
|
|
57
|
+
parsed.version = true;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (arg === '-o' || arg === '--output') {
|
|
62
|
+
parsed.outputFile = takeValue(arg, i);
|
|
63
|
+
i++;
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (arg === '-f' || arg === '--format') {
|
|
68
|
+
parsed.outputFormat = takeValue(arg, i);
|
|
69
|
+
i++;
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (arg === '--json') {
|
|
74
|
+
parsed.outputFormat = 'json';
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (arg === '--markdown') {
|
|
79
|
+
parsed.outputFormat = 'markdown';
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (arg === '-s' || arg === '--strategy') {
|
|
84
|
+
parsed.strategy = takeValue(arg, i);
|
|
85
|
+
i++;
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (
|
|
90
|
+
arg === '-r' ||
|
|
91
|
+
arg === '--remove-attrs' ||
|
|
92
|
+
arg === '--remove-attr' ||
|
|
93
|
+
arg === '--remove-attribute'
|
|
94
|
+
) {
|
|
95
|
+
const value = takeValue(arg, i);
|
|
96
|
+
parsed.removeAttributes.push(
|
|
97
|
+
...value.split(',').map(item => item.trim()).filter(Boolean)
|
|
98
|
+
);
|
|
99
|
+
i++;
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (arg.startsWith('-')) {
|
|
104
|
+
throw new Error(`Unknown option: ${arg}`);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
parsed.positional.push(arg);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (parsed.positional.length > 2) {
|
|
111
|
+
throw new Error('Too many positional arguments');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (parsed.outputFile && parsed.positional[1]) {
|
|
115
|
+
throw new Error('Output file specified twice (positional and --output)');
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
parsed.inputFile = parsed.positional[0];
|
|
119
|
+
if (!parsed.outputFile) {
|
|
120
|
+
parsed.outputFile = parsed.positional[1];
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return parsed;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function readStdin() {
|
|
127
|
+
return new Promise((resolve, reject) => {
|
|
128
|
+
let data = '';
|
|
129
|
+
process.stdin.setEncoding('utf-8');
|
|
130
|
+
process.stdin.on('data', chunk => {
|
|
131
|
+
data += chunk;
|
|
132
|
+
});
|
|
133
|
+
process.stdin.on('end', () => resolve(data));
|
|
134
|
+
process.stdin.on('error', reject);
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
async function run() {
|
|
139
|
+
const args = parseArgs(process.argv.slice(2));
|
|
140
|
+
|
|
141
|
+
if (args.help) {
|
|
142
|
+
printHelp();
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (args.version) {
|
|
147
|
+
process.stdout.write(`${pkg.version}\n`);
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (args.outputFormat !== 'markdown' && args.outputFormat !== 'json') {
|
|
152
|
+
throw new Error('Invalid --format value. Use "markdown" or "json".');
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (args.strategy && args.strategy !== 'list' && args.strategy !== 'article') {
|
|
156
|
+
throw new Error('Invalid --strategy value. Use "list" or "article".');
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const hasStdin = !process.stdin.isTTY;
|
|
160
|
+
let htmlInput;
|
|
161
|
+
|
|
162
|
+
if (args.inputFile && args.inputFile !== '-') {
|
|
163
|
+
htmlInput = await readFile(args.inputFile, 'utf-8');
|
|
164
|
+
} else if (args.inputFile === '-' || hasStdin) {
|
|
165
|
+
htmlInput = await readStdin();
|
|
166
|
+
} else {
|
|
167
|
+
throw new Error('No input provided. Pass an input file, "-" for stdin, or pipe HTML to stdin.');
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const options = {
|
|
171
|
+
outputFormat: args.outputFormat
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
if (args.strategy) {
|
|
175
|
+
options.strategy = args.strategy;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (args.removeAttributes.length > 0) {
|
|
179
|
+
options.removeAttributes = args.removeAttributes;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const output = main(htmlInput, options);
|
|
183
|
+
|
|
184
|
+
if (args.outputFile) {
|
|
185
|
+
await writeFile(args.outputFile, output, 'utf-8');
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
process.stdout.write(output);
|
|
190
|
+
if (process.stdout.isTTY && !output.endsWith('\n')) {
|
|
191
|
+
process.stdout.write('\n');
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
run().catch(err => {
|
|
196
|
+
process.stderr.write(`html2md4llm: ${err.message}\n`);
|
|
197
|
+
process.stderr.write('Run "html2md4llm --help" for usage.\n');
|
|
198
|
+
process.exit(1);
|
|
199
|
+
});
|
package/package.json
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "html2md4llm",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Convert HTML to clean Markdown or JSON
|
|
3
|
+
"version": "1.1.2",
|
|
4
|
+
"description": "Convert HTML to clean Markdown or JSON, optimized for LLM processing",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/main.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"html2md4llm": "bin/html2md4llm.js"
|
|
9
|
+
},
|
|
7
10
|
"exports": {
|
|
8
|
-
".":
|
|
11
|
+
".": {
|
|
12
|
+
"import": "./src/main.js",
|
|
13
|
+
"default": "./src/main.js"
|
|
14
|
+
}
|
|
9
15
|
},
|
|
10
16
|
"scripts": {
|
|
11
|
-
"test": "node tests/run-tests.js"
|
|
12
|
-
"build": "node build.js"
|
|
17
|
+
"test": "node tests/run-tests.js"
|
|
13
18
|
},
|
|
14
19
|
"keywords": [
|
|
15
20
|
"html",
|
|
@@ -17,6 +22,8 @@
|
|
|
17
22
|
"json",
|
|
18
23
|
"converter",
|
|
19
24
|
"llm",
|
|
25
|
+
"cli",
|
|
26
|
+
"npx",
|
|
20
27
|
"parser",
|
|
21
28
|
"html-to-markdown"
|
|
22
29
|
],
|
|
@@ -27,16 +34,12 @@
|
|
|
27
34
|
},
|
|
28
35
|
"repository": {
|
|
29
36
|
"type": "git",
|
|
30
|
-
"url": "https://github.com/kaiye/html2md4llm.git"
|
|
37
|
+
"url": "git+https://github.com/kaiye/html2md4llm.git"
|
|
31
38
|
},
|
|
32
39
|
"files": [
|
|
33
40
|
"src/**/*",
|
|
34
|
-
"
|
|
35
|
-
"plugin/**/*",
|
|
41
|
+
"bin/**/*",
|
|
36
42
|
"README.md",
|
|
37
43
|
"LICENSE"
|
|
38
|
-
]
|
|
39
|
-
"devDependencies": {
|
|
40
|
-
"esbuild": "^0.27.0"
|
|
41
|
-
}
|
|
44
|
+
]
|
|
42
45
|
}
|
|
@@ -1,5 +1,80 @@
|
|
|
1
1
|
const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
|
|
2
|
-
const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section'];
|
|
2
|
+
const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section', 'table'];
|
|
3
|
+
const tableSections = ['thead', 'tbody', 'tfoot'];
|
|
4
|
+
|
|
5
|
+
function normalizeTableCell(text) {
|
|
6
|
+
return text
|
|
7
|
+
.replace(/\r\n/g, '\n')
|
|
8
|
+
.replace(/\n+/g, '<br>')
|
|
9
|
+
.replace(/\s+/g, ' ')
|
|
10
|
+
.replace(/\|/g, '\\|')
|
|
11
|
+
.trim();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function collectTableRows(tableNode) {
|
|
15
|
+
const rows = [];
|
|
16
|
+
for (const child of tableNode.children || []) {
|
|
17
|
+
if (child.type !== 'element') continue;
|
|
18
|
+
|
|
19
|
+
if (child.tag === 'tr') {
|
|
20
|
+
rows.push(child);
|
|
21
|
+
continue;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (tableSections.includes(child.tag)) {
|
|
25
|
+
for (const row of child.children || []) {
|
|
26
|
+
if (row.type === 'element' && row.tag === 'tr') {
|
|
27
|
+
rows.push(row);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return rows;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function renderTable(node, indent) {
|
|
36
|
+
const rows = collectTableRows(node).map(row => {
|
|
37
|
+
const cellNodes = (row.children || []).filter(
|
|
38
|
+
child => child.type === 'element' && (child.tag === 'th' || child.tag === 'td')
|
|
39
|
+
);
|
|
40
|
+
const cells = cellNodes.map(cell => {
|
|
41
|
+
const text = (cell.children || []).map(ch => generate(ch)).join('');
|
|
42
|
+
return normalizeTableCell(text);
|
|
43
|
+
});
|
|
44
|
+
const hasHeaderCell = cellNodes.some(cell => cell.tag === 'th');
|
|
45
|
+
return { cells, hasHeaderCell };
|
|
46
|
+
}).filter(row => row.cells.length > 0);
|
|
47
|
+
|
|
48
|
+
if (rows.length === 0) return '';
|
|
49
|
+
|
|
50
|
+
let headerRowIndex = rows.findIndex(row => row.hasHeaderCell);
|
|
51
|
+
if (headerRowIndex < 0) headerRowIndex = 0;
|
|
52
|
+
|
|
53
|
+
const headerRow = [...rows[headerRowIndex].cells];
|
|
54
|
+
const bodyRows = rows
|
|
55
|
+
.filter((_, i) => i !== headerRowIndex)
|
|
56
|
+
.map(row => [...row.cells]);
|
|
57
|
+
|
|
58
|
+
const columnCount = Math.max(
|
|
59
|
+
headerRow.length,
|
|
60
|
+
...bodyRows.map(row => row.length)
|
|
61
|
+
);
|
|
62
|
+
|
|
63
|
+
while (headerRow.length < columnCount) headerRow.push('');
|
|
64
|
+
for (const row of bodyRows) {
|
|
65
|
+
while (row.length < columnCount) row.push('');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const rowToLine = cells => `${' '.repeat(indent)}| ${cells.join(' | ')} |`;
|
|
69
|
+
const separator = `${' '.repeat(indent)}| ${Array(columnCount).fill('---').join(' | ')} |`;
|
|
70
|
+
|
|
71
|
+
const lines = [rowToLine(headerRow), separator];
|
|
72
|
+
for (const row of bodyRows) {
|
|
73
|
+
lines.push(rowToLine(row));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return lines.join('\n');
|
|
77
|
+
}
|
|
3
78
|
|
|
4
79
|
function isInline(node) {
|
|
5
80
|
if (node.type === 'element' && node.tag === 'br') return false;
|
|
@@ -33,6 +108,11 @@ export function generate(node, indent = 0) {
|
|
|
33
108
|
const tag = node.tag;
|
|
34
109
|
const children = node.children || [];
|
|
35
110
|
|
|
111
|
+
// Tables
|
|
112
|
+
if (tag === 'table') {
|
|
113
|
+
return renderTable(node, indent);
|
|
114
|
+
}
|
|
115
|
+
|
|
36
116
|
// If only one child and no special handling for this tag, pass through transparently
|
|
37
117
|
const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
|
|
38
118
|
if (children.length === 1 && !hasSpecialHandling) {
|
package/src/main.js
CHANGED
|
@@ -37,6 +37,9 @@ export function main(htmlInput, options = {}) {
|
|
|
37
37
|
return JSON.stringify(tree, (key, value) => key === 'parent' ? undefined : value, 2);
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
export const html2md4llm = main;
|
|
41
|
+
export default main;
|
|
42
|
+
|
|
40
43
|
function extractLargestList(node) {
|
|
41
44
|
let largest = null;
|
|
42
45
|
let maxCount = 0;
|
package/src/parser.js
CHANGED
|
@@ -108,6 +108,7 @@ export function parse(html, removeAttributes = []) {
|
|
|
108
108
|
// Post-processing: flatten pre/code, flatten containers, remove unwanted nodes
|
|
109
109
|
const voidElements = ['br', 'hr', 'img'];
|
|
110
110
|
const flattenableTags = ['div', 'span', 'section', 'p'];
|
|
111
|
+
const preserveEmptyElements = ['table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td'];
|
|
111
112
|
|
|
112
113
|
function flattenPreCode(node) {
|
|
113
114
|
if (node.type === 'element' && (node.tag === 'pre' || node.tag === 'code')) {
|
|
@@ -123,6 +124,22 @@ export function parse(html, removeAttributes = []) {
|
|
|
123
124
|
}
|
|
124
125
|
}
|
|
125
126
|
|
|
127
|
+
// Helper function to check if a node has substantive content
|
|
128
|
+
function hasSubstantiveContent(node) {
|
|
129
|
+
if (node.type === 'text') {
|
|
130
|
+
return node.text.trim().length > 0;
|
|
131
|
+
}
|
|
132
|
+
if (node.type === 'element') {
|
|
133
|
+
// Content elements (img, a) count as substantive
|
|
134
|
+
if (['img', 'a'].includes(node.tag)) return true;
|
|
135
|
+
// Non-content elements (br, hr) don't count as substantive
|
|
136
|
+
if (['br', 'hr'].includes(node.tag)) return false;
|
|
137
|
+
// Recursively check if any child has substantive content
|
|
138
|
+
return node.children?.some(hasSubstantiveContent) ?? false;
|
|
139
|
+
}
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
|
|
126
143
|
function removeUnwantedNodes(node) {
|
|
127
144
|
if (!node.children) return;
|
|
128
145
|
|
|
@@ -160,6 +177,13 @@ export function parse(html, removeAttributes = []) {
|
|
|
160
177
|
removeUnwantedNodes(child);
|
|
161
178
|
// Remove style attribute after filtering
|
|
162
179
|
delete child.attributes.style;
|
|
180
|
+
// Filter empty formatting elements (strong, em, b, i, code) without substantive content
|
|
181
|
+
const formattingElements = ['strong', 'em', 'b', 'i', 'code'];
|
|
182
|
+
if (formattingElements.includes(child.tag) && !hasSubstantiveContent(child)) {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
// Keep structural table elements even when cell content is empty
|
|
186
|
+
if (preserveEmptyElements.includes(child.tag)) return true;
|
|
163
187
|
// Remove empty nodes
|
|
164
188
|
if (child.children && child.children.length === 0) return false;
|
|
165
189
|
}
|
package/plugin/main.js
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { main } from '../src/main.js';
|
|
2
|
-
|
|
3
|
-
export default class Html2Md4LlmTool {
|
|
4
|
-
async invoke(parameters) {
|
|
5
|
-
const { html, outputFormat = 'markdown', strategy } = parameters;
|
|
6
|
-
|
|
7
|
-
if (!html) {
|
|
8
|
-
throw new Error('html parameter is required');
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
const options = { outputFormat };
|
|
12
|
-
if (strategy) {
|
|
13
|
-
options.strategy = strategy;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
const result = main(html, options);
|
|
17
|
-
|
|
18
|
-
return {
|
|
19
|
-
result,
|
|
20
|
-
format: outputFormat
|
|
21
|
-
};
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
async validate(parameters) {
|
|
25
|
-
if (!parameters.html || typeof parameters.html !== 'string') {
|
|
26
|
-
return { valid: false, error: 'html must be a non-empty string' };
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const { outputFormat, strategy } = parameters;
|
|
30
|
-
|
|
31
|
-
if (outputFormat && !['markdown', 'json'].includes(outputFormat)) {
|
|
32
|
-
return { valid: false, error: 'outputFormat must be "markdown" or "json"' };
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
if (strategy && !['list', 'article'].includes(strategy)) {
|
|
36
|
-
return { valid: false, error: 'strategy must be "list" or "article"' };
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
return { valid: true };
|
|
40
|
-
}
|
|
41
|
-
}
|
package/plugin/manifest.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"version": "0.0.1",
|
|
3
|
-
"type": "tool",
|
|
4
|
-
"author": "kaiye",
|
|
5
|
-
"name": "html2md4llm",
|
|
6
|
-
"label": {
|
|
7
|
-
"en_US": "HTML to Markdown/JSON",
|
|
8
|
-
"zh_Hans": "HTML 转 Markdown/JSON"
|
|
9
|
-
},
|
|
10
|
-
"description": {
|
|
11
|
-
"en_US": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
|
|
12
|
-
"zh_Hans": "将 HTML 转换为干净的 Markdown 或 JSON 格式,针对 LLM 处理优化"
|
|
13
|
-
},
|
|
14
|
-
"icon": "icon.svg",
|
|
15
|
-
"tags": ["html", "markdown", "json", "converter", "parser"]
|
|
16
|
-
}
|