html2md4llm 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/bin/html2md4llm.js +19 -0
- package/package.json +1 -1
- package/src/generators/markdown.js +3 -2
- package/src/main.js +27 -1
package/README.md
CHANGED
|
@@ -48,6 +48,7 @@ cat ./input.html | npx html2md4llm > ./output.md
|
|
|
48
48
|
- `options.outputFormat: 'markdown' | 'json'`:默认 `'markdown'`
|
|
49
49
|
- `options.strategy: 'list' | 'article'`:可选提取策略
|
|
50
50
|
- `options.removeAttributes: string[]`:按规则移除属性,如 `['aria-*', 'role']`
|
|
51
|
+
- `options.unescapeHTML: 'auto' | true | false`:默认 `'auto'`。`auto` 时若输入去首尾空白后以 `<` 开头且 `>` 结尾,会先做一次全量 unescape。
|
|
51
52
|
|
|
52
53
|
返回值:`string`(Markdown 文本或 JSON 字符串)
|
|
53
54
|
|
|
@@ -63,6 +64,7 @@ html2md4llm - [output.md] [options]
|
|
|
63
64
|
- `--json`:等价于 `--format json`
|
|
64
65
|
- `--markdown`:等价于 `--format markdown`
|
|
65
66
|
- `-s, --strategy <list|article>`:内容提取策略
|
|
67
|
+
- `-u, --unescape-html <auto|true|false>`:输入 HTML 反转义策略(默认 `auto`)
|
|
66
68
|
- `-r, --remove-attrs <attrs>`:逗号分隔,如 `aria-*,role`
|
|
67
69
|
- `-h, --help`:查看帮助
|
|
68
70
|
- `-v, --version`:查看版本
|
|
@@ -75,6 +77,7 @@ html2md4llm - [output.md] [options]
|
|
|
75
77
|
2. 需要结构化消费时使用 `outputFormat: 'json'`。
|
|
76
78
|
3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
|
|
77
79
|
4. 需要清理无关属性时传 `removeAttributes`,例如 `['aria-*', 'role', 'data-*']`。
|
|
80
|
+
5. 输入可能是转义 HTML(如 `<h1>Hello</h1>`)时,保持默认 `unescapeHTML: 'auto'`;若要强制关闭则传 `false`。
|
|
78
81
|
|
|
79
82
|
## 开发
|
|
80
83
|
|
package/bin/html2md4llm.js
CHANGED
|
@@ -21,6 +21,7 @@ Options:
|
|
|
21
21
|
--json Shortcut for --format json
|
|
22
22
|
--markdown Shortcut for --format markdown
|
|
23
23
|
-s, --strategy <list|article> Extraction strategy
|
|
24
|
+
-u, --unescape-html <mode> Input unescape mode: auto|true|false (default: auto)
|
|
24
25
|
-r, --remove-attrs <attrs> Comma-separated attrs (e.g. aria-*,role)
|
|
25
26
|
-h, --help Show help
|
|
26
27
|
-v, --version Show version`;
|
|
@@ -34,6 +35,7 @@ function parseArgs(argv) {
|
|
|
34
35
|
outputFile: undefined,
|
|
35
36
|
outputFormat: 'markdown',
|
|
36
37
|
strategy: undefined,
|
|
38
|
+
unescapeHTML: 'auto',
|
|
37
39
|
removeAttributes: []
|
|
38
40
|
};
|
|
39
41
|
|
|
@@ -86,6 +88,12 @@ function parseArgs(argv) {
|
|
|
86
88
|
continue;
|
|
87
89
|
}
|
|
88
90
|
|
|
91
|
+
if (arg === '-u' || arg === '--unescape-html') {
|
|
92
|
+
parsed.unescapeHTML = takeValue(arg, i);
|
|
93
|
+
i++;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
89
97
|
if (
|
|
90
98
|
arg === '-r' ||
|
|
91
99
|
arg === '--remove-attrs' ||
|
|
@@ -156,6 +164,11 @@ async function run() {
|
|
|
156
164
|
throw new Error('Invalid --strategy value. Use "list" or "article".');
|
|
157
165
|
}
|
|
158
166
|
|
|
167
|
+
const unescapeMode = String(args.unescapeHTML).toLowerCase();
|
|
168
|
+
if (unescapeMode !== 'auto' && unescapeMode !== 'true' && unescapeMode !== 'false') {
|
|
169
|
+
throw new Error('Invalid --unescape-html value. Use "auto", "true", or "false".');
|
|
170
|
+
}
|
|
171
|
+
|
|
159
172
|
const hasStdin = !process.stdin.isTTY;
|
|
160
173
|
let htmlInput;
|
|
161
174
|
|
|
@@ -175,6 +188,12 @@ async function run() {
|
|
|
175
188
|
options.strategy = args.strategy;
|
|
176
189
|
}
|
|
177
190
|
|
|
191
|
+
if (unescapeMode === 'auto') {
|
|
192
|
+
options.unescapeHTML = 'auto';
|
|
193
|
+
} else {
|
|
194
|
+
options.unescapeHTML = unescapeMode === 'true';
|
|
195
|
+
}
|
|
196
|
+
|
|
178
197
|
if (args.removeAttributes.length > 0) {
|
|
179
198
|
options.removeAttributes = args.removeAttributes;
|
|
180
199
|
}
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
|
|
2
|
-
const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section', 'table'];
|
|
2
|
+
const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'hr', 'img', 'div', 'section', 'table'];
|
|
3
3
|
const tableSections = ['thead', 'tbody', 'tfoot'];
|
|
4
4
|
|
|
5
5
|
function normalizeTableCell(text) {
|
|
@@ -114,7 +114,7 @@ export function generate(node, indent = 0) {
|
|
|
114
114
|
}
|
|
115
115
|
|
|
116
116
|
// If only one child and no special handling for this tag, pass through transparently
|
|
117
|
-
const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
|
|
117
|
+
const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'hr', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
|
|
118
118
|
if (children.length === 1 && !hasSpecialHandling) {
|
|
119
119
|
return generate(children[0], indent);
|
|
120
120
|
}
|
|
@@ -206,6 +206,7 @@ export function generate(node, indent = 0) {
|
|
|
206
206
|
|
|
207
207
|
// Line break
|
|
208
208
|
if (tag === 'br') return '\n';
|
|
209
|
+
if (tag === 'hr') return '---';
|
|
209
210
|
|
|
210
211
|
// Default: just return children
|
|
211
212
|
return childText;
|
package/src/main.js
CHANGED
|
@@ -18,8 +18,17 @@ export function main(htmlInput, options = {}) {
|
|
|
18
18
|
throw new Error('options.strategy must be \'list\', \'article\', or undefined');
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
const unescapeHTML = options.unescapeHTML ?? 'auto';
|
|
22
|
+
if (unescapeHTML !== 'auto' && unescapeHTML !== true && unescapeHTML !== false) {
|
|
23
|
+
throw new Error('options.unescapeHTML must be \'auto\', true, or false');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const normalizedHtmlInput = shouldUnescapeHtml(htmlInput, unescapeHTML)
|
|
27
|
+
? unescapeHtml(htmlInput)
|
|
28
|
+
: htmlInput;
|
|
29
|
+
|
|
21
30
|
// Parse HTML to virtual DOM
|
|
22
|
-
let tree = parse(
|
|
31
|
+
let tree = parse(normalizedHtmlInput, options.removeAttributes);
|
|
23
32
|
|
|
24
33
|
// Apply extraction strategy
|
|
25
34
|
if (strategy === 'list') {
|
|
@@ -40,6 +49,23 @@ export function main(htmlInput, options = {}) {
|
|
|
40
49
|
export const html2md4llm = main;
|
|
41
50
|
export default main;
|
|
42
51
|
|
|
52
|
+
function shouldUnescapeHtml(htmlInput, mode) {
|
|
53
|
+
if (mode === true) return true;
|
|
54
|
+
if (mode === false) return false;
|
|
55
|
+
|
|
56
|
+
const trimmed = htmlInput.trim();
|
|
57
|
+
return trimmed.startsWith('<') && trimmed.endsWith('>');
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function unescapeHtml(str) {
|
|
61
|
+
return str
|
|
62
|
+
.replace(/</g, '<')
|
|
63
|
+
.replace(/>/g, '>')
|
|
64
|
+
.replace(/"/g, '"')
|
|
65
|
+
.replace(/'/g, "'")
|
|
66
|
+
.replace(/&/g, '&');
|
|
67
|
+
}
|
|
68
|
+
|
|
43
69
|
function extractLargestList(node) {
|
|
44
70
|
let largest = null;
|
|
45
71
|
let maxCount = 0;
|