html2md4llm 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/bin/html2md4llm.js +19 -0
- package/package.json +1 -1
- package/src/main.js +27 -1
package/README.md
CHANGED
|
@@ -48,6 +48,7 @@ cat ./input.html | npx html2md4llm > ./output.md
|
|
|
48
48
|
- `options.outputFormat: 'markdown' | 'json'`:默认 `'markdown'`
|
|
49
49
|
- `options.strategy: 'list' | 'article'`:可选提取策略
|
|
50
50
|
- `options.removeAttributes: string[]`:按规则移除属性,如 `['aria-*', 'role']`
|
|
51
|
+
- `options.unescapeHTML: 'auto' | true | false`:默认 `'auto'`。`auto` 时若输入去首尾空白后以 `<` 开头且 `>` 结尾,会先做一次全量 unescape。
|
|
51
52
|
|
|
52
53
|
返回值:`string`(Markdown 文本或 JSON 字符串)
|
|
53
54
|
|
|
@@ -63,6 +64,7 @@ html2md4llm - [output.md] [options]
|
|
|
63
64
|
- `--json`:等价于 `--format json`
|
|
64
65
|
- `--markdown`:等价于 `--format markdown`
|
|
65
66
|
- `-s, --strategy <list|article>`:内容提取策略
|
|
67
|
+
- `-u, --unescape-html <auto|true|false>`:输入 HTML 反转义策略(默认 `auto`)
|
|
66
68
|
- `-r, --remove-attrs <attrs>`:逗号分隔,如 `aria-*,role`
|
|
67
69
|
- `-h, --help`:查看帮助
|
|
68
70
|
- `-v, --version`:查看版本
|
|
@@ -75,6 +77,7 @@ html2md4llm - [output.md] [options]
|
|
|
75
77
|
2. 需要结构化消费时使用 `outputFormat: 'json'`。
|
|
76
78
|
3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
|
|
77
79
|
4. 需要清理无关属性时传 `removeAttributes`,例如 `['aria-*', 'role', 'data-*']`。
|
|
80
|
+
5. 输入可能是转义 HTML(如 `<h1>Hello</h1>`)时,保持默认 `unescapeHTML: 'auto'`;若要强制关闭则传 `false`。
|
|
78
81
|
|
|
79
82
|
## 开发
|
|
80
83
|
|
package/bin/html2md4llm.js
CHANGED
|
@@ -21,6 +21,7 @@ Options:
|
|
|
21
21
|
--json Shortcut for --format json
|
|
22
22
|
--markdown Shortcut for --format markdown
|
|
23
23
|
-s, --strategy <list|article> Extraction strategy
|
|
24
|
+
-u, --unescape-html <mode> Input unescape mode: auto|true|false (default: auto)
|
|
24
25
|
-r, --remove-attrs <attrs> Comma-separated attrs (e.g. aria-*,role)
|
|
25
26
|
-h, --help Show help
|
|
26
27
|
-v, --version Show version`;
|
|
@@ -34,6 +35,7 @@ function parseArgs(argv) {
|
|
|
34
35
|
outputFile: undefined,
|
|
35
36
|
outputFormat: 'markdown',
|
|
36
37
|
strategy: undefined,
|
|
38
|
+
unescapeHTML: 'auto',
|
|
37
39
|
removeAttributes: []
|
|
38
40
|
};
|
|
39
41
|
|
|
@@ -86,6 +88,12 @@ function parseArgs(argv) {
|
|
|
86
88
|
continue;
|
|
87
89
|
}
|
|
88
90
|
|
|
91
|
+
if (arg === '-u' || arg === '--unescape-html') {
|
|
92
|
+
parsed.unescapeHTML = takeValue(arg, i);
|
|
93
|
+
i++;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
89
97
|
if (
|
|
90
98
|
arg === '-r' ||
|
|
91
99
|
arg === '--remove-attrs' ||
|
|
@@ -156,6 +164,11 @@ async function run() {
|
|
|
156
164
|
throw new Error('Invalid --strategy value. Use "list" or "article".');
|
|
157
165
|
}
|
|
158
166
|
|
|
167
|
+
const unescapeMode = String(args.unescapeHTML).toLowerCase();
|
|
168
|
+
if (unescapeMode !== 'auto' && unescapeMode !== 'true' && unescapeMode !== 'false') {
|
|
169
|
+
throw new Error('Invalid --unescape-html value. Use "auto", "true", or "false".');
|
|
170
|
+
}
|
|
171
|
+
|
|
159
172
|
const hasStdin = !process.stdin.isTTY;
|
|
160
173
|
let htmlInput;
|
|
161
174
|
|
|
@@ -175,6 +188,12 @@ async function run() {
|
|
|
175
188
|
options.strategy = args.strategy;
|
|
176
189
|
}
|
|
177
190
|
|
|
191
|
+
if (unescapeMode === 'auto') {
|
|
192
|
+
options.unescapeHTML = 'auto';
|
|
193
|
+
} else {
|
|
194
|
+
options.unescapeHTML = unescapeMode === 'true';
|
|
195
|
+
}
|
|
196
|
+
|
|
178
197
|
if (args.removeAttributes.length > 0) {
|
|
179
198
|
options.removeAttributes = args.removeAttributes;
|
|
180
199
|
}
|
package/package.json
CHANGED
package/src/main.js
CHANGED
|
@@ -18,8 +18,17 @@ export function main(htmlInput, options = {}) {
|
|
|
18
18
|
throw new Error('options.strategy must be \'list\', \'article\', or undefined');
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
const unescapeHTML = options.unescapeHTML ?? 'auto';
|
|
22
|
+
if (unescapeHTML !== 'auto' && unescapeHTML !== true && unescapeHTML !== false) {
|
|
23
|
+
throw new Error('options.unescapeHTML must be \'auto\', true, or false');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const normalizedHtmlInput = shouldUnescapeHtml(htmlInput, unescapeHTML)
|
|
27
|
+
? unescapeHtml(htmlInput)
|
|
28
|
+
: htmlInput;
|
|
29
|
+
|
|
21
30
|
// Parse HTML to virtual DOM
|
|
22
|
-
let tree = parse(
|
|
31
|
+
let tree = parse(normalizedHtmlInput, options.removeAttributes);
|
|
23
32
|
|
|
24
33
|
// Apply extraction strategy
|
|
25
34
|
if (strategy === 'list') {
|
|
@@ -40,6 +49,23 @@ export function main(htmlInput, options = {}) {
|
|
|
40
49
|
export const html2md4llm = main;
|
|
41
50
|
export default main;
|
|
42
51
|
|
|
52
|
+
function shouldUnescapeHtml(htmlInput, mode) {
|
|
53
|
+
if (mode === true) return true;
|
|
54
|
+
if (mode === false) return false;
|
|
55
|
+
|
|
56
|
+
const trimmed = htmlInput.trim();
|
|
57
|
+
return trimmed.startsWith('<') && trimmed.endsWith('>');
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function unescapeHtml(str) {
|
|
61
|
+
return str
|
|
62
|
+
.replace(/</g, '<')
|
|
63
|
+
.replace(/>/g, '>')
|
|
64
|
+
.replace(/"/g, '"')
|
|
65
|
+
.replace(/'/g, "'")
|
|
66
|
+
.replace(/&/g, '&');
|
|
67
|
+
}
|
|
68
|
+
|
|
43
69
|
function extractLargestList(node) {
|
|
44
70
|
let largest = null;
|
|
45
71
|
let maxCount = 0;
|