html2md4llm 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,6 +48,7 @@ cat ./input.html | npx html2md4llm > ./output.md
48
48
  - `options.outputFormat: 'markdown' | 'json'`:默认 `'markdown'`
49
49
  - `options.strategy: 'list' | 'article'`:可选提取策略
50
50
  - `options.removeAttributes: string[]`:按规则移除属性,如 `['aria-*', 'role']`
51
+ - `options.unescapeHTML: 'auto' | true | false`:默认 `'auto'`。`auto` 时若输入去首尾空白后以 `<` 开头且 `>` 结尾,会先做一次全量 unescape。
51
52
 
52
53
  返回值:`string`(Markdown 文本或 JSON 字符串)
53
54
 
@@ -63,6 +64,7 @@ html2md4llm - [output.md] [options]
63
64
  - `--json`:等价于 `--format json`
64
65
  - `--markdown`:等价于 `--format markdown`
65
66
  - `-s, --strategy <list|article>`:内容提取策略
67
+ - `-u, --unescape-html <auto|true|false>`:输入 HTML 反转义策略(默认 `auto`)
66
68
  - `-r, --remove-attrs <attrs>`:逗号分隔,如 `aria-*,role`
67
69
  - `-h, --help`:查看帮助
68
70
  - `-v, --version`:查看版本
@@ -75,6 +77,7 @@ html2md4llm - [output.md] [options]
75
77
  2. 需要结构化消费时使用 `outputFormat: 'json'`。
76
78
  3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
77
79
  4. 需要清理无关属性时传 `removeAttributes`,例如 `['aria-*', 'role', 'data-*']`。
80
+ 5. 输入可能是转义 HTML(如 `&lt;h1&gt;Hello&lt;/h1&gt;`)时,保持默认 `unescapeHTML: 'auto'`;若要强制关闭则传 `false`。
78
81
 
79
82
  ## 开发
80
83
 
@@ -21,6 +21,7 @@ Options:
21
21
  --json Shortcut for --format json
22
22
  --markdown Shortcut for --format markdown
23
23
  -s, --strategy <list|article> Extraction strategy
24
+ -u, --unescape-html <mode> Input unescape mode: auto|true|false (default: auto)
24
25
  -r, --remove-attrs <attrs> Comma-separated attrs (e.g. aria-*,role)
25
26
  -h, --help Show help
26
27
  -v, --version Show version`;
@@ -34,6 +35,7 @@ function parseArgs(argv) {
34
35
  outputFile: undefined,
35
36
  outputFormat: 'markdown',
36
37
  strategy: undefined,
38
+ unescapeHTML: 'auto',
37
39
  removeAttributes: []
38
40
  };
39
41
 
@@ -86,6 +88,12 @@ function parseArgs(argv) {
86
88
  continue;
87
89
  }
88
90
 
91
+ if (arg === '-u' || arg === '--unescape-html') {
92
+ parsed.unescapeHTML = takeValue(arg, i);
93
+ i++;
94
+ continue;
95
+ }
96
+
89
97
  if (
90
98
  arg === '-r' ||
91
99
  arg === '--remove-attrs' ||
@@ -156,6 +164,11 @@ async function run() {
156
164
  throw new Error('Invalid --strategy value. Use "list" or "article".');
157
165
  }
158
166
 
167
+ const unescapeMode = String(args.unescapeHTML).toLowerCase();
168
+ if (unescapeMode !== 'auto' && unescapeMode !== 'true' && unescapeMode !== 'false') {
169
+ throw new Error('Invalid --unescape-html value. Use "auto", "true", or "false".');
170
+ }
171
+
159
172
  const hasStdin = !process.stdin.isTTY;
160
173
  let htmlInput;
161
174
 
@@ -175,6 +188,12 @@ async function run() {
175
188
  options.strategy = args.strategy;
176
189
  }
177
190
 
191
+ if (unescapeMode === 'auto') {
192
+ options.unescapeHTML = 'auto';
193
+ } else {
194
+ options.unescapeHTML = unescapeMode === 'true';
195
+ }
196
+
178
197
  if (args.removeAttributes.length > 0) {
179
198
  options.removeAttributes = args.removeAttributes;
180
199
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "html2md4llm",
3
- "version": "1.1.2",
3
+ "version": "1.2.0",
4
4
  "description": "Convert HTML to clean Markdown or JSON, optimized for LLM processing",
5
5
  "type": "module",
6
6
  "main": "src/main.js",
@@ -1,5 +1,5 @@
1
1
  const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
2
- const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section', 'table'];
2
+ const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'hr', 'img', 'div', 'section', 'table'];
3
3
  const tableSections = ['thead', 'tbody', 'tfoot'];
4
4
 
5
5
  function normalizeTableCell(text) {
@@ -114,7 +114,7 @@ export function generate(node, indent = 0) {
114
114
  }
115
115
 
116
116
  // If only one child and no special handling for this tag, pass through transparently
117
- const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
117
+ const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'hr', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
118
118
  if (children.length === 1 && !hasSpecialHandling) {
119
119
  return generate(children[0], indent);
120
120
  }
@@ -206,6 +206,7 @@ export function generate(node, indent = 0) {
206
206
 
207
207
  // Line break
208
208
  if (tag === 'br') return '\n';
209
+ if (tag === 'hr') return '---';
209
210
 
210
211
  // Default: just return children
211
212
  return childText;
package/src/main.js CHANGED
@@ -18,8 +18,17 @@ export function main(htmlInput, options = {}) {
18
18
  throw new Error('options.strategy must be \'list\', \'article\', or undefined');
19
19
  }
20
20
 
21
+ const unescapeHTML = options.unescapeHTML ?? 'auto';
22
+ if (unescapeHTML !== 'auto' && unescapeHTML !== true && unescapeHTML !== false) {
23
+ throw new Error('options.unescapeHTML must be \'auto\', true, or false');
24
+ }
25
+
26
+ const normalizedHtmlInput = shouldUnescapeHtml(htmlInput, unescapeHTML)
27
+ ? unescapeHtml(htmlInput)
28
+ : htmlInput;
29
+
21
30
  // Parse HTML to virtual DOM
22
- let tree = parse(htmlInput, options.removeAttributes);
31
+ let tree = parse(normalizedHtmlInput, options.removeAttributes);
23
32
 
24
33
  // Apply extraction strategy
25
34
  if (strategy === 'list') {
@@ -40,6 +49,23 @@ export function main(htmlInput, options = {}) {
40
49
  export const html2md4llm = main;
41
50
  export default main;
42
51
 
52
+ function shouldUnescapeHtml(htmlInput, mode) {
53
+ if (mode === true) return true;
54
+ if (mode === false) return false;
55
+
56
+ const trimmed = htmlInput.trim();
57
+ return trimmed.startsWith('&lt;') && trimmed.endsWith('&gt;');
58
+ }
59
+
60
+ function unescapeHtml(str) {
61
+ return str
62
+ .replace(/&lt;/g, '<')
63
+ .replace(/&gt;/g, '>')
64
+ .replace(/&quot;/g, '"')
65
+ .replace(/&#39;/g, "'")
66
+ .replace(/&amp;/g, '&');
67
+ }
68
+
43
69
  function extractLargestList(node) {
44
70
  let largest = null;
45
71
  let maxCount = 0;