npm - html2md4llm - Versions diffs - 1.1.2 → 1.2.0 - Mend

html2md4llm 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +3 -0
package/bin/html2md4llm.js +19 -0
package/package.json +1 -1
package/src/generators/markdown.js +3 -2
package/src/main.js +27 -1

package/README.md CHANGED Viewed

@@ -48,6 +48,7 @@ cat ./input.html | npx html2md4llm > ./output.md
 - `options.outputFormat: 'markdown' | 'json'`：默认 `'markdown'`
 - `options.strategy: 'list' | 'article'`：可选提取策略
 - `options.removeAttributes: string[]`：按规则移除属性，如 `['aria-*', 'role']`
+- `options.unescapeHTML: 'auto' | true | false`：默认 `'auto'`。`auto` 时若输入去首尾空白后以 `&lt;` 开头且 `&gt;` 结尾，会先做一次全量 unescape。
 返回值：`string`（Markdown 文本或 JSON 字符串）
@@ -63,6 +64,7 @@ html2md4llm - [output.md] [options]
 - `--json`：等价于 `--format json`
 - `--markdown`：等价于 `--format markdown`
 - `-s, --strategy <list|article>`：内容提取策略
+- `-u, --unescape-html <auto|true|false>`：输入 HTML 反转义策略（默认 `auto`）
 - `-r, --remove-attrs <attrs>`：逗号分隔，如 `aria-*,role`
 - `-h, --help`：查看帮助
 - `-v, --version`：查看版本
@@ -75,6 +77,7 @@ html2md4llm - [output.md] [options]
 2. 需要结构化消费时使用 `outputFormat: 'json'`。
 3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
 4. 需要清理无关属性时传 `removeAttributes`，例如 `['aria-*', 'role', 'data-*']`。
+5. 输入可能是转义 HTML（如 `&lt;h1&gt;Hello&lt;/h1&gt;`）时，保持默认 `unescapeHTML: 'auto'`；若要强制关闭则传 `false`。
 ## 开发

package/bin/html2md4llm.js CHANGED Viewed

@@ -21,6 +21,7 @@ Options:
   --json                         Shortcut for --format json
   --markdown                     Shortcut for --format markdown
   -s, --strategy <list|article> Extraction strategy
+  -u, --unescape-html <mode>    Input unescape mode: auto|true|false (default: auto)
   -r, --remove-attrs <attrs>    Comma-separated attrs (e.g. aria-*,role)
   -h, --help                    Show help
   -v, --version                 Show version`;
@@ -34,6 +35,7 @@ function parseArgs(argv) {
     outputFile: undefined,
     outputFormat: 'markdown',
     strategy: undefined,
+    unescapeHTML: 'auto',
     removeAttributes: []
   };
@@ -86,6 +88,12 @@ function parseArgs(argv) {
       continue;
     }
+    if (arg === '-u' || arg === '--unescape-html') {
+      parsed.unescapeHTML = takeValue(arg, i);
+      i++;
+      continue;
+    }
     if (
       arg === '-r' ||
       arg === '--remove-attrs' ||
@@ -156,6 +164,11 @@ async function run() {
     throw new Error('Invalid --strategy value. Use "list" or "article".');
   }
+  const unescapeMode = String(args.unescapeHTML).toLowerCase();
+  if (unescapeMode !== 'auto' && unescapeMode !== 'true' && unescapeMode !== 'false') {
+    throw new Error('Invalid --unescape-html value. Use "auto", "true", or "false".');
+  }
   const hasStdin = !process.stdin.isTTY;
   let htmlInput;
@@ -175,6 +188,12 @@ async function run() {
     options.strategy = args.strategy;
   }
+  if (unescapeMode === 'auto') {
+    options.unescapeHTML = 'auto';
+  } else {
+    options.unescapeHTML = unescapeMode === 'true';
+  }
   if (args.removeAttributes.length > 0) {
     options.removeAttributes = args.removeAttributes;
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "html2md4llm",
-  "version": "1.1.2",
+  "version": "1.2.0",
   "description": "Convert HTML to clean Markdown or JSON, optimized for LLM processing",
   "type": "module",
   "main": "src/main.js",

package/src/generators/markdown.js CHANGED Viewed

@@ -1,5 +1,5 @@
 const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
-const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section', 'table'];
+const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'hr', 'img', 'div', 'section', 'table'];
 const tableSections = ['thead', 'tbody', 'tfoot'];
 function normalizeTableCell(text) {
@@ -114,7 +114,7 @@ export function generate(node, indent = 0) {
   }
   // If only one child and no special handling for this tag, pass through transparently
-  const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
+  const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'hr', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
   if (children.length === 1 && !hasSpecialHandling) {
     return generate(children[0], indent);
   }
@@ -206,6 +206,7 @@ export function generate(node, indent = 0) {
   // Line break
   if (tag === 'br') return '\n';
+  if (tag === 'hr') return '---';
   // Default: just return children
   return childText;

package/src/main.js CHANGED Viewed

@@ -18,8 +18,17 @@ export function main(htmlInput, options = {}) {
     throw new Error('options.strategy must be \'list\', \'article\', or undefined');
   }
+  const unescapeHTML = options.unescapeHTML ?? 'auto';
+  if (unescapeHTML !== 'auto' && unescapeHTML !== true && unescapeHTML !== false) {
+    throw new Error('options.unescapeHTML must be \'auto\', true, or false');
+  }
+  const normalizedHtmlInput = shouldUnescapeHtml(htmlInput, unescapeHTML)
+    ? unescapeHtml(htmlInput)
+    : htmlInput;
   // Parse HTML to virtual DOM
-  let tree = parse(htmlInput, options.removeAttributes);
+  let tree = parse(normalizedHtmlInput, options.removeAttributes);
   // Apply extraction strategy
   if (strategy === 'list') {
@@ -40,6 +49,23 @@ export function main(htmlInput, options = {}) {
 export const html2md4llm = main;
 export default main;
+function shouldUnescapeHtml(htmlInput, mode) {
+  if (mode === true) return true;
+  if (mode === false) return false;
+  const trimmed = htmlInput.trim();
+  return trimmed.startsWith('&lt;') && trimmed.endsWith('&gt;');
+}
+function unescapeHtml(str) {
+  return str
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&amp;/g, '&');
+}
 function extractLargestList(node) {
   let largest = null;
   let maxCount = 0;