npm - html2md4llm - Versions diffs - 1.0.0 → 1.1.2 - Mend

html2md4llm 1.0.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/LICENSE +21 -0
package/README.md +56 -43
package/bin/html2md4llm.js +199 -0
package/package.json +15 -12
package/src/generators/markdown.js +81 -1
package/src/main.js +3 -0
package/src/parser.js +24 -0
package/plugin/main.js +0 -41
package/plugin/manifest.json +0 -16

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 kaiye
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md CHANGED Viewed

@@ -1,73 +1,86 @@
 # html2md4llm
-Convert HTML to clean Markdown or JSON format, optimized for LLM processing.
+把 HTML 转成更干净的 Markdown 或 JSON，便于人和 AI 直接消费。
-## Features
+## 安装
-- Convert HTML to Markdown or JSON
-- Intelligent content extraction (list/article modes)
-- Automatic HTML cleaning (removes scripts, styles, iframes)
-- Preserves metadata (title, description, keywords)
-- Zero dependencies - uses only Node.js built-in modules
+```bash
+npm install html2md4llm
+```
-## Installation
+## 最常用方式
-### NPM Package
+### 1) 在代码中调用
-```bash
-npm install html2md4llm
+```js
+import { main } from 'html2md4llm';
+const html = '<h1>Hello</h1><p>World</p>';
+const md = main(html); // 默认输出 markdown
+console.log(md);
 ```
-### Standalone Script
+输出 JSON：
+```js
+const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
+```
-Download `dist/html2md4llm.min.js` and include it directly:
+### 2) 用 npx 当命令行工具
-```html
-<script src="html2md4llm.min.js"></script>
-<script>
-  const result = html2md4llm('<h1>Hello</h1>');
-</script>
+本地文件转 `.md`：
+```bash
+npx html2md4llm ./input.html ./output.md
 ```
-### Dify Plugin
+stdin -> stdout（Linux 管道）：
-Install from the plugin marketplace or import from this repository's `plugin/` directory.
+```bash
+cat ./input.html | npx html2md4llm > ./output.md
+```
-## Usage
+## API（给代码/AI 调用）
-```javascript
-import { main } from 'html2md4llm';
+`main(htmlInput, options?)`
-// Basic conversion to Markdown
-const markdown = main('<h1>Hello</h1><p>World</p>');
+- `htmlInput: string`：HTML 字符串
+- `options.outputFormat: 'markdown' | 'json'`：默认 `'markdown'`
+- `options.strategy: 'list' | 'article'`：可选提取策略
+- `options.removeAttributes: string[]`：按规则移除属性，如 `['aria-*', 'role']`
-// Convert to JSON
-const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
+返回值：`string`（Markdown 文本或 JSON 字符串）
-// Extract largest list
-const list = main(html, { strategy: 'list' });
+## CLI 参数
-// Extract article content
-const article = main(html, { strategy: 'article' });
+```bash
+html2md4llm <input.html> [output.md] [options]
+html2md4llm - [output.md] [options]
 ```
-## API
+- `-o, --output <file>`：输出文件路径
+- `-f, --format <markdown|json>`：输出格式
+- `--json`：等价于 `--format json`
+- `--markdown`：等价于 `--format markdown`
+- `-s, --strategy <list|article>`：内容提取策略
+- `-r, --remove-attrs <attrs>`：逗号分隔，如 `aria-*,role`
+- `-h, --help`：查看帮助
+- `-v, --version`：查看版本
-### `main(htmlInput, options)`
+## AI 使用建议
-**Parameters:**
-- `htmlInput` (string): HTML text to convert
-- `options` (object, optional):
-  - `outputFormat` (string): `'markdown'` (default) or `'json'`
-  - `strategy` (string): `'list'`, `'article'`, or undefined
-  - `removeAttributes` (boolean): Remove HTML attributes during parsing
+如果你在 Agent/工作流里调用这个工具，推荐固定约定：
-**Returns:** String (Markdown or JSON)
+1. 输入始终传完整 HTML 字符串。
+2. 需要结构化消费时使用 `outputFormat: 'json'`。
+3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
+4. 需要清理无关属性时传 `removeAttributes`，例如 `['aria-*', 'role', 'data-*']`。
-## Strategies
+## 开发
-- **list**: Extracts the largest `<ul>` or `<ol>` element
-- **article**: Filters out empty containers, keeping main content
+```bash
+npm test
+```
 ## License

package/bin/html2md4llm.js ADDED Viewed

@@ -0,0 +1,199 @@
+#!/usr/bin/env node
+import { readFileSync } from 'node:fs';
+import { readFile, writeFile } from 'node:fs/promises';
+import process from 'node:process';
+import { main } from '../src/main.js';
+const pkg = JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf-8'));
+function printHelp() {
+  const help = `html2md4llm ${pkg.version}
+Usage:
+  html2md4llm <input.html> [output.md] [options]
+  html2md4llm - [output.md] [options]
+  cat input.html | html2md4llm [options] > output.md
+Options:
+  -o, --output <file>           Output file path
+  -f, --format <markdown|json>  Output format (default: markdown)
+  --json                         Shortcut for --format json
+  --markdown                     Shortcut for --format markdown
+  -s, --strategy <list|article> Extraction strategy
+  -r, --remove-attrs <attrs>    Comma-separated attrs (e.g. aria-*,role)
+  -h, --help                    Show help
+  -v, --version                 Show version`;
+  process.stdout.write(`${help}\n`);
+}
+function parseArgs(argv) {
+  const parsed = {
+    positional: [],
+    outputFile: undefined,
+    outputFormat: 'markdown',
+    strategy: undefined,
+    removeAttributes: []
+  };
+  const takeValue = (name, index) => {
+    const value = argv[index + 1];
+    if (!value || value.startsWith('-')) {
+      throw new Error(`Missing value for ${name}`);
+    }
+    return value;
+  };
+  for (let i = 0; i < argv.length; i++) {
+    const arg = argv[i];
+    if (arg === '-h' || arg === '--help') {
+      parsed.help = true;
+      continue;
+    }
+    if (arg === '-v' || arg === '--version') {
+      parsed.version = true;
+      continue;
+    }
+    if (arg === '-o' || arg === '--output') {
+      parsed.outputFile = takeValue(arg, i);
+      i++;
+      continue;
+    }
+    if (arg === '-f' || arg === '--format') {
+      parsed.outputFormat = takeValue(arg, i);
+      i++;
+      continue;
+    }
+    if (arg === '--json') {
+      parsed.outputFormat = 'json';
+      continue;
+    }
+    if (arg === '--markdown') {
+      parsed.outputFormat = 'markdown';
+      continue;
+    }
+    if (arg === '-s' || arg === '--strategy') {
+      parsed.strategy = takeValue(arg, i);
+      i++;
+      continue;
+    }
+    if (
+      arg === '-r' ||
+      arg === '--remove-attrs' ||
+      arg === '--remove-attr' ||
+      arg === '--remove-attribute'
+    ) {
+      const value = takeValue(arg, i);
+      parsed.removeAttributes.push(
+        ...value.split(',').map(item => item.trim()).filter(Boolean)
+      );
+      i++;
+      continue;
+    }
+    if (arg.startsWith('-')) {
+      throw new Error(`Unknown option: ${arg}`);
+    }
+    parsed.positional.push(arg);
+  }
+  if (parsed.positional.length > 2) {
+    throw new Error('Too many positional arguments');
+  }
+  if (parsed.outputFile && parsed.positional[1]) {
+    throw new Error('Output file specified twice (positional and --output)');
+  }
+  parsed.inputFile = parsed.positional[0];
+  if (!parsed.outputFile) {
+    parsed.outputFile = parsed.positional[1];
+  }
+  return parsed;
+}
+function readStdin() {
+  return new Promise((resolve, reject) => {
+    let data = '';
+    process.stdin.setEncoding('utf-8');
+    process.stdin.on('data', chunk => {
+      data += chunk;
+    });
+    process.stdin.on('end', () => resolve(data));
+    process.stdin.on('error', reject);
+  });
+}
+async function run() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help) {
+    printHelp();
+    return;
+  }
+  if (args.version) {
+    process.stdout.write(`${pkg.version}\n`);
+    return;
+  }
+  if (args.outputFormat !== 'markdown' && args.outputFormat !== 'json') {
+    throw new Error('Invalid --format value. Use "markdown" or "json".');
+  }
+  if (args.strategy && args.strategy !== 'list' && args.strategy !== 'article') {
+    throw new Error('Invalid --strategy value. Use "list" or "article".');
+  }
+  const hasStdin = !process.stdin.isTTY;
+  let htmlInput;
+  if (args.inputFile && args.inputFile !== '-') {
+    htmlInput = await readFile(args.inputFile, 'utf-8');
+  } else if (args.inputFile === '-' || hasStdin) {
+    htmlInput = await readStdin();
+  } else {
+    throw new Error('No input provided. Pass an input file, "-" for stdin, or pipe HTML to stdin.');
+  }
+  const options = {
+    outputFormat: args.outputFormat
+  };
+  if (args.strategy) {
+    options.strategy = args.strategy;
+  }
+  if (args.removeAttributes.length > 0) {
+    options.removeAttributes = args.removeAttributes;
+  }
+  const output = main(htmlInput, options);
+  if (args.outputFile) {
+    await writeFile(args.outputFile, output, 'utf-8');
+    return;
+  }
+  process.stdout.write(output);
+  if (process.stdout.isTTY && !output.endsWith('\n')) {
+    process.stdout.write('\n');
+  }
+}
+run().catch(err => {
+  process.stderr.write(`html2md4llm: ${err.message}\n`);
+  process.stderr.write('Run "html2md4llm --help" for usage.\n');
+  process.exit(1);
+});

package/package.json CHANGED Viewed

@@ -1,15 +1,20 @@
 {
   "name": "html2md4llm",
-  "version": "1.0.0",
-  "description": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
+  "version": "1.1.2",
+  "description": "Convert HTML to clean Markdown or JSON, optimized for LLM processing",
   "type": "module",
   "main": "src/main.js",
+  "bin": {
+    "html2md4llm": "bin/html2md4llm.js"
+  },
   "exports": {
-    ".": "./src/main.js"
+    ".": {
+      "import": "./src/main.js",
+      "default": "./src/main.js"
+    }
   },
   "scripts": {
-    "test": "node tests/run-tests.js",
-    "build": "node build.js"
+    "test": "node tests/run-tests.js"
   },
   "keywords": [
     "html",
@@ -17,6 +22,8 @@
     "json",
     "converter",
     "llm",
+    "cli",
+    "npx",
     "parser",
     "html-to-markdown"
   ],
@@ -27,16 +34,12 @@
   },
   "repository": {
     "type": "git",
-    "url": "https://github.com/kaiye/html2md4llm.git"
+    "url": "git+https://github.com/kaiye/html2md4llm.git"
   },
   "files": [
     "src/**/*",
-    "dist/**/*",
-    "plugin/**/*",
+    "bin/**/*",
     "README.md",
     "LICENSE"
-  ],
-  "devDependencies": {
-    "esbuild": "^0.27.0"
-  }
+  ]
 }

package/src/generators/markdown.js CHANGED Viewed

@@ -1,5 +1,80 @@
 const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
-const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section'];
+const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section', 'table'];
+const tableSections = ['thead', 'tbody', 'tfoot'];
+function normalizeTableCell(text) {
+  return text
+    .replace(/\r\n/g, '\n')
+    .replace(/\n+/g, '<br>')
+    .replace(/\s+/g, ' ')
+    .replace(/\|/g, '\\|')
+    .trim();
+}
+function collectTableRows(tableNode) {
+  const rows = [];
+  for (const child of tableNode.children || []) {
+    if (child.type !== 'element') continue;
+    if (child.tag === 'tr') {
+      rows.push(child);
+      continue;
+    }
+    if (tableSections.includes(child.tag)) {
+      for (const row of child.children || []) {
+        if (row.type === 'element' && row.tag === 'tr') {
+          rows.push(row);
+        }
+      }
+    }
+  }
+  return rows;
+}
+function renderTable(node, indent) {
+  const rows = collectTableRows(node).map(row => {
+    const cellNodes = (row.children || []).filter(
+      child => child.type === 'element' && (child.tag === 'th' || child.tag === 'td')
+    );
+    const cells = cellNodes.map(cell => {
+      const text = (cell.children || []).map(ch => generate(ch)).join('');
+      return normalizeTableCell(text);
+    });
+    const hasHeaderCell = cellNodes.some(cell => cell.tag === 'th');
+    return { cells, hasHeaderCell };
+  }).filter(row => row.cells.length > 0);
+  if (rows.length === 0) return '';
+  let headerRowIndex = rows.findIndex(row => row.hasHeaderCell);
+  if (headerRowIndex < 0) headerRowIndex = 0;
+  const headerRow = [...rows[headerRowIndex].cells];
+  const bodyRows = rows
+    .filter((_, i) => i !== headerRowIndex)
+    .map(row => [...row.cells]);
+  const columnCount = Math.max(
+    headerRow.length,
+    ...bodyRows.map(row => row.length)
+  );
+  while (headerRow.length < columnCount) headerRow.push('');
+  for (const row of bodyRows) {
+    while (row.length < columnCount) row.push('');
+  }
+  const rowToLine = cells => `${' '.repeat(indent)}| ${cells.join(' | ')} |`;
+  const separator = `${' '.repeat(indent)}| ${Array(columnCount).fill('---').join(' | ')} |`;
+  const lines = [rowToLine(headerRow), separator];
+  for (const row of bodyRows) {
+    lines.push(rowToLine(row));
+  }
+  return lines.join('\n');
+}
 function isInline(node) {
   if (node.type === 'element' && node.tag === 'br') return false;
@@ -33,6 +108,11 @@ export function generate(node, indent = 0) {
   const tag = node.tag;
   const children = node.children || [];
+  // Tables
+  if (tag === 'table') {
+    return renderTable(node, indent);
+  }
   // If only one child and no special handling for this tag, pass through transparently
   const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
   if (children.length === 1 && !hasSpecialHandling) {

package/src/main.js CHANGED Viewed

@@ -37,6 +37,9 @@ export function main(htmlInput, options = {}) {
   return JSON.stringify(tree, (key, value) => key === 'parent' ? undefined : value, 2);
 }
+export const html2md4llm = main;
+export default main;
 function extractLargestList(node) {
   let largest = null;
   let maxCount = 0;

package/src/parser.js CHANGED Viewed

@@ -108,6 +108,7 @@ export function parse(html, removeAttributes = []) {
   // Post-processing: flatten pre/code, flatten containers, remove unwanted nodes
   const voidElements = ['br', 'hr', 'img'];
   const flattenableTags = ['div', 'span', 'section', 'p'];
+  const preserveEmptyElements = ['table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td'];
   function flattenPreCode(node) {
     if (node.type === 'element' && (node.tag === 'pre' || node.tag === 'code')) {
@@ -123,6 +124,22 @@ export function parse(html, removeAttributes = []) {
     }
   }
+  // Helper function to check if a node has substantive content
+  function hasSubstantiveContent(node) {
+    if (node.type === 'text') {
+      return node.text.trim().length > 0;
+    }
+    if (node.type === 'element') {
+      // Content elements (img, a) count as substantive
+      if (['img', 'a'].includes(node.tag)) return true;
+      // Non-content elements (br, hr) don't count as substantive
+      if (['br', 'hr'].includes(node.tag)) return false;
+      // Recursively check if any child has substantive content
+      return node.children?.some(hasSubstantiveContent) ?? false;
+    }
+    return false;
+  }
   function removeUnwantedNodes(node) {
     if (!node.children) return;
@@ -160,6 +177,13 @@ export function parse(html, removeAttributes = []) {
         removeUnwantedNodes(child);
         // Remove style attribute after filtering
         delete child.attributes.style;
+        // Filter empty formatting elements (strong, em, b, i, code) without substantive content
+        const formattingElements = ['strong', 'em', 'b', 'i', 'code'];
+        if (formattingElements.includes(child.tag) && !hasSubstantiveContent(child)) {
+          return false;
+        }
+        // Keep structural table elements even when cell content is empty
+        if (preserveEmptyElements.includes(child.tag)) return true;
         // Remove empty nodes
         if (child.children && child.children.length === 0) return false;
       }

package/plugin/main.js DELETED Viewed

@@ -1,41 +0,0 @@
-import { main } from '../src/main.js';
-export default class Html2Md4LlmTool {
-  async invoke(parameters) {
-    const { html, outputFormat = 'markdown', strategy } = parameters;
-    if (!html) {
-      throw new Error('html parameter is required');
-    }
-    const options = { outputFormat };
-    if (strategy) {
-      options.strategy = strategy;
-    }
-    const result = main(html, options);
-    return {
-      result,
-      format: outputFormat
-    };
-  }
-  async validate(parameters) {
-    if (!parameters.html || typeof parameters.html !== 'string') {
-      return { valid: false, error: 'html must be a non-empty string' };
-    }
-    const { outputFormat, strategy } = parameters;
-    if (outputFormat && !['markdown', 'json'].includes(outputFormat)) {
-      return { valid: false, error: 'outputFormat must be "markdown" or "json"' };
-    }
-    if (strategy && !['list', 'article'].includes(strategy)) {
-      return { valid: false, error: 'strategy must be "list" or "article"' };
-    }
-    return { valid: true };
-  }
-}

package/plugin/manifest.json DELETED Viewed

@@ -1,16 +0,0 @@
-{
-  "version": "0.0.1",
-  "type": "tool",
-  "author": "kaiye",
-  "name": "html2md4llm",
-  "label": {
-    "en_US": "HTML to Markdown/JSON",
-    "zh_Hans": "HTML 转 Markdown/JSON"
-  },
-  "description": {
-    "en_US": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
-    "zh_Hans": "将 HTML 转换为干净的 Markdown 或 JSON 格式，针对 LLM 处理优化"
-  },
-  "icon": "icon.svg",
-  "tags": ["html", "markdown", "json", "converter", "parser"]
-}