html2md4llm 1.0.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 kaiye
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,73 +1,86 @@
1
1
  # html2md4llm
2
2
 
3
- Convert HTML to clean Markdown or JSON format, optimized for LLM processing.
3
+ HTML 转成更干净的 Markdown JSON,便于人和 AI 直接消费。
4
4
 
5
- ## Features
5
+ ## 安装
6
6
 
7
- - Convert HTML to Markdown or JSON
8
- - Intelligent content extraction (list/article modes)
9
- - Automatic HTML cleaning (removes scripts, styles, iframes)
10
- - Preserves metadata (title, description, keywords)
11
- - Zero dependencies - uses only Node.js built-in modules
7
+ ```bash
8
+ npm install html2md4llm
9
+ ```
12
10
 
13
- ## Installation
11
+ ## 最常用方式
14
12
 
15
- ### NPM Package
13
+ ### 1) 在代码中调用
16
14
 
17
- ```bash
18
- npm install html2md4llm
15
+ ```js
16
+ import { main } from 'html2md4llm';
17
+
18
+ const html = '<h1>Hello</h1><p>World</p>';
19
+ const md = main(html); // 默认输出 markdown
20
+ console.log(md);
19
21
  ```
20
22
 
21
- ### Standalone Script
23
+ 输出 JSON:
24
+
25
+ ```js
26
+ const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
27
+ ```
22
28
 
23
- Download `dist/html2md4llm.min.js` and include it directly:
29
+ ### 2) npx 当命令行工具
24
30
 
25
- ```html
26
- <script src="html2md4llm.min.js"></script>
27
- <script>
28
- const result = html2md4llm('<h1>Hello</h1>');
29
- </script>
31
+ 本地文件转 `.md`:
32
+
33
+ ```bash
34
+ npx html2md4llm ./input.html ./output.md
30
35
  ```
31
36
 
32
- ### Dify Plugin
37
+ stdin -> stdout(Linux 管道):
33
38
 
34
- Install from the plugin marketplace or import from this repository's `plugin/` directory.
39
+ ```bash
40
+ cat ./input.html | npx html2md4llm > ./output.md
41
+ ```
35
42
 
36
- ## Usage
43
+ ## API(给代码/AI 调用)
37
44
 
38
- ```javascript
39
- import { main } from 'html2md4llm';
45
+ `main(htmlInput, options?)`
40
46
 
41
- // Basic conversion to Markdown
42
- const markdown = main('<h1>Hello</h1><p>World</p>');
47
+ - `htmlInput: string`:HTML 字符串
48
+ - `options.outputFormat: 'markdown' | 'json'`:默认 `'markdown'`
49
+ - `options.strategy: 'list' | 'article'`:可选提取策略
50
+ - `options.removeAttributes: string[]`:按规则移除属性,如 `['aria-*', 'role']`
43
51
 
44
- // Convert to JSON
45
- const json = main('<h1>Hello</h1>', { outputFormat: 'json' });
52
+ 返回值:`string`(Markdown 文本或 JSON 字符串)
46
53
 
47
- // Extract largest list
48
- const list = main(html, { strategy: 'list' });
54
+ ## CLI 参数
49
55
 
50
- // Extract article content
51
- const article = main(html, { strategy: 'article' });
56
+ ```bash
57
+ html2md4llm <input.html> [output.md] [options]
58
+ html2md4llm - [output.md] [options]
52
59
  ```
53
60
 
54
- ## API
61
+ - `-o, --output <file>`:输出文件路径
62
+ - `-f, --format <markdown|json>`:输出格式
63
+ - `--json`:等价于 `--format json`
64
+ - `--markdown`:等价于 `--format markdown`
65
+ - `-s, --strategy <list|article>`:内容提取策略
66
+ - `-r, --remove-attrs <attrs>`:逗号分隔,如 `aria-*,role`
67
+ - `-h, --help`:查看帮助
68
+ - `-v, --version`:查看版本
55
69
 
56
- ### `main(htmlInput, options)`
70
+ ## AI 使用建议
57
71
 
58
- **Parameters:**
59
- - `htmlInput` (string): HTML text to convert
60
- - `options` (object, optional):
61
- - `outputFormat` (string): `'markdown'` (default) or `'json'`
62
- - `strategy` (string): `'list'`, `'article'`, or undefined
63
- - `removeAttributes` (boolean): Remove HTML attributes during parsing
72
+ 如果你在 Agent/工作流里调用这个工具,推荐固定约定:
64
73
 
65
- **Returns:** String (Markdown or JSON)
74
+ 1. 输入始终传完整 HTML 字符串。
75
+ 2. 需要结构化消费时使用 `outputFormat: 'json'`。
76
+ 3. 需要最简正文时按场景加 `strategy: 'article'` 或 `strategy: 'list'`。
77
+ 4. 需要清理无关属性时传 `removeAttributes`,例如 `['aria-*', 'role', 'data-*']`。
66
78
 
67
- ## Strategies
79
+ ## 开发
68
80
 
69
- - **list**: Extracts the largest `<ul>` or `<ol>` element
70
- - **article**: Filters out empty containers, keeping main content
81
+ ```bash
82
+ npm test
83
+ ```
71
84
 
72
85
  ## License
73
86
 
@@ -0,0 +1,199 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFileSync } from 'node:fs';
4
+ import { readFile, writeFile } from 'node:fs/promises';
5
+ import process from 'node:process';
6
+ import { main } from '../src/main.js';
7
+
8
+ const pkg = JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf-8'));
9
+
10
+ function printHelp() {
11
+ const help = `html2md4llm ${pkg.version}
12
+
13
+ Usage:
14
+ html2md4llm <input.html> [output.md] [options]
15
+ html2md4llm - [output.md] [options]
16
+ cat input.html | html2md4llm [options] > output.md
17
+
18
+ Options:
19
+ -o, --output <file> Output file path
20
+ -f, --format <markdown|json> Output format (default: markdown)
21
+ --json Shortcut for --format json
22
+ --markdown Shortcut for --format markdown
23
+ -s, --strategy <list|article> Extraction strategy
24
+ -r, --remove-attrs <attrs> Comma-separated attrs (e.g. aria-*,role)
25
+ -h, --help Show help
26
+ -v, --version Show version`;
27
+
28
+ process.stdout.write(`${help}\n`);
29
+ }
30
+
31
+ function parseArgs(argv) {
32
+ const parsed = {
33
+ positional: [],
34
+ outputFile: undefined,
35
+ outputFormat: 'markdown',
36
+ strategy: undefined,
37
+ removeAttributes: []
38
+ };
39
+
40
+ const takeValue = (name, index) => {
41
+ const value = argv[index + 1];
42
+ if (!value || value.startsWith('-')) {
43
+ throw new Error(`Missing value for ${name}`);
44
+ }
45
+ return value;
46
+ };
47
+
48
+ for (let i = 0; i < argv.length; i++) {
49
+ const arg = argv[i];
50
+
51
+ if (arg === '-h' || arg === '--help') {
52
+ parsed.help = true;
53
+ continue;
54
+ }
55
+
56
+ if (arg === '-v' || arg === '--version') {
57
+ parsed.version = true;
58
+ continue;
59
+ }
60
+
61
+ if (arg === '-o' || arg === '--output') {
62
+ parsed.outputFile = takeValue(arg, i);
63
+ i++;
64
+ continue;
65
+ }
66
+
67
+ if (arg === '-f' || arg === '--format') {
68
+ parsed.outputFormat = takeValue(arg, i);
69
+ i++;
70
+ continue;
71
+ }
72
+
73
+ if (arg === '--json') {
74
+ parsed.outputFormat = 'json';
75
+ continue;
76
+ }
77
+
78
+ if (arg === '--markdown') {
79
+ parsed.outputFormat = 'markdown';
80
+ continue;
81
+ }
82
+
83
+ if (arg === '-s' || arg === '--strategy') {
84
+ parsed.strategy = takeValue(arg, i);
85
+ i++;
86
+ continue;
87
+ }
88
+
89
+ if (
90
+ arg === '-r' ||
91
+ arg === '--remove-attrs' ||
92
+ arg === '--remove-attr' ||
93
+ arg === '--remove-attribute'
94
+ ) {
95
+ const value = takeValue(arg, i);
96
+ parsed.removeAttributes.push(
97
+ ...value.split(',').map(item => item.trim()).filter(Boolean)
98
+ );
99
+ i++;
100
+ continue;
101
+ }
102
+
103
+ if (arg.startsWith('-')) {
104
+ throw new Error(`Unknown option: ${arg}`);
105
+ }
106
+
107
+ parsed.positional.push(arg);
108
+ }
109
+
110
+ if (parsed.positional.length > 2) {
111
+ throw new Error('Too many positional arguments');
112
+ }
113
+
114
+ if (parsed.outputFile && parsed.positional[1]) {
115
+ throw new Error('Output file specified twice (positional and --output)');
116
+ }
117
+
118
+ parsed.inputFile = parsed.positional[0];
119
+ if (!parsed.outputFile) {
120
+ parsed.outputFile = parsed.positional[1];
121
+ }
122
+
123
+ return parsed;
124
+ }
125
+
126
+ function readStdin() {
127
+ return new Promise((resolve, reject) => {
128
+ let data = '';
129
+ process.stdin.setEncoding('utf-8');
130
+ process.stdin.on('data', chunk => {
131
+ data += chunk;
132
+ });
133
+ process.stdin.on('end', () => resolve(data));
134
+ process.stdin.on('error', reject);
135
+ });
136
+ }
137
+
138
+ async function run() {
139
+ const args = parseArgs(process.argv.slice(2));
140
+
141
+ if (args.help) {
142
+ printHelp();
143
+ return;
144
+ }
145
+
146
+ if (args.version) {
147
+ process.stdout.write(`${pkg.version}\n`);
148
+ return;
149
+ }
150
+
151
+ if (args.outputFormat !== 'markdown' && args.outputFormat !== 'json') {
152
+ throw new Error('Invalid --format value. Use "markdown" or "json".');
153
+ }
154
+
155
+ if (args.strategy && args.strategy !== 'list' && args.strategy !== 'article') {
156
+ throw new Error('Invalid --strategy value. Use "list" or "article".');
157
+ }
158
+
159
+ const hasStdin = !process.stdin.isTTY;
160
+ let htmlInput;
161
+
162
+ if (args.inputFile && args.inputFile !== '-') {
163
+ htmlInput = await readFile(args.inputFile, 'utf-8');
164
+ } else if (args.inputFile === '-' || hasStdin) {
165
+ htmlInput = await readStdin();
166
+ } else {
167
+ throw new Error('No input provided. Pass an input file, "-" for stdin, or pipe HTML to stdin.');
168
+ }
169
+
170
+ const options = {
171
+ outputFormat: args.outputFormat
172
+ };
173
+
174
+ if (args.strategy) {
175
+ options.strategy = args.strategy;
176
+ }
177
+
178
+ if (args.removeAttributes.length > 0) {
179
+ options.removeAttributes = args.removeAttributes;
180
+ }
181
+
182
+ const output = main(htmlInput, options);
183
+
184
+ if (args.outputFile) {
185
+ await writeFile(args.outputFile, output, 'utf-8');
186
+ return;
187
+ }
188
+
189
+ process.stdout.write(output);
190
+ if (process.stdout.isTTY && !output.endsWith('\n')) {
191
+ process.stdout.write('\n');
192
+ }
193
+ }
194
+
195
+ run().catch(err => {
196
+ process.stderr.write(`html2md4llm: ${err.message}\n`);
197
+ process.stderr.write('Run "html2md4llm --help" for usage.\n');
198
+ process.exit(1);
199
+ });
package/package.json CHANGED
@@ -1,15 +1,20 @@
1
1
  {
2
2
  "name": "html2md4llm",
3
- "version": "1.0.0",
4
- "description": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
3
+ "version": "1.1.2",
4
+ "description": "Convert HTML to clean Markdown or JSON, optimized for LLM processing",
5
5
  "type": "module",
6
6
  "main": "src/main.js",
7
+ "bin": {
8
+ "html2md4llm": "bin/html2md4llm.js"
9
+ },
7
10
  "exports": {
8
- ".": "./src/main.js"
11
+ ".": {
12
+ "import": "./src/main.js",
13
+ "default": "./src/main.js"
14
+ }
9
15
  },
10
16
  "scripts": {
11
- "test": "node tests/run-tests.js",
12
- "build": "node build.js"
17
+ "test": "node tests/run-tests.js"
13
18
  },
14
19
  "keywords": [
15
20
  "html",
@@ -17,6 +22,8 @@
17
22
  "json",
18
23
  "converter",
19
24
  "llm",
25
+ "cli",
26
+ "npx",
20
27
  "parser",
21
28
  "html-to-markdown"
22
29
  ],
@@ -27,16 +34,12 @@
27
34
  },
28
35
  "repository": {
29
36
  "type": "git",
30
- "url": "https://github.com/kaiye/html2md4llm.git"
37
+ "url": "git+https://github.com/kaiye/html2md4llm.git"
31
38
  },
32
39
  "files": [
33
40
  "src/**/*",
34
- "dist/**/*",
35
- "plugin/**/*",
41
+ "bin/**/*",
36
42
  "README.md",
37
43
  "LICENSE"
38
- ],
39
- "devDependencies": {
40
- "esbuild": "^0.27.0"
41
- }
44
+ ]
42
45
  }
@@ -1,5 +1,80 @@
1
1
  const inlineElements = ['span', 'a', 'strong', 'em', 'code', 'b', 'i'];
2
- const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section'];
2
+ const blockElements = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'div', 'section', 'table'];
3
+ const tableSections = ['thead', 'tbody', 'tfoot'];
4
+
5
+ function normalizeTableCell(text) {
6
+ return text
7
+ .replace(/\r\n/g, '\n')
8
+ .replace(/\n+/g, '<br>')
9
+ .replace(/\s+/g, ' ')
10
+ .replace(/\|/g, '\\|')
11
+ .trim();
12
+ }
13
+
14
+ function collectTableRows(tableNode) {
15
+ const rows = [];
16
+ for (const child of tableNode.children || []) {
17
+ if (child.type !== 'element') continue;
18
+
19
+ if (child.tag === 'tr') {
20
+ rows.push(child);
21
+ continue;
22
+ }
23
+
24
+ if (tableSections.includes(child.tag)) {
25
+ for (const row of child.children || []) {
26
+ if (row.type === 'element' && row.tag === 'tr') {
27
+ rows.push(row);
28
+ }
29
+ }
30
+ }
31
+ }
32
+ return rows;
33
+ }
34
+
35
+ function renderTable(node, indent) {
36
+ const rows = collectTableRows(node).map(row => {
37
+ const cellNodes = (row.children || []).filter(
38
+ child => child.type === 'element' && (child.tag === 'th' || child.tag === 'td')
39
+ );
40
+ const cells = cellNodes.map(cell => {
41
+ const text = (cell.children || []).map(ch => generate(ch)).join('');
42
+ return normalizeTableCell(text);
43
+ });
44
+ const hasHeaderCell = cellNodes.some(cell => cell.tag === 'th');
45
+ return { cells, hasHeaderCell };
46
+ }).filter(row => row.cells.length > 0);
47
+
48
+ if (rows.length === 0) return '';
49
+
50
+ let headerRowIndex = rows.findIndex(row => row.hasHeaderCell);
51
+ if (headerRowIndex < 0) headerRowIndex = 0;
52
+
53
+ const headerRow = [...rows[headerRowIndex].cells];
54
+ const bodyRows = rows
55
+ .filter((_, i) => i !== headerRowIndex)
56
+ .map(row => [...row.cells]);
57
+
58
+ const columnCount = Math.max(
59
+ headerRow.length,
60
+ ...bodyRows.map(row => row.length)
61
+ );
62
+
63
+ while (headerRow.length < columnCount) headerRow.push('');
64
+ for (const row of bodyRows) {
65
+ while (row.length < columnCount) row.push('');
66
+ }
67
+
68
+ const rowToLine = cells => `${' '.repeat(indent)}| ${cells.join(' | ')} |`;
69
+ const separator = `${' '.repeat(indent)}| ${Array(columnCount).fill('---').join(' | ')} |`;
70
+
71
+ const lines = [rowToLine(headerRow), separator];
72
+ for (const row of bodyRows) {
73
+ lines.push(rowToLine(row));
74
+ }
75
+
76
+ return lines.join('\n');
77
+ }
3
78
 
4
79
  function isInline(node) {
5
80
  if (node.type === 'element' && node.tag === 'br') return false;
@@ -33,6 +108,11 @@ export function generate(node, indent = 0) {
33
108
  const tag = node.tag;
34
109
  const children = node.children || [];
35
110
 
111
+ // Tables
112
+ if (tag === 'table') {
113
+ return renderTable(node, indent);
114
+ }
115
+
36
116
  // If only one child and no special handling for this tag, pass through transparently
37
117
  const hasSpecialHandling = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'pre', 'br', 'strong', 'b', 'em', 'i', 'code', 'a'].includes(tag);
38
118
  if (children.length === 1 && !hasSpecialHandling) {
package/src/main.js CHANGED
@@ -37,6 +37,9 @@ export function main(htmlInput, options = {}) {
37
37
  return JSON.stringify(tree, (key, value) => key === 'parent' ? undefined : value, 2);
38
38
  }
39
39
 
40
+ export const html2md4llm = main;
41
+ export default main;
42
+
40
43
  function extractLargestList(node) {
41
44
  let largest = null;
42
45
  let maxCount = 0;
package/src/parser.js CHANGED
@@ -108,6 +108,7 @@ export function parse(html, removeAttributes = []) {
108
108
  // Post-processing: flatten pre/code, flatten containers, remove unwanted nodes
109
109
  const voidElements = ['br', 'hr', 'img'];
110
110
  const flattenableTags = ['div', 'span', 'section', 'p'];
111
+ const preserveEmptyElements = ['table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td'];
111
112
 
112
113
  function flattenPreCode(node) {
113
114
  if (node.type === 'element' && (node.tag === 'pre' || node.tag === 'code')) {
@@ -123,6 +124,22 @@ export function parse(html, removeAttributes = []) {
123
124
  }
124
125
  }
125
126
 
127
+ // Helper function to check if a node has substantive content
128
+ function hasSubstantiveContent(node) {
129
+ if (node.type === 'text') {
130
+ return node.text.trim().length > 0;
131
+ }
132
+ if (node.type === 'element') {
133
+ // Content elements (img, a) count as substantive
134
+ if (['img', 'a'].includes(node.tag)) return true;
135
+ // Non-content elements (br, hr) don't count as substantive
136
+ if (['br', 'hr'].includes(node.tag)) return false;
137
+ // Recursively check if any child has substantive content
138
+ return node.children?.some(hasSubstantiveContent) ?? false;
139
+ }
140
+ return false;
141
+ }
142
+
126
143
  function removeUnwantedNodes(node) {
127
144
  if (!node.children) return;
128
145
 
@@ -160,6 +177,13 @@ export function parse(html, removeAttributes = []) {
160
177
  removeUnwantedNodes(child);
161
178
  // Remove style attribute after filtering
162
179
  delete child.attributes.style;
180
+ // Filter empty formatting elements (strong, em, b, i, code) without substantive content
181
+ const formattingElements = ['strong', 'em', 'b', 'i', 'code'];
182
+ if (formattingElements.includes(child.tag) && !hasSubstantiveContent(child)) {
183
+ return false;
184
+ }
185
+ // Keep structural table elements even when cell content is empty
186
+ if (preserveEmptyElements.includes(child.tag)) return true;
163
187
  // Remove empty nodes
164
188
  if (child.children && child.children.length === 0) return false;
165
189
  }
package/plugin/main.js DELETED
@@ -1,41 +0,0 @@
1
- import { main } from '../src/main.js';
2
-
3
- export default class Html2Md4LlmTool {
4
- async invoke(parameters) {
5
- const { html, outputFormat = 'markdown', strategy } = parameters;
6
-
7
- if (!html) {
8
- throw new Error('html parameter is required');
9
- }
10
-
11
- const options = { outputFormat };
12
- if (strategy) {
13
- options.strategy = strategy;
14
- }
15
-
16
- const result = main(html, options);
17
-
18
- return {
19
- result,
20
- format: outputFormat
21
- };
22
- }
23
-
24
- async validate(parameters) {
25
- if (!parameters.html || typeof parameters.html !== 'string') {
26
- return { valid: false, error: 'html must be a non-empty string' };
27
- }
28
-
29
- const { outputFormat, strategy } = parameters;
30
-
31
- if (outputFormat && !['markdown', 'json'].includes(outputFormat)) {
32
- return { valid: false, error: 'outputFormat must be "markdown" or "json"' };
33
- }
34
-
35
- if (strategy && !['list', 'article'].includes(strategy)) {
36
- return { valid: false, error: 'strategy must be "list" or "article"' };
37
- }
38
-
39
- return { valid: true };
40
- }
41
- }
@@ -1,16 +0,0 @@
1
- {
2
- "version": "0.0.1",
3
- "type": "tool",
4
- "author": "kaiye",
5
- "name": "html2md4llm",
6
- "label": {
7
- "en_US": "HTML to Markdown/JSON",
8
- "zh_Hans": "HTML 转 Markdown/JSON"
9
- },
10
- "description": {
11
- "en_US": "Convert HTML to clean Markdown or JSON format, optimized for LLM processing",
12
- "zh_Hans": "将 HTML 转换为干净的 Markdown 或 JSON 格式,针对 LLM 处理优化"
13
- },
14
- "icon": "icon.svg",
15
- "tags": ["html", "markdown", "json", "converter", "parser"]
16
- }