@kreuzberg/html-to-markdown 2.19.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +350 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +153 -0
- package/dist/index.d.ts +80 -0
- package/dist/index.js +126 -0
- package/package.json +74 -0
package/README.md
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# html-to-markdown (TypeScript)
|
|
2
|
+
|
|
3
|
+
[](https://crates.io/crates/html-to-markdown-rs)
|
|
4
|
+
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node)
|
|
5
|
+
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm)
|
|
6
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
7
|
+
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
8
|
+
[](https://rubygems.org/gems/html-to-markdown)
|
|
9
|
+
[](https://hex.pm/packages/html_to_markdown)
|
|
10
|
+
[](https://www.nuget.org/packages/Goldziher.HtmlToMarkdown/)
|
|
11
|
+
[](https://central.sonatype.com/artifact/io.github.goldziher/html-to-markdown)
|
|
12
|
+
[](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown)
|
|
13
|
+
[](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
|
|
14
|
+
[](https://discord.gg/pXxagNK2zN)
|
|
15
|
+
|
|
16
|
+
High-performance HTML to Markdown converter for Node.js and Bun with full TypeScript support. This package wraps native `@kreuzberg/html-to-markdown-node` bindings and provides a type-safe API.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Native bindings (Node.js/Bun) - Recommended
|
|
22
|
+
npm install @kreuzberg/html-to-markdown
|
|
23
|
+
pnpm add @kreuzberg/html-to-markdown
|
|
24
|
+
yarn add @kreuzberg/html-to-markdown
|
|
25
|
+
bun add @kreuzberg/html-to-markdown
|
|
26
|
+
|
|
27
|
+
# WebAssembly (browser/edge/Node without native toolchain)
|
|
28
|
+
npm install @kreuzberg/html-to-markdown-wasm
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Migration Guide (v2.18.x → v2.19.0)
|
|
32
|
+
|
|
33
|
+
### Breaking Change: Scoped npm Packages
|
|
34
|
+
|
|
35
|
+
In v2.19.0, npm packages were moved to the `@kreuzberg` scope to align with the Kreuzberg.dev organization.
|
|
36
|
+
|
|
37
|
+
#### Package Installation Update
|
|
38
|
+
|
|
39
|
+
**Before (v2.18.x):**
|
|
40
|
+
```bash
|
|
41
|
+
npm install html-to-markdown-node
|
|
42
|
+
npm install html-to-markdown-wasm
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**After (v2.19.0+):**
|
|
46
|
+
```bash
|
|
47
|
+
npm install @kreuzberg/html-to-markdown-node
|
|
48
|
+
npm install @kreuzberg/html-to-markdown-wasm
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
#### Import Statement Update
|
|
52
|
+
|
|
53
|
+
**Before:**
|
|
54
|
+
```typescript
|
|
55
|
+
import { convert } from 'html-to-markdown-node';
|
|
56
|
+
import { convert } from 'html-to-markdown-wasm';
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**After:**
|
|
60
|
+
```typescript
|
|
61
|
+
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
62
|
+
import { convert } from '@kreuzberg/html-to-markdown-wasm';
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
#### TypeScript Declaration Update
|
|
66
|
+
|
|
67
|
+
Update your TypeScript configuration if you have imports from the old package name:
|
|
68
|
+
|
|
69
|
+
**Before (tsconfig.json or import aliases):**
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"compilerOptions": {
|
|
73
|
+
"paths": {
|
|
74
|
+
"html-to-markdown": ["node_modules/html-to-markdown-node"]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**After:**
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"compilerOptions": {
|
|
84
|
+
"paths": {
|
|
85
|
+
"@kreuzberg/html-to-markdown": ["node_modules/@kreuzberg/html-to-markdown-node"]
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
#### Deno Update
|
|
92
|
+
|
|
93
|
+
**Before:**
|
|
94
|
+
```typescript
|
|
95
|
+
import { convert } from "npm:html-to-markdown-wasm";
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**After:**
|
|
99
|
+
```typescript
|
|
100
|
+
import { convert } from "npm:@kreuzberg/html-to-markdown-wasm";
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
#### Summary of Changes
|
|
104
|
+
|
|
105
|
+
- All npm packages now use `@kreuzberg` scope
|
|
106
|
+
- `html-to-markdown-node` → `@kreuzberg/html-to-markdown-node`
|
|
107
|
+
- `html-to-markdown-wasm` → `@kreuzberg/html-to-markdown-wasm`
|
|
108
|
+
- TypeScript types and APIs are identical
|
|
109
|
+
- No functional changes to the library
|
|
110
|
+
|
|
111
|
+
## Quick Start
|
|
112
|
+
|
|
113
|
+
**Basic conversion with type safety:**
|
|
114
|
+
```typescript
|
|
115
|
+
import { convert } from '@kreuzberg/html-to-markdown';
|
|
116
|
+
|
|
117
|
+
const markdown: string = convert('<h1>Hello World</h1>');
|
|
118
|
+
console.log(markdown); // # Hello World
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**With conversion options:**
|
|
122
|
+
```typescript
|
|
123
|
+
import { convert, ConversionOptions } from '@kreuzberg/html-to-markdown';
|
|
124
|
+
|
|
125
|
+
const options: ConversionOptions = {
|
|
126
|
+
headingStyle: 'atx',
|
|
127
|
+
listIndentWidth: 2,
|
|
128
|
+
wrap: true,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
const markdown = convert('<h1>Title</h1><p>Content</p>', options);
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**TypeScript interfaces for type safety:**
|
|
135
|
+
```typescript
|
|
136
|
+
interface ConversionOptions {
|
|
137
|
+
headingStyle?: 'atx' | 'setext';
|
|
138
|
+
listIndentWidth?: number;
|
|
139
|
+
wrap?: boolean;
|
|
140
|
+
wrapWidth?: number;
|
|
141
|
+
// ... more options
|
|
142
|
+
}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**File and stream helpers:**
|
|
146
|
+
```typescript
|
|
147
|
+
import { convertFile, convertBuffer } from '@kreuzberg/html-to-markdown';
|
|
148
|
+
|
|
149
|
+
// From file
|
|
150
|
+
const markdown = await convertFile('page.html');
|
|
151
|
+
|
|
152
|
+
// From Buffer/Uint8Array
|
|
153
|
+
const buffer = Buffer.from('<h1>Title</h1>');
|
|
154
|
+
const markdown = convertBuffer(buffer);
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## API Reference
|
|
158
|
+
|
|
159
|
+
### Core Functions
|
|
160
|
+
|
|
161
|
+
#### `convert(html: string, options?: ConversionOptions): string`
|
|
162
|
+
Convert HTML string to Markdown.
|
|
163
|
+
|
|
164
|
+
#### `convertBuffer(buffer: Buffer | Uint8Array, options?: ConversionOptions): string`
|
|
165
|
+
Convert HTML from Buffer/Uint8Array (avoids string allocation overhead).
|
|
166
|
+
|
|
167
|
+
#### `convertFile(filePath: string, options?: ConversionOptions): Promise<string>`
|
|
168
|
+
Asynchronously convert an HTML file to Markdown.
|
|
169
|
+
|
|
170
|
+
#### `convertStream(stream: NodeJS.ReadableStream, options?: ConversionOptions): Promise<string>`
|
|
171
|
+
Convert HTML from a readable stream (stdin, file stream, network).
|
|
172
|
+
|
|
173
|
+
### Metadata Extraction Functions
|
|
174
|
+
|
|
175
|
+
Requires `metadata` feature flag.
|
|
176
|
+
|
|
177
|
+
#### `convertWithMetadata(html: string, options?, metadataConfig?): { markdown: string; metadata: JsExtendedMetadata }`
|
|
178
|
+
Convert and extract document metadata, headers, links, images, and structured data.
|
|
179
|
+
|
|
180
|
+
#### `convertWithMetadataBuffer(buffer: Buffer | Uint8Array, options?, metadataConfig?): JsMetadataExtraction`
|
|
181
|
+
Convert from Buffer with metadata extraction.
|
|
182
|
+
|
|
183
|
+
#### `convertFileWithMetadata(filePath: string, options?, metadataConfig?): Promise<JsMetadataExtraction>`
|
|
184
|
+
Convert HTML file with metadata extraction.
|
|
185
|
+
|
|
186
|
+
#### `convertStreamWithMetadata(stream: NodeJS.ReadableStream, options?, metadataConfig?): Promise<JsMetadataExtraction>`
|
|
187
|
+
Convert stream with metadata extraction.
|
|
188
|
+
|
|
189
|
+
#### `hasMetadataSupport(): boolean`
|
|
190
|
+
Check if metadata extraction is available at runtime.
|
|
191
|
+
|
|
192
|
+
### Visitor Pattern Functions
|
|
193
|
+
|
|
194
|
+
Custom element callbacks for fine-grained conversion control.
|
|
195
|
+
|
|
196
|
+
#### `convertWithVisitor(html: string, config: { visitor: Visitor; options?: ConversionOptions }): string | Promise<string>`
|
|
197
|
+
Convert with visitor callbacks for element interception.
|
|
198
|
+
|
|
199
|
+
#### `convertWithAsyncVisitor(html: string, config: { visitor: AsyncVisitor; options?: ConversionOptions }): Promise<string>`
|
|
200
|
+
Convert with async visitor methods for I/O operations.
|
|
201
|
+
|
|
202
|
+
## Type Definitions
|
|
203
|
+
|
|
204
|
+
### ConversionOptions
|
|
205
|
+
```typescript
|
|
206
|
+
interface ConversionOptions {
|
|
207
|
+
headingStyle?: 'atx' | 'setext'; // # Style or underline style
|
|
208
|
+
bulletListMarker?: '-' | '*' | '+'; // List marker
|
|
209
|
+
codeBlockStyle?: 'fenced' | 'indented'; // Code block format
|
|
210
|
+
horizontalRule?: string; // --- or *** or ___
|
|
211
|
+
listIndentWidth?: number; // Indentation (default: 4)
|
|
212
|
+
wrap?: boolean; // Enable text wrapping
|
|
213
|
+
wrapWidth?: number; // Wrap column width
|
|
214
|
+
preserveNotices?: boolean; // Keep HTML comments
|
|
215
|
+
sanitize?: boolean; // Remove unsafe HTML (default: true)
|
|
216
|
+
headingPrefix?: string; // Prefix for headings
|
|
217
|
+
strongDelimiter?: string; // ** or __
|
|
218
|
+
emDelimiter?: string; // * or _
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Metadata Types
|
|
223
|
+
```typescript
|
|
224
|
+
interface JsMetadataConfig {
|
|
225
|
+
extractHeaders?: boolean; // h1-h6 elements
|
|
226
|
+
extractLinks?: boolean; // <a> elements
|
|
227
|
+
extractImages?: boolean; // <img> and inline SVG
|
|
228
|
+
extractStructuredData?: boolean; // JSON-LD, Microdata, RDFa
|
|
229
|
+
maxStructuredDataSize?: number; // Size limit (default: 1MB)
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
interface JsExtendedMetadata {
|
|
233
|
+
document: JsDocumentMetadata;
|
|
234
|
+
headers: JsHeaderMetadata[];
|
|
235
|
+
links: JsLinkMetadata[];
|
|
236
|
+
images: JsImageMetadata[];
|
|
237
|
+
structuredData: JsStructuredData[];
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
interface JsDocumentMetadata {
|
|
241
|
+
title?: string;
|
|
242
|
+
description?: string;
|
|
243
|
+
keywords: string[];
|
|
244
|
+
author?: string;
|
|
245
|
+
canonicalUrl?: string;
|
|
246
|
+
language?: string;
|
|
247
|
+
textDirection?: 'ltr' | 'rtl' | 'auto';
|
|
248
|
+
openGraph: Record<string, string>;
|
|
249
|
+
twitterCard: Record<string, string>;
|
|
250
|
+
metaTags: Record<string, string>;
|
|
251
|
+
}
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Visitor Types
|
|
255
|
+
```typescript
|
|
256
|
+
interface Visitor {
|
|
257
|
+
visitText?(ctx: NodeContext, text: string): VisitResult;
|
|
258
|
+
visitLink?(ctx: NodeContext, href: string, text: string, title?: string): VisitResult;
|
|
259
|
+
visitImage?(ctx: NodeContext, src: string, alt?: string, title?: string): VisitResult;
|
|
260
|
+
visitHeading?(ctx: NodeContext, level: number, text: string, id?: string): VisitResult;
|
|
261
|
+
visitCodeBlock?(ctx: NodeContext, lang?: string, code?: string): VisitResult;
|
|
262
|
+
// ... 41 total methods for fine-grained control
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
interface NodeContext {
|
|
266
|
+
nodeType: string;
|
|
267
|
+
tagName: string;
|
|
268
|
+
attributes: Record<string, string>;
|
|
269
|
+
depth: number;
|
|
270
|
+
indexInParent: number;
|
|
271
|
+
parentTag: string | null;
|
|
272
|
+
isInline: boolean;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
type VisitResult =
|
|
276
|
+
| { type: 'continue' }
|
|
277
|
+
| { type: 'custom'; output: string }
|
|
278
|
+
| { type: 'skip' }
|
|
279
|
+
| { type: 'preserveHtml' }
|
|
280
|
+
| { type: 'error'; message: string };
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Error Handling
|
|
284
|
+
|
|
285
|
+
```typescript
|
|
286
|
+
try {
|
|
287
|
+
const markdown = convert(html);
|
|
288
|
+
} catch (error) {
|
|
289
|
+
if (error instanceof Error) {
|
|
290
|
+
console.error('Conversion failed:', error.message);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
Inputs with binary data (PDF bytes coerced to strings) raise errors with message: `Invalid input`.
|
|
296
|
+
|
|
297
|
+
## Examples
|
|
298
|
+
|
|
299
|
+
See comprehensive guides in the examples directory:
|
|
300
|
+
|
|
301
|
+
- **[Visitor Pattern](../../examples/visitor-pattern/)** - Custom callbacks, filtering, transformations, analytics
|
|
302
|
+
- **[Metadata Extraction](../../examples/metadata-extraction/)** - SEO metadata, TOC generation, link validation
|
|
303
|
+
- **[Performance](../../examples/performance/)** - Benchmarks, optimization strategies
|
|
304
|
+
|
|
305
|
+
## TypeScript Configuration
|
|
306
|
+
|
|
307
|
+
For strict type checking:
|
|
308
|
+
|
|
309
|
+
```json
|
|
310
|
+
{
|
|
311
|
+
"compilerOptions": {
|
|
312
|
+
"strict": true,
|
|
313
|
+
"noUncheckedIndexedAccess": true,
|
|
314
|
+
"exactOptionalPropertyTypes": true,
|
|
315
|
+
"noImplicitAny": true,
|
|
316
|
+
"noImplicitThis": true,
|
|
317
|
+
"strictNullChecks": true,
|
|
318
|
+
"strictFunctionTypes": true,
|
|
319
|
+
"strictPropertyInitialization": true,
|
|
320
|
+
"noImplicitReturns": true
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
All bindings are fully typed with no `any` types. Leverage TypeScript for compile-time safety.
|
|
326
|
+
|
|
327
|
+
## Performance
|
|
328
|
+
|
|
329
|
+
Benchmarks from Apple M4 (ops/sec):
|
|
330
|
+
|
|
331
|
+
| Document | Size | ops/sec |
|
|
332
|
+
| ------------------- | ------- | ------- |
|
|
333
|
+
| Small (Intro) | 463 KB | 627 |
|
|
334
|
+
| Medium (Python) | 657 KB | 460 |
|
|
335
|
+
| Large (Rust) | 567 KB | 554 |
|
|
336
|
+
| Lists (Timeline) | 129 KB | 3,137 |
|
|
337
|
+
| Tables (Countries) | 360 KB | 932 |
|
|
338
|
+
|
|
339
|
+
Run `task bench:harness -- --frameworks node` to benchmark locally.
|
|
340
|
+
|
|
341
|
+
## Links
|
|
342
|
+
|
|
343
|
+
- [GitHub](https://github.com/kreuzberg-dev/html-to-markdown)
|
|
344
|
+
- [npm Package](https://www.npmjs.com/package/@kreuzberg/html-to-markdown)
|
|
345
|
+
- [WASM Package](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm)
|
|
346
|
+
- [Discord Community](https://discord.gg/pXxagNK2zN)
|
|
347
|
+
|
|
348
|
+
## License
|
|
349
|
+
|
|
350
|
+
MIT
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { promises as fs } from "node:fs";
|
|
3
|
+
import { stderr, stdin, stdout } from "node:process";
|
|
4
|
+
import { convertJson as convertHtmlJson, convertWithInlineImagesJson, } from "@kreuzberg/html-to-markdown-node";
|
|
5
|
+
import { convertStream, convertStreamWithInlineImages } from "./index";
|
|
6
|
+
const jsonReplacer = (_key, value) => (typeof value === "bigint" ? Number(value) : value);
|
|
7
|
+
const toJson = (value) => {
|
|
8
|
+
if (value == null) {
|
|
9
|
+
return undefined;
|
|
10
|
+
}
|
|
11
|
+
return JSON.stringify(value, jsonReplacer);
|
|
12
|
+
};
|
|
13
|
+
function printUsage() {
|
|
14
|
+
stdout.write(`html-to-markdown CLI\n\n`);
|
|
15
|
+
stdout.write(`Usage:\n`);
|
|
16
|
+
stdout.write(` html-to-markdown [--input <file>] [--output <file>] [--options '{...}'] [--inline-images]\n\n`);
|
|
17
|
+
stdout.write(`Options:\n`);
|
|
18
|
+
stdout.write(` --input <file> Read HTML from a file instead of stdin\n`);
|
|
19
|
+
stdout.write(` --output <file> Write Markdown to a file instead of stdout\n`);
|
|
20
|
+
stdout.write(` --options <json> JSON encoded conversion options\n`);
|
|
21
|
+
stdout.write(` --inline-images Collect inline images (writes JSON to <output>.images.json)\n`);
|
|
22
|
+
stdout.write(` --inline-image-config JSON encoded inline image extraction options\n`);
|
|
23
|
+
stdout.write(` -h, --help Show this help message\n`);
|
|
24
|
+
stdout.write(` -v, --version Print the package version\n`);
|
|
25
|
+
}
|
|
26
|
+
async function loadJson(value, label) {
|
|
27
|
+
try {
|
|
28
|
+
return JSON.parse(value);
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
throw new Error(`Failed to parse ${label} JSON: ${error.message}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function getNextArg(args, index, flagName) {
|
|
35
|
+
const value = args[index + 1];
|
|
36
|
+
if (value === undefined) {
|
|
37
|
+
throw new Error(`Missing value for ${flagName}`);
|
|
38
|
+
}
|
|
39
|
+
return value;
|
|
40
|
+
}
|
|
41
|
+
async function parseArgs() {
|
|
42
|
+
const args = process.argv.slice(2);
|
|
43
|
+
const opts = {};
|
|
44
|
+
for (let index = 0; index < args.length; index += 1) {
|
|
45
|
+
const arg = args[index];
|
|
46
|
+
if (arg === undefined)
|
|
47
|
+
continue;
|
|
48
|
+
switch (arg) {
|
|
49
|
+
case "-h":
|
|
50
|
+
case "--help":
|
|
51
|
+
return "help";
|
|
52
|
+
case "-v":
|
|
53
|
+
case "--version":
|
|
54
|
+
return "version";
|
|
55
|
+
case "--input":
|
|
56
|
+
opts.input = getNextArg(args, index, "--input");
|
|
57
|
+
index += 1;
|
|
58
|
+
break;
|
|
59
|
+
case "--output":
|
|
60
|
+
opts.output = getNextArg(args, index, "--output");
|
|
61
|
+
index += 1;
|
|
62
|
+
break;
|
|
63
|
+
case "--options":
|
|
64
|
+
opts.options = await loadJson(getNextArg(args, index, "--options"), "--options");
|
|
65
|
+
index += 1;
|
|
66
|
+
break;
|
|
67
|
+
case "--inline-images":
|
|
68
|
+
opts.inlineImages = true;
|
|
69
|
+
break;
|
|
70
|
+
case "--inline-image-config":
|
|
71
|
+
opts.inlineImageConfig = await loadJson(getNextArg(args, index, "--inline-image-config"), "--inline-image-config");
|
|
72
|
+
index += 1;
|
|
73
|
+
break;
|
|
74
|
+
default:
|
|
75
|
+
throw new Error(`Unknown argument: ${arg}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return opts;
|
|
79
|
+
}
|
|
80
|
+
async function readInput(path) {
|
|
81
|
+
if (path) {
|
|
82
|
+
return fs.readFile(path, "utf8");
|
|
83
|
+
}
|
|
84
|
+
if (!stdin.isTTY) {
|
|
85
|
+
return stdin;
|
|
86
|
+
}
|
|
87
|
+
throw new Error("No input provided. Pass --input <file> or pipe HTML via stdin.");
|
|
88
|
+
}
|
|
89
|
+
async function writeOutput(content, path) {
|
|
90
|
+
if (!path) {
|
|
91
|
+
stdout.write(content);
|
|
92
|
+
if (!content.endsWith("\n")) {
|
|
93
|
+
stdout.write("\n");
|
|
94
|
+
}
|
|
95
|
+
return;
|
|
96
|
+
}
|
|
97
|
+
await fs.writeFile(path, content, "utf8");
|
|
98
|
+
}
|
|
99
|
+
async function writeInlineImages(extractionPath, inlineData) {
|
|
100
|
+
const payload = {
|
|
101
|
+
markdown: inlineData.markdown,
|
|
102
|
+
inlineImages: inlineData.inlineImages,
|
|
103
|
+
warnings: inlineData.warnings,
|
|
104
|
+
};
|
|
105
|
+
await fs.writeFile(extractionPath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
|
|
106
|
+
}
|
|
107
|
+
async function main() {
|
|
108
|
+
try {
|
|
109
|
+
const parsed = await parseArgs();
|
|
110
|
+
if (parsed === "help") {
|
|
111
|
+
printUsage();
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
if (parsed === "version") {
|
|
115
|
+
const { version } = require("../package.json");
|
|
116
|
+
stdout.write(`${version}\n`);
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
const { input, output, options, inlineImages, inlineImageConfig } = parsed;
|
|
120
|
+
const inputContent = await readInput(input);
|
|
121
|
+
if (inlineImages) {
|
|
122
|
+
const inlineResult = typeof inputContent === "string"
|
|
123
|
+
? convertWithInlineImagesJson(inputContent, toJson(options), toJson(inlineImageConfig))
|
|
124
|
+
: await convertStreamWithInlineImages(inputContent, options, inlineImageConfig);
|
|
125
|
+
if (output) {
|
|
126
|
+
await writeOutput(inlineResult.markdown, output);
|
|
127
|
+
const imagePath = `${output}.images.json`;
|
|
128
|
+
await writeInlineImages(imagePath, inlineResult);
|
|
129
|
+
stdout.write(`Inline images written to ${imagePath}\n`);
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
stdout.write(inlineResult.markdown);
|
|
133
|
+
if (!inlineResult.markdown.endsWith("\n")) {
|
|
134
|
+
stdout.write("\n");
|
|
135
|
+
}
|
|
136
|
+
stdout.write(`${JSON.stringify({ inlineImages: inlineResult.inlineImages, warnings: inlineResult.warnings }, null, 2)}\n`);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
else if (typeof inputContent === "string") {
|
|
140
|
+
const markdown = convertHtmlJson(inputContent, toJson(options));
|
|
141
|
+
await writeOutput(markdown, output);
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
const markdown = await convertStream(inputContent, options);
|
|
145
|
+
await writeOutput(markdown, output);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
catch (error) {
|
|
149
|
+
stderr.write(`Error: ${error.message}\n`);
|
|
150
|
+
process.exitCode = 1;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
void main();
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import type { Readable } from "node:stream";
|
|
2
|
+
import { type JsConversionOptions, type JsHtmlExtraction, type JsInlineImageConfig, type JsMetadataConfig, type JsMetadataExtraction } from "@kreuzberg/html-to-markdown-node";
|
|
3
|
+
export * from "@kreuzberg/html-to-markdown-node";
|
|
4
|
+
/**
|
|
5
|
+
* Check if metadata extraction functionality is available.
|
|
6
|
+
*
|
|
7
|
+
* @returns true if convertWithMetadata is available, false otherwise
|
|
8
|
+
*/
|
|
9
|
+
export declare function hasMetadataSupport(): boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Convert the contents of an HTML file to Markdown.
|
|
12
|
+
*/
|
|
13
|
+
export declare function convertFile(filePath: string, options?: JsConversionOptions | null | undefined): Promise<string>;
|
|
14
|
+
/**
|
|
15
|
+
* Convert an HTML file and collect inline images.
|
|
16
|
+
*/
|
|
17
|
+
export declare function convertFileWithInlineImages(filePath: string, options?: JsConversionOptions | null | undefined, imageConfig?: JsInlineImageConfig | null | undefined): Promise<JsHtmlExtraction>;
|
|
18
|
+
/**
|
|
19
|
+
* Convert HTML streamed from stdin or another readable stream.
|
|
20
|
+
*/
|
|
21
|
+
export declare function convertStream(stream: Readable | AsyncIterable<string | Buffer>, options?: JsConversionOptions | null | undefined): Promise<string>;
|
|
22
|
+
/**
|
|
23
|
+
* Convert HTML from a stream and collect inline images.
|
|
24
|
+
*/
|
|
25
|
+
export declare function convertStreamWithInlineImages(stream: Readable | AsyncIterable<string | Buffer>, options?: JsConversionOptions | null | undefined, imageConfig?: JsInlineImageConfig | null | undefined): Promise<JsHtmlExtraction>;
|
|
26
|
+
/**
|
|
27
|
+
* Convert HTML to Markdown and extract comprehensive metadata.
|
|
28
|
+
*
|
|
29
|
+
* Extracts document metadata (title, description, language, etc.), headers,
|
|
30
|
+
* links, images, and structured data (JSON-LD, Microdata, RDFa).
|
|
31
|
+
*
|
|
32
|
+
* @param html HTML content to convert
|
|
33
|
+
* @param options Optional conversion configuration
|
|
34
|
+
* @param metadataConfig Optional metadata extraction configuration
|
|
35
|
+
* @returns Object with converted markdown and extracted metadata
|
|
36
|
+
*
|
|
37
|
+
* @example
|
|
38
|
+
* ```ts
|
|
39
|
+
* import { convertWithMetadata } from 'html-to-markdown';
|
|
40
|
+
*
|
|
41
|
+
* const html = `
|
|
42
|
+
* <html lang="en">
|
|
43
|
+
* <head>
|
|
44
|
+
* <title>My Article</title>
|
|
45
|
+
* <meta name="description" content="An interesting article">
|
|
46
|
+
* </head>
|
|
47
|
+
* <body>
|
|
48
|
+
* <h1>Main Title</h1>
|
|
49
|
+
* <p>Content with <a href="/page">link</a></p>
|
|
50
|
+
* </body>
|
|
51
|
+
* </html>
|
|
52
|
+
* `;
|
|
53
|
+
*
|
|
54
|
+
* const { markdown, metadata } = await convertWithMetadata(html, undefined, {
|
|
55
|
+
* extractHeaders: true,
|
|
56
|
+
* extractLinks: true,
|
|
57
|
+
* extractImages: true,
|
|
58
|
+
* });
|
|
59
|
+
*
|
|
60
|
+
* console.log(metadata.document.title); // "My Article"
|
|
61
|
+
* console.log(metadata.headers.length); // 1
|
|
62
|
+
* console.log(metadata.links.length); // 1
|
|
63
|
+
* ```
|
|
64
|
+
*/
|
|
65
|
+
export declare function convertWithMetadata(html: string, options?: JsConversionOptions | null | undefined, metadataConfig?: JsMetadataConfig | null | undefined): JsMetadataExtraction;
|
|
66
|
+
/**
|
|
67
|
+
* Convert HTML from Buffer/Uint8Array to Markdown with metadata extraction.
|
|
68
|
+
*
|
|
69
|
+
* Avoids creating intermediate JavaScript strings by accepting raw bytes.
|
|
70
|
+
* Auto-detects UTF-8 encoding.
|
|
71
|
+
*/
|
|
72
|
+
export declare function convertWithMetadataBuffer(html: Buffer | Uint8Array, options?: JsConversionOptions | null | undefined, metadataConfig?: JsMetadataConfig | null | undefined): JsMetadataExtraction;
|
|
73
|
+
/**
|
|
74
|
+
* Convert the contents of an HTML file to Markdown with metadata extraction.
|
|
75
|
+
*/
|
|
76
|
+
export declare function convertFileWithMetadata(filePath: string, options?: JsConversionOptions | null | undefined, metadataConfig?: JsMetadataConfig | null | undefined): Promise<JsMetadataExtraction>;
|
|
77
|
+
/**
|
|
78
|
+
* Convert HTML streamed from stdin or another readable stream with metadata extraction.
|
|
79
|
+
*/
|
|
80
|
+
export declare function convertStreamWithMetadata(stream: Readable | AsyncIterable<string | Buffer>, options?: JsConversionOptions | null | undefined, metadataConfig?: JsMetadataConfig | null | undefined): Promise<JsMetadataExtraction>;
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { convertJson as convertHtmlJson, convertWithInlineImagesJson as convertHtmlWithInlineImagesJson, convertWithMetadataBufferJson as convertHtmlWithMetadataBufferJson, convertWithMetadataJson as convertHtmlWithMetadataJson, } from "@kreuzberg/html-to-markdown-node";
|
|
3
|
+
export * from "@kreuzberg/html-to-markdown-node";
|
|
4
|
+
const jsonReplacer = (_key, value) => (typeof value === "bigint" ? Number(value) : value);
|
|
5
|
+
const toJson = (value) => {
|
|
6
|
+
if (value == null) {
|
|
7
|
+
return undefined;
|
|
8
|
+
}
|
|
9
|
+
return JSON.stringify(value, jsonReplacer);
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Check if metadata extraction functionality is available.
|
|
13
|
+
*
|
|
14
|
+
* @returns true if convertWithMetadata is available, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export function hasMetadataSupport() {
|
|
17
|
+
try {
|
|
18
|
+
return typeof convertHtmlWithMetadataJson === "function";
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Convert the contents of an HTML file to Markdown.
|
|
26
|
+
*/
|
|
27
|
+
export async function convertFile(filePath, options) {
|
|
28
|
+
const html = await readFile(filePath, "utf8");
|
|
29
|
+
return convertHtmlJson(html, toJson(options ?? undefined));
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Convert an HTML file and collect inline images.
|
|
33
|
+
*/
|
|
34
|
+
export async function convertFileWithInlineImages(filePath, options, imageConfig) {
|
|
35
|
+
const html = await readFile(filePath, "utf8");
|
|
36
|
+
return convertHtmlWithInlineImagesJson(html, toJson(options ?? undefined), toJson(imageConfig ?? undefined));
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Convert HTML streamed from stdin or another readable stream.
|
|
40
|
+
*/
|
|
41
|
+
export async function convertStream(stream, options) {
|
|
42
|
+
let html = "";
|
|
43
|
+
for await (const chunk of stream) {
|
|
44
|
+
html += typeof chunk === "string" ? chunk : chunk.toString("utf8");
|
|
45
|
+
}
|
|
46
|
+
return convertHtmlJson(html, toJson(options ?? undefined));
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Convert HTML from a stream and collect inline images.
|
|
50
|
+
*/
|
|
51
|
+
export async function convertStreamWithInlineImages(stream, options, imageConfig) {
|
|
52
|
+
let html = "";
|
|
53
|
+
for await (const chunk of stream) {
|
|
54
|
+
html += typeof chunk === "string" ? chunk : chunk.toString("utf8");
|
|
55
|
+
}
|
|
56
|
+
return convertHtmlWithInlineImagesJson(html, toJson(options ?? undefined), toJson(imageConfig ?? undefined));
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Convert HTML to Markdown and extract comprehensive metadata.
|
|
60
|
+
*
|
|
61
|
+
* Extracts document metadata (title, description, language, etc.), headers,
|
|
62
|
+
* links, images, and structured data (JSON-LD, Microdata, RDFa).
|
|
63
|
+
*
|
|
64
|
+
* @param html HTML content to convert
|
|
65
|
+
* @param options Optional conversion configuration
|
|
66
|
+
* @param metadataConfig Optional metadata extraction configuration
|
|
67
|
+
* @returns Object with converted markdown and extracted metadata
|
|
68
|
+
*
|
|
69
|
+
* @example
|
|
70
|
+
* ```ts
|
|
71
|
+
* import { convertWithMetadata } from 'html-to-markdown';
|
|
72
|
+
*
|
|
73
|
+
* const html = `
|
|
74
|
+
* <html lang="en">
|
|
75
|
+
* <head>
|
|
76
|
+
* <title>My Article</title>
|
|
77
|
+
* <meta name="description" content="An interesting article">
|
|
78
|
+
* </head>
|
|
79
|
+
* <body>
|
|
80
|
+
* <h1>Main Title</h1>
|
|
81
|
+
* <p>Content with <a href="/page">link</a></p>
|
|
82
|
+
* </body>
|
|
83
|
+
* </html>
|
|
84
|
+
* `;
|
|
85
|
+
*
|
|
86
|
+
* const { markdown, metadata } = await convertWithMetadata(html, undefined, {
|
|
87
|
+
* extractHeaders: true,
|
|
88
|
+
* extractLinks: true,
|
|
89
|
+
* extractImages: true,
|
|
90
|
+
* });
|
|
91
|
+
*
|
|
92
|
+
* console.log(metadata.document.title); // "My Article"
|
|
93
|
+
* console.log(metadata.headers.length); // 1
|
|
94
|
+
* console.log(metadata.links.length); // 1
|
|
95
|
+
* ```
|
|
96
|
+
*/
|
|
97
|
+
export function convertWithMetadata(html, options, metadataConfig) {
|
|
98
|
+
return convertHtmlWithMetadataJson(html, toJson(options ?? undefined), toJson(metadataConfig ?? undefined));
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Convert HTML from Buffer/Uint8Array to Markdown with metadata extraction.
|
|
102
|
+
*
|
|
103
|
+
* Avoids creating intermediate JavaScript strings by accepting raw bytes.
|
|
104
|
+
* Auto-detects UTF-8 encoding.
|
|
105
|
+
*/
|
|
106
|
+
export function convertWithMetadataBuffer(html, options, metadataConfig) {
|
|
107
|
+
const input = Buffer.isBuffer(html) ? html : Buffer.from(html);
|
|
108
|
+
return convertHtmlWithMetadataBufferJson(input, toJson(options ?? undefined), toJson(metadataConfig ?? undefined));
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Convert the contents of an HTML file to Markdown with metadata extraction.
|
|
112
|
+
*/
|
|
113
|
+
export async function convertFileWithMetadata(filePath, options, metadataConfig) {
|
|
114
|
+
const html = await readFile(filePath, "utf8");
|
|
115
|
+
return convertWithMetadata(html, options ?? undefined, metadataConfig ?? undefined);
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Convert HTML streamed from stdin or another readable stream with metadata extraction.
|
|
119
|
+
*/
|
|
120
|
+
export async function convertStreamWithMetadata(stream, options, metadataConfig) {
|
|
121
|
+
let html = "";
|
|
122
|
+
for await (const chunk of stream) {
|
|
123
|
+
html += typeof chunk === "string" ? chunk : chunk.toString("utf8");
|
|
124
|
+
}
|
|
125
|
+
return convertWithMetadata(html, options ?? undefined, metadataConfig ?? undefined);
|
|
126
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@kreuzberg/html-to-markdown",
|
|
3
|
+
"version": "2.19.0-rc.1",
|
|
4
|
+
"description": "High-performance HTML to Markdown converter for TypeScript/Node.js with a Rust core.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"html",
|
|
7
|
+
"markdown",
|
|
8
|
+
"converter",
|
|
9
|
+
"rust",
|
|
10
|
+
"cli",
|
|
11
|
+
"napi",
|
|
12
|
+
"typescript"
|
|
13
|
+
],
|
|
14
|
+
"license": "MIT",
|
|
15
|
+
"author": {
|
|
16
|
+
"name": "Na'aman Hirschfeld",
|
|
17
|
+
"email": "nhirschfeld@gmail.com"
|
|
18
|
+
},
|
|
19
|
+
"homepage": "https://github.com/kreuzberg-dev/html-to-markdown",
|
|
20
|
+
"repository": {
|
|
21
|
+
"type": "git",
|
|
22
|
+
"url": "https://github.com/kreuzberg-dev/html-to-markdown"
|
|
23
|
+
},
|
|
24
|
+
"bugs": {
|
|
25
|
+
"url": "https://github.com/kreuzberg-dev/html-to-markdown/issues"
|
|
26
|
+
},
|
|
27
|
+
"main": "dist/index.js",
|
|
28
|
+
"types": "dist/index.d.ts",
|
|
29
|
+
"exports": {
|
|
30
|
+
".": {
|
|
31
|
+
"types": "./dist/index.d.ts",
|
|
32
|
+
"default": "./dist/index.js"
|
|
33
|
+
},
|
|
34
|
+
"./cli": {
|
|
35
|
+
"types": "./dist/cli.d.ts",
|
|
36
|
+
"default": "./dist/cli.js"
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
"bin": {
|
|
40
|
+
"html-to-markdown": "./dist/cli.js"
|
|
41
|
+
},
|
|
42
|
+
"scripts": {
|
|
43
|
+
"build": "pnpm --filter @kreuzberg/html-to-markdown-node run build && tsc --project tsconfig.json",
|
|
44
|
+
"clean": "rm -rf dist",
|
|
45
|
+
"lint": "biome check src",
|
|
46
|
+
"lint:fix": "biome check --write src",
|
|
47
|
+
"format:fix": "biome check --write src",
|
|
48
|
+
"format:check": "biome check src",
|
|
49
|
+
"test": "pnpm --filter @kreuzberg/html-to-markdown-node run build && vitest run",
|
|
50
|
+
"test:coverage": "pnpm --filter @kreuzberg/html-to-markdown-node run build && vitest run --coverage",
|
|
51
|
+
"test:watch": "vitest"
|
|
52
|
+
},
|
|
53
|
+
"files": [
|
|
54
|
+
"dist",
|
|
55
|
+
"README.md"
|
|
56
|
+
],
|
|
57
|
+
"dependencies": {
|
|
58
|
+
"@kreuzberg/html-to-markdown-node": "2.19.0-rc.1"
|
|
59
|
+
},
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@biomejs/biome": "^2.3.10",
|
|
62
|
+
"@types/node": "^25.0.3",
|
|
63
|
+
"@vitest/coverage-v8": "^4.0.16",
|
|
64
|
+
"typescript": "^5.9.3",
|
|
65
|
+
"vitest": "^4.0.16"
|
|
66
|
+
},
|
|
67
|
+
"engines": {
|
|
68
|
+
"node": ">=18"
|
|
69
|
+
},
|
|
70
|
+
"publishConfig": {
|
|
71
|
+
"registry": "https://registry.npmjs.org/",
|
|
72
|
+
"access": "public"
|
|
73
|
+
}
|
|
74
|
+
}
|