html-to-markdown-wasm 2.12.1 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -0
- package/dist/README.md +420 -13
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/package.json +1 -1
- package/dist-node/README.md +420 -13
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/package.json +1 -1
- package/dist-web/README.md +420 -13
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/package.json +1 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -291,6 +291,113 @@ for (const img of result.inlineImages) {
|
|
|
291
291
|
}
|
|
292
292
|
```
|
|
293
293
|
|
|
294
|
+
## Metadata Extraction
|
|
295
|
+
|
|
296
|
+
Extract document metadata (headers, links, images, structured data) alongside Markdown conversion:
|
|
297
|
+
|
|
298
|
+
```typescript
|
|
299
|
+
import { convertWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
|
|
300
|
+
|
|
301
|
+
const html = `
|
|
302
|
+
<html lang="en">
|
|
303
|
+
<head><title>My Article</title></head>
|
|
304
|
+
<body>
|
|
305
|
+
<h1>Main Title</h1>
|
|
306
|
+
<p>Content with <a href="https://example.com">a link</a></p>
|
|
307
|
+
<img src="https://example.com/image.jpg" alt="Example image">
|
|
308
|
+
</body>
|
|
309
|
+
</html>
|
|
310
|
+
`;
|
|
311
|
+
|
|
312
|
+
const config = new WasmMetadataConfig();
|
|
313
|
+
config.extractHeaders = true;
|
|
314
|
+
config.extractLinks = true;
|
|
315
|
+
config.extractImages = true;
|
|
316
|
+
config.extractStructuredData = true;
|
|
317
|
+
config.maxStructuredDataSize = 1_000_000; // 1MB limit
|
|
318
|
+
|
|
319
|
+
const result = convertWithMetadata(html, null, config);
|
|
320
|
+
|
|
321
|
+
console.log(result.markdown);
|
|
322
|
+
console.log('Document metadata:', result.metadata.document);
|
|
323
|
+
// {
|
|
324
|
+
// title: 'My Article',
|
|
325
|
+
// language: 'en',
|
|
326
|
+
// ...
|
|
327
|
+
// }
|
|
328
|
+
|
|
329
|
+
console.log('Headers:', result.metadata.headers);
|
|
330
|
+
// [
|
|
331
|
+
// { level: 1, text: 'Main Title', id: undefined, depth: 0, htmlOffset: ... }
|
|
332
|
+
// ]
|
|
333
|
+
|
|
334
|
+
console.log('Links:', result.metadata.links);
|
|
335
|
+
// [
|
|
336
|
+
// {
|
|
337
|
+
// href: 'https://example.com',
|
|
338
|
+
// text: 'a link',
|
|
339
|
+
// linkType: 'external',
|
|
340
|
+
// rel: [],
|
|
341
|
+
// ...
|
|
342
|
+
// }
|
|
343
|
+
// ]
|
|
344
|
+
|
|
345
|
+
console.log('Images:', result.metadata.images);
|
|
346
|
+
// [
|
|
347
|
+
// {
|
|
348
|
+
// src: 'https://example.com/image.jpg',
|
|
349
|
+
// alt: 'Example image',
|
|
350
|
+
// imageType: 'external',
|
|
351
|
+
// ...
|
|
352
|
+
// }
|
|
353
|
+
// ]
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Metadata Configuration
|
|
357
|
+
|
|
358
|
+
The `WasmMetadataConfig` class controls what metadata is extracted:
|
|
359
|
+
|
|
360
|
+
```typescript
|
|
361
|
+
import { WasmMetadataConfig } from 'html-to-markdown-wasm';
|
|
362
|
+
|
|
363
|
+
const config = new WasmMetadataConfig();
|
|
364
|
+
|
|
365
|
+
// Enable/disable extraction types
|
|
366
|
+
config.extractHeaders = true; // h1-h6 elements
|
|
367
|
+
config.extractLinks = true; // <a> elements with link type classification
|
|
368
|
+
config.extractImages = true; // <img> and <svg> elements
|
|
369
|
+
config.extractStructuredData = true; // JSON-LD, Microdata, RDFa
|
|
370
|
+
|
|
371
|
+
// Limit structured data size to prevent memory exhaustion
|
|
372
|
+
config.maxStructuredDataSize = 1_000_000; // 1MB default
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### Metadata Structure
|
|
376
|
+
|
|
377
|
+
The returned metadata object includes:
|
|
378
|
+
|
|
379
|
+
- **document**: Document-level metadata (title, description, keywords, language, OG tags, Twitter cards, etc.)
|
|
380
|
+
- **headers**: Array of header elements with level, text, id, and document position
|
|
381
|
+
- **links**: Array of links with href, text, type (anchor/internal/external/email/phone), and rel attributes
|
|
382
|
+
- **images**: Array of images with src, alt text, dimensions, and type classification (dataUri/external/relative/svg)
|
|
383
|
+
- **structuredData**: Array of JSON-LD, Microdata, and RDFa blocks
|
|
384
|
+
|
|
385
|
+
### Byte-Based Input
|
|
386
|
+
|
|
387
|
+
Convert bytes directly with metadata extraction:
|
|
388
|
+
|
|
389
|
+
```typescript
|
|
390
|
+
import { convertBytesWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
|
|
391
|
+
import { readFileSync } from 'node:fs';
|
|
392
|
+
|
|
393
|
+
const htmlBytes = readFileSync('article.html');
|
|
394
|
+
const config = new WasmMetadataConfig();
|
|
395
|
+
|
|
396
|
+
const result = convertBytesWithMetadata(htmlBytes, null, config);
|
|
397
|
+
console.log(result.markdown);
|
|
398
|
+
console.log(result.metadata);
|
|
399
|
+
```
|
|
400
|
+
|
|
294
401
|
## Build Targets
|
|
295
402
|
|
|
296
403
|
Three build targets are provided for different environments:
|
package/dist/README.md
CHANGED
|
@@ -28,6 +28,7 @@ Experience WebAssembly-powered HTML to Markdown conversion instantly in your bro
|
|
|
28
28
|
- **Blazing Fast**: Rust-powered core delivers 10-80× faster conversion than pure Python alternatives
|
|
29
29
|
- **Universal**: Works everywhere - Node.js, Bun, Deno, browsers, Python, Rust, and standalone CLI
|
|
30
30
|
- **Smart Conversion**: Handles complex documents including nested tables, code blocks, task lists, and hOCR OCR output
|
|
31
|
+
- **Metadata Extraction**: Extract document metadata (title, description, headers, links, images) alongside conversion
|
|
31
32
|
- **Highly Configurable**: Control heading styles, code block fences, list formatting, whitespace handling, and HTML sanitization
|
|
32
33
|
- **Tag Preservation**: Keep specific HTML tags unconverted when markdown isn't expressive enough
|
|
33
34
|
- **Secure by Default**: Built-in HTML sanitization prevents malicious content
|
|
@@ -35,19 +36,22 @@ Experience WebAssembly-powered HTML to Markdown conversion instantly in your bro
|
|
|
35
36
|
|
|
36
37
|
## Documentation
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
- **
|
|
43
|
-
- **PHP
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
- **
|
|
47
|
-
- **Elixir
|
|
48
|
-
- **Rust
|
|
49
|
-
|
|
50
|
-
|
|
39
|
+
**Language Guides & API References:**
|
|
40
|
+
|
|
41
|
+
- **Python** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/python/README.md) with metadata extraction, inline images, hOCR workflows
|
|
42
|
+
- **JavaScript/TypeScript** – [Node.js](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-node/README.md) | [TypeScript](https://github.com/Goldziher/html-to-markdown/blob/main/packages/typescript/README.md) | [WASM](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-wasm/README.md)
|
|
43
|
+
- **Ruby** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md) with RBS types, Steep type checking
|
|
44
|
+
- **PHP** – [Package](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php/README.md) | [Extension (PIE)](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php-ext/README.md)
|
|
45
|
+
- **Go** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/go/README.md) with FFI bindings
|
|
46
|
+
- **Java** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/java/README.md) with Panama FFI, Maven/Gradle setup
|
|
47
|
+
- **C#/.NET** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/csharp/README.md) with NuGet distribution
|
|
48
|
+
- **Elixir** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/elixir/README.md) with Rustler NIF bindings
|
|
49
|
+
- **Rust** – [README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown/README.md) with core API, error handling, advanced features
|
|
50
|
+
|
|
51
|
+
**Project Resources:**
|
|
52
|
+
|
|
53
|
+
- **Contributing** – [CONTRIBUTING.md](https://github.com/Goldziher/html-to-markdown/blob/main/CONTRIBUTING.md) ⭐ Start here for development
|
|
54
|
+
- **Changelog** – [CHANGELOG.md](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md) – Version history and breaking changes
|
|
51
55
|
|
|
52
56
|
## Installation
|
|
53
57
|
|
|
@@ -102,6 +106,44 @@ See the JavaScript guides for full API documentation:
|
|
|
102
106
|
- [Node.js/Bun guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-node)
|
|
103
107
|
- [WebAssembly guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-wasm)
|
|
104
108
|
|
|
109
|
+
### Metadata extraction (all languages)
|
|
110
|
+
|
|
111
|
+
```typescript
|
|
112
|
+
import { convertWithMetadata } from 'html-to-markdown-node';
|
|
113
|
+
|
|
114
|
+
const html = `
|
|
115
|
+
<html>
|
|
116
|
+
<head>
|
|
117
|
+
<title>Example</title>
|
|
118
|
+
<meta name="description" content="Demo page">
|
|
119
|
+
<link rel="canonical" href="https://example.com/page">
|
|
120
|
+
</head>
|
|
121
|
+
<body>
|
|
122
|
+
<h1 id="welcome">Welcome</h1>
|
|
123
|
+
<a href="https://example.com" rel="nofollow external">Example link</a>
|
|
124
|
+
<img src="https://example.com/image.jpg" alt="Hero" width="640" height="480">
|
|
125
|
+
</body>
|
|
126
|
+
</html>
|
|
127
|
+
`;
|
|
128
|
+
|
|
129
|
+
const { markdown, metadata } = await convertWithMetadata(
|
|
130
|
+
html,
|
|
131
|
+
{ headingStyle: 'Atx' },
|
|
132
|
+
{ extract_links: true, extract_images: true, extract_headers: true },
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
console.log(markdown);
|
|
136
|
+
// metadata.document.title === 'Example'
|
|
137
|
+
// metadata.links[0].rel === ['nofollow', 'external']
|
|
138
|
+
// metadata.images[0].dimensions === [640, 480]
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Equivalent APIs are available in every binding:
|
|
142
|
+
|
|
143
|
+
- Python: `convert_with_metadata(html, options=None, metadata_config=None)`
|
|
144
|
+
- Ruby: `HtmlToMarkdown.convert_with_metadata(html, options = nil, metadata_config = nil)`
|
|
145
|
+
- PHP: `convert_with_metadata(string $html, ?array $options = null, ?array $metadataConfig = null)`
|
|
146
|
+
|
|
105
147
|
### CLI
|
|
106
148
|
|
|
107
149
|
```bash
|
|
@@ -119,6 +161,371 @@ html-to-markdown --url https://example.com > output.md
|
|
|
119
161
|
html-to-markdown --url https://example.com --user-agent "Mozilla/5.0" > output.md
|
|
120
162
|
```
|
|
121
163
|
|
|
164
|
+
### Metadata Extraction
|
|
165
|
+
|
|
166
|
+
Extract document metadata alongside HTML-to-Markdown conversion. All bindings support identical APIs:
|
|
167
|
+
|
|
168
|
+
#### CLI Examples
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
# Basic metadata extraction with conversion
|
|
172
|
+
html-to-markdown input.html --with-metadata -o output.json
|
|
173
|
+
|
|
174
|
+
# Extract document metadata (title, description, language, etc.)
|
|
175
|
+
html-to-markdown input.html --with-metadata --extract-document
|
|
176
|
+
|
|
177
|
+
# Extract headers and links
|
|
178
|
+
html-to-markdown input.html --with-metadata --extract-headers --extract-links
|
|
179
|
+
|
|
180
|
+
# Extract all metadata types with conversion
|
|
181
|
+
html-to-markdown input.html --with-metadata \
|
|
182
|
+
--extract-document \
|
|
183
|
+
--extract-headers \
|
|
184
|
+
--extract-links \
|
|
185
|
+
--extract-images \
|
|
186
|
+
--extract-structured-data \
|
|
187
|
+
-o metadata.json
|
|
188
|
+
|
|
189
|
+
# Fetch and extract from remote URL
|
|
190
|
+
html-to-markdown --url https://example.com --with-metadata -o output.json
|
|
191
|
+
|
|
192
|
+
# Web scraping with preprocessing and metadata
|
|
193
|
+
html-to-markdown page.html --preprocess --preset aggressive \
|
|
194
|
+
--with-metadata --extract-links --extract-images
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Output format (JSON):
|
|
198
|
+
|
|
199
|
+
```json
|
|
200
|
+
{
|
|
201
|
+
"markdown": "# Title\n\nContent here...",
|
|
202
|
+
"metadata": {
|
|
203
|
+
"document": {
|
|
204
|
+
"title": "Page Title",
|
|
205
|
+
"description": "Meta description",
|
|
206
|
+
"charset": "utf-8",
|
|
207
|
+
"language": "en"
|
|
208
|
+
},
|
|
209
|
+
"headers": [
|
|
210
|
+
{ "level": 1, "text": "Title", "id": "title" }
|
|
211
|
+
],
|
|
212
|
+
"links": [
|
|
213
|
+
{
|
|
214
|
+
"text": "Example",
|
|
215
|
+
"href": "https://example.com",
|
|
216
|
+
"title": null,
|
|
217
|
+
"rel": ["external"]
|
|
218
|
+
}
|
|
219
|
+
],
|
|
220
|
+
"images": [
|
|
221
|
+
{
|
|
222
|
+
"src": "https://example.com/image.jpg",
|
|
223
|
+
"alt": "Hero image",
|
|
224
|
+
"title": null,
|
|
225
|
+
"dimensions": [640, 480]
|
|
226
|
+
}
|
|
227
|
+
]
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
#### Python Example
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from html_to_markdown import convert_with_metadata
|
|
236
|
+
|
|
237
|
+
html = '''
|
|
238
|
+
<html>
|
|
239
|
+
<head>
|
|
240
|
+
<title>Product Guide</title>
|
|
241
|
+
<meta name="description" content="Complete product documentation">
|
|
242
|
+
</head>
|
|
243
|
+
<body>
|
|
244
|
+
<h1>Getting Started</h1>
|
|
245
|
+
<p>Visit our <a href="https://example.com">website</a> for more.</p>
|
|
246
|
+
<img src="https://example.com/guide.jpg" alt="Setup diagram" width="800" height="600">
|
|
247
|
+
</body>
|
|
248
|
+
</html>
|
|
249
|
+
'''
|
|
250
|
+
|
|
251
|
+
markdown, metadata = convert_with_metadata(
|
|
252
|
+
html,
|
|
253
|
+
options={'heading_style': 'Atx'},
|
|
254
|
+
metadata_config={
|
|
255
|
+
'extract_document': True,
|
|
256
|
+
'extract_headers': True,
|
|
257
|
+
'extract_links': True,
|
|
258
|
+
'extract_images': True,
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
print(markdown)
|
|
263
|
+
print(f"Title: {metadata['document']['title']}")
|
|
264
|
+
print(f"Links found: {len(metadata['links'])}")
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
#### TypeScript/Node.js Example
|
|
268
|
+
|
|
269
|
+
```typescript
|
|
270
|
+
import { convertWithMetadata } from 'html-to-markdown-node';
|
|
271
|
+
|
|
272
|
+
const html = `
|
|
273
|
+
<html>
|
|
274
|
+
<head>
|
|
275
|
+
<title>Article</title>
|
|
276
|
+
<meta name="description" content="Tech article">
|
|
277
|
+
</head>
|
|
278
|
+
<body>
|
|
279
|
+
<h1>Web Performance</h1>
|
|
280
|
+
<p>Read our <a href="/blog">blog</a> for tips.</p>
|
|
281
|
+
<img src="/perf.png" alt="Chart" width="1200" height="630">
|
|
282
|
+
</body>
|
|
283
|
+
</html>
|
|
284
|
+
`;
|
|
285
|
+
|
|
286
|
+
const { markdown, metadata } = await convertWithMetadata(html, {
|
|
287
|
+
headingStyle: 'Atx',
|
|
288
|
+
}, {
|
|
289
|
+
extract_document: true,
|
|
290
|
+
extract_headers: true,
|
|
291
|
+
extract_links: true,
|
|
292
|
+
extract_images: true,
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
console.log(markdown);
|
|
296
|
+
console.log(`Found ${metadata.headers.length} headers`);
|
|
297
|
+
console.log(`Found ${metadata.links.length} links`);
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
#### Ruby Example
|
|
301
|
+
|
|
302
|
+
```ruby
|
|
303
|
+
require 'html_to_markdown'
|
|
304
|
+
|
|
305
|
+
html = <<~HTML
|
|
306
|
+
<html>
|
|
307
|
+
<head>
|
|
308
|
+
<title>Documentation</title>
|
|
309
|
+
<meta name="description" content="API Reference">
|
|
310
|
+
</head>
|
|
311
|
+
<body>
|
|
312
|
+
<h2>Installation</h2>
|
|
313
|
+
<p>See our <a href="https://github.com">GitHub</a>.</p>
|
|
314
|
+
<img src="https://example.com/diagram.svg" alt="Architecture" width="960" height="540">
|
|
315
|
+
</body>
|
|
316
|
+
</html>
|
|
317
|
+
HTML
|
|
318
|
+
|
|
319
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(
|
|
320
|
+
html,
|
|
321
|
+
options: { heading_style: :atx },
|
|
322
|
+
metadata_config: {
|
|
323
|
+
extract_document: true,
|
|
324
|
+
extract_headers: true,
|
|
325
|
+
extract_links: true,
|
|
326
|
+
extract_images: true,
|
|
327
|
+
}
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
puts markdown
|
|
331
|
+
puts "Title: #{metadata[:document][:title]}"
|
|
332
|
+
puts "Images: #{metadata[:images].length}"
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
#### PHP Example
|
|
336
|
+
|
|
337
|
+
```php
|
|
338
|
+
<?php
|
|
339
|
+
use HtmlToMarkdown\HtmlToMarkdown;
|
|
340
|
+
|
|
341
|
+
$html = <<<HTML
|
|
342
|
+
<html>
|
|
343
|
+
<head>
|
|
344
|
+
<title>Tutorial</title>
|
|
345
|
+
<meta name="description" content="Step-by-step guide">
|
|
346
|
+
</head>
|
|
347
|
+
<body>
|
|
348
|
+
<h1>Getting Started</h1>
|
|
349
|
+
<p>Check our <a href="https://example.com/guide">guide</a>.</p>
|
|
350
|
+
<img src="https://example.com/steps.png" alt="Steps" width="1024" height="768">
|
|
351
|
+
</body>
|
|
352
|
+
</html>
|
|
353
|
+
HTML;
|
|
354
|
+
|
|
355
|
+
[$markdown, $metadata] = convert_with_metadata(
|
|
356
|
+
$html,
|
|
357
|
+
options: ['heading_style' => 'Atx'],
|
|
358
|
+
metadataConfig: [
|
|
359
|
+
'extract_document' => true,
|
|
360
|
+
'extract_headers' => true,
|
|
361
|
+
'extract_links' => true,
|
|
362
|
+
'extract_images' => true,
|
|
363
|
+
]
|
|
364
|
+
);
|
|
365
|
+
|
|
366
|
+
echo "Title: " . $metadata['document']['title'] . "\n";
|
|
367
|
+
echo "Found " . count($metadata['links']) . " links\n";
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
#### Go Example
|
|
371
|
+
|
|
372
|
+
```go
|
|
373
|
+
package main
|
|
374
|
+
|
|
375
|
+
import (
|
|
376
|
+
"encoding/json"
|
|
377
|
+
"fmt"
|
|
378
|
+
"log"
|
|
379
|
+
|
|
380
|
+
"github.com/Goldziher/html-to-markdown/packages/go/htmltomarkdown"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
func main() {
|
|
384
|
+
html := `
|
|
385
|
+
<html>
|
|
386
|
+
<head>
|
|
387
|
+
<title>Developer Guide</title>
|
|
388
|
+
<meta name="description" content="Complete API reference">
|
|
389
|
+
</head>
|
|
390
|
+
<body>
|
|
391
|
+
<h1>API Overview</h1>
|
|
392
|
+
<p>Learn more at our <a href="https://api.example.com/docs">API docs</a>.</p>
|
|
393
|
+
<img src="https://example.com/api-flow.png" alt="API Flow" width="1280" height="720">
|
|
394
|
+
</body>
|
|
395
|
+
</html>
|
|
396
|
+
`
|
|
397
|
+
|
|
398
|
+
markdown, metadata, err := htmltomarkdown.ConvertWithMetadata(html, &htmltomarkdown.MetadataConfig{
|
|
399
|
+
ExtractDocument: true,
|
|
400
|
+
ExtractHeaders: true,
|
|
401
|
+
ExtractLinks: true,
|
|
402
|
+
ExtractImages: true,
|
|
403
|
+
ExtractStructuredData: false,
|
|
404
|
+
})
|
|
405
|
+
if err != nil {
|
|
406
|
+
log.Fatal(err)
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
fmt.Println("Markdown:", markdown)
|
|
410
|
+
fmt.Printf("Title: %s\n", metadata.Document.Title)
|
|
411
|
+
fmt.Printf("Found %d links\n", len(metadata.Links))
|
|
412
|
+
|
|
413
|
+
// Marshal to JSON if needed
|
|
414
|
+
jsonBytes, _ := json.MarshalIndent(metadata, "", " ")
|
|
415
|
+
fmt.Println(string(jsonBytes))
|
|
416
|
+
}
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
#### Java Example
|
|
420
|
+
|
|
421
|
+
```java
|
|
422
|
+
import io.github.goldziher.htmltomarkdown.HtmlToMarkdown;
|
|
423
|
+
import io.github.goldziher.htmltomarkdown.ConversionResult;
|
|
424
|
+
import com.google.gson.Gson;
|
|
425
|
+
import com.google.gson.GsonBuilder;
|
|
426
|
+
|
|
427
|
+
public class MetadataExample {
|
|
428
|
+
public static void main(String[] args) {
|
|
429
|
+
String html = """
|
|
430
|
+
<html>
|
|
431
|
+
<head>
|
|
432
|
+
<title>Java Guide</title>
|
|
433
|
+
<meta name="description" content="Complete Java bindings documentation">
|
|
434
|
+
</head>
|
|
435
|
+
<body>
|
|
436
|
+
<h1>Quick Start</h1>
|
|
437
|
+
<p>Visit our <a href="https://github.com/Goldziher/html-to-markdown">GitHub</a>.</p>
|
|
438
|
+
<img src="https://example.com/java-flow.png" alt="Flow diagram" width="1024" height="576">
|
|
439
|
+
</body>
|
|
440
|
+
</html>
|
|
441
|
+
""";
|
|
442
|
+
|
|
443
|
+
try {
|
|
444
|
+
ConversionResult result = HtmlToMarkdown.convertWithMetadata(
|
|
445
|
+
html,
|
|
446
|
+
new HtmlToMarkdown.MetadataOptions()
|
|
447
|
+
.extractDocument(true)
|
|
448
|
+
.extractHeaders(true)
|
|
449
|
+
.extractLinks(true)
|
|
450
|
+
.extractImages(true)
|
|
451
|
+
);
|
|
452
|
+
|
|
453
|
+
System.out.println("Markdown:\n" + result.getMarkdown());
|
|
454
|
+
System.out.println("Title: " + result.getMetadata().getDocument().getTitle());
|
|
455
|
+
System.out.println("Links found: " + result.getMetadata().getLinks().size());
|
|
456
|
+
|
|
457
|
+
// Pretty-print metadata as JSON
|
|
458
|
+
Gson gson = new GsonBuilder().setPrettyPrinting().create();
|
|
459
|
+
System.out.println(gson.toJson(result.getMetadata()));
|
|
460
|
+
} catch (HtmlToMarkdown.ConversionException e) {
|
|
461
|
+
System.err.println("Conversion failed: " + e.getMessage());
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
#### C# Example
|
|
468
|
+
|
|
469
|
+
```csharp
|
|
470
|
+
using HtmlToMarkdown;
|
|
471
|
+
using System.Text.Json;
|
|
472
|
+
|
|
473
|
+
var html = @"
|
|
474
|
+
<html>
|
|
475
|
+
<head>
|
|
476
|
+
<title>C# Guide</title>
|
|
477
|
+
<meta name=""description"" content=""Official C# bindings documentation"">
|
|
478
|
+
</head>
|
|
479
|
+
<body>
|
|
480
|
+
<h1>Introduction</h1>
|
|
481
|
+
<p>See our <a href=""https://github.com/Goldziher/html-to-markdown"">repository</a>.</p>
|
|
482
|
+
<img src=""https://example.com/csharp-arch.png"" alt=""Architecture"" width=""1200"" height=""675"">
|
|
483
|
+
</body>
|
|
484
|
+
</html>
|
|
485
|
+
";
|
|
486
|
+
|
|
487
|
+
try
|
|
488
|
+
{
|
|
489
|
+
var result = HtmlToMarkdownConverter.ConvertWithMetadata(
|
|
490
|
+
html,
|
|
491
|
+
new MetadataConfig
|
|
492
|
+
{
|
|
493
|
+
ExtractDocument = true,
|
|
494
|
+
ExtractHeaders = true,
|
|
495
|
+
ExtractLinks = true,
|
|
496
|
+
ExtractImages = true,
|
|
497
|
+
}
|
|
498
|
+
);
|
|
499
|
+
|
|
500
|
+
Console.WriteLine("Markdown:");
|
|
501
|
+
Console.WriteLine(result.Markdown);
|
|
502
|
+
|
|
503
|
+
Console.WriteLine($"Title: {result.Metadata.Document.Title}");
|
|
504
|
+
Console.WriteLine($"Links found: {result.Metadata.Links.Count}");
|
|
505
|
+
|
|
506
|
+
// Serialize metadata to JSON
|
|
507
|
+
var options = new JsonSerializerOptions { WriteIndented = true };
|
|
508
|
+
var json = JsonSerializer.Serialize(result.Metadata, options);
|
|
509
|
+
Console.WriteLine(json);
|
|
510
|
+
}
|
|
511
|
+
catch (HtmlToMarkdownException ex)
|
|
512
|
+
{
|
|
513
|
+
Console.Error.WriteLine($"Conversion failed: {ex.Message}");
|
|
514
|
+
}
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
See the individual binding READMEs for detailed metadata extraction options:
|
|
518
|
+
|
|
519
|
+
- **Python** – [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/python/README.md)
|
|
520
|
+
- **TypeScript/Node.js** – [Node.js README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-node/README.md) | [TypeScript README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/typescript/README.md)
|
|
521
|
+
- **Ruby** – [Ruby README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md)
|
|
522
|
+
- **PHP** – [PHP README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php/README.md)
|
|
523
|
+
- **Go** – [Go README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/go/README.md)
|
|
524
|
+
- **Java** – [Java README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/java/README.md)
|
|
525
|
+
- **C#/.NET** – [C# README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/csharp/README.md)
|
|
526
|
+
- **WebAssembly** – [WASM README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-wasm/README.md)
|
|
527
|
+
- **Rust** – [Rust README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown/README.md)
|
|
528
|
+
|
|
122
529
|
### Python (v2 API)
|
|
123
530
|
|
|
124
531
|
```python
|
|
Binary file
|