mdream 0.17.1 → 1.0.0-beta.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -226
- package/bin/mdream.mjs +25 -1
- package/dist/browser.d.mts +13 -0
- package/dist/browser.mjs +30 -0
- package/dist/edge.d.mts +12 -0
- package/dist/edge.mjs +21 -0
- package/dist/iife.js +371 -13
- package/dist/index.d.mts +73 -174
- package/dist/index.mjs +114 -5
- package/dist/worker.d.mts +91 -0
- package/dist/worker.mjs +109 -0
- package/napi/index.d.mts +105 -0
- package/napi/index.d.ts +105 -0
- package/napi/index.mjs +587 -0
- package/package.json +33 -52
- package/wasm/mdream_edge.d.ts +53 -0
- package/wasm/mdream_edge.js +416 -0
- package/wasm/mdream_edge_bg.wasm +0 -0
- package/wasm/mdream_edge_bg.wasm.d.ts +14 -0
- package/wasm/package.json +17 -0
- package/dist/_chunks/const.mjs +0 -126
- package/dist/_chunks/extraction.mjs +0 -142
- package/dist/_chunks/markdown-processor.mjs +0 -1387
- package/dist/_chunks/plugin.d.mts +0 -11
- package/dist/_chunks/plugin.mjs +0 -11
- package/dist/_chunks/plugins.mjs +0 -513
- package/dist/_chunks/src.mjs +0 -48
- package/dist/_chunks/types.d.mts +0 -304
- package/dist/cli.d.mts +0 -1
- package/dist/cli.mjs +0 -67
- package/dist/llms-txt.d.mts +0 -84
- package/dist/llms-txt.mjs +0 -469
- package/dist/negotiate.d.mts +0 -26
- package/dist/negotiate.mjs +0 -92
- package/dist/plugins.d.mts +0 -77
- package/dist/plugins.mjs +0 -5
- package/dist/preset/minimal.d.mts +0 -12
- package/dist/preset/minimal.mjs +0 -40
- package/dist/splitter.d.mts +0 -15
- package/dist/splitter.mjs +0 -211
package/README.md
CHANGED
|
@@ -20,12 +20,12 @@
|
|
|
20
20
|
|
|
21
21
|
## Features
|
|
22
22
|
|
|
23
|
-
- 🧠 Optimized HTML To Markdown Conversion (~50% fewer tokens with
|
|
23
|
+
- 🧠 Optimized HTML To Markdown Conversion (~50% fewer tokens with Minimal preset)
|
|
24
24
|
- 🔍 Generates GitHub Flavored Markdown: Frontmatter, Nested & HTML markup support.
|
|
25
|
-
- 🚀 Fast:
|
|
26
|
-
- ⚡ Tiny:
|
|
25
|
+
- 🚀 Fast: Convert 1.8MB of HTML to markdown in ~8ms (Rust), ~62ms (JS). Up to 7.9x speedup.
|
|
26
|
+
- ⚡ Tiny: 10kB gzip JS core, 45kB gzip with Rust WASM engine. Zero dependencies.
|
|
27
27
|
- ⚙️ Run anywhere: CLI, edge workers, browsers, Node, etc.
|
|
28
|
-
- 🔌 Extensible:
|
|
28
|
+
- 🔌 Extensible: Declarative plugin config for both engines, hook-based plugins via `@mdream/js`.
|
|
29
29
|
|
|
30
30
|
## What is Mdream?
|
|
31
31
|
|
|
@@ -39,7 +39,7 @@ Mdream produces high-quality Markdown for LLMs efficiently with no core dependen
|
|
|
39
39
|
## Installation
|
|
40
40
|
|
|
41
41
|
```bash
|
|
42
|
-
pnpm add mdream
|
|
42
|
+
pnpm add mdream@beta
|
|
43
43
|
```
|
|
44
44
|
|
|
45
45
|
## CLI Usage
|
|
@@ -53,7 +53,7 @@ Fetches the [Markdown Wikipedia page](https://en.wikipedia.org/wiki/Markdown) an
|
|
|
53
53
|
|
|
54
54
|
```bash
|
|
55
55
|
curl -s https://en.wikipedia.org/wiki/Markdown \
|
|
56
|
-
| npx mdream --origin https://en.wikipedia.org --preset minimal \
|
|
56
|
+
| npx mdream@beta --origin https://en.wikipedia.org --preset minimal \
|
|
57
57
|
| tee streaming.md
|
|
58
58
|
```
|
|
59
59
|
|
|
@@ -65,7 +65,7 @@ Converts a local HTML file to a Markdown file, using `tee` to write the output t
|
|
|
65
65
|
|
|
66
66
|
```bash
|
|
67
67
|
cat index.html \
|
|
68
|
-
| npx mdream --preset minimal \
|
|
68
|
+
| npx mdream@beta --preset minimal \
|
|
69
69
|
| tee streaming.md
|
|
70
70
|
```
|
|
71
71
|
|
|
@@ -82,33 +82,54 @@ Mdream provides two main functions for working with HTML:
|
|
|
82
82
|
- `htmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
|
|
83
83
|
- `streamHtmlToMarkdown`: Best practice if you are fetching or reading from a local file.
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
### Engines
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
Mdream includes two rendering engines, automatically selecting the best one for your environment:
|
|
88
|
+
- **Rust Engine** (default in Node.js): Native NAPI performance, 5.6-7.9x faster. WASM build for edge/browser runtimes.
|
|
89
|
+
- **JavaScript Engine** (`@mdream/js`): Zero-dependencies, supports custom hook-based plugins.
|
|
90
|
+
|
|
91
|
+
```ts
|
|
92
|
+
import { htmlToMarkdown } from 'mdream'
|
|
93
|
+
|
|
94
|
+
// Rust NAPI engine used automatically in Node.js
|
|
95
|
+
// JS engine used in browser/edge runtimes
|
|
96
|
+
const markdown = htmlToMarkdown('<h1>Hello World</h1>')
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Browser & Edge Usage
|
|
100
|
+
|
|
101
|
+
For browser environments and edge runtimes (Cloudflare Workers, Vercel Edge), mdream compiles to WebAssembly. Export conditions (`workerd`, `edge-light`, `browser`) select the correct build automatically, or use `mdream/worker` directly:
|
|
102
|
+
|
|
103
|
+
```ts
|
|
104
|
+
import { htmlToMarkdown } from 'mdream/worker'
|
|
105
|
+
|
|
106
|
+
const markdown = await htmlToMarkdown('<h1>Hello World</h1>')
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Browser CDN Usage
|
|
110
|
+
|
|
111
|
+
Use mdream directly via CDN with no build step. The IIFE bundle uses the Rust WASM engine. Call `init()` once to load the WASM binary, then use `htmlToMarkdown()` synchronously.
|
|
88
112
|
|
|
89
113
|
```html
|
|
90
|
-
|
|
91
|
-
<
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
</
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
</html>
|
|
114
|
+
<script src="https://unpkg.com/mdream/dist/iife.js"></script>
|
|
115
|
+
<script>
|
|
116
|
+
// init() fetches the .wasm file from the same CDN path automatically
|
|
117
|
+
await window.mdream.init()
|
|
118
|
+
const markdown = window.mdream.htmlToMarkdown('<h1>Hello</h1><p>World</p>')
|
|
119
|
+
console.log(markdown) // # Hello\n\nWorld
|
|
120
|
+
</script>
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
You can also pass a custom WASM URL or `ArrayBuffer` to `init()`:
|
|
124
|
+
|
|
125
|
+
```js
|
|
126
|
+
await window.mdream.init('https://cdn.example.com/mdream_edge_bg.wasm')
|
|
104
127
|
```
|
|
105
128
|
|
|
106
129
|
**CDN Options:**
|
|
107
130
|
- **unpkg**: `https://unpkg.com/mdream/dist/iife.js`
|
|
108
131
|
- **jsDelivr**: `https://cdn.jsdelivr.net/npm/mdream/dist/iife.js`
|
|
109
132
|
|
|
110
|
-
The browser build includes the core `htmlToMarkdown` function and is optimized for size (44kB uncompressed, 10.3kB gzipped).
|
|
111
|
-
|
|
112
133
|
**Convert existing HTML**
|
|
113
134
|
|
|
114
135
|
```ts
|
|
@@ -137,12 +158,12 @@ for await (const chunk of markdownGenerator) {
|
|
|
137
158
|
}
|
|
138
159
|
```
|
|
139
160
|
|
|
140
|
-
**Pure HTML Parser**
|
|
161
|
+
**Pure HTML Parser (JS Engine)**
|
|
141
162
|
|
|
142
|
-
If you only need to parse HTML into a DOM-like AST without converting to Markdown, use `parseHtml
|
|
163
|
+
If you only need to parse HTML into a DOM-like AST without converting to Markdown, use `parseHtml` from the JS engine:
|
|
143
164
|
|
|
144
165
|
```ts
|
|
145
|
-
import { parseHtml } from 'mdream'
|
|
166
|
+
import { parseHtml } from '@mdream/js'
|
|
146
167
|
|
|
147
168
|
const html = '<div><h1>Title</h1><p>Content</p></div>'
|
|
148
169
|
const { events, remainingHtml } = parseHtml(html)
|
|
@@ -163,57 +184,65 @@ The `parseHtml` function provides:
|
|
|
163
184
|
|
|
164
185
|
## Presets
|
|
165
186
|
|
|
166
|
-
Presets are pre-configured combinations of plugins for common use cases.
|
|
167
|
-
|
|
168
187
|
### Minimal Preset
|
|
169
188
|
|
|
170
189
|
The `minimal` preset optimizes for token reduction and cleaner output by removing non-essential content:
|
|
171
190
|
|
|
172
191
|
```ts
|
|
173
|
-
import {
|
|
192
|
+
import { htmlToMarkdown } from 'mdream'
|
|
174
193
|
|
|
175
|
-
const
|
|
176
|
-
origin: 'https://example.com'
|
|
194
|
+
const markdown = htmlToMarkdown(html, {
|
|
195
|
+
origin: 'https://example.com',
|
|
196
|
+
minimal: true,
|
|
177
197
|
})
|
|
178
198
|
```
|
|
179
199
|
|
|
180
|
-
**
|
|
181
|
-
- `
|
|
182
|
-
- `
|
|
183
|
-
- `
|
|
184
|
-
- `
|
|
200
|
+
**Enables:**
|
|
201
|
+
- `isolateMain` - Extracts main content area
|
|
202
|
+
- `frontmatter` - Generates YAML frontmatter from meta tags
|
|
203
|
+
- `tailwind` - Converts Tailwind classes to Markdown
|
|
204
|
+
- `filter` - Excludes forms, navigation, buttons, footers, and other non-content elements
|
|
185
205
|
|
|
186
206
|
**CLI Usage:**
|
|
187
207
|
```bash
|
|
188
|
-
curl -s https://example.com | npx mdream --preset minimal --origin https://example.com
|
|
208
|
+
curl -s https://example.com | npx mdream@beta --preset minimal --origin https://example.com
|
|
189
209
|
```
|
|
190
210
|
|
|
191
|
-
##
|
|
192
|
-
|
|
193
|
-
The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
|
|
211
|
+
## Declarative Options
|
|
194
212
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
Mdream includes several built-in plugins that can be used individually or combined:
|
|
198
|
-
|
|
199
|
-
- **[`extractionPlugin`](./src/plugins/extraction.ts)**: Extract specific elements using CSS selectors for data analysis
|
|
200
|
-
- **[`filterPlugin`](./src/plugins/filter.ts)**: Include or exclude elements based on CSS selectors or tag IDs
|
|
201
|
-
- **[`frontmatterPlugin`](./src/plugins/frontmatter.ts)**: Generate YAML frontmatter from HTML head elements (title, meta tags)
|
|
202
|
-
- **[`isolateMainPlugin`](./src/plugins/isolate-main.ts)**: Isolate main content using `<main>` elements or header-to-footer boundaries
|
|
203
|
-
- **[`tailwindPlugin`](./src/plugins/tailwind.ts)**: Convert Tailwind CSS classes to Markdown formatting (bold, italic, etc.)
|
|
213
|
+
Both engines accept the same declarative configuration:
|
|
204
214
|
|
|
205
215
|
```ts
|
|
206
|
-
import {
|
|
216
|
+
import { htmlToMarkdown } from 'mdream'
|
|
207
217
|
|
|
208
218
|
const markdown = htmlToMarkdown(html, {
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
]
|
|
219
|
+
origin: 'https://example.com',
|
|
220
|
+
minimal: true, // enables frontmatter, isolateMain, tailwind, filter
|
|
221
|
+
clean: true, // enable all post-processing cleanup
|
|
222
|
+
frontmatter: fm => console.log(fm), // callback for extracted frontmatter
|
|
223
|
+
filter: { exclude: ['nav', '.sidebar'] },
|
|
224
|
+
extraction: {
|
|
225
|
+
'h2': el => console.log('Heading:', el.textContent),
|
|
226
|
+
'img[alt]': el => console.log('Image:', el.attributes.src),
|
|
227
|
+
},
|
|
228
|
+
tagOverrides: { 'custom-tag': { alias: 'div' } },
|
|
214
229
|
})
|
|
215
230
|
```
|
|
216
231
|
|
|
232
|
+
### Available Options
|
|
233
|
+
|
|
234
|
+
| Option | Type | Description |
|
|
235
|
+
|--------|------|-------------|
|
|
236
|
+
| `origin` | `string` | Base URL for resolving relative links/images |
|
|
237
|
+
| `minimal` | `boolean` | Enable minimal preset (frontmatter, isolateMain, tailwind, filter) |
|
|
238
|
+
| `clean` | `boolean \| CleanOptions` | Post-processing cleanup (`true` for all, or pick specific) |
|
|
239
|
+
| `frontmatter` | `boolean \| (fm) => void \| FrontmatterConfig` | Extract frontmatter from HTML head |
|
|
240
|
+
| `isolateMain` | `boolean` | Isolate main content area |
|
|
241
|
+
| `tailwind` | `boolean` | Convert Tailwind classes to Markdown |
|
|
242
|
+
| `filter` | `{ include?, exclude?, processChildren? }` | Filter elements by CSS selectors |
|
|
243
|
+
| `extraction` | `Record<string, (el) => void>` | Extract elements matching CSS selectors |
|
|
244
|
+
| `tagOverrides` | `Record<string, TagOverride \| string>` | Override tag rendering behavior |
|
|
245
|
+
|
|
217
246
|
### Content Extraction with Readability
|
|
218
247
|
|
|
219
248
|
For advanced content extraction (article detection, boilerplate removal), use [@mozilla/readability](https://github.com/mozilla/readability) before mdream:
|
|
@@ -234,101 +263,36 @@ if (article) {
|
|
|
234
263
|
|
|
235
264
|
This pipeline gives you battle-tested content extraction + fast markdown conversion.
|
|
236
265
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
- `beforeNodeProcess`: Called before any node processing, can skip nodes
|
|
240
|
-
- `onNodeEnter`: Called when entering an element node
|
|
241
|
-
- `onNodeExit`: Called when exiting an element node
|
|
242
|
-
- `processTextNode`: Called for each text node
|
|
243
|
-
- `processAttributes`: Called to process element attributes
|
|
244
|
-
|
|
245
|
-
### Creating a Plugin
|
|
266
|
+
## Hook-Based Plugins (JS Engine)
|
|
246
267
|
|
|
247
|
-
|
|
268
|
+
For custom hook-based plugins, use `@mdream/js`:
|
|
248
269
|
|
|
249
270
|
```ts
|
|
250
|
-
import
|
|
251
|
-
import {
|
|
252
|
-
import { createPlugin } from 'mdream/plugins'
|
|
271
|
+
import { htmlToMarkdown } from '@mdream/js'
|
|
272
|
+
import { createPlugin } from '@mdream/js/plugins'
|
|
253
273
|
|
|
254
274
|
const myPlugin = createPlugin({
|
|
255
|
-
onNodeEnter(node
|
|
256
|
-
if (node.name === 'h1')
|
|
257
|
-
return '
|
|
258
|
-
}
|
|
275
|
+
onNodeEnter(node) {
|
|
276
|
+
if (node.name === 'h1')
|
|
277
|
+
return '** '
|
|
259
278
|
},
|
|
260
|
-
|
|
261
|
-
processTextNode(textNode: TextNode) {
|
|
262
|
-
// Transform text content
|
|
279
|
+
processTextNode(textNode) {
|
|
263
280
|
if (textNode.parent?.attributes?.id === 'highlight') {
|
|
264
|
-
return {
|
|
265
|
-
content: `**${textNode.value}**`,
|
|
266
|
-
skip: false
|
|
267
|
-
}
|
|
281
|
+
return { content: `**${textNode.value}**`, skip: false }
|
|
268
282
|
}
|
|
269
283
|
}
|
|
270
284
|
})
|
|
271
285
|
|
|
272
|
-
|
|
273
|
-
const html: string = '<div id="highlight">Important text</div>'
|
|
274
|
-
const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
|
|
286
|
+
const markdown = htmlToMarkdown(html, { hooks: [myPlugin] })
|
|
275
287
|
```
|
|
276
288
|
|
|
277
|
-
###
|
|
278
|
-
|
|
279
|
-
```ts
|
|
280
|
-
import type { ElementNode, NodeEvent } from 'mdream'
|
|
281
|
-
import { ELEMENT_NODE } from 'mdream'
|
|
282
|
-
import { createPlugin } from 'mdream/plugins'
|
|
283
|
-
|
|
284
|
-
const adBlockPlugin = createPlugin({
|
|
285
|
-
beforeNodeProcess(event: NodeEvent) {
|
|
286
|
-
const { node } = event
|
|
287
|
-
|
|
288
|
-
if (node.type === ELEMENT_NODE && node.name === 'div') {
|
|
289
|
-
const element = node as ElementNode
|
|
290
|
-
// Skip ads and promotional content
|
|
291
|
-
if (element.attributes?.class?.includes('ad')
|
|
292
|
-
|| element.attributes?.id?.includes('promo')) {
|
|
293
|
-
return { skip: true }
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
})
|
|
298
|
-
```
|
|
299
|
-
|
|
300
|
-
### Extraction Plugin
|
|
301
|
-
|
|
302
|
-
Extract specific elements and their content during HTML processing for data analysis or content discovery:
|
|
303
|
-
|
|
304
|
-
```ts
|
|
305
|
-
import { extractionPlugin, htmlToMarkdown } from 'mdream'
|
|
306
|
-
|
|
307
|
-
const html: string = `
|
|
308
|
-
<article>
|
|
309
|
-
<h2>Getting Started</h2>
|
|
310
|
-
<p>This is a tutorial about web scraping.</p>
|
|
311
|
-
<img src="/hero.jpg" alt="Hero image" />
|
|
312
|
-
</article>
|
|
313
|
-
`
|
|
314
|
-
|
|
315
|
-
// Extract elements using CSS selectors
|
|
316
|
-
const plugin = extractionPlugin({
|
|
317
|
-
'h2': (element: ExtractedElement, state: MdreamRuntimeState) => {
|
|
318
|
-
console.log('Heading:', element.textContent) // "Getting Started"
|
|
319
|
-
console.log('Depth:', state.depth) // Current nesting depth
|
|
320
|
-
},
|
|
321
|
-
'img[alt]': (element: ExtractedElement, state: MdreamRuntimeState) => {
|
|
322
|
-
console.log('Image:', element.attributes.src, element.attributes.alt)
|
|
323
|
-
// "Image: /hero.jpg Hero image"
|
|
324
|
-
console.log('Context:', state.options) // Access to conversion options
|
|
325
|
-
}
|
|
326
|
-
})
|
|
327
|
-
|
|
328
|
-
htmlToMarkdown(html, { plugins: [plugin] })
|
|
329
|
-
```
|
|
289
|
+
### Plugin Hooks
|
|
330
290
|
|
|
331
|
-
|
|
291
|
+
- `beforeNodeProcess`: Called before any node processing, can skip nodes
|
|
292
|
+
- `onNodeEnter`: Called when entering an element node
|
|
293
|
+
- `onNodeExit`: Called when exiting an element node
|
|
294
|
+
- `processTextNode`: Called for each text node
|
|
295
|
+
- `processAttributes`: Called to process element attributes
|
|
332
296
|
|
|
333
297
|
## Markdown Splitting
|
|
334
298
|
|
|
@@ -337,8 +301,8 @@ Split HTML into chunks during conversion for LLM context windows, vector databas
|
|
|
337
301
|
### Basic Chunking
|
|
338
302
|
|
|
339
303
|
```ts
|
|
340
|
-
import { TAG_H2 } from 'mdream'
|
|
341
|
-
import { htmlToMarkdownSplitChunks } from 'mdream/splitter'
|
|
304
|
+
import { TAG_H2 } from '@mdream/js'
|
|
305
|
+
import { htmlToMarkdownSplitChunks } from '@mdream/js/splitter'
|
|
342
306
|
|
|
343
307
|
const html = `
|
|
344
308
|
<h1>Documentation</h1>
|
|
@@ -369,7 +333,7 @@ chunks.forEach((chunk) => {
|
|
|
369
333
|
For large documents, use the generator version to process chunks one at a time:
|
|
370
334
|
|
|
371
335
|
```ts
|
|
372
|
-
import { htmlToMarkdownSplitChunksStream } from 'mdream/splitter'
|
|
336
|
+
import { htmlToMarkdownSplitChunksStream } from '@mdream/js/splitter'
|
|
373
337
|
|
|
374
338
|
// Process chunks incrementally - lower memory usage
|
|
375
339
|
for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) {
|
|
@@ -404,7 +368,7 @@ interface SplitterOptions {
|
|
|
404
368
|
|
|
405
369
|
// Standard options
|
|
406
370
|
origin?: string // Base URL for links/images
|
|
407
|
-
|
|
371
|
+
hooks?: TransformPlugin[] // Apply hook-based plugins during conversion (@mdream/js only)
|
|
408
372
|
}
|
|
409
373
|
```
|
|
410
374
|
|
|
@@ -430,9 +394,9 @@ interface MarkdownChunk {
|
|
|
430
394
|
Combine splitting with presets for optimized output:
|
|
431
395
|
|
|
432
396
|
```ts
|
|
433
|
-
import { TAG_H2 } from 'mdream'
|
|
434
|
-
import { withMinimalPreset } from 'mdream/preset/minimal'
|
|
435
|
-
import { htmlToMarkdownSplitChunks } from 'mdream/splitter'
|
|
397
|
+
import { TAG_H2 } from '@mdream/js'
|
|
398
|
+
import { withMinimalPreset } from '@mdream/js/preset/minimal'
|
|
399
|
+
import { htmlToMarkdownSplitChunks } from '@mdream/js/splitter'
|
|
436
400
|
|
|
437
401
|
const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
|
|
438
402
|
headersToSplitOn: [TAG_H2],
|
|
@@ -443,98 +407,24 @@ const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
|
|
|
443
407
|
|
|
444
408
|
## llms.txt Generation
|
|
445
409
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
### createLlmsTxtStream
|
|
449
|
-
|
|
450
|
-
Stream llms.txt generation without keeping full content in memory:
|
|
410
|
+
For llms.txt artifact generation, use `@mdream/llms-txt`. It accepts pre-converted markdown and generates `llms.txt` and `llms-full.txt` artifacts.
|
|
451
411
|
|
|
452
412
|
```ts
|
|
453
|
-
import {
|
|
454
|
-
|
|
455
|
-
const stream = createLlmsTxtStream({
|
|
456
|
-
siteName: 'My Docs',
|
|
457
|
-
description: 'Documentation site',
|
|
458
|
-
origin: 'https://example.com',
|
|
459
|
-
outputDir: './dist',
|
|
460
|
-
generateFull: true, // Also generate llms-full.txt
|
|
461
|
-
sections: [
|
|
462
|
-
{
|
|
463
|
-
title: 'Getting Started',
|
|
464
|
-
description: 'Quick start guide',
|
|
465
|
-
links: [
|
|
466
|
-
{ title: 'Installation', href: '/install', description: 'How to install' },
|
|
467
|
-
{ title: 'Quick Start', href: '/quickstart' },
|
|
468
|
-
],
|
|
469
|
-
},
|
|
470
|
-
],
|
|
471
|
-
notes: ['Generated by mdream', 'Last updated: 2024'],
|
|
472
|
-
})
|
|
473
|
-
|
|
474
|
-
const writer = stream.getWriter()
|
|
475
|
-
await writer.write({
|
|
476
|
-
title: 'Home',
|
|
477
|
-
content: '# Welcome\n\nHome page content.',
|
|
478
|
-
url: '/',
|
|
479
|
-
metadata: {
|
|
480
|
-
description: 'Welcome page',
|
|
481
|
-
},
|
|
482
|
-
})
|
|
483
|
-
await writer.close()
|
|
484
|
-
```
|
|
485
|
-
|
|
486
|
-
This creates:
|
|
487
|
-
- `llms.txt` - Links to all pages with metadata
|
|
488
|
-
- `llms-full.txt` - Complete content with frontmatter (if `generateFull: true`)
|
|
489
|
-
|
|
490
|
-
### generateLlmsTxtArtifacts
|
|
491
|
-
|
|
492
|
-
Process HTML files or ProcessedFile objects:
|
|
493
|
-
|
|
494
|
-
```ts
|
|
495
|
-
import { generateLlmsTxtArtifacts } from 'mdream'
|
|
413
|
+
import { generateLlmsTxtArtifacts } from '@mdream/llms-txt'
|
|
414
|
+
import { htmlToMarkdown } from 'mdream'
|
|
496
415
|
|
|
497
416
|
const result = await generateLlmsTxtArtifacts({
|
|
498
|
-
|
|
417
|
+
files: [
|
|
418
|
+
{ title: 'Home', url: '/', content: htmlToMarkdown(homeHtml) },
|
|
419
|
+
{ title: 'About', url: '/about', content: htmlToMarkdown(aboutHtml) },
|
|
420
|
+
],
|
|
499
421
|
siteName: 'My Site',
|
|
500
422
|
origin: 'https://example.com',
|
|
501
423
|
generateFull: true,
|
|
502
|
-
sections: [
|
|
503
|
-
{
|
|
504
|
-
title: 'Resources',
|
|
505
|
-
links: [
|
|
506
|
-
{ title: 'Docs', href: '/docs' },
|
|
507
|
-
],
|
|
508
|
-
},
|
|
509
|
-
],
|
|
510
|
-
notes: 'Footer notes',
|
|
511
424
|
})
|
|
512
425
|
|
|
513
426
|
console.log(result.llmsTxt) // llms.txt content
|
|
514
427
|
console.log(result.llmsFullTxt) // llms-full.txt content
|
|
515
|
-
console.log(result.processedFiles) // Array of processed files
|
|
516
|
-
```
|
|
517
|
-
|
|
518
|
-
### Structure
|
|
519
|
-
|
|
520
|
-
llms.txt follows this structure:
|
|
521
|
-
|
|
522
|
-
```markdown
|
|
523
|
-
# Site Name
|
|
524
|
-
|
|
525
|
-
> Site description
|
|
526
|
-
|
|
527
|
-
## Custom Section
|
|
528
|
-
|
|
529
|
-
Section description
|
|
530
|
-
|
|
531
|
-
- [Link Title](url): Optional description
|
|
532
|
-
|
|
533
|
-
## Pages
|
|
534
|
-
|
|
535
|
-
- [Page Title](url): Page description
|
|
536
|
-
|
|
537
|
-
Custom notes
|
|
538
428
|
```
|
|
539
429
|
|
|
540
430
|
## Credits
|
package/bin/mdream.mjs
CHANGED
|
@@ -1,2 +1,26 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import
|
|
2
|
+
import { Readable } from 'node:stream'
|
|
3
|
+
import { streamHtmlToMarkdown } from 'mdream'
|
|
4
|
+
|
|
5
|
+
const args = process.argv.slice(2)
|
|
6
|
+
let origin
|
|
7
|
+
let preset
|
|
8
|
+
for (let i = 0; i < args.length; i++) {
|
|
9
|
+
if (args[i] === '--origin' && args[i + 1]) {
|
|
10
|
+
origin = args[++i]
|
|
11
|
+
}
|
|
12
|
+
else if (args[i] === '--preset' && args[i + 1]) {
|
|
13
|
+
preset = args[++i]
|
|
14
|
+
}
|
|
15
|
+
else if (args[i] === '-h' || args[i] === '--help') {
|
|
16
|
+
process.stdout.write('Usage: mdream [--origin <url>] [--preset minimal]\nPipe HTML via stdin, outputs Markdown to stdout.\n')
|
|
17
|
+
process.exit(0)
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const options = { origin, minimal: preset === 'minimal' }
|
|
22
|
+
const stream = Readable.toWeb(process.stdin)
|
|
23
|
+
for await (const chunk of streamHtmlToMarkdown(stream, options)) {
|
|
24
|
+
if (chunk?.length)
|
|
25
|
+
process.stdout.write(chunk)
|
|
26
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { HtmlToMarkdownOptions, MdreamNapiResult } from "../napi/index.js";
|
|
2
|
+
|
|
3
|
+
//#region src/browser.d.ts
|
|
4
|
+
declare function htmlToMarkdown(html: string, options?: HtmlToMarkdownOptions): Promise<MdreamNapiResult>;
|
|
5
|
+
declare function createMarkdownStream(options?: HtmlToMarkdownOptions): Promise<MarkdownStream>;
|
|
6
|
+
declare class MarkdownStream {
|
|
7
|
+
private _inner;
|
|
8
|
+
constructor(options?: HtmlToMarkdownOptions);
|
|
9
|
+
processChunk(chunk: string): string;
|
|
10
|
+
finish(): string;
|
|
11
|
+
}
|
|
12
|
+
//#endregion
|
|
13
|
+
export { MarkdownStream, createMarkdownStream, htmlToMarkdown };
|
package/dist/browser.mjs
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import init, { MarkdownStream as MarkdownStream$1, htmlToMarkdownResult } from "../wasm/mdream_edge.js";
|
|
2
|
+
//#region src/browser.ts
|
|
3
|
+
let _initPromise;
|
|
4
|
+
function ensureInit() {
|
|
5
|
+
if (!_initPromise) _initPromise = init();
|
|
6
|
+
return _initPromise;
|
|
7
|
+
}
|
|
8
|
+
ensureInit();
|
|
9
|
+
async function htmlToMarkdown(html, options) {
|
|
10
|
+
await ensureInit();
|
|
11
|
+
return htmlToMarkdownResult(html, options || {});
|
|
12
|
+
}
|
|
13
|
+
async function createMarkdownStream(options) {
|
|
14
|
+
await ensureInit();
|
|
15
|
+
return new MarkdownStream(options);
|
|
16
|
+
}
|
|
17
|
+
var MarkdownStream = class {
|
|
18
|
+
_inner;
|
|
19
|
+
constructor(options) {
|
|
20
|
+
this._inner = new MarkdownStream$1(options || {});
|
|
21
|
+
}
|
|
22
|
+
processChunk(chunk) {
|
|
23
|
+
return this._inner.processChunk(chunk);
|
|
24
|
+
}
|
|
25
|
+
finish() {
|
|
26
|
+
return this._inner.finish();
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
//#endregion
|
|
30
|
+
export { MarkdownStream, createMarkdownStream, htmlToMarkdown };
|
package/dist/edge.d.mts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { HtmlToMarkdownOptions } from "../napi/index.js";
|
|
2
|
+
|
|
3
|
+
//#region src/edge.d.ts
|
|
4
|
+
declare function htmlToMarkdown(html: string, options?: HtmlToMarkdownOptions): string;
|
|
5
|
+
declare class MarkdownStream {
|
|
6
|
+
private _inner;
|
|
7
|
+
constructor(options?: HtmlToMarkdownOptions);
|
|
8
|
+
processChunk(chunk: string): string;
|
|
9
|
+
finish(): string;
|
|
10
|
+
}
|
|
11
|
+
//#endregion
|
|
12
|
+
export { MarkdownStream, htmlToMarkdown };
|
package/dist/edge.mjs
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { MarkdownStream as MarkdownStream$1, htmlToMarkdownResult, initSync } from "../wasm/mdream_edge.js";
|
|
2
|
+
import wasmModule from "../wasm/mdream_edge_bg.wasm";
|
|
3
|
+
//#region src/edge.ts
|
|
4
|
+
initSync({ module: wasmModule });
|
|
5
|
+
function htmlToMarkdown(html, options) {
|
|
6
|
+
return htmlToMarkdownResult(html, options || {}).markdown || "";
|
|
7
|
+
}
|
|
8
|
+
var MarkdownStream = class {
|
|
9
|
+
_inner;
|
|
10
|
+
constructor(options) {
|
|
11
|
+
this._inner = new MarkdownStream$1(options || {});
|
|
12
|
+
}
|
|
13
|
+
processChunk(chunk) {
|
|
14
|
+
return this._inner.processChunk(chunk);
|
|
15
|
+
}
|
|
16
|
+
finish() {
|
|
17
|
+
return this._inner.finish();
|
|
18
|
+
}
|
|
19
|
+
};
|
|
20
|
+
//#endregion
|
|
21
|
+
export { MarkdownStream, htmlToMarkdown };
|