mdream 0.7.2 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +303 -0
- package/dist/_chunks/{src-Dbe3WLUq.mjs → src-CrlAO7kH.mjs} +14 -5
- package/dist/cli.mjs +1 -1
- package/dist/index.d.mts +1 -0
- package/dist/index.mjs +1 -1
- package/package.json +1 -1
package/README.md
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# mdream
|
|
2
|
+
|
|
3
|
+
[](https://npmjs.com/package/mdream)
|
|
4
|
+
[](https://npm.chart.dev/mdream)
|
|
5
|
+
[](https://github.com/harlan-zw/mdream/blob/main/LICENSE.md)
|
|
6
|
+
|
|
7
|
+
> Ultra-performant HTML to Markdown Convertor Optimized for LLMs. Generate llms.txt artifacts using CLI, GitHub Actions, Vite Plugin and more.
|
|
8
|
+
|
|
9
|
+
<img src="../../.github/logo.png" alt="mdream logo" width="200">
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<table>
|
|
13
|
+
<tbody>
|
|
14
|
+
<td align="center">
|
|
15
|
+
<sub>Made possible by my <a href="https://github.com/sponsors/harlan-zw">Sponsor Program 💖</a><br> Follow me <a href="https://twitter.com/harlan_zw">@harlan_zw</a> 🐦 • Join <a href="https://discord.gg/275MBUBvgP">Discord</a> for help</sub><br>
|
|
16
|
+
</td>
|
|
17
|
+
</tbody>
|
|
18
|
+
</table>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- 🧠 Optimized HTML To Markdown Conversion (~50% fewer tokens with [Minimal preset](./src/preset/minimal.ts))
|
|
24
|
+
- 🔍 Generates GitHub Flavored Markdown: Frontmatter, Nested & HTML markup support.
|
|
25
|
+
- 🚀 Fast: Stream 1.4MB of HTML to markdown in ~50ms.
|
|
26
|
+
- ⚡ Tiny: 5kB gzip, zero dependency core.
|
|
27
|
+
- ⚙️ Run anywhere: CLI, edge workers, browsers, Node, etc.
|
|
28
|
+
- 🔌 Extensible: [Plugin system](#plugin-system) for customizing and extending functionality.
|
|
29
|
+
|
|
30
|
+
## What is Mdream?
|
|
31
|
+
|
|
32
|
+
Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
|
|
33
|
+
human readability.
|
|
34
|
+
|
|
35
|
+
Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
|
|
36
|
+
|
|
37
|
+
Mdream produces high-quality Markdown for LLMs efficiently with no core dependencies. It includes a plugin system to customize the conversion process, allowing you to parse, extract, transform, and filter as needed.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pnpm add mdream
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## CLI Usage
|
|
46
|
+
|
|
47
|
+
Mdream provides a CLI designed to work exclusively with Unix pipes,
|
|
48
|
+
providing flexibility and freedom to integrate with other tools.
|
|
49
|
+
|
|
50
|
+
**Pipe Site to Markdown**
|
|
51
|
+
|
|
52
|
+
Fetches the [Markdown Wikipedia page](https://en.wikipedia.org/wiki/Markdown) and converts it to Markdown preserving the original links and images.
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
curl -s https://en.wikipedia.org/wiki/Markdown \
|
|
56
|
+
| npx mdream --origin https://en.wikipedia.org --preset minimal \
|
|
57
|
+
| tee streaming.md
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
_Tip: The `--origin` flag will fix relative image and link paths_
|
|
61
|
+
|
|
62
|
+
**Local File to Markdown**
|
|
63
|
+
|
|
64
|
+
Converts a local HTML file to a Markdown file, using `tee` to write the output to a file and display it in the terminal.
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
cat index.html \
|
|
68
|
+
| npx mdream --preset minimal \
|
|
69
|
+
| tee streaming.md
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### CLI Options
|
|
73
|
+
|
|
74
|
+
- `--origin <url>`: Base URL for resolving relative links and images
|
|
75
|
+
- `--preset <preset>`: Conversion presets: minimal
|
|
76
|
+
- `--help`: Display help information
|
|
77
|
+
- `--version`: Display version information
|
|
78
|
+
|
|
79
|
+
## API Usage
|
|
80
|
+
|
|
81
|
+
Mdream provides two main functions for working with HTML:
|
|
82
|
+
- `htmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
|
|
83
|
+
- `streamHtmlToMarkdown`: Best practice if you are fetching or reading from a local file.
|
|
84
|
+
|
|
85
|
+
**Convert existing HTML**
|
|
86
|
+
|
|
87
|
+
```ts
|
|
88
|
+
import { htmlToMarkdown } from 'mdream'
|
|
89
|
+
|
|
90
|
+
// Simple conversion
|
|
91
|
+
const markdown = htmlToMarkdown('<h1>Hello World</h1>')
|
|
92
|
+
console.log(markdown) // # Hello World
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Convert from Fetch**
|
|
96
|
+
|
|
97
|
+
```ts
|
|
98
|
+
import { streamHtmlToMarkdown } from 'mdream'
|
|
99
|
+
|
|
100
|
+
// Using fetch with streaming
|
|
101
|
+
const response = await fetch('https://example.com')
|
|
102
|
+
const htmlStream = response.body
|
|
103
|
+
const markdownGenerator = streamHtmlToMarkdown(htmlStream, {
|
|
104
|
+
origin: 'https://example.com'
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
// Process chunks as they arrive
|
|
108
|
+
for await (const chunk of markdownGenerator) {
|
|
109
|
+
console.log(chunk)
|
|
110
|
+
}
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
**Pure HTML Parser**
|
|
114
|
+
|
|
115
|
+
If you only need to parse HTML into a DOM-like AST without converting to Markdown, use `parseHtml`:
|
|
116
|
+
|
|
117
|
+
```ts
|
|
118
|
+
import { parseHtml } from 'mdream'
|
|
119
|
+
|
|
120
|
+
const html = '<div><h1>Title</h1><p>Content</p></div>'
|
|
121
|
+
const { events, remainingHtml } = parseHtml(html)
|
|
122
|
+
|
|
123
|
+
// Process the parsed events
|
|
124
|
+
events.forEach((event) => {
|
|
125
|
+
if (event.type === 'enter' && event.node.type === 'element') {
|
|
126
|
+
console.log('Entering element:', event.node.tagName)
|
|
127
|
+
}
|
|
128
|
+
})
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
The `parseHtml` function provides:
|
|
132
|
+
- **Pure AST parsing** - No markdown generation overhead
|
|
133
|
+
- **DOM events** - Enter/exit events for each element and text node
|
|
134
|
+
- **Plugin support** - Can apply plugins during parsing
|
|
135
|
+
- **Streaming compatible** - Works with the same plugin system
|
|
136
|
+
|
|
137
|
+
## Presets
|
|
138
|
+
|
|
139
|
+
Presets are pre-configured combinations of plugins for common use cases.
|
|
140
|
+
|
|
141
|
+
### Minimal Preset
|
|
142
|
+
|
|
143
|
+
The `minimal` preset optimizes for token reduction and cleaner output by removing non-essential content:
|
|
144
|
+
|
|
145
|
+
```ts
|
|
146
|
+
import { withMinimalPreset } from 'mdream/preset/minimal'
|
|
147
|
+
|
|
148
|
+
const options = withMinimalPreset({
|
|
149
|
+
origin: 'https://example.com'
|
|
150
|
+
})
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
**Plugins included:**
|
|
154
|
+
- `isolateMainPlugin()` - Extracts main content area
|
|
155
|
+
- `frontmatterPlugin()` - Generates YAML frontmatter from meta tags
|
|
156
|
+
- `tailwindPlugin()` - Converts Tailwind classes to Markdown
|
|
157
|
+
- `filterPlugin()` - Excludes forms, navigation, buttons, footers, and other non-content elements
|
|
158
|
+
|
|
159
|
+
**CLI Usage:**
|
|
160
|
+
```bash
|
|
161
|
+
curl -s https://example.com | npx mdream --preset minimal --origin https://example.com
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Plugin System
|
|
165
|
+
|
|
166
|
+
The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
|
|
167
|
+
|
|
168
|
+
### Built-in Plugins
|
|
169
|
+
|
|
170
|
+
Mdream includes several built-in plugins that can be used individually or combined:
|
|
171
|
+
|
|
172
|
+
- **[`extractionPlugin`](./src/plugins/extraction.ts)**: Extract specific elements using CSS selectors for data analysis
|
|
173
|
+
- **[`filterPlugin`](./src/plugins/filter.ts)**: Include or exclude elements based on CSS selectors or tag IDs
|
|
174
|
+
- **[`frontmatterPlugin`](./src/plugins/frontmatter.ts)**: Generate YAML frontmatter from HTML head elements (title, meta tags)
|
|
175
|
+
- **[`isolateMainPlugin`](./src/plugins/isolate-main.ts)**: Isolate main content using `<main>` elements or header-to-footer boundaries
|
|
176
|
+
- **[`tailwindPlugin`](./src/plugins/tailwind.ts)**: Convert Tailwind CSS classes to Markdown formatting (bold, italic, etc.)
|
|
177
|
+
- **[`readabilityPlugin`](./src/plugins/readability.ts)**: Content scoring and extraction (experimental)
|
|
178
|
+
|
|
179
|
+
```ts
|
|
180
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin } from 'mdream/plugins'
|
|
181
|
+
|
|
182
|
+
const markdown = htmlToMarkdown(html, {
|
|
183
|
+
plugins: [
|
|
184
|
+
isolateMainPlugin(),
|
|
185
|
+
frontmatterPlugin(),
|
|
186
|
+
filterPlugin({ exclude: ['nav', '.sidebar', '#footer'] })
|
|
187
|
+
]
|
|
188
|
+
})
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Plugin Hooks
|
|
192
|
+
|
|
193
|
+
- `beforeNodeProcess`: Called before any node processing, can skip nodes
|
|
194
|
+
- `onNodeEnter`: Called when entering an element node
|
|
195
|
+
- `onNodeExit`: Called when exiting an element node
|
|
196
|
+
- `processTextNode`: Called for each text node
|
|
197
|
+
- `processAttributes`: Called to process element attributes
|
|
198
|
+
|
|
199
|
+
### Creating a Plugin
|
|
200
|
+
|
|
201
|
+
Use `createPlugin()` to create a plugin with type safety:
|
|
202
|
+
|
|
203
|
+
```ts
|
|
204
|
+
import type { ElementNode, TextNode } from 'mdream'
|
|
205
|
+
import { htmlToMarkdown } from 'mdream'
|
|
206
|
+
import { createPlugin } from 'mdream/plugins'
|
|
207
|
+
|
|
208
|
+
const myPlugin = createPlugin({
|
|
209
|
+
onNodeEnter(node: ElementNode) {
|
|
210
|
+
if (node.name === 'h1') {
|
|
211
|
+
return '🔥 '
|
|
212
|
+
}
|
|
213
|
+
},
|
|
214
|
+
|
|
215
|
+
processTextNode(textNode: TextNode) {
|
|
216
|
+
// Transform text content
|
|
217
|
+
if (textNode.parent?.attributes?.id === 'highlight') {
|
|
218
|
+
return {
|
|
219
|
+
content: `**${textNode.value}**`,
|
|
220
|
+
skip: false
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
})
|
|
225
|
+
|
|
226
|
+
// Use the plugin
|
|
227
|
+
const html: string = '<div id="highlight">Important text</div>'
|
|
228
|
+
const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Example: Content Filter Plugin
|
|
232
|
+
|
|
233
|
+
```ts
|
|
234
|
+
import type { ElementNode, NodeEvent } from 'mdream'
|
|
235
|
+
import { ELEMENT_NODE } from 'mdream'
|
|
236
|
+
import { createPlugin } from 'mdream/plugins'
|
|
237
|
+
|
|
238
|
+
const adBlockPlugin = createPlugin({
|
|
239
|
+
beforeNodeProcess(event: NodeEvent) {
|
|
240
|
+
const { node } = event
|
|
241
|
+
|
|
242
|
+
if (node.type === ELEMENT_NODE && node.name === 'div') {
|
|
243
|
+
const element = node as ElementNode
|
|
244
|
+
// Skip ads and promotional content
|
|
245
|
+
if (element.attributes?.class?.includes('ad')
|
|
246
|
+
|| element.attributes?.id?.includes('promo')) {
|
|
247
|
+
return { skip: true }
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
})
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Extraction Plugin
|
|
255
|
+
|
|
256
|
+
Extract specific elements and their content during HTML processing for data analysis or content discovery:
|
|
257
|
+
|
|
258
|
+
```ts
|
|
259
|
+
import { extractionPlugin, htmlToMarkdown } from 'mdream'
|
|
260
|
+
|
|
261
|
+
const html: string = `
|
|
262
|
+
<article>
|
|
263
|
+
<h2>Getting Started</h2>
|
|
264
|
+
<p>This is a tutorial about web scraping.</p>
|
|
265
|
+
<img src="/hero.jpg" alt="Hero image" />
|
|
266
|
+
</article>
|
|
267
|
+
`
|
|
268
|
+
|
|
269
|
+
// Extract elements using CSS selectors
|
|
270
|
+
const plugin = extractionPlugin({
|
|
271
|
+
'h2': (element: ExtractedElement, state: MdreamRuntimeState) => {
|
|
272
|
+
console.log('Heading:', element.textContent) // "Getting Started"
|
|
273
|
+
console.log('Depth:', state.depth) // Current nesting depth
|
|
274
|
+
},
|
|
275
|
+
'img[alt]': (element: ExtractedElement, state: MdreamRuntimeState) => {
|
|
276
|
+
console.log('Image:', element.attributes.src, element.attributes.alt)
|
|
277
|
+
// "Image: /hero.jpg Hero image"
|
|
278
|
+
console.log('Context:', state.options) // Access to conversion options
|
|
279
|
+
}
|
|
280
|
+
})
|
|
281
|
+
|
|
282
|
+
htmlToMarkdown(html, { plugins: [plugin] })
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
|
|
286
|
+
|
|
287
|
+
## Credits
|
|
288
|
+
|
|
289
|
+
- [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
|
|
290
|
+
|
|
291
|
+
## License
|
|
292
|
+
|
|
293
|
+
Licensed under the [MIT license](https://github.com/harlan-zw/mdream/blob/main/LICENSE.md).
|
|
294
|
+
|
|
295
|
+
<!-- Badges -->
|
|
296
|
+
[npm-version-src]: https://img.shields.io/npm/v/mdream/latest.svg?style=flat&colorA=18181B&colorB=4C9BE0
|
|
297
|
+
[npm-version-href]: https://npmjs.com/package/mdream
|
|
298
|
+
|
|
299
|
+
[npm-downloads-src]: https://img.shields.io/npm/dm/mdream.svg?style=flat&colorA=18181B&colorB=4C9BE0
|
|
300
|
+
[npm-downloads-href]: https://npmjs.com/package/mdream
|
|
301
|
+
|
|
302
|
+
[license-src]: https://img.shields.io/github/license/harlan-zw/mdream.svg?style=flat&colorA=18181B&colorB=4C9BE0
|
|
303
|
+
[license-href]: https://github.com/harlan-zw/mdream/blob/main/LICENSE.md
|
|
@@ -1478,10 +1478,15 @@ function generateLlmsTxtContent(files, options) {
|
|
|
1478
1478
|
if (files.length > 0) {
|
|
1479
1479
|
content += `## Pages\n\n`;
|
|
1480
1480
|
for (const file of files) {
|
|
1481
|
-
const url = origin + file.url;
|
|
1482
1481
|
const desc = file.metadata?.description;
|
|
1483
1482
|
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
1484
|
-
|
|
1483
|
+
if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
|
|
1484
|
+
const relativePath = relative(options.outputDir, file.filePath);
|
|
1485
|
+
content += `- [${file.title}](${relativePath})${descText}\n`;
|
|
1486
|
+
} else {
|
|
1487
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
|
|
1488
|
+
content += `- [${file.title}](${url})${descText}\n`;
|
|
1489
|
+
}
|
|
1485
1490
|
}
|
|
1486
1491
|
}
|
|
1487
1492
|
return content;
|
|
@@ -1501,10 +1506,14 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
1501
1506
|
}
|
|
1502
1507
|
content += `\n---\n\n`;
|
|
1503
1508
|
for (const file of files) {
|
|
1504
|
-
const url = origin ? origin + file.url : file.url;
|
|
1509
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
1505
1510
|
content += `## ${file.title}\n\n`;
|
|
1506
|
-
content += `**URL:** ${url}\n
|
|
1507
|
-
|
|
1511
|
+
content += `**URL:** ${url}\n`;
|
|
1512
|
+
if (file.filePath && options.outputDir) {
|
|
1513
|
+
const relativePath = relative(options.outputDir, file.filePath);
|
|
1514
|
+
content += `**File:** ${relativePath}\n`;
|
|
1515
|
+
} else if (file.filePath) content += `**File:** ${file.filePath}\n`;
|
|
1516
|
+
content += `\n${file.content}\n\n---\n\n`;
|
|
1508
1517
|
}
|
|
1509
1518
|
}
|
|
1510
1519
|
return content;
|
package/dist/cli.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import "./_chunks/extraction-D28Kr1J3.mjs";
|
|
2
|
-
import { generateLlmsTxtArtifacts, streamHtmlToMarkdown } from "./_chunks/src-
|
|
2
|
+
import { generateLlmsTxtArtifacts, streamHtmlToMarkdown } from "./_chunks/src-CrlAO7kH.mjs";
|
|
3
3
|
import "./_chunks/plugins-DXY-fo9h.mjs";
|
|
4
4
|
import { withMinimalPreset } from "./_chunks/minimal-CCnrG7a1.mjs";
|
|
5
5
|
import { mkdir, writeFile } from "node:fs/promises";
|
package/dist/index.d.mts
CHANGED
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import { TagIdMap, createPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
|
|
2
|
-
import { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-
|
|
2
|
+
import { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-CrlAO7kH.mjs";
|
|
3
3
|
|
|
4
4
|
export { MarkdownProcessor, TagIdMap, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|