mdream 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/minimal-Ru8PBNVI.mjs +40 -0
- package/dist/_chunks/{plugin-DCJFRZej.mjs → plugin-Bqz9GKOA.mjs} +1 -1
- package/dist/_chunks/plugin-D45YAMmt.d.mts +12 -0
- package/dist/_chunks/plugins-D305pIpW.mjs +844 -0
- package/dist/_chunks/{stream-BeojJNLt.mjs → stream-IeCVDuTy.mjs} +53 -35
- package/dist/_chunks/{types-BHoibuoP.d.mts → types-D9VKEbix.d.mts} +29 -8
- package/dist/cli.mjs +14 -8
- package/dist/index.d.mts +75 -3
- package/dist/index.mjs +2 -2
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +2 -3
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +3 -39
- package/package.json +3 -17
- package/README.md +0 -252
- package/dist/_chunks/index-VTwTBxk0.d.mts +0 -58
- package/dist/_chunks/plugins-DGakgpSl.mjs +0 -582
- package/dist/_chunks/readability-BfCjcbbx.mjs +0 -271
package/README.md
DELETED
|
@@ -1,252 +0,0 @@
|
|
|
1
|
-
<h1>mdream</h1>
|
|
2
|
-
|
|
3
|
-
[](https://npmjs.com/package/mdream)
|
|
4
|
-
[](https://npm.chart.dev/mdream)
|
|
5
|
-
[](https://github.com/harlan-zw/mdream/blob/main/LICENSE.md)
|
|
6
|
-
|
|
7
|
-
> Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.
|
|
8
|
-
|
|
9
|
-
<img src=".github/logo.png" alt="mdream logo" width="200">
|
|
10
|
-
|
|
11
|
-
<p align="center">
|
|
12
|
-
<table>
|
|
13
|
-
<tbody>
|
|
14
|
-
<td align="center">
|
|
15
|
-
<sub>Made possible by my <a href="https://github.com/sponsors/harlan-zw">Sponsor Program 💖</a><br> Follow me <a href="https://twitter.com/harlan_zw">@harlan_zw</a> 🐦 • Join <a href="https://discord.gg/275MBUBvgP">Discord</a> for help</sub><br>
|
|
16
|
-
</td>
|
|
17
|
-
</tbody>
|
|
18
|
-
</table>
|
|
19
|
-
</p>
|
|
20
|
-
|
|
21
|
-
## Features
|
|
22
|
-
|
|
23
|
-
- 🧠 Content Extraction: [Readability.js]() scoring heuristics for [~50% fewer tokens*]() and improved accuracy.
|
|
24
|
-
- 🔍 GitHub Flavored Markdown: Frontmatter, Nested & HTML markup support.
|
|
25
|
-
|
|
26
|
-
**Ultra Performant**
|
|
27
|
-
- 🚀 Convert 1.4MB of HTML in [~50ms*]() with advanced streaming support, including content-based buffering.
|
|
28
|
-
- ⚡ 5kB gzip, zero dependencies.
|
|
29
|
-
|
|
30
|
-
**Adaptable**
|
|
31
|
-
|
|
32
|
-
- ⚙️ Run anywhere: CLI, edge workers, browsers, Node, etc.
|
|
33
|
-
- 🔌 Extensible: [Plugin system](#plugin-system) for customizing and extending functionality.
|
|
34
|
-
|
|
35
|
-
**CLI**
|
|
36
|
-
|
|
37
|
-
- integrates with crawlee to provide entire site markdown dumps
|
|
38
|
-
- Run a MCP web server
|
|
39
|
-
|
|
40
|
-
## Why Mdream?
|
|
41
|
-
|
|
42
|
-
Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
|
|
43
|
-
human readability.
|
|
44
|
-
|
|
45
|
-
Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
|
|
46
|
-
|
|
47
|
-
Mdream is HTML parser + Markdown generator built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
|
|
48
|
-
a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
|
|
49
|
-
|
|
50
|
-
## CLI Usage
|
|
51
|
-
|
|
52
|
-
The Mdream CLI is designed to work exclusively with Unix pipes, providing flexibility and freedom to integrate with other tools.
|
|
53
|
-
|
|
54
|
-
**Pipe Site to Markdown**
|
|
55
|
-
|
|
56
|
-
Fetches the [Markdown Wikipedia page](https://en.wikipedia.org/wiki/Markdown) and converts it to Markdown preserving the original links and images.
|
|
57
|
-
|
|
58
|
-
```bash
|
|
59
|
-
curl -s https://en.wikipedia.org/wiki/Markdown \
|
|
60
|
-
| npx mdream --origin https://en.wikipedia.org --filters minimal-from-first-header \
|
|
61
|
-
| tee streaming.md
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
_Tip: The `--origin` flag will fix relative image and link paths_
|
|
65
|
-
|
|
66
|
-
**Local File to Markdown**
|
|
67
|
-
|
|
68
|
-
Converts a local HTML file to a Markdown file, using `tee` to write the output to a file and display it in the terminal.
|
|
69
|
-
|
|
70
|
-
```bash
|
|
71
|
-
cat index.html \
|
|
72
|
-
| npx mdream \
|
|
73
|
-
| tee streaming.md
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
### CLI Options
|
|
77
|
-
|
|
78
|
-
- `--origin <url>`: Base URL for resolving relative links and images
|
|
79
|
-
- `-v, --verbose`: Enable verbose debug logging to stderr
|
|
80
|
-
- `--help`: Display help information
|
|
81
|
-
- `--version`: Display version information
|
|
82
|
-
|
|
83
|
-
## API Usage
|
|
84
|
-
|
|
85
|
-
### Installation
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
# npm
|
|
89
|
-
npm install mdream
|
|
90
|
-
|
|
91
|
-
# yarn
|
|
92
|
-
yarn add mdream
|
|
93
|
-
|
|
94
|
-
# pnpm
|
|
95
|
-
pnpm add mdream
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
### Usage
|
|
99
|
-
|
|
100
|
-
Mdream provides two utils for working with HTML, both will process content as a stream.
|
|
101
|
-
- `htmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
|
|
102
|
-
- `streamHtmlToMarkdown`: Best practice if you are fetching or reading from a local file.
|
|
103
|
-
|
|
104
|
-
**Convert existing HTML**
|
|
105
|
-
|
|
106
|
-
```ts
|
|
107
|
-
import { htmlToMarkdown } from 'mdream'
|
|
108
|
-
|
|
109
|
-
// Simple conversion
|
|
110
|
-
const markdown = htmlToMarkdown('<h1>Hello World</h1>')
|
|
111
|
-
console.log(markdown) // # Hello World
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
**Convert from Fetch**
|
|
115
|
-
|
|
116
|
-
```ts
|
|
117
|
-
import { streamHtmlToMarkdown } from 'mdream'
|
|
118
|
-
|
|
119
|
-
// Using fetch with streaming
|
|
120
|
-
const response = await fetch('https://example.com')
|
|
121
|
-
const htmlStream = response.body
|
|
122
|
-
const markdownGenerator = streamHtmlToMarkdown(htmlStream, {
|
|
123
|
-
origin: 'https://example.com',
|
|
124
|
-
filters: 'minimal-from-first-header'
|
|
125
|
-
})
|
|
126
|
-
|
|
127
|
-
// Process chunks as they arrive
|
|
128
|
-
for await (const chunk of markdownGenerator) {
|
|
129
|
-
console.log(chunk)
|
|
130
|
-
}
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
## Documentation
|
|
134
|
-
|
|
135
|
-
### Plugin System
|
|
136
|
-
|
|
137
|
-
The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
|
|
138
|
-
|
|
139
|
-
#### Plugin Hooks
|
|
140
|
-
|
|
141
|
-
- `beforeNodeProcess`: Called before any node processing, can skip nodes
|
|
142
|
-
- `onNodeEnter`: Called when entering an element node
|
|
143
|
-
- `onNodeExit`: Called when exiting an element node
|
|
144
|
-
- `processTextNode`: Called for each text node
|
|
145
|
-
- `processAttributes`: Called to process element attributes
|
|
146
|
-
|
|
147
|
-
#### Creating a Plugin
|
|
148
|
-
|
|
149
|
-
Use `createPlugin()` to create a plugin with type safety:
|
|
150
|
-
|
|
151
|
-
```ts
|
|
152
|
-
import type { ElementNode, TextNode } from 'mdream'
|
|
153
|
-
import { htmlToMarkdown } from 'mdream'
|
|
154
|
-
import { createPlugin } from 'mdream/plugins'
|
|
155
|
-
|
|
156
|
-
const myPlugin = createPlugin({
|
|
157
|
-
onNodeEnter(node: ElementNode): string | undefined {
|
|
158
|
-
if (node.name === 'h1') {
|
|
159
|
-
return '🔥 '
|
|
160
|
-
}
|
|
161
|
-
},
|
|
162
|
-
|
|
163
|
-
processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
|
|
164
|
-
// Transform text content
|
|
165
|
-
if (textNode.parent?.attributes?.id === 'highlight') {
|
|
166
|
-
return {
|
|
167
|
-
content: `**${textNode.value}**`,
|
|
168
|
-
skip: false
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
})
|
|
173
|
-
|
|
174
|
-
// Use the plugin
|
|
175
|
-
const html: string = '<div id="highlight">Important text</div>'
|
|
176
|
-
const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
#### Example: Content Filter Plugin
|
|
180
|
-
|
|
181
|
-
```ts
|
|
182
|
-
import type { ElementNode, NodeEvent } from 'mdream'
|
|
183
|
-
import { ELEMENT_NODE } from 'mdream'
|
|
184
|
-
import { createPlugin } from 'mdream/plugins'
|
|
185
|
-
|
|
186
|
-
const adBlockPlugin = createPlugin({
|
|
187
|
-
beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
|
|
188
|
-
const { node } = event
|
|
189
|
-
|
|
190
|
-
if (node.type === ELEMENT_NODE && node.name === 'div') {
|
|
191
|
-
const element = node as ElementNode
|
|
192
|
-
// Skip ads and promotional content
|
|
193
|
-
if (element.attributes?.class?.includes('ad')
|
|
194
|
-
|| element.attributes?.id?.includes('promo')) {
|
|
195
|
-
return { skip: true }
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
})
|
|
200
|
-
```
|
|
201
|
-
|
|
202
|
-
#### Extraction Plugin
|
|
203
|
-
|
|
204
|
-
Extract specific elements and their content during HTML processing for data analysis or content discovery:
|
|
205
|
-
|
|
206
|
-
```ts
|
|
207
|
-
import type { ExtractedElement, MdreamRuntimeState } from 'mdream/plugins'
|
|
208
|
-
import { extractionPlugin, htmlToMarkdown } from 'mdream'
|
|
209
|
-
|
|
210
|
-
const html: string = `
|
|
211
|
-
<article>
|
|
212
|
-
<h2>Getting Started</h2>
|
|
213
|
-
<p>This is a tutorial about web scraping.</p>
|
|
214
|
-
<img src="/hero.jpg" alt="Hero image" />
|
|
215
|
-
</article>
|
|
216
|
-
`
|
|
217
|
-
|
|
218
|
-
// Extract elements using CSS selectors
|
|
219
|
-
const plugin = extractionPlugin({
|
|
220
|
-
'h2': (element: ExtractedElement, state: MdreamRuntimeState): void => {
|
|
221
|
-
console.log('Heading:', element.textContent) // "Getting Started"
|
|
222
|
-
console.log('Depth:', state.depth) // Current nesting depth
|
|
223
|
-
},
|
|
224
|
-
'img[alt]': (element: ExtractedElement, state: MdreamRuntimeState): void => {
|
|
225
|
-
console.log('Image:', element.attributes.src, element.attributes.alt)
|
|
226
|
-
// "Image: /hero.jpg Hero image"
|
|
227
|
-
console.log('Context:', state.options) // Access to conversion options
|
|
228
|
-
}
|
|
229
|
-
})
|
|
230
|
-
|
|
231
|
-
htmlToMarkdown(html, { plugins: [plugin] })
|
|
232
|
-
```
|
|
233
|
-
|
|
234
|
-
The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
|
|
235
|
-
|
|
236
|
-
## Credits
|
|
237
|
-
|
|
238
|
-
- [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
|
|
239
|
-
|
|
240
|
-
## License
|
|
241
|
-
|
|
242
|
-
Licensed under the [MIT license](https://github.com/harlan-zw/mdream/blob/main/LICENSE.md).
|
|
243
|
-
|
|
244
|
-
<!-- Badges -->
|
|
245
|
-
[npm-version-src]: https://img.shields.io/npm/v/mdream/latest.svg?style=flat&colorA=18181B&colorB=4C9BE0
|
|
246
|
-
[npm-version-href]: https://npmjs.com/package/mdream
|
|
247
|
-
|
|
248
|
-
[npm-downloads-src]: https://img.shields.io/npm/dm/mdream.svg?style=flat&colorA=18181B&colorB=4C9BE0
|
|
249
|
-
[npm-downloads-href]: https://npmjs.com/package/mdream
|
|
250
|
-
|
|
251
|
-
[license-src]: https://img.shields.io/github/license/harlan-zw/mdream.svg?style=flat&colorA=18181B&colorB=4C9BE0
|
|
252
|
-
[license-href]: https://github.com/harlan-zw/mdream/blob/main/LICENSE.md
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
import { HTMLToMarkdownOptions, NodeEvent, Plugin } from "./types-BHoibuoP.mjs";
|
|
2
|
-
import { ReadableStream } from "node:stream/web";
|
|
3
|
-
|
|
4
|
-
//#region src/const.d.ts
|
|
5
|
-
|
|
6
|
-
declare const TagIdMap: Record<string, number>;
|
|
7
|
-
//#endregion
|
|
8
|
-
//#region src/markdown-processor.d.ts
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Creates a markdown processor that consumes DOM events and generates markdown
|
|
12
|
-
*/
|
|
13
|
-
declare function createMarkdownProcessor(options?: HTMLToMarkdownOptions): {
|
|
14
|
-
processEvent: (event: NodeEvent) => void;
|
|
15
|
-
processHtml: (html: string) => void;
|
|
16
|
-
getMarkdown: () => string;
|
|
17
|
-
getMarkdownChunk: () => string;
|
|
18
|
-
};
|
|
19
|
-
declare const MarkdownProcessor: typeof createMarkdownProcessor;
|
|
20
|
-
//#endregion
|
|
21
|
-
//#region src/parse.d.ts
|
|
22
|
-
interface ParseOptions {
|
|
23
|
-
plugins?: Plugin[];
|
|
24
|
-
}
|
|
25
|
-
interface ParseResult {
|
|
26
|
-
events: NodeEvent[];
|
|
27
|
-
remainingHtml: string;
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Pure HTML parser that emits DOM events
|
|
31
|
-
* Completely decoupled from markdown generation
|
|
32
|
-
*/
|
|
33
|
-
declare function parseHtml(html: string, options?: ParseOptions): ParseResult;
|
|
34
|
-
/**
|
|
35
|
-
* Streaming HTML parser - calls onEvent for each DOM event
|
|
36
|
-
*/
|
|
37
|
-
//#endregion
|
|
38
|
-
//#region src/pluggable/plugin.d.ts
|
|
39
|
-
/**
|
|
40
|
-
* Create a plugin that implements the Plugin interface with improved type inference
|
|
41
|
-
*
|
|
42
|
-
* @returns A complete plugin implementation
|
|
43
|
-
*/
|
|
44
|
-
declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
45
|
-
//#endregion
|
|
46
|
-
//#region src/stream.d.ts
|
|
47
|
-
/**
|
|
48
|
-
* Creates a markdown stream from an HTML stream
|
|
49
|
-
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
50
|
-
* @param options - Configuration options for conversion
|
|
51
|
-
* @returns An async generator yielding markdown chunks
|
|
52
|
-
*/
|
|
53
|
-
declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
|
|
54
|
-
//#endregion
|
|
55
|
-
//#region src/index.d.ts
|
|
56
|
-
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
57
|
-
//#endregion
|
|
58
|
-
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|