mdream 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -25
- package/dist/cli.mjs +1 -1
- package/dist/index.d.mts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.mjs +2 -2
- package/dist/plugins.d.mts +1 -1
- package/dist/plugins.d.ts +1 -1
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.d.ts +1 -1
- package/dist/shared/{mdream.a2AvjJLp.d.mts → mdream.C9ruFMrk.d.mts} +2 -0
- package/dist/shared/{mdream.a2AvjJLp.d.ts → mdream.C9ruFMrk.d.ts} +2 -0
- package/dist/shared/{mdream.N3Qlh-YP.mjs → mdream.CRBi8vE8.mjs} +7 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -44,7 +44,10 @@
|
|
|
44
44
|
Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
|
|
45
45
|
human readability.
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
|
|
48
|
+
|
|
49
|
+
Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
|
|
50
|
+
a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
|
|
48
51
|
|
|
49
52
|
Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
|
|
50
53
|
|
|
@@ -110,7 +113,7 @@ import { htmlToMarkdown } from 'mdream'
|
|
|
110
113
|
// Simple conversion
|
|
111
114
|
const markdown = htmlToMarkdown('<h1>Hello World</h1>')
|
|
112
115
|
console.log(markdown) // # Hello World
|
|
113
|
-
|
|
116
|
+
```
|
|
114
117
|
|
|
115
118
|
**Convert from Fetch**
|
|
116
119
|
|
|
@@ -135,33 +138,69 @@ for await (const chunk of markdownGenerator) {
|
|
|
135
138
|
|
|
136
139
|
### Plugin System
|
|
137
140
|
|
|
138
|
-
|
|
141
|
+
The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
|
|
142
|
+
|
|
143
|
+
#### Plugin Hooks
|
|
144
|
+
|
|
145
|
+
- `beforeNodeProcess`: Called before any node processing, can skip nodes
|
|
146
|
+
- `onNodeEnter`: Called when entering an element node
|
|
147
|
+
- `onNodeExit`: Called when exiting an element node
|
|
148
|
+
- `processTextNode`: Called for each text node
|
|
149
|
+
- `processAttributes`: Called to process element attributes
|
|
150
|
+
|
|
151
|
+
#### Creating a Plugin
|
|
152
|
+
|
|
153
|
+
Use `createPlugin()` to create a plugin with type safety:
|
|
139
154
|
|
|
140
155
|
```ts
|
|
141
|
-
import {
|
|
156
|
+
import type { ElementNode, TextNode } from 'mdream'
|
|
157
|
+
import { htmlToMarkdown } from 'mdream'
|
|
158
|
+
import { createPlugin } from 'mdream/plugins'
|
|
142
159
|
|
|
143
|
-
// Create a custom plugin
|
|
144
160
|
const myPlugin = createPlugin({
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
161
|
+
onNodeEnter(node: ElementNode): string | undefined {
|
|
162
|
+
if (node.name === 'h1') {
|
|
163
|
+
return '🔥 '
|
|
164
|
+
}
|
|
165
|
+
},
|
|
166
|
+
|
|
167
|
+
processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
|
|
168
|
+
// Transform text content
|
|
169
|
+
if (textNode.parent?.attributes?.id === 'highlight') {
|
|
170
|
+
return {
|
|
171
|
+
content: `**${textNode.value}**`,
|
|
172
|
+
skip: false
|
|
173
|
+
}
|
|
149
174
|
}
|
|
150
|
-
return content
|
|
151
175
|
}
|
|
152
176
|
})
|
|
153
177
|
|
|
154
|
-
// Use
|
|
155
|
-
const html = '<div
|
|
156
|
-
const markdown = htmlToMarkdown(html, {
|
|
157
|
-
|
|
158
|
-
withTailwind(), // Apply Tailwind class processing
|
|
159
|
-
filterUnsupportedTags(), // Filter out unsupported tags
|
|
160
|
-
myPlugin // Apply custom transformations
|
|
161
|
-
]
|
|
162
|
-
})
|
|
178
|
+
// Use the plugin
|
|
179
|
+
const html: string = '<div id="highlight">Important text</div>'
|
|
180
|
+
const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
|
|
181
|
+
```
|
|
163
182
|
|
|
164
|
-
|
|
183
|
+
#### Example: Content Filter Plugin
|
|
184
|
+
|
|
185
|
+
```ts
|
|
186
|
+
import type { ElementNode, NodeEvent } from 'mdream'
|
|
187
|
+
import { ELEMENT_NODE } from 'mdream'
|
|
188
|
+
import { createPlugin } from 'mdream/plugins'
|
|
189
|
+
|
|
190
|
+
const adBlockPlugin = createPlugin({
|
|
191
|
+
beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
|
|
192
|
+
const { node } = event
|
|
193
|
+
|
|
194
|
+
if (node.type === ELEMENT_NODE && node.name === 'div') {
|
|
195
|
+
const element = node as ElementNode
|
|
196
|
+
// Skip ads and promotional content
|
|
197
|
+
if (element.attributes?.class?.includes('ad')
|
|
198
|
+
|| element.attributes?.id?.includes('promo')) {
|
|
199
|
+
return { skip: true }
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
})
|
|
165
204
|
```
|
|
166
205
|
|
|
167
206
|
#### Extraction Plugin
|
|
@@ -169,9 +208,10 @@ console.log(markdown) // "⚠️ **Important message** ⚠️"
|
|
|
169
208
|
Extract specific elements and their content during HTML processing for data analysis or content discovery:
|
|
170
209
|
|
|
171
210
|
```ts
|
|
211
|
+
import type { ExtractedElement } from 'mdream/plugins'
|
|
172
212
|
import { extractionPlugin, htmlToMarkdown } from 'mdream'
|
|
173
213
|
|
|
174
|
-
const html = `
|
|
214
|
+
const html: string = `
|
|
175
215
|
<article>
|
|
176
216
|
<h2>Getting Started</h2>
|
|
177
217
|
<p>This is a tutorial about web scraping.</p>
|
|
@@ -181,10 +221,10 @@ const html = `
|
|
|
181
221
|
|
|
182
222
|
// Extract elements using CSS selectors
|
|
183
223
|
const plugin = extractionPlugin({
|
|
184
|
-
'h2': (element) => {
|
|
224
|
+
'h2': (element: ExtractedElement): void => {
|
|
185
225
|
console.log('Heading:', element.textContent) // "Getting Started"
|
|
186
226
|
},
|
|
187
|
-
'img[alt]': (element) => {
|
|
227
|
+
'img[alt]': (element: ExtractedElement): void => {
|
|
188
228
|
console.log('Image:', element.attributes.src, element.attributes.alt)
|
|
189
229
|
// "Image: /hero.jpg Hero image"
|
|
190
230
|
}
|
|
@@ -195,8 +235,6 @@ htmlToMarkdown(html, { plugins: [plugin] })
|
|
|
195
235
|
|
|
196
236
|
The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
|
|
197
237
|
|
|
198
|
-
For more details, see the [plugin documentation](./docs/plugins.md).
|
|
199
|
-
|
|
200
238
|
## Credits
|
|
201
239
|
|
|
202
240
|
- [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
|
package/dist/cli.mjs
CHANGED
|
@@ -2,7 +2,7 @@ import { Readable } from 'node:stream';
|
|
|
2
2
|
import { cac } from 'cac';
|
|
3
3
|
import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
|
|
4
4
|
import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
|
|
5
|
-
import { s as streamHtmlToMarkdown } from './shared/mdream.
|
|
5
|
+
import { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
|
|
6
6
|
import './shared/mdream.Ch6B8TEB.mjs';
|
|
7
7
|
|
|
8
8
|
async function streamingConvert(options = {}) {
|
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { H as HTMLToMarkdownOptions } from './shared/mdream.
|
|
2
|
-
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.
|
|
1
|
+
import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.mjs';
|
|
2
|
+
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.mjs';
|
|
3
3
|
import { ReadableStream } from 'node:stream/web';
|
|
4
4
|
|
|
5
5
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { H as HTMLToMarkdownOptions } from './shared/mdream.
|
|
2
|
-
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.
|
|
1
|
+
import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.js';
|
|
2
|
+
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.js';
|
|
3
3
|
import { ReadableStream } from 'node:stream/web';
|
|
4
4
|
|
|
5
5
|
/**
|
package/dist/index.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { p as processPartialHTMLToMarkdown } from './shared/mdream.
|
|
2
|
-
export { s as streamHtmlToMarkdown } from './shared/mdream.
|
|
1
|
+
import { p as processPartialHTMLToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
|
|
2
|
+
export { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
|
|
3
3
|
import './shared/mdream.Ch6B8TEB.mjs';
|
|
4
4
|
|
|
5
5
|
function htmlToMarkdown(html, options = {}) {
|
package/dist/plugins.d.mts
CHANGED
package/dist/plugins.d.ts
CHANGED
package/dist/preset/minimal.d.ts
CHANGED
|
@@ -82,6 +82,8 @@ interface TextNode extends Node {
|
|
|
82
82
|
value: string;
|
|
83
83
|
/** Custom data added by plugins */
|
|
84
84
|
context?: Record<string, any>;
|
|
85
|
+
/** Whether this text node should be excluded from markdown output (for script/style elements) */
|
|
86
|
+
excludedFromMarkdown?: boolean;
|
|
85
87
|
}
|
|
86
88
|
/**
|
|
87
89
|
* Base DOM node interface
|
|
@@ -82,6 +82,8 @@ interface TextNode extends Node {
|
|
|
82
82
|
value: string;
|
|
83
83
|
/** Custom data added by plugins */
|
|
84
84
|
context?: Record<string, any>;
|
|
85
|
+
/** Whether this text node should be excluded from markdown output (for script/style elements) */
|
|
86
|
+
excludedFromMarkdown?: boolean;
|
|
85
87
|
}
|
|
86
88
|
/**
|
|
87
89
|
* Base DOM node interface
|
|
@@ -48,6 +48,9 @@ function processHtmlEventToMarkdown(event, state) {
|
|
|
48
48
|
textNode.value = pluginResult.content;
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
|
+
if (textNode.excludedFromMarkdown) {
|
|
52
|
+
return;
|
|
53
|
+
}
|
|
51
54
|
if (textNode.value === " " && lastChar === "\n") {
|
|
52
55
|
return;
|
|
53
56
|
}
|
|
@@ -1024,9 +1027,10 @@ function processTextBuffer(textBuffer, state, handleEvent) {
|
|
|
1024
1027
|
const containsWhitespace = state.textBufferContainsWhitespace;
|
|
1025
1028
|
state.textBufferContainsNonWhitespace = false;
|
|
1026
1029
|
state.textBufferContainsWhitespace = false;
|
|
1027
|
-
if (!state.currentNode
|
|
1030
|
+
if (!state.currentNode) {
|
|
1028
1031
|
return;
|
|
1029
1032
|
}
|
|
1033
|
+
const excludesTextNodes = state.currentNode?.tagHandler?.excludesTextNodes;
|
|
1030
1034
|
const inPreTag = state.depthMap[TAG_PRE] > 0;
|
|
1031
1035
|
if (!inPreTag && !containsNonWhitespace && !state.currentNode.childTextNodeIndex) {
|
|
1032
1036
|
return;
|
|
@@ -1057,7 +1061,8 @@ function processTextBuffer(textBuffer, state, handleEvent) {
|
|
|
1057
1061
|
regionId: state.currentNode?.regionId,
|
|
1058
1062
|
index: state.currentNode.currentWalkIndex++,
|
|
1059
1063
|
depth: state.depth,
|
|
1060
|
-
containsWhitespace
|
|
1064
|
+
containsWhitespace,
|
|
1065
|
+
excludedFromMarkdown: excludesTextNodes
|
|
1061
1066
|
};
|
|
1062
1067
|
for (const parent of parentsToIncrement) {
|
|
1063
1068
|
parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
|