mdream 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,7 +44,10 @@
44
44
  Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
45
45
  human readability.
46
46
 
47
- Mdream is an ultra-performant HTML to Markdown converter built specifically for LLM Content Analysis & Human Readability. With zero dependencies, streaming built-in and opinionated output optimized for both human readability and AI processing.
47
+ Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
48
+
49
+ Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
50
+ a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
48
51
 
49
52
  Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
50
53
 
@@ -110,7 +113,7 @@ import { htmlToMarkdown } from 'mdream'
110
113
  // Simple conversion
111
114
  const markdown = htmlToMarkdown('<h1>Hello World</h1>')
112
115
  console.log(markdown) // # Hello World
113
- ````
116
+ ```
114
117
 
115
118
  **Convert from Fetch**
116
119
 
@@ -135,33 +138,69 @@ for await (const chunk of markdownGenerator) {
135
138
 
136
139
  ### Plugin System
137
140
 
138
- Mdream now features a powerful plugin system that allows you to customize and extend the HTML-to-Markdown conversion process.
141
+ The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
142
+
143
+ #### Plugin Hooks
144
+
145
+ - `beforeNodeProcess`: Called before any node processing, can skip nodes
146
+ - `onNodeEnter`: Called when entering an element node
147
+ - `onNodeExit`: Called when exiting an element node
148
+ - `processTextNode`: Called for each text node
149
+ - `processAttributes`: Called to process element attributes
150
+
151
+ #### Creating a Plugin
152
+
153
+ Use `createPlugin()` to create a plugin with type safety:
139
154
 
140
155
  ```ts
141
- import { createPlugin, filterUnsupportedTags, htmlToMarkdown, withTailwind } from 'mdream'
156
+ import type { ElementNode, TextNode } from 'mdream'
157
+ import { htmlToMarkdown } from 'mdream'
158
+ import { createPlugin } from 'mdream/plugins'
142
159
 
143
- // Create a custom plugin
144
160
  const myPlugin = createPlugin({
145
- name: 'my-plugin',
146
- transformContent: (content, node) => {
147
- if (node.type === 1 && node.name === 'div' && node.attributes?.role === 'alert') {
148
- return `⚠️ ${content} ⚠️`
161
+ onNodeEnter(node: ElementNode): string | undefined {
162
+ if (node.name === 'h1') {
163
+ return '🔥 '
164
+ }
165
+ },
166
+
167
+ processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
168
+ // Transform text content
169
+ if (textNode.parent?.attributes?.id === 'highlight') {
170
+ return {
171
+ content: `**${textNode.value}**`,
172
+ skip: false
173
+ }
149
174
  }
150
- return content
151
175
  }
152
176
  })
153
177
 
154
- // Use multiple plugins together
155
- const html = '<div role="alert" class="font-bold">Important message</div>'
156
- const markdown = htmlToMarkdown(html, {
157
- plugins: [
158
- withTailwind(), // Apply Tailwind class processing
159
- filterUnsupportedTags(), // Filter out unsupported tags
160
- myPlugin // Apply custom transformations
161
- ]
162
- })
178
+ // Use the plugin
179
+ const html: string = '<div id="highlight">Important text</div>'
180
+ const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
181
+ ```
163
182
 
164
- console.log(markdown) // "⚠️ **Important message** ⚠️"
183
+ #### Example: Content Filter Plugin
184
+
185
+ ```ts
186
+ import type { ElementNode, NodeEvent } from 'mdream'
187
+ import { ELEMENT_NODE } from 'mdream'
188
+ import { createPlugin } from 'mdream/plugins'
189
+
190
+ const adBlockPlugin = createPlugin({
191
+ beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
192
+ const { node } = event
193
+
194
+ if (node.type === ELEMENT_NODE && node.name === 'div') {
195
+ const element = node as ElementNode
196
+ // Skip ads and promotional content
197
+ if (element.attributes?.class?.includes('ad')
198
+ || element.attributes?.id?.includes('promo')) {
199
+ return { skip: true }
200
+ }
201
+ }
202
+ }
203
+ })
165
204
  ```
166
205
 
167
206
  #### Extraction Plugin
@@ -169,9 +208,10 @@ console.log(markdown) // "⚠️ **Important message** ⚠️"
169
208
  Extract specific elements and their content during HTML processing for data analysis or content discovery:
170
209
 
171
210
  ```ts
211
+ import type { ExtractedElement } from 'mdream/plugins'
172
212
  import { extractionPlugin, htmlToMarkdown } from 'mdream'
173
213
 
174
- const html = `
214
+ const html: string = `
175
215
  <article>
176
216
  <h2>Getting Started</h2>
177
217
  <p>This is a tutorial about web scraping.</p>
@@ -181,10 +221,10 @@ const html = `
181
221
 
182
222
  // Extract elements using CSS selectors
183
223
  const plugin = extractionPlugin({
184
- 'h2': (element) => {
224
+ 'h2': (element: ExtractedElement): void => {
185
225
  console.log('Heading:', element.textContent) // "Getting Started"
186
226
  },
187
- 'img[alt]': (element) => {
227
+ 'img[alt]': (element: ExtractedElement): void => {
188
228
  console.log('Image:', element.attributes.src, element.attributes.alt)
189
229
  // "Image: /hero.jpg Hero image"
190
230
  }
@@ -195,8 +235,6 @@ htmlToMarkdown(html, { plugins: [plugin] })
195
235
 
196
236
  The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
197
237
 
198
- For more details, see the [plugin documentation](./docs/plugins.md).
199
-
200
238
  ## Credits
201
239
 
202
240
  - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
package/dist/cli.mjs CHANGED
@@ -2,7 +2,7 @@ import { Readable } from 'node:stream';
2
2
  import { cac } from 'cac';
3
3
  import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
4
4
  import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
5
- import { s as streamHtmlToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
5
+ import { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
6
6
  import './shared/mdream.Ch6B8TEB.mjs';
7
7
 
8
8
  async function streamingConvert(options = {}) {
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.mjs';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.mjs';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.mjs';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.mjs';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.js';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.js';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.js';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.js';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { p as processPartialHTMLToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
2
- export { s as streamHtmlToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
1
+ import { p as processPartialHTMLToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
2
+ export { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
3
3
  import './shared/mdream.Ch6B8TEB.mjs';
4
4
 
5
5
  function htmlToMarkdown(html, options = {}) {
@@ -1,4 +1,4 @@
1
- import { P as Plugin, b as ElementNode } from './shared/mdream.a2AvjJLp.mjs';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.C9ruFMrk.mjs';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
package/dist/plugins.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { P as Plugin, b as ElementNode } from './shared/mdream.a2AvjJLp.js';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.C9ruFMrk.js';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.mjs';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C9ruFMrk.mjs';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.js';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C9ruFMrk.js';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -82,6 +82,8 @@ interface TextNode extends Node {
82
82
  value: string;
83
83
  /** Custom data added by plugins */
84
84
  context?: Record<string, any>;
85
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
86
+ excludedFromMarkdown?: boolean;
85
87
  }
86
88
  /**
87
89
  * Base DOM node interface
@@ -82,6 +82,8 @@ interface TextNode extends Node {
82
82
  value: string;
83
83
  /** Custom data added by plugins */
84
84
  context?: Record<string, any>;
85
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
86
+ excludedFromMarkdown?: boolean;
85
87
  }
86
88
  /**
87
89
  * Base DOM node interface
@@ -48,6 +48,9 @@ function processHtmlEventToMarkdown(event, state) {
48
48
  textNode.value = pluginResult.content;
49
49
  }
50
50
  }
51
+ if (textNode.excludedFromMarkdown) {
52
+ return;
53
+ }
51
54
  if (textNode.value === " " && lastChar === "\n") {
52
55
  return;
53
56
  }
@@ -1024,9 +1027,10 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1024
1027
  const containsWhitespace = state.textBufferContainsWhitespace;
1025
1028
  state.textBufferContainsNonWhitespace = false;
1026
1029
  state.textBufferContainsWhitespace = false;
1027
- if (!state.currentNode || state.currentNode?.tagHandler?.excludesTextNodes) {
1030
+ if (!state.currentNode) {
1028
1031
  return;
1029
1032
  }
1033
+ const excludesTextNodes = state.currentNode?.tagHandler?.excludesTextNodes;
1030
1034
  const inPreTag = state.depthMap[TAG_PRE] > 0;
1031
1035
  if (!inPreTag && !containsNonWhitespace && !state.currentNode.childTextNodeIndex) {
1032
1036
  return;
@@ -1057,7 +1061,8 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1057
1061
  regionId: state.currentNode?.regionId,
1058
1062
  index: state.currentNode.currentWalkIndex++,
1059
1063
  depth: state.depth,
1060
- containsWhitespace
1064
+ containsWhitespace,
1065
+ excludedFromMarkdown: excludesTextNodes
1061
1066
  };
1062
1067
  for (const parent of parentsToIncrement) {
1063
1068
  parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.2.2",
4
+ "version": "0.2.3",
5
5
  "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",