mdream 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,9 +44,10 @@
44
44
  Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
45
45
  human readability.
46
46
 
47
- Mdream is an ultra-performant HTML to Markdown converter built specifically for LLM Content Analysis & Human Readability. With zero dependencies, streaming built-in and opinionated output optimized for both human readability and AI processing.
47
+ Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
48
48
 
49
- Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
49
+ Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
50
+ a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
50
51
 
51
52
  ## CLI Usage
52
53
 
@@ -110,7 +111,7 @@ import { htmlToMarkdown } from 'mdream'
110
111
  // Simple conversion
111
112
  const markdown = htmlToMarkdown('<h1>Hello World</h1>')
112
113
  console.log(markdown) // # Hello World
113
- ````
114
+ ```
114
115
 
115
116
  **Convert from Fetch**
116
117
 
@@ -135,33 +136,69 @@ for await (const chunk of markdownGenerator) {
135
136
 
136
137
  ### Plugin System
137
138
 
138
- Mdream now features a powerful plugin system that allows you to customize and extend the HTML-to-Markdown conversion process.
139
+ The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
140
+
141
+ #### Plugin Hooks
142
+
143
+ - `beforeNodeProcess`: Called before any node processing, can skip nodes
144
+ - `onNodeEnter`: Called when entering an element node
145
+ - `onNodeExit`: Called when exiting an element node
146
+ - `processTextNode`: Called for each text node
147
+ - `processAttributes`: Called to process element attributes
148
+
149
+ #### Creating a Plugin
150
+
151
+ Use `createPlugin()` to create a plugin with type safety:
139
152
 
140
153
  ```ts
141
- import { createPlugin, filterUnsupportedTags, htmlToMarkdown, withTailwind } from 'mdream'
154
+ import type { ElementNode, TextNode } from 'mdream'
155
+ import { htmlToMarkdown } from 'mdream'
156
+ import { createPlugin } from 'mdream/plugins'
142
157
 
143
- // Create a custom plugin
144
158
  const myPlugin = createPlugin({
145
- name: 'my-plugin',
146
- transformContent: (content, node) => {
147
- if (node.type === 1 && node.name === 'div' && node.attributes?.role === 'alert') {
148
- return `⚠️ ${content} ⚠️`
159
+ onNodeEnter(node: ElementNode): string | undefined {
160
+ if (node.name === 'h1') {
161
+ return '🔥 '
162
+ }
163
+ },
164
+
165
+ processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
166
+ // Transform text content
167
+ if (textNode.parent?.attributes?.id === 'highlight') {
168
+ return {
169
+ content: `**${textNode.value}**`,
170
+ skip: false
171
+ }
149
172
  }
150
- return content
151
173
  }
152
174
  })
153
175
 
154
- // Use multiple plugins together
155
- const html = '<div role="alert" class="font-bold">Important message</div>'
156
- const markdown = htmlToMarkdown(html, {
157
- plugins: [
158
- withTailwind(), // Apply Tailwind class processing
159
- filterUnsupportedTags(), // Filter out unsupported tags
160
- myPlugin // Apply custom transformations
161
- ]
162
- })
176
+ // Use the plugin
177
+ const html: string = '<div id="highlight">Important text</div>'
178
+ const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
179
+ ```
163
180
 
164
- console.log(markdown) // "⚠️ **Important message** ⚠️"
181
+ #### Example: Content Filter Plugin
182
+
183
+ ```ts
184
+ import type { ElementNode, NodeEvent } from 'mdream'
185
+ import { ELEMENT_NODE } from 'mdream'
186
+ import { createPlugin } from 'mdream/plugins'
187
+
188
+ const adBlockPlugin = createPlugin({
189
+ beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
190
+ const { node } = event
191
+
192
+ if (node.type === ELEMENT_NODE && node.name === 'div') {
193
+ const element = node as ElementNode
194
+ // Skip ads and promotional content
195
+ if (element.attributes?.class?.includes('ad')
196
+ || element.attributes?.id?.includes('promo')) {
197
+ return { skip: true }
198
+ }
199
+ }
200
+ }
201
+ })
165
202
  ```
166
203
 
167
204
  #### Extraction Plugin
@@ -169,9 +206,10 @@ console.log(markdown) // "⚠️ **Important message** ⚠️"
169
206
  Extract specific elements and their content during HTML processing for data analysis or content discovery:
170
207
 
171
208
  ```ts
209
+ import type { ExtractedElement } from 'mdream/plugins'
172
210
  import { extractionPlugin, htmlToMarkdown } from 'mdream'
173
211
 
174
- const html = `
212
+ const html: string = `
175
213
  <article>
176
214
  <h2>Getting Started</h2>
177
215
  <p>This is a tutorial about web scraping.</p>
@@ -181,10 +219,10 @@ const html = `
181
219
 
182
220
  // Extract elements using CSS selectors
183
221
  const plugin = extractionPlugin({
184
- 'h2': (element) => {
222
+ 'h2': (element: ExtractedElement): void => {
185
223
  console.log('Heading:', element.textContent) // "Getting Started"
186
224
  },
187
- 'img[alt]': (element) => {
225
+ 'img[alt]': (element: ExtractedElement): void => {
188
226
  console.log('Image:', element.attributes.src, element.attributes.alt)
189
227
  // "Image: /hero.jpg Hero image"
190
228
  }
@@ -195,8 +233,6 @@ htmlToMarkdown(html, { plugins: [plugin] })
195
233
 
196
234
  The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
197
235
 
198
- For more details, see the [plugin documentation](./docs/plugins.md).
199
-
200
236
  ## Credits
201
237
 
202
238
  - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
package/dist/cli.mjs CHANGED
@@ -2,7 +2,7 @@ import { Readable } from 'node:stream';
2
2
  import { cac } from 'cac';
3
3
  import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
4
4
  import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
5
- import { s as streamHtmlToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
5
+ import { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
6
6
  import './shared/mdream.Ch6B8TEB.mjs';
7
7
 
8
8
  async function streamingConvert(options = {}) {
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.mjs';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.mjs';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.mjs';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.mjs';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.js';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.js';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.js';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.js';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { p as processPartialHTMLToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
2
- export { s as streamHtmlToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
1
+ import { p as processPartialHTMLToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
2
+ export { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
3
3
  import './shared/mdream.Ch6B8TEB.mjs';
4
4
 
5
5
  function htmlToMarkdown(html, options = {}) {
@@ -1,4 +1,4 @@
1
- import { P as Plugin, b as ElementNode } from './shared/mdream.a2AvjJLp.mjs';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.-SGj02be.mjs';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
package/dist/plugins.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { P as Plugin, b as ElementNode } from './shared/mdream.a2AvjJLp.js';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.-SGj02be.js';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.mjs';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.-SGj02be.mjs';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.js';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.-SGj02be.js';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -82,6 +82,8 @@ interface TextNode extends Node {
82
82
  value: string;
83
83
  /** Custom data added by plugins */
84
84
  context?: Record<string, any>;
85
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
86
+ excludedFromMarkdown?: boolean;
85
87
  }
86
88
  /**
87
89
  * Base DOM node interface
@@ -145,6 +147,12 @@ interface MdreamProcessingState {
145
147
  isFirstTextInElement?: boolean;
146
148
  /** Reference to the last processed text node - for context tracking */
147
149
  lastTextNode?: Node;
150
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
151
+ inSingleQuote?: boolean;
152
+ inDoubleQuote?: boolean;
153
+ inBacktick?: boolean;
154
+ /** Backslash escaping state tracking - avoids checking previous character */
155
+ lastCharWasBackslash?: boolean;
148
156
  /** Plugin instances array for efficient iteration */
149
157
  plugins?: Plugin[];
150
158
  /** Configuration options for conversion */
@@ -82,6 +82,8 @@ interface TextNode extends Node {
82
82
  value: string;
83
83
  /** Custom data added by plugins */
84
84
  context?: Record<string, any>;
85
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
86
+ excludedFromMarkdown?: boolean;
85
87
  }
86
88
  /**
87
89
  * Base DOM node interface
@@ -145,6 +147,12 @@ interface MdreamProcessingState {
145
147
  isFirstTextInElement?: boolean;
146
148
  /** Reference to the last processed text node - for context tracking */
147
149
  lastTextNode?: Node;
150
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
151
+ inSingleQuote?: boolean;
152
+ inDoubleQuote?: boolean;
153
+ inBacktick?: boolean;
154
+ /** Backslash escaping state tracking - avoids checking previous character */
155
+ lastCharWasBackslash?: boolean;
148
156
  /** Plugin instances array for efficient iteration */
149
157
  plugins?: Plugin[];
150
158
  /** Configuration options for conversion */
@@ -48,6 +48,9 @@ function processHtmlEventToMarkdown(event, state) {
48
48
  textNode.value = pluginResult.content;
49
49
  }
50
50
  }
51
+ if (textNode.excludedFromMarkdown) {
52
+ return;
53
+ }
51
54
  if (textNode.value === " " && lastChar === "\n") {
52
55
  return;
53
56
  }
@@ -869,6 +872,10 @@ const SPACE_CHAR = 32;
869
872
  const TAB_CHAR = 9;
870
873
  const NEWLINE_CHAR = 10;
871
874
  const CARRIAGE_RETURN_CHAR = 13;
875
+ const BACKTICK_CHAR = 96;
876
+ const PIPE_CHAR = 124;
877
+ const OPEN_BRACKET_CHAR = 91;
878
+ const CLOSE_BRACKET_CHAR = 93;
872
879
  const EMPTY_ATTRIBUTES = Object.freeze({});
873
880
  function copyDepthMap(depthMap) {
874
881
  return new Uint8Array(depthMap);
@@ -892,6 +899,7 @@ function parseHTML(htmlChunk, state, handleEvent) {
892
899
  state.lastCharWasWhitespace ??= true;
893
900
  state.justClosedTag ??= false;
894
901
  state.isFirstTextInElement ??= false;
902
+ state.lastCharWasBackslash ??= false;
895
903
  let i = 0;
896
904
  const chunkLength = htmlChunk.length;
897
905
  while (i < chunkLength) {
@@ -919,23 +927,36 @@ function parseHTML(htmlChunk, state, handleEvent) {
919
927
  }
920
928
  state.lastCharWasWhitespace = true;
921
929
  state.textBufferContainsWhitespace = true;
930
+ state.lastCharWasBackslash = false;
922
931
  } else {
923
932
  state.textBufferContainsNonWhitespace = true;
924
933
  state.lastCharWasWhitespace = false;
925
934
  state.justClosedTag = false;
926
- if (currentCharCode === 124 && state.depthMap[TAG_TABLE]) {
935
+ if (currentCharCode === PIPE_CHAR && state.depthMap[TAG_TABLE]) {
927
936
  textBuffer += "\\|";
928
- } else if (currentCharCode === 96 && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
937
+ } else if (currentCharCode === BACKTICK_CHAR && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
929
938
  textBuffer += "\\`";
930
- } else if (currentCharCode === 91 && state.depthMap[TAG_A]) {
939
+ } else if (currentCharCode === OPEN_BRACKET_CHAR && state.depthMap[TAG_A]) {
931
940
  textBuffer += "\\[";
932
- } else if (currentCharCode === 93 && state.depthMap[TAG_A]) {
941
+ } else if (currentCharCode === CLOSE_BRACKET_CHAR && state.depthMap[TAG_A]) {
933
942
  textBuffer += "\\]";
934
- } else if (currentCharCode === 62 && state.depthMap[TAG_BLOCKQUOTE]) {
943
+ } else if (currentCharCode === GT_CHAR && state.depthMap[TAG_BLOCKQUOTE]) {
935
944
  textBuffer += "\\>";
936
945
  } else {
937
946
  textBuffer += htmlChunk[i];
938
947
  }
948
+ if (state.currentNode?.tagHandler?.isNonNesting) {
949
+ if (!state.lastCharWasBackslash) {
950
+ if (currentCharCode === APOS_CHAR && !state.inDoubleQuote && !state.inBacktick) {
951
+ state.inSingleQuote = !state.inSingleQuote;
952
+ } else if (currentCharCode === QUOTE_CHAR && !state.inSingleQuote && !state.inBacktick) {
953
+ state.inDoubleQuote = !state.inDoubleQuote;
954
+ } else if (currentCharCode === BACKTICK_CHAR && !state.inSingleQuote && !state.inDoubleQuote) {
955
+ state.inBacktick = !state.inBacktick;
956
+ }
957
+ }
958
+ }
959
+ state.lastCharWasBackslash = currentCharCode === BACKSLASH_CHAR;
939
960
  }
940
961
  i++;
941
962
  continue;
@@ -958,6 +979,12 @@ function parseHTML(htmlChunk, state, handleEvent) {
958
979
  break;
959
980
  }
960
981
  } else if (nextCharCode === SLASH_CHAR) {
982
+ const inQuotes = state.inSingleQuote || state.inDoubleQuote || state.inBacktick;
983
+ if (state.currentNode?.tagHandler?.isNonNesting && inQuotes) {
984
+ textBuffer += htmlChunk[i];
985
+ i++;
986
+ continue;
987
+ }
961
988
  if (textBuffer.length > 0) {
962
989
  processTextBuffer(textBuffer, state, handleEvent);
963
990
  textBuffer = "";
@@ -1024,9 +1051,10 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1024
1051
  const containsWhitespace = state.textBufferContainsWhitespace;
1025
1052
  state.textBufferContainsNonWhitespace = false;
1026
1053
  state.textBufferContainsWhitespace = false;
1027
- if (!state.currentNode || state.currentNode?.tagHandler?.excludesTextNodes) {
1054
+ if (!state.currentNode) {
1028
1055
  return;
1029
1056
  }
1057
+ const excludesTextNodes = state.currentNode?.tagHandler?.excludesTextNodes;
1030
1058
  const inPreTag = state.depthMap[TAG_PRE] > 0;
1031
1059
  if (!inPreTag && !containsNonWhitespace && !state.currentNode.childTextNodeIndex) {
1032
1060
  return;
@@ -1039,7 +1067,7 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1039
1067
  const firstBlockParent = parentsToIncrement[parentsToIncrement.length - 1];
1040
1068
  if (containsWhitespace && !firstBlockParent?.childTextNodeIndex) {
1041
1069
  let start = 0;
1042
- while (start < text.length && (inPreTag ? text.charCodeAt(start) === 10 || text.charCodeAt(start) === 13 : isWhitespace(text.charCodeAt(start)))) {
1070
+ while (start < text.length && (inPreTag ? text.charCodeAt(start) === NEWLINE_CHAR || text.charCodeAt(start) === CARRIAGE_RETURN_CHAR : isWhitespace(text.charCodeAt(start)))) {
1043
1071
  start++;
1044
1072
  }
1045
1073
  if (start > 0) {
@@ -1057,7 +1085,8 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1057
1085
  regionId: state.currentNode?.regionId,
1058
1086
  index: state.currentNode.currentWalkIndex++,
1059
1087
  depth: state.depth,
1060
- containsWhitespace
1088
+ containsWhitespace,
1089
+ excludedFromMarkdown: excludesTextNodes
1061
1090
  };
1062
1091
  for (const parent of parentsToIncrement) {
1063
1092
  parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
@@ -1104,7 +1133,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
1104
1133
  }
1105
1134
  }
1106
1135
  if (curr) {
1107
- closeNode(state.currentNode, state, handleEvent);
1136
+ closeNode(curr, state, handleEvent);
1108
1137
  }
1109
1138
  state.justClosedTag = true;
1110
1139
  return {
@@ -1138,6 +1167,12 @@ function closeNode(node, state, handleEvent) {
1138
1167
  if (node.tagId) {
1139
1168
  state.depthMap[node.tagId] = Math.max(0, state.depthMap[node.tagId] - 1);
1140
1169
  }
1170
+ if (node.tagHandler?.isNonNesting) {
1171
+ state.inSingleQuote = false;
1172
+ state.inDoubleQuote = false;
1173
+ state.inBacktick = false;
1174
+ state.lastCharWasBackslash = false;
1175
+ }
1141
1176
  state.depth--;
1142
1177
  handleEvent({ type: NodeEventExit, node });
1143
1178
  state.currentNode = state.currentNode.parent;
@@ -1228,6 +1263,12 @@ function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
1228
1263
  parentNode.currentWalkIndex = 0;
1229
1264
  state.currentNode = parentNode;
1230
1265
  state.hasEncodedHtmlEntity = false;
1266
+ if (tagHandler?.isNonNesting && !result.selfClosing) {
1267
+ state.inSingleQuote = false;
1268
+ state.inDoubleQuote = false;
1269
+ state.inBacktick = false;
1270
+ state.lastCharWasBackslash = false;
1271
+ }
1231
1272
  if (result.selfClosing) {
1232
1273
  closeNode(tag, state, handleEvent);
1233
1274
  state.justClosedTag = true;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.2.2",
4
+ "version": "0.2.4",
5
5
  "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",