npm - mdream - Versions diffs - 0.2.2 → 0.2.4 - Mend

mdream 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +62 -26
package/dist/cli.mjs +1 -1
package/dist/index.d.mts +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.mjs +2 -2
package/dist/plugins.d.mts +1 -1
package/dist/plugins.d.ts +1 -1
package/dist/preset/minimal.d.mts +1 -1
package/dist/preset/minimal.d.ts +1 -1
package/dist/shared/{mdream.a2AvjJLp.d.mts → mdream.-SGj02be.d.mts} +8 -0
package/dist/shared/{mdream.a2AvjJLp.d.ts → mdream.-SGj02be.d.ts} +8 -0
package/dist/shared/{mdream.N3Qlh-YP.mjs → mdream.CsDVbUMp.mjs} +50 -9
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -44,9 +44,10 @@
 Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
 human readability.
-Mdream is an ultra-performant HTML to Markdown converter built specifically for LLM Content Analysis & Human Readability. With zero dependencies, streaming built-in and opinionated output optimized for both human readability and AI processing.
+Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
-Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
+Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
+a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
 ## CLI Usage
@@ -110,7 +111,7 @@ import { htmlToMarkdown } from 'mdream'
 // Simple conversion
 const markdown = htmlToMarkdown('<h1>Hello World</h1>')
 console.log(markdown) // # Hello World
-````
+```
 **Convert from Fetch**
@@ -135,33 +136,69 @@ for await (const chunk of markdownGenerator) {
 ### Plugin System
-Mdream now features a powerful plugin system that allows you to customize and extend the HTML-to-Markdown conversion process.
+The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
+#### Plugin Hooks
+- `beforeNodeProcess`: Called before any node processing, can skip nodes
+- `onNodeEnter`: Called when entering an element node
+- `onNodeExit`: Called when exiting an element node
+- `processTextNode`: Called for each text node
+- `processAttributes`: Called to process element attributes
+#### Creating a Plugin
+Use `createPlugin()` to create a plugin with type safety:
 ```ts
-import { createPlugin, filterUnsupportedTags, htmlToMarkdown, withTailwind } from 'mdream'
+import type { ElementNode, TextNode } from 'mdream'
+import { htmlToMarkdown } from 'mdream'
+import { createPlugin } from 'mdream/plugins'
-// Create a custom plugin
 const myPlugin = createPlugin({
-  name: 'my-plugin',
-  transformContent: (content, node) => {
-    if (node.type === 1 && node.name === 'div' && node.attributes?.role === 'alert') {
-      return `⚠️ ${content} ⚠️`
+  onNodeEnter(node: ElementNode): string | undefined {
+    if (node.name === 'h1') {
+      return '🔥 '
+    }
+  },
+  processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
+    // Transform text content
+    if (textNode.parent?.attributes?.id === 'highlight') {
+      return {
+        content: `**${textNode.value}**`,
+        skip: false
+      }
     }
-    return content
   }
 })
-// Use multiple plugins together
-const html = '<div role="alert" class="font-bold">Important message</div>'
-const markdown = htmlToMarkdown(html, {
-  plugins: [
-    withTailwind(), // Apply Tailwind class processing
-    filterUnsupportedTags(), // Filter out unsupported tags
-    myPlugin // Apply custom transformations
-  ]
-})
+// Use the plugin
+const html: string = '<div id="highlight">Important text</div>'
+const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
+```
-console.log(markdown) // "⚠️ **Important message** ⚠️"
+#### Example: Content Filter Plugin
+```ts
+import type { ElementNode, NodeEvent } from 'mdream'
+import { ELEMENT_NODE } from 'mdream'
+import { createPlugin } from 'mdream/plugins'
+const adBlockPlugin = createPlugin({
+  beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
+    const { node } = event
+    if (node.type === ELEMENT_NODE && node.name === 'div') {
+      const element = node as ElementNode
+      // Skip ads and promotional content
+      if (element.attributes?.class?.includes('ad')
+        || element.attributes?.id?.includes('promo')) {
+        return { skip: true }
+      }
+    }
+  }
+})
 ```
 #### Extraction Plugin
@@ -169,9 +206,10 @@ console.log(markdown) // "⚠️ **Important message** ⚠️"
 Extract specific elements and their content during HTML processing for data analysis or content discovery:
 ```ts
+import type { ExtractedElement } from 'mdream/plugins'
 import { extractionPlugin, htmlToMarkdown } from 'mdream'
-const html = `
+const html: string = `
   <article>
     <h2>Getting Started</h2>
     <p>This is a tutorial about web scraping.</p>
@@ -181,10 +219,10 @@ const html = `
 // Extract elements using CSS selectors
 const plugin = extractionPlugin({
-  'h2': (element) => {
+  'h2': (element: ExtractedElement): void => {
     console.log('Heading:', element.textContent) // "Getting Started"
   },
-  'img[alt]': (element) => {
+  'img[alt]': (element: ExtractedElement): void => {
     console.log('Image:', element.attributes.src, element.attributes.alt)
     // "Image: /hero.jpg Hero image"
   }
@@ -195,8 +233,6 @@ htmlToMarkdown(html, { plugins: [plugin] })
 The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
-For more details, see the [plugin documentation](./docs/plugins.md).
 ## Credits
 - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration

package/dist/cli.mjs CHANGED Viewed

@@ -2,7 +2,7 @@ import { Readable } from 'node:stream';
 import { cac } from 'cac';
 import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
 import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
-import { s as streamHtmlToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
+import { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
 import './shared/mdream.Ch6B8TEB.mjs';
 async function streamingConvert(options = {}) {

package/dist/index.d.mts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.mjs';
-export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.mjs';
+import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.mjs';
+export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.mjs';
 import { ReadableStream } from 'node:stream/web';
 /**

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.js';
-export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.js';
+import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.js';
+export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.js';
 import { ReadableStream } from 'node:stream/web';
 /**

package/dist/index.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
-import { p as processPartialHTMLToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
-export { s as streamHtmlToMarkdown } from './shared/mdream.N3Qlh-YP.mjs';
+import { p as processPartialHTMLToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
+export { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
 import './shared/mdream.Ch6B8TEB.mjs';
 function htmlToMarkdown(html, options = {}) {

package/dist/plugins.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { P as Plugin, b as ElementNode } from './shared/mdream.a2AvjJLp.mjs';
+import { P as Plugin, b as ElementNode } from './shared/mdream.-SGj02be.mjs';
 /**
  * Create a plugin that implements the Plugin interface with improved type inference

package/dist/plugins.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { P as Plugin, b as ElementNode } from './shared/mdream.a2AvjJLp.js';
+import { P as Plugin, b as ElementNode } from './shared/mdream.-SGj02be.js';
 /**
  * Create a plugin that implements the Plugin interface with improved type inference

package/dist/preset/minimal.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.mjs';
+import { H as HTMLToMarkdownOptions } from '../shared/mdream.-SGj02be.mjs';
 /**
  * Creates a configurable minimal preset with advanced options

package/dist/preset/minimal.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.js';
+import { H as HTMLToMarkdownOptions } from '../shared/mdream.-SGj02be.js';
 /**
  * Creates a configurable minimal preset with advanced options

package/dist/shared/{mdream.a2AvjJLp.d.mts → mdream.-SGj02be.d.mts} RENAMED Viewed

@@ -82,6 +82,8 @@ interface TextNode extends Node {
     value: string;
     /** Custom data added by plugins */
     context?: Record<string, any>;
+    /** Whether this text node should be excluded from markdown output (for script/style elements) */
+    excludedFromMarkdown?: boolean;
 }
 /**
  * Base DOM node interface
@@ -145,6 +147,12 @@ interface MdreamProcessingState {
     isFirstTextInElement?: boolean;
     /** Reference to the last processed text node - for context tracking */
     lastTextNode?: Node;
+    /** Quote state tracking for non-nesting tags - avoids backward scanning */
+    inSingleQuote?: boolean;
+    inDoubleQuote?: boolean;
+    inBacktick?: boolean;
+    /** Backslash escaping state tracking - avoids checking previous character */
+    lastCharWasBackslash?: boolean;
     /** Plugin instances array for efficient iteration */
     plugins?: Plugin[];
     /** Configuration options for conversion */

package/dist/shared/{mdream.a2AvjJLp.d.ts → mdream.-SGj02be.d.ts} RENAMED Viewed

@@ -82,6 +82,8 @@ interface TextNode extends Node {
     value: string;
     /** Custom data added by plugins */
     context?: Record<string, any>;
+    /** Whether this text node should be excluded from markdown output (for script/style elements) */
+    excludedFromMarkdown?: boolean;
 }
 /**
  * Base DOM node interface
@@ -145,6 +147,12 @@ interface MdreamProcessingState {
     isFirstTextInElement?: boolean;
     /** Reference to the last processed text node - for context tracking */
     lastTextNode?: Node;
+    /** Quote state tracking for non-nesting tags - avoids backward scanning */
+    inSingleQuote?: boolean;
+    inDoubleQuote?: boolean;
+    inBacktick?: boolean;
+    /** Backslash escaping state tracking - avoids checking previous character */
+    lastCharWasBackslash?: boolean;
     /** Plugin instances array for efficient iteration */
     plugins?: Plugin[];
     /** Configuration options for conversion */

package/dist/shared/{mdream.N3Qlh-YP.mjs → mdream.CsDVbUMp.mjs} RENAMED Viewed

@@ -48,6 +48,9 @@ function processHtmlEventToMarkdown(event, state) {
           textNode.value = pluginResult.content;
         }
       }
+      if (textNode.excludedFromMarkdown) {
+        return;
+      }
       if (textNode.value === " " && lastChar === "\n") {
         return;
       }
@@ -869,6 +872,10 @@ const SPACE_CHAR = 32;
 const TAB_CHAR = 9;
 const NEWLINE_CHAR = 10;
 const CARRIAGE_RETURN_CHAR = 13;
+const BACKTICK_CHAR = 96;
+const PIPE_CHAR = 124;
+const OPEN_BRACKET_CHAR = 91;
+const CLOSE_BRACKET_CHAR = 93;
 const EMPTY_ATTRIBUTES = Object.freeze({});
 function copyDepthMap(depthMap) {
   return new Uint8Array(depthMap);
@@ -892,6 +899,7 @@ function parseHTML(htmlChunk, state, handleEvent) {
   state.lastCharWasWhitespace ??= true;
   state.justClosedTag ??= false;
   state.isFirstTextInElement ??= false;
+  state.lastCharWasBackslash ??= false;
   let i = 0;
   const chunkLength = htmlChunk.length;
   while (i < chunkLength) {
@@ -919,23 +927,36 @@ function parseHTML(htmlChunk, state, handleEvent) {
         }
         state.lastCharWasWhitespace = true;
         state.textBufferContainsWhitespace = true;
+        state.lastCharWasBackslash = false;
       } else {
         state.textBufferContainsNonWhitespace = true;
         state.lastCharWasWhitespace = false;
         state.justClosedTag = false;
-        if (currentCharCode === 124 && state.depthMap[TAG_TABLE]) {
+        if (currentCharCode === PIPE_CHAR && state.depthMap[TAG_TABLE]) {
           textBuffer += "\\|";
-        } else if (currentCharCode === 96 && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
+        } else if (currentCharCode === BACKTICK_CHAR && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
           textBuffer += "\\`";
-        } else if (currentCharCode === 91 && state.depthMap[TAG_A]) {
+        } else if (currentCharCode === OPEN_BRACKET_CHAR && state.depthMap[TAG_A]) {
           textBuffer += "\\[";
-        } else if (currentCharCode === 93 && state.depthMap[TAG_A]) {
+        } else if (currentCharCode === CLOSE_BRACKET_CHAR && state.depthMap[TAG_A]) {
           textBuffer += "\\]";
-        } else if (currentCharCode === 62 && state.depthMap[TAG_BLOCKQUOTE]) {
+        } else if (currentCharCode === GT_CHAR && state.depthMap[TAG_BLOCKQUOTE]) {
           textBuffer += "\\>";
         } else {
           textBuffer += htmlChunk[i];
         }
+        if (state.currentNode?.tagHandler?.isNonNesting) {
+          if (!state.lastCharWasBackslash) {
+            if (currentCharCode === APOS_CHAR && !state.inDoubleQuote && !state.inBacktick) {
+              state.inSingleQuote = !state.inSingleQuote;
+            } else if (currentCharCode === QUOTE_CHAR && !state.inSingleQuote && !state.inBacktick) {
+              state.inDoubleQuote = !state.inDoubleQuote;
+            } else if (currentCharCode === BACKTICK_CHAR && !state.inSingleQuote && !state.inDoubleQuote) {
+              state.inBacktick = !state.inBacktick;
+            }
+          }
+        }
+        state.lastCharWasBackslash = currentCharCode === BACKSLASH_CHAR;
       }
       i++;
       continue;
@@ -958,6 +979,12 @@ function parseHTML(htmlChunk, state, handleEvent) {
         break;
       }
     } else if (nextCharCode === SLASH_CHAR) {
+      const inQuotes = state.inSingleQuote || state.inDoubleQuote || state.inBacktick;
+      if (state.currentNode?.tagHandler?.isNonNesting && inQuotes) {
+        textBuffer += htmlChunk[i];
+        i++;
+        continue;
+      }
       if (textBuffer.length > 0) {
         processTextBuffer(textBuffer, state, handleEvent);
         textBuffer = "";
@@ -1024,9 +1051,10 @@ function processTextBuffer(textBuffer, state, handleEvent) {
   const containsWhitespace = state.textBufferContainsWhitespace;
   state.textBufferContainsNonWhitespace = false;
   state.textBufferContainsWhitespace = false;
-  if (!state.currentNode || state.currentNode?.tagHandler?.excludesTextNodes) {
+  if (!state.currentNode) {
     return;
   }
+  const excludesTextNodes = state.currentNode?.tagHandler?.excludesTextNodes;
   const inPreTag = state.depthMap[TAG_PRE] > 0;
   if (!inPreTag && !containsNonWhitespace && !state.currentNode.childTextNodeIndex) {
     return;
@@ -1039,7 +1067,7 @@ function processTextBuffer(textBuffer, state, handleEvent) {
   const firstBlockParent = parentsToIncrement[parentsToIncrement.length - 1];
   if (containsWhitespace && !firstBlockParent?.childTextNodeIndex) {
     let start = 0;
-    while (start < text.length && (inPreTag ? text.charCodeAt(start) === 10 || text.charCodeAt(start) === 13 : isWhitespace(text.charCodeAt(start)))) {
+    while (start < text.length && (inPreTag ? text.charCodeAt(start) === NEWLINE_CHAR || text.charCodeAt(start) === CARRIAGE_RETURN_CHAR : isWhitespace(text.charCodeAt(start)))) {
       start++;
     }
     if (start > 0) {
@@ -1057,7 +1085,8 @@ function processTextBuffer(textBuffer, state, handleEvent) {
     regionId: state.currentNode?.regionId,
     index: state.currentNode.currentWalkIndex++,
     depth: state.depth,
-    containsWhitespace
+    containsWhitespace,
+    excludedFromMarkdown: excludesTextNodes
   };
   for (const parent of parentsToIncrement) {
     parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
@@ -1104,7 +1133,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
     }
   }
   if (curr) {
-    closeNode(state.currentNode, state, handleEvent);
+    closeNode(curr, state, handleEvent);
   }
   state.justClosedTag = true;
   return {
@@ -1138,6 +1167,12 @@ function closeNode(node, state, handleEvent) {
   if (node.tagId) {
     state.depthMap[node.tagId] = Math.max(0, state.depthMap[node.tagId] - 1);
   }
+  if (node.tagHandler?.isNonNesting) {
+    state.inSingleQuote = false;
+    state.inDoubleQuote = false;
+    state.inBacktick = false;
+    state.lastCharWasBackslash = false;
+  }
   state.depth--;
   handleEvent({ type: NodeEventExit, node });
   state.currentNode = state.currentNode.parent;
@@ -1228,6 +1263,12 @@ function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
   parentNode.currentWalkIndex = 0;
   state.currentNode = parentNode;
   state.hasEncodedHtmlEntity = false;
+  if (tagHandler?.isNonNesting && !result.selfClosing) {
+    state.inSingleQuote = false;
+    state.inDoubleQuote = false;
+    state.inBacktick = false;
+    state.lastCharWasBackslash = false;
+  }
   if (result.selfClosing) {
     closeNode(tag, state, handleEvent);
     state.justClosedTag = true;

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "mdream",
   "type": "module",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
   "author": {
     "name": "Harlan Wilton",