mdream 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,7 +44,10 @@
44
44
  Traditional HTML to Markdown converters were not built for LLMs or humans. They tend to be slow and bloated and produce output that's poorly suited for LLMs token usage or for
45
45
  human readability.
46
46
 
47
- Mdream is an ultra-performant HTML to Markdown converter built specifically for LLM Content Analysis & Human Readability. With zero dependencies, streaming built-in and opinionated output optimized for both human readability and AI processing.
47
+ Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
48
+
49
+ Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
50
+ a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
48
51
 
49
52
  Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
50
53
 
@@ -110,7 +113,7 @@ import { htmlToMarkdown } from 'mdream'
110
113
  // Simple conversion
111
114
  const markdown = htmlToMarkdown('<h1>Hello World</h1>')
112
115
  console.log(markdown) // # Hello World
113
- ````
116
+ ```
114
117
 
115
118
  **Convert from Fetch**
116
119
 
@@ -135,33 +138,69 @@ for await (const chunk of markdownGenerator) {
135
138
 
136
139
  ### Plugin System
137
140
 
138
- Mdream now features a powerful plugin system that allows you to customize and extend the HTML-to-Markdown conversion process.
141
+ The plugin system allows you to customize HTML to Markdown conversion by hooking into the processing pipeline. Plugins can filter content, extract data, transform nodes, or add custom behavior.
142
+
143
+ #### Plugin Hooks
144
+
145
+ - `beforeNodeProcess`: Called before any node processing, can skip nodes
146
+ - `onNodeEnter`: Called when entering an element node
147
+ - `onNodeExit`: Called when exiting an element node
148
+ - `processTextNode`: Called for each text node
149
+ - `processAttributes`: Called to process element attributes
150
+
151
+ #### Creating a Plugin
152
+
153
+ Use `createPlugin()` to create a plugin with type safety:
139
154
 
140
155
  ```ts
141
- import { createPlugin, filterUnsupportedTags, htmlToMarkdown, withTailwind } from 'mdream'
156
+ import type { ElementNode, TextNode } from 'mdream'
157
+ import { htmlToMarkdown } from 'mdream'
158
+ import { createPlugin } from 'mdream/plugins'
142
159
 
143
- // Create a custom plugin
144
160
  const myPlugin = createPlugin({
145
- name: 'my-plugin',
146
- transformContent: (content, node) => {
147
- if (node.type === 1 && node.name === 'div' && node.attributes?.role === 'alert') {
148
- return `⚠️ ${content} ⚠️`
161
+ onNodeEnter(node: ElementNode): string | undefined {
162
+ if (node.name === 'h1') {
163
+ return '🔥 '
164
+ }
165
+ },
166
+
167
+ processTextNode(textNode: TextNode): { content: string, skip: boolean } | undefined {
168
+ // Transform text content
169
+ if (textNode.parent?.attributes?.id === 'highlight') {
170
+ return {
171
+ content: `**${textNode.value}**`,
172
+ skip: false
173
+ }
149
174
  }
150
- return content
151
175
  }
152
176
  })
153
177
 
154
- // Use multiple plugins together
155
- const html = '<div role="alert" class="font-bold">Important message</div>'
156
- const markdown = htmlToMarkdown(html, {
157
- plugins: [
158
- withTailwind(), // Apply Tailwind class processing
159
- filterUnsupportedTags(), // Filter out unsupported tags
160
- myPlugin // Apply custom transformations
161
- ]
162
- })
178
+ // Use the plugin
179
+ const html: string = '<div id="highlight">Important text</div>'
180
+ const markdown: string = htmlToMarkdown(html, { plugins: [myPlugin] })
181
+ ```
163
182
 
164
- console.log(markdown) // "⚠️ **Important message** ⚠️"
183
+ #### Example: Content Filter Plugin
184
+
185
+ ```ts
186
+ import type { ElementNode, NodeEvent } from 'mdream'
187
+ import { ELEMENT_NODE } from 'mdream'
188
+ import { createPlugin } from 'mdream/plugins'
189
+
190
+ const adBlockPlugin = createPlugin({
191
+ beforeNodeProcess(event: NodeEvent): { skip: boolean } | undefined {
192
+ const { node } = event
193
+
194
+ if (node.type === ELEMENT_NODE && node.name === 'div') {
195
+ const element = node as ElementNode
196
+ // Skip ads and promotional content
197
+ if (element.attributes?.class?.includes('ad')
198
+ || element.attributes?.id?.includes('promo')) {
199
+ return { skip: true }
200
+ }
201
+ }
202
+ }
203
+ })
165
204
  ```
166
205
 
167
206
  #### Extraction Plugin
@@ -169,9 +208,10 @@ console.log(markdown) // "⚠️ **Important message** ⚠️"
169
208
  Extract specific elements and their content during HTML processing for data analysis or content discovery:
170
209
 
171
210
  ```ts
211
+ import type { ExtractedElement } from 'mdream/plugins'
172
212
  import { extractionPlugin, htmlToMarkdown } from 'mdream'
173
213
 
174
- const html = `
214
+ const html: string = `
175
215
  <article>
176
216
  <h2>Getting Started</h2>
177
217
  <p>This is a tutorial about web scraping.</p>
@@ -181,10 +221,10 @@ const html = `
181
221
 
182
222
  // Extract elements using CSS selectors
183
223
  const plugin = extractionPlugin({
184
- 'h2': (element) => {
224
+ 'h2': (element: ExtractedElement): void => {
185
225
  console.log('Heading:', element.textContent) // "Getting Started"
186
226
  },
187
- 'img[alt]': (element) => {
227
+ 'img[alt]': (element: ExtractedElement): void => {
188
228
  console.log('Image:', element.attributes.src, element.attributes.alt)
189
229
  // "Image: /hero.jpg Hero image"
190
230
  }
@@ -195,8 +235,6 @@ htmlToMarkdown(html, { plugins: [plugin] })
195
235
 
196
236
  The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
197
237
 
198
- For more details, see the [plugin documentation](./docs/plugins.md).
199
-
200
238
  ## Credits
201
239
 
202
240
  - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
package/dist/cli.mjs CHANGED
@@ -1,9 +1,9 @@
1
1
  import { Readable } from 'node:stream';
2
2
  import { cac } from 'cac';
3
- import { f as frontmatterPlugin } from './shared/mdream.cpEmpxyh.mjs';
4
- import { r as readabilityPlugin } from './shared/mdream.C8ruysN5.mjs';
5
- import { s as streamHtmlToMarkdown } from './shared/mdream.DUeWbUFG.mjs';
6
- import './shared/mdream.-hdaPj9a.mjs';
3
+ import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
4
+ import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
5
+ import { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
6
+ import './shared/mdream.Ch6B8TEB.mjs';
7
7
 
8
8
  async function streamingConvert(options = {}) {
9
9
  const outputStream = process.stdout;
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.mjs';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.mjs';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.mjs';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.mjs';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.a2AvjJLp.js';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.a2AvjJLp.js';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.js';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.js';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.mjs CHANGED
@@ -1,6 +1,6 @@
1
- import { p as processPartialHTMLToMarkdown } from './shared/mdream.DUeWbUFG.mjs';
2
- export { s as streamHtmlToMarkdown } from './shared/mdream.DUeWbUFG.mjs';
3
- import './shared/mdream.-hdaPj9a.mjs';
1
+ import { p as processPartialHTMLToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
2
+ export { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
3
+ import './shared/mdream.Ch6B8TEB.mjs';
4
4
 
5
5
  function htmlToMarkdown(html, options = {}) {
6
6
  const state = {
@@ -1,4 +1,4 @@
1
- import { P as Plugin } from './shared/mdream.a2AvjJLp.mjs';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.C9ruFMrk.mjs';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
@@ -7,6 +7,11 @@ import { P as Plugin } from './shared/mdream.a2AvjJLp.mjs';
7
7
  */
8
8
  declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
9
9
 
10
+ interface ExtractedElement extends ElementNode {
11
+ textContent: string;
12
+ }
13
+ declare function extractionPlugin(selectors: Record<string, (element: ExtractedElement) => void>): Plugin;
14
+
10
15
  /**
11
16
  * Plugin that filters nodes based on CSS selectors.
12
17
  * Allows including or excluding nodes based on selectors.
@@ -85,4 +90,4 @@ declare function readabilityPlugin(): Plugin;
85
90
  */
86
91
  declare function tailwindPlugin(): Plugin;
87
92
 
88
- export { createPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
93
+ export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
package/dist/plugins.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { P as Plugin } from './shared/mdream.a2AvjJLp.js';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.C9ruFMrk.js';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
@@ -7,6 +7,11 @@ import { P as Plugin } from './shared/mdream.a2AvjJLp.js';
7
7
  */
8
8
  declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
9
9
 
10
+ interface ExtractedElement extends ElementNode {
11
+ textContent: string;
12
+ }
13
+ declare function extractionPlugin(selectors: Record<string, (element: ExtractedElement) => void>): Plugin;
14
+
10
15
  /**
11
16
  * Plugin that filters nodes based on CSS selectors.
12
17
  * Allows including or excluding nodes based on selectors.
@@ -85,4 +90,4 @@ declare function readabilityPlugin(): Plugin;
85
90
  */
86
91
  declare function tailwindPlugin(): Plugin;
87
92
 
88
- export { createPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
93
+ export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
package/dist/plugins.mjs CHANGED
@@ -1,4 +1,46 @@
1
- export { c as createPlugin, f as frontmatterPlugin } from './shared/mdream.cpEmpxyh.mjs';
2
- export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.DEM9pag4.mjs';
3
- export { r as readabilityPlugin } from './shared/mdream.C8ruysN5.mjs';
4
- import './shared/mdream.-hdaPj9a.mjs';
1
+ import { c as createPlugin } from './shared/mdream.C6Z2rfeq.mjs';
2
+ export { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
3
+ import { p as parseSelector } from './shared/mdream.D5zBVbP9.mjs';
4
+ export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.D5zBVbP9.mjs';
5
+ export { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
6
+ import './shared/mdream.Ch6B8TEB.mjs';
7
+
8
+ function extractionPlugin(selectors) {
9
+ const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
10
+ matcher: parseSelector(selector),
11
+ callback
12
+ }));
13
+ const trackedElements = /* @__PURE__ */ new Map();
14
+ return createPlugin({
15
+ onNodeEnter(element) {
16
+ matcherCallbacks.forEach(({ matcher, callback }) => {
17
+ if (matcher.matches(element)) {
18
+ trackedElements.set(element, { textContent: "", callback });
19
+ }
20
+ });
21
+ },
22
+ processTextNode(textNode) {
23
+ let currentParent = textNode.parent;
24
+ while (currentParent) {
25
+ const tracked = trackedElements.get(currentParent);
26
+ if (tracked) {
27
+ tracked.textContent += textNode.value;
28
+ }
29
+ currentParent = currentParent.parent;
30
+ }
31
+ },
32
+ onNodeExit(element) {
33
+ const tracked = trackedElements.get(element);
34
+ if (tracked) {
35
+ const extractedElement = {
36
+ ...element,
37
+ textContent: tracked.textContent.trim()
38
+ };
39
+ tracked.callback(extractedElement);
40
+ trackedElements.delete(element);
41
+ }
42
+ }
43
+ });
44
+ }
45
+
46
+ export { createPlugin, extractionPlugin };
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.mjs';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C9ruFMrk.mjs';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.a2AvjJLp.js';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C9ruFMrk.js';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -1,6 +1,6 @@
1
- import { y as TAG_FORM, t as TAG_FIELDSET, q as TAG_OBJECT, r as TAG_EMBED, a0 as TAG_FIGURE, B as TAG_FOOTER, z as TAG_ASIDE, s as TAG_IFRAME, w as TAG_INPUT, v as TAG_TEXTAREA, u as TAG_SELECT, x as TAG_BUTTON, A as TAG_NAV } from '../shared/mdream.-hdaPj9a.mjs';
2
- import { i as isolateMainPlugin, t as tailwindPlugin, f as filterPlugin } from '../shared/mdream.DEM9pag4.mjs';
3
- import { f as frontmatterPlugin } from '../shared/mdream.cpEmpxyh.mjs';
1
+ import { a9 as TAG_FORM, V as TAG_FIELDSET, b2 as TAG_OBJECT, a4 as TAG_EMBED, b6 as TAG_FIGURE, aa as TAG_FOOTER, u as TAG_ASIDE, P as TAG_IFRAME, a3 as TAG_INPUT, X as TAG_TEXTAREA, Y as TAG_SELECT, ah as TAG_BUTTON, aj as TAG_NAV } from '../shared/mdream.Ch6B8TEB.mjs';
2
+ import { i as isolateMainPlugin, t as tailwindPlugin, f as filterPlugin } from '../shared/mdream.D5zBVbP9.mjs';
3
+ import { f as frontmatterPlugin } from '../shared/mdream.C6Z2rfeq.mjs';
4
4
 
5
5
  function withMinimalPreset(options = {}) {
6
6
  const plugins = [
@@ -1,4 +1,4 @@
1
- import { aa as ELEMENT_NODE, b as TAG_HEAD, ab as collectNodeContent, ac as TAG_TITLE, ad as TAG_META } from './mdream.-hdaPj9a.mjs';
1
+ import { E as ELEMENT_NODE, aU as TAG_HEAD, c as collectNodeContent, aR as TAG_TITLE, aO as TAG_META } from './mdream.Ch6B8TEB.mjs';
2
2
 
3
3
  function createPlugin(plugin) {
4
4
  return plugin;
@@ -82,6 +82,8 @@ interface TextNode extends Node {
82
82
  value: string;
83
83
  /** Custom data added by plugins */
84
84
  context?: Record<string, any>;
85
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
86
+ excludedFromMarkdown?: boolean;
85
87
  }
86
88
  /**
87
89
  * Base DOM node interface
@@ -82,6 +82,8 @@ interface TextNode extends Node {
82
82
  value: string;
83
83
  /** Custom data added by plugins */
84
84
  context?: Record<string, any>;
85
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
86
+ excludedFromMarkdown?: boolean;
85
87
  }
86
88
  /**
87
89
  * Base DOM node interface
@@ -1,4 +1,4 @@
1
- import { ae as TEXT_NODE, af as NodeEventEnter, ab as collectNodeContent, aa as ELEMENT_NODE, a3 as TAG_PRE, M as TAG_LI, a4 as TAG_BLOCKQUOTE, ag as NO_SPACING, ah as DEFAULT_BLOCK_SPACING, ai as TABLE_ROW_SPACING, aj as LIST_ITEM_SPACING, ak as BLOCKQUOTE_SPACING, al as MARKDOWN_STRIKETHROUGH, am as MARKDOWN_HORIZONTAL_RULE, J as TAG_DD, K as TAG_DT, L as TAG_DL, e as TAG_ADDRESS, an as TAG_RP, ao as TAG_RT, ap as TAG_RUBY, aq as TAG_BDO, ar as TAG_TIME, as as TAG_VAR, at as TAG_DFN, au as TAG_CITE, av as TAG_U, z as TAG_ASIDE, aw as TAG_PLAINTEXT, ax as TAG_XMP, ay as TAG_NOFRAMES, az as TAG_NOSCRIPT, aA as TAG_SMALL, aB as TAG_SAMP, aC as TAG_Q, aD as TAG_MARK, aE as TAG_ABBR, aF as TAG_TEMPLATE, aG as TAG_PROGRESS, aH as TAG_METER, aI as TAG_DIALOG, aJ as TAG_MAP, s as TAG_IFRAME, aK as TAG_CANVAS, _ as TAG_VIDEO, Z as TAG_AUDIO, aL as TAG_LEGEND, t as TAG_FIELDSET, aM as TAG_OPTION, v as TAG_TEXTAREA, u as TAG_SELECT, Y as TAG_SVG, aN as TAG_WBR, aO as TAG_TRACK, aP as TAG_SOURCE, aQ as TAG_PARAM, aR as TAG_KEYGEN, w as TAG_INPUT, r as TAG_EMBED, aS as TAG_COL, aT as TAG_BASE, aU as TAG_AREA, aV as TAG_LINK, y as TAG_FORM, B as TAG_FOOTER, aW as TAG_KBD, S as TAG_TFOOT, U as TAG_TBODY, aX as TAG_CENTER, X as TAG_TABLE, T as TAG_BODY, x as TAG_BUTTON, aY as TAG_LABEL, A as TAG_NAV, j as TAG_SPAN, a5 as TAG_DIV, a6 as TAG_P, P as TAG_TD, Q as TAG_TH, R as TAG_TR, V as TAG_THEAD, a1 as TAG_IMG, d as TAG_A, O as TAG_UL, N as TAG_OL, a2 as TAG_CODE, aZ as MARKDOWN_CODE_BLOCK, a_ as MARKDOWN_INLINE_CODE, a$ as TAG_INS, b0 as TAG_SUP, b1 as TAG_SUB, b2 as TAG_DEL, m as TAG_I, n as TAG_EM, o as TAG_B, p as TAG_STRONG, l as TAG_HR, D as TAG_H6, E as TAG_H5, F as TAG_H4, G as TAG_H3, H as TAG_H2, I as TAG_H1, k as TAG_BR, ad as TAG_META, h as TAG_STYLE, i as TAG_SCRIPT, ac as TAG_TITLE, f as TAG_SUMMARY, g as TAG_DETAILS, b as TAG_HEAD, b3 as MARKDOWN_EMPHASIS, b4 as MARKDOWN_STRONG, b5 as HTML_ENTITIES, b6 as MAX_TAG_ID, b7 as assembleBufferedContent, b8 as TagIdMap, b9 as NodeEventExit } from './mdream.-hdaPj9a.mjs';
1
+ import { T as TEXT_NODE, N as NodeEventEnter, c as collectNodeContent, E as ELEMENT_NODE, a as TAG_PRE, b as TAG_LI, d as TAG_BLOCKQUOTE, e as NO_SPACING, D as DEFAULT_BLOCK_SPACING, f as TABLE_ROW_SPACING, L as LIST_ITEM_SPACING, B as BLOCKQUOTE_SPACING, M as MARKDOWN_STRIKETHROUGH, g as MARKDOWN_HORIZONTAL_RULE, h as TAG_DD, i as TAG_DT, j as TAG_DL, k as TAG_ADDRESS, l as TAG_RP, m as TAG_RT, n as TAG_RUBY, o as TAG_BDO, p as TAG_TIME, q as TAG_VAR, r as TAG_DFN, s as TAG_CITE, t as TAG_U, u as TAG_ASIDE, v as TAG_PLAINTEXT, w as TAG_XMP, x as TAG_NOFRAMES, y as TAG_NOSCRIPT, z as TAG_SMALL, A as TAG_SAMP, C as TAG_Q, F as TAG_MARK, G as TAG_ABBR, H as TAG_TEMPLATE, I as TAG_PROGRESS, J as TAG_METER, K as TAG_DIALOG, O as TAG_MAP, P as TAG_IFRAME, Q as TAG_CANVAS, R as TAG_VIDEO, S as TAG_AUDIO, U as TAG_LEGEND, V as TAG_FIELDSET, W as TAG_OPTION, X as TAG_TEXTAREA, Y as TAG_SELECT, Z as TAG_SVG, _ as TAG_WBR, $ as TAG_TRACK, a0 as TAG_SOURCE, a1 as TAG_PARAM, a2 as TAG_KEYGEN, a3 as TAG_INPUT, a4 as TAG_EMBED, a5 as TAG_COL, a6 as TAG_BASE, a7 as TAG_AREA, a8 as TAG_LINK, a9 as TAG_FORM, aa as TAG_FOOTER, ab as TAG_KBD, ac as TAG_TFOOT, ad as TAG_TBODY, ae as TAG_CENTER, af as TAG_TABLE, ag as TAG_BODY, ah as TAG_BUTTON, ai as TAG_LABEL, aj as TAG_NAV, ak as TAG_SPAN, al as TAG_DIV, am as TAG_P, an as TAG_TD, ao as TAG_TH, ap as TAG_TR, aq as TAG_THEAD, ar as TAG_IMG, as as TAG_A, at as TAG_UL, au as TAG_OL, av as TAG_CODE, aw as MARKDOWN_CODE_BLOCK, ax as MARKDOWN_INLINE_CODE, ay as TAG_INS, az as TAG_SUP, aA as TAG_SUB, aB as TAG_DEL, aC as TAG_I, aD as TAG_EM, aE as TAG_B, aF as TAG_STRONG, aG as TAG_HR, aH as TAG_H6, aI as TAG_H5, aJ as TAG_H4, aK as TAG_H3, aL as TAG_H2, aM as TAG_H1, aN as TAG_BR, aO as TAG_META, aP as TAG_STYLE, aQ as TAG_SCRIPT, aR as TAG_TITLE, aS as TAG_SUMMARY, aT as TAG_DETAILS, aU as TAG_HEAD, aV as MARKDOWN_EMPHASIS, aW as MARKDOWN_STRONG, aX as HTML_ENTITIES, aY as MAX_TAG_ID, aZ as assembleBufferedContent, a_ as TagIdMap, a$ as NodeEventExit } from './mdream.Ch6B8TEB.mjs';
2
2
 
3
3
  function needsSpacing(lastChar, firstChar) {
4
4
  const noSpaceLastChars = /* @__PURE__ */ new Set(["\n", " ", "[", ">", "_", "*", "`", "|", "#", "<", "("]);
@@ -48,6 +48,9 @@ function processHtmlEventToMarkdown(event, state) {
48
48
  textNode.value = pluginResult.content;
49
49
  }
50
50
  }
51
+ if (textNode.excludedFromMarkdown) {
52
+ return;
53
+ }
51
54
  if (textNode.value === " " && lastChar === "\n") {
52
55
  return;
53
56
  }
@@ -1024,9 +1027,10 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1024
1027
  const containsWhitespace = state.textBufferContainsWhitespace;
1025
1028
  state.textBufferContainsNonWhitespace = false;
1026
1029
  state.textBufferContainsWhitespace = false;
1027
- if (!state.currentNode || state.currentNode?.tagHandler?.excludesTextNodes) {
1030
+ if (!state.currentNode) {
1028
1031
  return;
1029
1032
  }
1033
+ const excludesTextNodes = state.currentNode?.tagHandler?.excludesTextNodes;
1030
1034
  const inPreTag = state.depthMap[TAG_PRE] > 0;
1031
1035
  if (!inPreTag && !containsNonWhitespace && !state.currentNode.childTextNodeIndex) {
1032
1036
  return;
@@ -1057,7 +1061,8 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1057
1061
  regionId: state.currentNode?.regionId,
1058
1062
  index: state.currentNode.currentWalkIndex++,
1059
1063
  depth: state.depth,
1060
- containsWhitespace
1064
+ containsWhitespace,
1065
+ excludedFromMarkdown: excludesTextNodes
1061
1066
  };
1062
1067
  for (const parent of parentsToIncrement) {
1063
1068
  parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
@@ -277,4 +277,4 @@ const BLOCKQUOTE_SPACING = [1, 1];
277
277
  const LIST_ITEM_SPACING = [1, 0];
278
278
  const TABLE_ROW_SPACING = [0, 1];
279
279
 
280
- export { TAG_FIGCAPTION as $, TAG_NAV as A, TAG_FOOTER as B, TAG_HEADER as C, TAG_H6 as D, TAG_H5 as E, TAG_H4 as F, TAG_H3 as G, TAG_H2 as H, TAG_H1 as I, TAG_DD as J, TAG_DT as K, TAG_DL as L, TAG_LI as M, TAG_OL as N, TAG_UL as O, TAG_TD as P, TAG_TH as Q, TAG_TR as R, TAG_TFOOT as S, TAG_BODY as T, TAG_TBODY as U, TAG_THEAD as V, TAG_CAPTION as W, TAG_TABLE as X, TAG_SVG as Y, TAG_AUDIO as Z, TAG_VIDEO as _, TAG_HTML as a, TAG_INS as a$, TAG_FIGURE as a0, TAG_IMG as a1, TAG_CODE as a2, TAG_PRE as a3, TAG_BLOCKQUOTE as a4, TAG_DIV as a5, TAG_P as a6, TAG_MAIN as a7, TAG_SECTION as a8, TAG_ARTICLE as a9, TAG_SMALL as aA, TAG_SAMP as aB, TAG_Q as aC, TAG_MARK as aD, TAG_ABBR as aE, TAG_TEMPLATE as aF, TAG_PROGRESS as aG, TAG_METER as aH, TAG_DIALOG as aI, TAG_MAP as aJ, TAG_CANVAS as aK, TAG_LEGEND as aL, TAG_OPTION as aM, TAG_WBR as aN, TAG_TRACK as aO, TAG_SOURCE as aP, TAG_PARAM as aQ, TAG_KEYGEN as aR, TAG_COL as aS, TAG_BASE as aT, TAG_AREA as aU, TAG_LINK as aV, TAG_KBD as aW, TAG_CENTER as aX, TAG_LABEL as aY, MARKDOWN_CODE_BLOCK as aZ, MARKDOWN_INLINE_CODE as a_, ELEMENT_NODE as aa, collectNodeContent as ab, TAG_TITLE as ac, TAG_META as ad, TEXT_NODE as ae, NodeEventEnter as af, NO_SPACING as ag, DEFAULT_BLOCK_SPACING as ah, TABLE_ROW_SPACING as ai, LIST_ITEM_SPACING as aj, BLOCKQUOTE_SPACING as ak, MARKDOWN_STRIKETHROUGH as al, MARKDOWN_HORIZONTAL_RULE as am, TAG_RP as an, TAG_RT as ao, TAG_RUBY as ap, TAG_BDO as aq, TAG_TIME as ar, TAG_VAR as as, TAG_DFN as at, TAG_CITE as au, TAG_U as av, TAG_PLAINTEXT as aw, TAG_XMP as ax, TAG_NOFRAMES as ay, TAG_NOSCRIPT as az, TAG_HEAD as b, TAG_SUP as b0, TAG_SUB as b1, TAG_DEL as b2, MARKDOWN_EMPHASIS as b3, MARKDOWN_STRONG as b4, HTML_ENTITIES as b5, MAX_TAG_ID as b6, assembleBufferedContent as b7, TagIdMap as b8, NodeEventExit as b9, createBufferRegion as c, TAG_A as d, TAG_ADDRESS as e, TAG_SUMMARY as f, TAG_DETAILS as g, TAG_STYLE as h, TAG_SCRIPT as i, TAG_SPAN as j, TAG_BR as k, TAG_HR as l, TAG_I as m, TAG_EM as n, TAG_B as o, TAG_STRONG as p, TAG_OBJECT as q, TAG_EMBED as r, TAG_IFRAME as s, TAG_FIELDSET as t, TAG_SELECT as u, TAG_TEXTAREA as v, TAG_INPUT as w, TAG_BUTTON as x, TAG_FORM as y, TAG_ASIDE as z };
280
+ export { TAG_TRACK as $, TAG_SAMP as A, BLOCKQUOTE_SPACING as B, TAG_Q as C, DEFAULT_BLOCK_SPACING as D, ELEMENT_NODE as E, TAG_MARK as F, TAG_ABBR as G, TAG_TEMPLATE as H, TAG_PROGRESS as I, TAG_METER as J, TAG_DIALOG as K, LIST_ITEM_SPACING as L, MARKDOWN_STRIKETHROUGH as M, NodeEventEnter as N, TAG_MAP as O, TAG_IFRAME as P, TAG_CANVAS as Q, TAG_VIDEO as R, TAG_AUDIO as S, TEXT_NODE as T, TAG_LEGEND as U, TAG_FIELDSET as V, TAG_OPTION as W, TAG_TEXTAREA as X, TAG_SELECT as Y, TAG_SVG as Z, TAG_WBR as _, TAG_PRE as a, NodeEventExit as a$, TAG_SOURCE as a0, TAG_PARAM as a1, TAG_KEYGEN as a2, TAG_INPUT as a3, TAG_EMBED as a4, TAG_COL as a5, TAG_BASE as a6, TAG_AREA as a7, TAG_LINK as a8, TAG_FORM as a9, TAG_SUB as aA, TAG_DEL as aB, TAG_I as aC, TAG_EM as aD, TAG_B as aE, TAG_STRONG as aF, TAG_HR as aG, TAG_H6 as aH, TAG_H5 as aI, TAG_H4 as aJ, TAG_H3 as aK, TAG_H2 as aL, TAG_H1 as aM, TAG_BR as aN, TAG_META as aO, TAG_STYLE as aP, TAG_SCRIPT as aQ, TAG_TITLE as aR, TAG_SUMMARY as aS, TAG_DETAILS as aT, TAG_HEAD as aU, MARKDOWN_EMPHASIS as aV, MARKDOWN_STRONG as aW, HTML_ENTITIES as aX, MAX_TAG_ID as aY, assembleBufferedContent as aZ, TagIdMap as a_, TAG_FOOTER as aa, TAG_KBD as ab, TAG_TFOOT as ac, TAG_TBODY as ad, TAG_CENTER as ae, TAG_TABLE as af, TAG_BODY as ag, TAG_BUTTON as ah, TAG_LABEL as ai, TAG_NAV as aj, TAG_SPAN as ak, TAG_DIV as al, TAG_P as am, TAG_TD as an, TAG_TH as ao, TAG_TR as ap, TAG_THEAD as aq, TAG_IMG as ar, TAG_A as as, TAG_UL as at, TAG_OL as au, TAG_CODE as av, MARKDOWN_CODE_BLOCK as aw, MARKDOWN_INLINE_CODE as ax, TAG_INS as ay, TAG_SUP as az, TAG_LI as b, TAG_HTML as b0, createBufferRegion as b1, TAG_OBJECT as b2, TAG_HEADER as b3, TAG_CAPTION as b4, TAG_FIGCAPTION as b5, TAG_FIGURE as b6, TAG_MAIN as b7, TAG_SECTION as b8, TAG_ARTICLE as b9, collectNodeContent as c, TAG_BLOCKQUOTE as d, NO_SPACING as e, TABLE_ROW_SPACING as f, MARKDOWN_HORIZONTAL_RULE as g, TAG_DD as h, TAG_DT as i, TAG_DL as j, TAG_ADDRESS as k, TAG_RP as l, TAG_RT as m, TAG_RUBY as n, TAG_BDO as o, TAG_TIME as p, TAG_VAR as q, TAG_DFN as r, TAG_CITE as s, TAG_U as t, TAG_ASIDE as u, TAG_PLAINTEXT as v, TAG_XMP as w, TAG_NOFRAMES as x, TAG_NOSCRIPT as y, TAG_SMALL as z };
@@ -1,5 +1,5 @@
1
- import { ae as TEXT_NODE, aa as ELEMENT_NODE, a7 as TAG_MAIN, C as TAG_HEADER, B as TAG_FOOTER, I as TAG_H1, H as TAG_H2, G as TAG_H3, F as TAG_H4, E as TAG_H5, D as TAG_H6 } from './mdream.-hdaPj9a.mjs';
2
- import { c as createPlugin } from './mdream.cpEmpxyh.mjs';
1
+ import { T as TEXT_NODE, E as ELEMENT_NODE, b7 as TAG_MAIN, b3 as TAG_HEADER, aa as TAG_FOOTER, aM as TAG_H1, aL as TAG_H2, aK as TAG_H3, aJ as TAG_H4, aI as TAG_H5, aH as TAG_H6 } from './mdream.Ch6B8TEB.mjs';
2
+ import { c as createPlugin } from './mdream.C6Z2rfeq.mjs';
3
3
 
4
4
  class TagSelector {
5
5
  constructor(tagName) {
@@ -498,4 +498,4 @@ function tailwindPlugin() {
498
498
  });
499
499
  }
500
500
 
501
- export { filterPlugin as f, isolateMainPlugin as i, tailwindPlugin as t };
501
+ export { filterPlugin as f, isolateMainPlugin as i, parseSelector as p, tailwindPlugin as t };
@@ -1,5 +1,5 @@
1
- import { T as TAG_BODY, a as TAG_HTML, b as TAG_HEAD, c as createBufferRegion, d as TAG_A, e as TAG_ADDRESS, f as TAG_SUMMARY, g as TAG_DETAILS, h as TAG_STYLE, i as TAG_SCRIPT, j as TAG_SPAN, k as TAG_BR, l as TAG_HR, m as TAG_I, n as TAG_EM, o as TAG_B, p as TAG_STRONG, q as TAG_OBJECT, r as TAG_EMBED, s as TAG_IFRAME, t as TAG_FIELDSET, u as TAG_SELECT, v as TAG_TEXTAREA, w as TAG_INPUT, x as TAG_BUTTON, y as TAG_FORM, z as TAG_ASIDE, A as TAG_NAV, B as TAG_FOOTER, C as TAG_HEADER, D as TAG_H6, E as TAG_H5, F as TAG_H4, G as TAG_H3, H as TAG_H2, I as TAG_H1, J as TAG_DD, K as TAG_DT, L as TAG_DL, M as TAG_LI, N as TAG_OL, O as TAG_UL, P as TAG_TD, Q as TAG_TH, R as TAG_TR, S as TAG_TFOOT, U as TAG_TBODY, V as TAG_THEAD, W as TAG_CAPTION, X as TAG_TABLE, Y as TAG_SVG, Z as TAG_AUDIO, _ as TAG_VIDEO, $ as TAG_FIGCAPTION, a0 as TAG_FIGURE, a1 as TAG_IMG, a2 as TAG_CODE, a3 as TAG_PRE, a4 as TAG_BLOCKQUOTE, a5 as TAG_DIV, a6 as TAG_P, a7 as TAG_MAIN, a8 as TAG_SECTION, a9 as TAG_ARTICLE } from './mdream.-hdaPj9a.mjs';
2
- import { c as createPlugin } from './mdream.cpEmpxyh.mjs';
1
+ import { ag as TAG_BODY, b0 as TAG_HTML, aU as TAG_HEAD, b1 as createBufferRegion, as as TAG_A, k as TAG_ADDRESS, aS as TAG_SUMMARY, aT as TAG_DETAILS, aP as TAG_STYLE, aQ as TAG_SCRIPT, ak as TAG_SPAN, aN as TAG_BR, aG as TAG_HR, aC as TAG_I, aD as TAG_EM, aE as TAG_B, aF as TAG_STRONG, b2 as TAG_OBJECT, a4 as TAG_EMBED, P as TAG_IFRAME, V as TAG_FIELDSET, Y as TAG_SELECT, X as TAG_TEXTAREA, a3 as TAG_INPUT, ah as TAG_BUTTON, a9 as TAG_FORM, u as TAG_ASIDE, aj as TAG_NAV, aa as TAG_FOOTER, b3 as TAG_HEADER, aH as TAG_H6, aI as TAG_H5, aJ as TAG_H4, aK as TAG_H3, aL as TAG_H2, aM as TAG_H1, h as TAG_DD, i as TAG_DT, j as TAG_DL, b as TAG_LI, au as TAG_OL, at as TAG_UL, an as TAG_TD, ao as TAG_TH, ap as TAG_TR, ac as TAG_TFOOT, ad as TAG_TBODY, aq as TAG_THEAD, b4 as TAG_CAPTION, af as TAG_TABLE, Z as TAG_SVG, S as TAG_AUDIO, R as TAG_VIDEO, b5 as TAG_FIGCAPTION, b6 as TAG_FIGURE, ar as TAG_IMG, av as TAG_CODE, a as TAG_PRE, d as TAG_BLOCKQUOTE, al as TAG_DIV, am as TAG_P, b7 as TAG_MAIN, b8 as TAG_SECTION, b9 as TAG_ARTICLE } from './mdream.Ch6B8TEB.mjs';
2
+ import { c as createPlugin } from './mdream.C6Z2rfeq.mjs';
3
3
 
4
4
  const REGEXPS = {
5
5
  // Positive patterns that suggest high-quality content
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.2.1",
4
+ "version": "0.2.3",
5
5
  "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",