mdream 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,8 +49,6 @@ Other LLM specific convertors focus on supporting _all_ document formats, result
49
49
  Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
50
50
  a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
51
51
 
52
- Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
53
-
54
52
  ## CLI Usage
55
53
 
56
54
  The Mdream CLI is designed to work exclusively with Unix pipes, providing flexibility and freedom to integrate with other tools.
package/dist/cli.mjs CHANGED
@@ -2,7 +2,7 @@ import { Readable } from 'node:stream';
2
2
  import { cac } from 'cac';
3
3
  import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
4
4
  import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
5
- import { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
5
+ import { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
6
6
  import './shared/mdream.Ch6B8TEB.mjs';
7
7
 
8
8
  async function streamingConvert(options = {}) {
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.mjs';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.mjs';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.mjs';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.mjs';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.C9ruFMrk.js';
2
- export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C9ruFMrk.js';
1
+ import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.js';
2
+ export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.js';
3
3
  import { ReadableStream } from 'node:stream/web';
4
4
 
5
5
  /**
package/dist/index.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { p as processPartialHTMLToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
2
- export { s as streamHtmlToMarkdown } from './shared/mdream.CRBi8vE8.mjs';
1
+ import { p as processPartialHTMLToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
2
+ export { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
3
3
  import './shared/mdream.Ch6B8TEB.mjs';
4
4
 
5
5
  function htmlToMarkdown(html, options = {}) {
@@ -1,4 +1,4 @@
1
- import { P as Plugin, b as ElementNode } from './shared/mdream.C9ruFMrk.mjs';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.-SGj02be.mjs';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
package/dist/plugins.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { P as Plugin, b as ElementNode } from './shared/mdream.C9ruFMrk.js';
1
+ import { P as Plugin, b as ElementNode } from './shared/mdream.-SGj02be.js';
2
2
 
3
3
  /**
4
4
  * Create a plugin that implements the Plugin interface with improved type inference
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.C9ruFMrk.mjs';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.-SGj02be.mjs';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -1,4 +1,4 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.C9ruFMrk.js';
1
+ import { H as HTMLToMarkdownOptions } from '../shared/mdream.-SGj02be.js';
2
2
 
3
3
  /**
4
4
  * Creates a configurable minimal preset with advanced options
@@ -147,6 +147,12 @@ interface MdreamProcessingState {
147
147
  isFirstTextInElement?: boolean;
148
148
  /** Reference to the last processed text node - for context tracking */
149
149
  lastTextNode?: Node;
150
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
151
+ inSingleQuote?: boolean;
152
+ inDoubleQuote?: boolean;
153
+ inBacktick?: boolean;
154
+ /** Backslash escaping state tracking - avoids checking previous character */
155
+ lastCharWasBackslash?: boolean;
150
156
  /** Plugin instances array for efficient iteration */
151
157
  plugins?: Plugin[];
152
158
  /** Configuration options for conversion */
@@ -147,6 +147,12 @@ interface MdreamProcessingState {
147
147
  isFirstTextInElement?: boolean;
148
148
  /** Reference to the last processed text node - for context tracking */
149
149
  lastTextNode?: Node;
150
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
151
+ inSingleQuote?: boolean;
152
+ inDoubleQuote?: boolean;
153
+ inBacktick?: boolean;
154
+ /** Backslash escaping state tracking - avoids checking previous character */
155
+ lastCharWasBackslash?: boolean;
150
156
  /** Plugin instances array for efficient iteration */
151
157
  plugins?: Plugin[];
152
158
  /** Configuration options for conversion */
@@ -872,6 +872,10 @@ const SPACE_CHAR = 32;
872
872
  const TAB_CHAR = 9;
873
873
  const NEWLINE_CHAR = 10;
874
874
  const CARRIAGE_RETURN_CHAR = 13;
875
+ const BACKTICK_CHAR = 96;
876
+ const PIPE_CHAR = 124;
877
+ const OPEN_BRACKET_CHAR = 91;
878
+ const CLOSE_BRACKET_CHAR = 93;
875
879
  const EMPTY_ATTRIBUTES = Object.freeze({});
876
880
  function copyDepthMap(depthMap) {
877
881
  return new Uint8Array(depthMap);
@@ -895,6 +899,7 @@ function parseHTML(htmlChunk, state, handleEvent) {
895
899
  state.lastCharWasWhitespace ??= true;
896
900
  state.justClosedTag ??= false;
897
901
  state.isFirstTextInElement ??= false;
902
+ state.lastCharWasBackslash ??= false;
898
903
  let i = 0;
899
904
  const chunkLength = htmlChunk.length;
900
905
  while (i < chunkLength) {
@@ -922,23 +927,36 @@ function parseHTML(htmlChunk, state, handleEvent) {
922
927
  }
923
928
  state.lastCharWasWhitespace = true;
924
929
  state.textBufferContainsWhitespace = true;
930
+ state.lastCharWasBackslash = false;
925
931
  } else {
926
932
  state.textBufferContainsNonWhitespace = true;
927
933
  state.lastCharWasWhitespace = false;
928
934
  state.justClosedTag = false;
929
- if (currentCharCode === 124 && state.depthMap[TAG_TABLE]) {
935
+ if (currentCharCode === PIPE_CHAR && state.depthMap[TAG_TABLE]) {
930
936
  textBuffer += "\\|";
931
- } else if (currentCharCode === 96 && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
937
+ } else if (currentCharCode === BACKTICK_CHAR && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
932
938
  textBuffer += "\\`";
933
- } else if (currentCharCode === 91 && state.depthMap[TAG_A]) {
939
+ } else if (currentCharCode === OPEN_BRACKET_CHAR && state.depthMap[TAG_A]) {
934
940
  textBuffer += "\\[";
935
- } else if (currentCharCode === 93 && state.depthMap[TAG_A]) {
941
+ } else if (currentCharCode === CLOSE_BRACKET_CHAR && state.depthMap[TAG_A]) {
936
942
  textBuffer += "\\]";
937
- } else if (currentCharCode === 62 && state.depthMap[TAG_BLOCKQUOTE]) {
943
+ } else if (currentCharCode === GT_CHAR && state.depthMap[TAG_BLOCKQUOTE]) {
938
944
  textBuffer += "\\>";
939
945
  } else {
940
946
  textBuffer += htmlChunk[i];
941
947
  }
948
+ if (state.currentNode?.tagHandler?.isNonNesting) {
949
+ if (!state.lastCharWasBackslash) {
950
+ if (currentCharCode === APOS_CHAR && !state.inDoubleQuote && !state.inBacktick) {
951
+ state.inSingleQuote = !state.inSingleQuote;
952
+ } else if (currentCharCode === QUOTE_CHAR && !state.inSingleQuote && !state.inBacktick) {
953
+ state.inDoubleQuote = !state.inDoubleQuote;
954
+ } else if (currentCharCode === BACKTICK_CHAR && !state.inSingleQuote && !state.inDoubleQuote) {
955
+ state.inBacktick = !state.inBacktick;
956
+ }
957
+ }
958
+ }
959
+ state.lastCharWasBackslash = currentCharCode === BACKSLASH_CHAR;
942
960
  }
943
961
  i++;
944
962
  continue;
@@ -961,6 +979,12 @@ function parseHTML(htmlChunk, state, handleEvent) {
961
979
  break;
962
980
  }
963
981
  } else if (nextCharCode === SLASH_CHAR) {
982
+ const inQuotes = state.inSingleQuote || state.inDoubleQuote || state.inBacktick;
983
+ if (state.currentNode?.tagHandler?.isNonNesting && inQuotes) {
984
+ textBuffer += htmlChunk[i];
985
+ i++;
986
+ continue;
987
+ }
964
988
  if (textBuffer.length > 0) {
965
989
  processTextBuffer(textBuffer, state, handleEvent);
966
990
  textBuffer = "";
@@ -1043,7 +1067,7 @@ function processTextBuffer(textBuffer, state, handleEvent) {
1043
1067
  const firstBlockParent = parentsToIncrement[parentsToIncrement.length - 1];
1044
1068
  if (containsWhitespace && !firstBlockParent?.childTextNodeIndex) {
1045
1069
  let start = 0;
1046
- while (start < text.length && (inPreTag ? text.charCodeAt(start) === 10 || text.charCodeAt(start) === 13 : isWhitespace(text.charCodeAt(start)))) {
1070
+ while (start < text.length && (inPreTag ? text.charCodeAt(start) === NEWLINE_CHAR || text.charCodeAt(start) === CARRIAGE_RETURN_CHAR : isWhitespace(text.charCodeAt(start)))) {
1047
1071
  start++;
1048
1072
  }
1049
1073
  if (start > 0) {
@@ -1109,7 +1133,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
1109
1133
  }
1110
1134
  }
1111
1135
  if (curr) {
1112
- closeNode(state.currentNode, state, handleEvent);
1136
+ closeNode(curr, state, handleEvent);
1113
1137
  }
1114
1138
  state.justClosedTag = true;
1115
1139
  return {
@@ -1143,6 +1167,12 @@ function closeNode(node, state, handleEvent) {
1143
1167
  if (node.tagId) {
1144
1168
  state.depthMap[node.tagId] = Math.max(0, state.depthMap[node.tagId] - 1);
1145
1169
  }
1170
+ if (node.tagHandler?.isNonNesting) {
1171
+ state.inSingleQuote = false;
1172
+ state.inDoubleQuote = false;
1173
+ state.inBacktick = false;
1174
+ state.lastCharWasBackslash = false;
1175
+ }
1146
1176
  state.depth--;
1147
1177
  handleEvent({ type: NodeEventExit, node });
1148
1178
  state.currentNode = state.currentNode.parent;
@@ -1233,6 +1263,12 @@ function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
1233
1263
  parentNode.currentWalkIndex = 0;
1234
1264
  state.currentNode = parentNode;
1235
1265
  state.hasEncodedHtmlEntity = false;
1266
+ if (tagHandler?.isNonNesting && !result.selfClosing) {
1267
+ state.inSingleQuote = false;
1268
+ state.inDoubleQuote = false;
1269
+ state.inBacktick = false;
1270
+ state.lastCharWasBackslash = false;
1271
+ }
1236
1272
  if (result.selfClosing) {
1237
1273
  closeNode(tag, state, handleEvent);
1238
1274
  state.justClosedTag = true;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.2.3",
4
+ "version": "0.2.4",
5
5
  "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",