mdream 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/dist/cli.mjs +1 -1
- package/dist/index.d.mts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.mjs +2 -2
- package/dist/plugins.d.mts +1 -1
- package/dist/plugins.d.ts +1 -1
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.d.ts +1 -1
- package/dist/shared/{mdream.C9ruFMrk.d.mts → mdream.-SGj02be.d.mts} +6 -0
- package/dist/shared/{mdream.C9ruFMrk.d.ts → mdream.-SGj02be.d.ts} +6 -0
- package/dist/shared/{mdream.CRBi8vE8.mjs → mdream.CsDVbUMp.mjs} +43 -7
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -49,8 +49,6 @@ Other LLM specific convertors focus on supporting _all_ document formats, result
|
|
|
49
49
|
Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
|
|
50
50
|
a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
|
|
51
51
|
|
|
52
|
-
Perfect for: RAG systems, web scraping, content extraction, ChatGPT/Claude integration, and large-scale document processing.
|
|
53
|
-
|
|
54
52
|
## CLI Usage
|
|
55
53
|
|
|
56
54
|
The Mdream CLI is designed to work exclusively with Unix pipes, providing flexibility and freedom to integrate with other tools.
|
package/dist/cli.mjs
CHANGED
|
@@ -2,7 +2,7 @@ import { Readable } from 'node:stream';
|
|
|
2
2
|
import { cac } from 'cac';
|
|
3
3
|
import { f as frontmatterPlugin } from './shared/mdream.C6Z2rfeq.mjs';
|
|
4
4
|
import { r as readabilityPlugin } from './shared/mdream.DMUbnRbh.mjs';
|
|
5
|
-
import { s as streamHtmlToMarkdown } from './shared/mdream.
|
|
5
|
+
import { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
|
|
6
6
|
import './shared/mdream.Ch6B8TEB.mjs';
|
|
7
7
|
|
|
8
8
|
async function streamingConvert(options = {}) {
|
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { H as HTMLToMarkdownOptions } from './shared/mdream.
|
|
2
|
-
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.
|
|
1
|
+
import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.mjs';
|
|
2
|
+
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.mjs';
|
|
3
3
|
import { ReadableStream } from 'node:stream/web';
|
|
4
4
|
|
|
5
5
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { H as HTMLToMarkdownOptions } from './shared/mdream.
|
|
2
|
-
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.
|
|
1
|
+
import { H as HTMLToMarkdownOptions } from './shared/mdream.-SGj02be.js';
|
|
2
|
+
export { B as BufferRegion, E as ELEMENT_NODE, b as ElementNode, f as HandlerContext, M as MdreamProcessingState, d as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.-SGj02be.js';
|
|
3
3
|
import { ReadableStream } from 'node:stream/web';
|
|
4
4
|
|
|
5
5
|
/**
|
package/dist/index.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { p as processPartialHTMLToMarkdown } from './shared/mdream.
|
|
2
|
-
export { s as streamHtmlToMarkdown } from './shared/mdream.
|
|
1
|
+
import { p as processPartialHTMLToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
|
|
2
|
+
export { s as streamHtmlToMarkdown } from './shared/mdream.CsDVbUMp.mjs';
|
|
3
3
|
import './shared/mdream.Ch6B8TEB.mjs';
|
|
4
4
|
|
|
5
5
|
function htmlToMarkdown(html, options = {}) {
|
package/dist/plugins.d.mts
CHANGED
package/dist/plugins.d.ts
CHANGED
package/dist/preset/minimal.d.ts
CHANGED
|
@@ -147,6 +147,12 @@ interface MdreamProcessingState {
|
|
|
147
147
|
isFirstTextInElement?: boolean;
|
|
148
148
|
/** Reference to the last processed text node - for context tracking */
|
|
149
149
|
lastTextNode?: Node;
|
|
150
|
+
/** Quote state tracking for non-nesting tags - avoids backward scanning */
|
|
151
|
+
inSingleQuote?: boolean;
|
|
152
|
+
inDoubleQuote?: boolean;
|
|
153
|
+
inBacktick?: boolean;
|
|
154
|
+
/** Backslash escaping state tracking - avoids checking previous character */
|
|
155
|
+
lastCharWasBackslash?: boolean;
|
|
150
156
|
/** Plugin instances array for efficient iteration */
|
|
151
157
|
plugins?: Plugin[];
|
|
152
158
|
/** Configuration options for conversion */
|
|
@@ -147,6 +147,12 @@ interface MdreamProcessingState {
|
|
|
147
147
|
isFirstTextInElement?: boolean;
|
|
148
148
|
/** Reference to the last processed text node - for context tracking */
|
|
149
149
|
lastTextNode?: Node;
|
|
150
|
+
/** Quote state tracking for non-nesting tags - avoids backward scanning */
|
|
151
|
+
inSingleQuote?: boolean;
|
|
152
|
+
inDoubleQuote?: boolean;
|
|
153
|
+
inBacktick?: boolean;
|
|
154
|
+
/** Backslash escaping state tracking - avoids checking previous character */
|
|
155
|
+
lastCharWasBackslash?: boolean;
|
|
150
156
|
/** Plugin instances array for efficient iteration */
|
|
151
157
|
plugins?: Plugin[];
|
|
152
158
|
/** Configuration options for conversion */
|
|
@@ -872,6 +872,10 @@ const SPACE_CHAR = 32;
|
|
|
872
872
|
const TAB_CHAR = 9;
|
|
873
873
|
const NEWLINE_CHAR = 10;
|
|
874
874
|
const CARRIAGE_RETURN_CHAR = 13;
|
|
875
|
+
const BACKTICK_CHAR = 96;
|
|
876
|
+
const PIPE_CHAR = 124;
|
|
877
|
+
const OPEN_BRACKET_CHAR = 91;
|
|
878
|
+
const CLOSE_BRACKET_CHAR = 93;
|
|
875
879
|
const EMPTY_ATTRIBUTES = Object.freeze({});
|
|
876
880
|
function copyDepthMap(depthMap) {
|
|
877
881
|
return new Uint8Array(depthMap);
|
|
@@ -895,6 +899,7 @@ function parseHTML(htmlChunk, state, handleEvent) {
|
|
|
895
899
|
state.lastCharWasWhitespace ??= true;
|
|
896
900
|
state.justClosedTag ??= false;
|
|
897
901
|
state.isFirstTextInElement ??= false;
|
|
902
|
+
state.lastCharWasBackslash ??= false;
|
|
898
903
|
let i = 0;
|
|
899
904
|
const chunkLength = htmlChunk.length;
|
|
900
905
|
while (i < chunkLength) {
|
|
@@ -922,23 +927,36 @@ function parseHTML(htmlChunk, state, handleEvent) {
|
|
|
922
927
|
}
|
|
923
928
|
state.lastCharWasWhitespace = true;
|
|
924
929
|
state.textBufferContainsWhitespace = true;
|
|
930
|
+
state.lastCharWasBackslash = false;
|
|
925
931
|
} else {
|
|
926
932
|
state.textBufferContainsNonWhitespace = true;
|
|
927
933
|
state.lastCharWasWhitespace = false;
|
|
928
934
|
state.justClosedTag = false;
|
|
929
|
-
if (currentCharCode ===
|
|
935
|
+
if (currentCharCode === PIPE_CHAR && state.depthMap[TAG_TABLE]) {
|
|
930
936
|
textBuffer += "\\|";
|
|
931
|
-
} else if (currentCharCode ===
|
|
937
|
+
} else if (currentCharCode === BACKTICK_CHAR && (state.depthMap[TAG_CODE] || state.depthMap[TAG_PRE])) {
|
|
932
938
|
textBuffer += "\\`";
|
|
933
|
-
} else if (currentCharCode ===
|
|
939
|
+
} else if (currentCharCode === OPEN_BRACKET_CHAR && state.depthMap[TAG_A]) {
|
|
934
940
|
textBuffer += "\\[";
|
|
935
|
-
} else if (currentCharCode ===
|
|
941
|
+
} else if (currentCharCode === CLOSE_BRACKET_CHAR && state.depthMap[TAG_A]) {
|
|
936
942
|
textBuffer += "\\]";
|
|
937
|
-
} else if (currentCharCode ===
|
|
943
|
+
} else if (currentCharCode === GT_CHAR && state.depthMap[TAG_BLOCKQUOTE]) {
|
|
938
944
|
textBuffer += "\\>";
|
|
939
945
|
} else {
|
|
940
946
|
textBuffer += htmlChunk[i];
|
|
941
947
|
}
|
|
948
|
+
if (state.currentNode?.tagHandler?.isNonNesting) {
|
|
949
|
+
if (!state.lastCharWasBackslash) {
|
|
950
|
+
if (currentCharCode === APOS_CHAR && !state.inDoubleQuote && !state.inBacktick) {
|
|
951
|
+
state.inSingleQuote = !state.inSingleQuote;
|
|
952
|
+
} else if (currentCharCode === QUOTE_CHAR && !state.inSingleQuote && !state.inBacktick) {
|
|
953
|
+
state.inDoubleQuote = !state.inDoubleQuote;
|
|
954
|
+
} else if (currentCharCode === BACKTICK_CHAR && !state.inSingleQuote && !state.inDoubleQuote) {
|
|
955
|
+
state.inBacktick = !state.inBacktick;
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
state.lastCharWasBackslash = currentCharCode === BACKSLASH_CHAR;
|
|
942
960
|
}
|
|
943
961
|
i++;
|
|
944
962
|
continue;
|
|
@@ -961,6 +979,12 @@ function parseHTML(htmlChunk, state, handleEvent) {
|
|
|
961
979
|
break;
|
|
962
980
|
}
|
|
963
981
|
} else if (nextCharCode === SLASH_CHAR) {
|
|
982
|
+
const inQuotes = state.inSingleQuote || state.inDoubleQuote || state.inBacktick;
|
|
983
|
+
if (state.currentNode?.tagHandler?.isNonNesting && inQuotes) {
|
|
984
|
+
textBuffer += htmlChunk[i];
|
|
985
|
+
i++;
|
|
986
|
+
continue;
|
|
987
|
+
}
|
|
964
988
|
if (textBuffer.length > 0) {
|
|
965
989
|
processTextBuffer(textBuffer, state, handleEvent);
|
|
966
990
|
textBuffer = "";
|
|
@@ -1043,7 +1067,7 @@ function processTextBuffer(textBuffer, state, handleEvent) {
|
|
|
1043
1067
|
const firstBlockParent = parentsToIncrement[parentsToIncrement.length - 1];
|
|
1044
1068
|
if (containsWhitespace && !firstBlockParent?.childTextNodeIndex) {
|
|
1045
1069
|
let start = 0;
|
|
1046
|
-
while (start < text.length && (inPreTag ? text.charCodeAt(start) ===
|
|
1070
|
+
while (start < text.length && (inPreTag ? text.charCodeAt(start) === NEWLINE_CHAR || text.charCodeAt(start) === CARRIAGE_RETURN_CHAR : isWhitespace(text.charCodeAt(start)))) {
|
|
1047
1071
|
start++;
|
|
1048
1072
|
}
|
|
1049
1073
|
if (start > 0) {
|
|
@@ -1109,7 +1133,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
|
1109
1133
|
}
|
|
1110
1134
|
}
|
|
1111
1135
|
if (curr) {
|
|
1112
|
-
closeNode(
|
|
1136
|
+
closeNode(curr, state, handleEvent);
|
|
1113
1137
|
}
|
|
1114
1138
|
state.justClosedTag = true;
|
|
1115
1139
|
return {
|
|
@@ -1143,6 +1167,12 @@ function closeNode(node, state, handleEvent) {
|
|
|
1143
1167
|
if (node.tagId) {
|
|
1144
1168
|
state.depthMap[node.tagId] = Math.max(0, state.depthMap[node.tagId] - 1);
|
|
1145
1169
|
}
|
|
1170
|
+
if (node.tagHandler?.isNonNesting) {
|
|
1171
|
+
state.inSingleQuote = false;
|
|
1172
|
+
state.inDoubleQuote = false;
|
|
1173
|
+
state.inBacktick = false;
|
|
1174
|
+
state.lastCharWasBackslash = false;
|
|
1175
|
+
}
|
|
1146
1176
|
state.depth--;
|
|
1147
1177
|
handleEvent({ type: NodeEventExit, node });
|
|
1148
1178
|
state.currentNode = state.currentNode.parent;
|
|
@@ -1233,6 +1263,12 @@ function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
|
|
|
1233
1263
|
parentNode.currentWalkIndex = 0;
|
|
1234
1264
|
state.currentNode = parentNode;
|
|
1235
1265
|
state.hasEncodedHtmlEntity = false;
|
|
1266
|
+
if (tagHandler?.isNonNesting && !result.selfClosing) {
|
|
1267
|
+
state.inSingleQuote = false;
|
|
1268
|
+
state.inDoubleQuote = false;
|
|
1269
|
+
state.inBacktick = false;
|
|
1270
|
+
state.lastCharWasBackslash = false;
|
|
1271
|
+
}
|
|
1236
1272
|
if (result.selfClosing) {
|
|
1237
1273
|
closeNode(tag, state, handleEvent);
|
|
1238
1274
|
state.justClosedTag = true;
|