mdream 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,12 +22,10 @@
22
22
 
23
23
  - 🧠 Content Extraction: [Readability.js]() scoring heuristics for [~50% fewer tokens*]() and improved accuracy.
24
24
  - 🔍 GitHub Flavored Markdown: Frontmatter, Nested & HTML markup support.
25
- - Tailwind CSS: Converts Tailwind CSS classes to Markdown for better readability.
26
25
 
27
26
  **Ultra Performant**
28
27
  - 🚀 Convert 1.4MB of HTML in [~50ms*]() with advanced streaming support, including content-based buffering.
29
28
  - ⚡ 5kB gzip, zero dependencies.
30
- - Streaming support
31
29
 
32
30
  **Adaptable**
33
31
 
@@ -46,7 +44,7 @@ human readability.
46
44
 
47
45
  Other LLM specific convertors focus on supporting _all_ document formats, resulting in larger bundles and lower quality Markdown output.
48
46
 
49
- Mdream is an ultra-performant HTML to Markdown converter built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
47
+ Mdream is HTML parser + Markdown generator built specifically for producing high-quality Markdown for LLMs as quickly as possible. It provides
50
48
  a powerful plugin system to customize the conversion process, allowing you to extract, transform, and filter content as needed.
51
49
 
52
50
  ## CLI Usage
@@ -0,0 +1,58 @@
1
+ import { HTMLToMarkdownOptions, NodeEvent, Plugin } from "./types-BHoibuoP.mjs";
2
+ import { ReadableStream } from "node:stream/web";
3
+
4
+ //#region src/const.d.ts
5
+
6
+ declare const TagIdMap: Record<string, number>;
7
+ //#endregion
8
+ //#region src/markdown-processor.d.ts
9
+
10
+ /**
11
+ * Creates a markdown processor that consumes DOM events and generates markdown
12
+ */
13
+ declare function createMarkdownProcessor(options?: HTMLToMarkdownOptions): {
14
+ processEvent: (event: NodeEvent) => void;
15
+ processHtml: (html: string) => void;
16
+ getMarkdown: () => string;
17
+ getMarkdownChunk: () => string;
18
+ };
19
+ declare const MarkdownProcessor: typeof createMarkdownProcessor;
20
+ //#endregion
21
+ //#region src/parse.d.ts
22
+ interface ParseOptions {
23
+ plugins?: Plugin[];
24
+ }
25
+ interface ParseResult {
26
+ events: NodeEvent[];
27
+ remainingHtml: string;
28
+ }
29
+ /**
30
+ * Pure HTML parser that emits DOM events
31
+ * Completely decoupled from markdown generation
32
+ */
33
+ declare function parseHtml(html: string, options?: ParseOptions): ParseResult;
34
+ /**
35
+ * Streaming HTML parser - calls onEvent for each DOM event
36
+ */
37
+ //#endregion
38
+ //#region src/pluggable/plugin.d.ts
39
+ /**
40
+ * Create a plugin that implements the Plugin interface with improved type inference
41
+ *
42
+ * @returns A complete plugin implementation
43
+ */
44
+ declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
45
+ //#endregion
46
+ //#region src/stream.d.ts
47
+ /**
48
+ * Creates a markdown stream from an HTML stream
49
+ * @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
50
+ * @param options - Configuration options for conversion
51
+ * @returns An async generator yielding markdown chunks
52
+ */
53
+ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
54
+ //#endregion
55
+ //#region src/index.d.ts
56
+ declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
57
+ //#endregion
58
+ export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
@@ -0,0 +1,299 @@
1
+ //#region src/buffer-region.ts
2
+ /**
3
+ * Creates a new buffer region
4
+ * Returns null if node already has a region assigned
5
+ */
6
+ function createBufferRegion(node, state, include) {
7
+ if (node.regionId) return null;
8
+ const id = state.regionToggles.size + 1;
9
+ node.regionId = id;
10
+ state.regionToggles.set(id, include);
11
+ state.regionContentBuffers.set(id, []);
12
+ return id;
13
+ }
14
+ /**
15
+ * Collects content for a node into appropriate buffer (optimized)
16
+ */
17
+ function collectNodeContent(node, content, state) {
18
+ if (!content) return;
19
+ const regionId = node.regionId || 0;
20
+ const targetBuffer = state.regionContentBuffers.get(regionId);
21
+ if (targetBuffer) {
22
+ targetBuffer.push(content);
23
+ state.lastContentCache = content;
24
+ }
25
+ }
26
+ /**
27
+ * Assembles final content from buffer regions and clears them after use
28
+ * Ensures frontmatter (regionId -1) appears first, followed by other included regions
29
+ */
30
+ function assembleBufferedContent(state) {
31
+ const fragments = [];
32
+ for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) {
33
+ const include = state.regionToggles.get(regionId);
34
+ if (include) fragments.push(...content);
35
+ }
36
+ state.regionToggles.clear();
37
+ state.regionContentBuffers.clear();
38
+ return fragments.join("").trimStart();
39
+ }
40
+
41
+ //#endregion
42
+ //#region src/const.ts
43
+ const TAG_HTML = 0;
44
+ const TAG_HEAD = 1;
45
+ const TAG_DETAILS = 2;
46
+ const TAG_SUMMARY = 3;
47
+ const TAG_TITLE = 4;
48
+ const TAG_META = 5;
49
+ const TAG_BR = 6;
50
+ const TAG_H1 = 7;
51
+ const TAG_H2 = 8;
52
+ const TAG_H3 = 9;
53
+ const TAG_H4 = 10;
54
+ const TAG_H5 = 11;
55
+ const TAG_H6 = 12;
56
+ const TAG_HR = 13;
57
+ const TAG_STRONG = 14;
58
+ const TAG_B = 15;
59
+ const TAG_EM = 16;
60
+ const TAG_I = 17;
61
+ const TAG_DEL = 18;
62
+ const TAG_SUB = 19;
63
+ const TAG_SUP = 20;
64
+ const TAG_INS = 21;
65
+ const TAG_BLOCKQUOTE = 22;
66
+ const TAG_CODE = 23;
67
+ const TAG_UL = 24;
68
+ const TAG_LI = 25;
69
+ const TAG_A = 26;
70
+ const TAG_IMG = 27;
71
+ const TAG_TABLE = 28;
72
+ const TAG_THEAD = 29;
73
+ const TAG_TR = 30;
74
+ const TAG_TH = 31;
75
+ const TAG_TD = 32;
76
+ const TAG_OL = 33;
77
+ const TAG_PRE = 34;
78
+ const TAG_P = 35;
79
+ const TAG_DIV = 36;
80
+ const TAG_SPAN = 37;
81
+ const TAG_TBODY = 38;
82
+ const TAG_TFOOT = 39;
83
+ const TAG_FORM = 40;
84
+ const TAG_NAV = 41;
85
+ const TAG_LABEL = 42;
86
+ const TAG_BUTTON = 43;
87
+ const TAG_BODY = 44;
88
+ const TAG_CENTER = 45;
89
+ const TAG_KBD = 46;
90
+ const TAG_FOOTER = 47;
91
+ const TAG_PATH = 48;
92
+ const TAG_SVG = 49;
93
+ const TAG_ARTICLE = 50;
94
+ const TAG_SECTION = 51;
95
+ const TAG_SCRIPT = 52;
96
+ const TAG_STYLE = 53;
97
+ const TAG_LINK = 54;
98
+ const TAG_AREA = 55;
99
+ const TAG_BASE = 56;
100
+ const TAG_COL = 57;
101
+ const TAG_EMBED = 58;
102
+ const TAG_INPUT = 59;
103
+ const TAG_KEYGEN = 60;
104
+ const TAG_PARAM = 61;
105
+ const TAG_SOURCE = 62;
106
+ const TAG_TRACK = 63;
107
+ const TAG_WBR = 64;
108
+ const TAG_SELECT = 65;
109
+ const TAG_TEXTAREA = 66;
110
+ const TAG_OPTION = 67;
111
+ const TAG_FIELDSET = 68;
112
+ const TAG_LEGEND = 69;
113
+ const TAG_AUDIO = 70;
114
+ const TAG_VIDEO = 71;
115
+ const TAG_CANVAS = 72;
116
+ const TAG_IFRAME = 73;
117
+ const TAG_MAP = 74;
118
+ const TAG_DIALOG = 75;
119
+ const TAG_METER = 76;
120
+ const TAG_PROGRESS = 77;
121
+ const TAG_TEMPLATE = 78;
122
+ const TAG_ABBR = 79;
123
+ const TAG_MARK = 80;
124
+ const TAG_Q = 81;
125
+ const TAG_SAMP = 82;
126
+ const TAG_SMALL = 83;
127
+ const TAG_NOSCRIPT = 84;
128
+ const TAG_NOFRAMES = 85;
129
+ const TAG_XMP = 86;
130
+ const TAG_PLAINTEXT = 87;
131
+ const TAG_ASIDE = 88;
132
+ const TAG_U = 89;
133
+ const TAG_CITE = 90;
134
+ const TAG_DFN = 91;
135
+ const TAG_VAR = 92;
136
+ const TAG_TIME = 93;
137
+ const TAG_BDO = 94;
138
+ const TAG_RUBY = 95;
139
+ const TAG_RT = 96;
140
+ const TAG_RP = 97;
141
+ const TAG_DD = 98;
142
+ const TAG_DT = 99;
143
+ const TAG_ADDRESS = 100;
144
+ const TAG_DL = 101;
145
+ const TAG_FIGURE = 102;
146
+ const TAG_OBJECT = 103;
147
+ const TAG_MAIN = 104;
148
+ const TAG_HEADER = 105;
149
+ const TAG_FIGCAPTION = 106;
150
+ const TAG_CAPTION = 107;
151
+ const MAX_TAG_ID = 108;
152
+ const HTML_ENTITIES = {
153
+ "&amp;": "&",
154
+ "&lt;": "<",
155
+ "&gt;": ">",
156
+ "&quot;": "\"",
157
+ "&#39;": "'",
158
+ "&apos;": "'",
159
+ "&nbsp;": " "
160
+ };
161
+ const ELEMENT_NODE = 1;
162
+ const TEXT_NODE = 2;
163
+ const NodeEventEnter = 0;
164
+ const NodeEventExit = 1;
165
+ const TagIdMap = {
166
+ html: TAG_HTML,
167
+ head: TAG_HEAD,
168
+ details: TAG_DETAILS,
169
+ summary: TAG_SUMMARY,
170
+ title: TAG_TITLE,
171
+ meta: TAG_META,
172
+ br: TAG_BR,
173
+ h1: TAG_H1,
174
+ h2: TAG_H2,
175
+ h3: TAG_H3,
176
+ h4: TAG_H4,
177
+ h5: TAG_H5,
178
+ h6: TAG_H6,
179
+ hr: TAG_HR,
180
+ strong: TAG_STRONG,
181
+ b: TAG_B,
182
+ em: TAG_EM,
183
+ i: TAG_I,
184
+ del: TAG_DEL,
185
+ sub: TAG_SUB,
186
+ sup: TAG_SUP,
187
+ ins: TAG_INS,
188
+ blockquote: TAG_BLOCKQUOTE,
189
+ code: TAG_CODE,
190
+ ul: TAG_UL,
191
+ li: TAG_LI,
192
+ a: TAG_A,
193
+ img: TAG_IMG,
194
+ table: TAG_TABLE,
195
+ thead: TAG_THEAD,
196
+ tr: TAG_TR,
197
+ th: TAG_TH,
198
+ td: TAG_TD,
199
+ ol: TAG_OL,
200
+ pre: TAG_PRE,
201
+ p: TAG_P,
202
+ div: TAG_DIV,
203
+ span: TAG_SPAN,
204
+ tbody: TAG_TBODY,
205
+ tfoot: TAG_TFOOT,
206
+ form: TAG_FORM,
207
+ nav: TAG_NAV,
208
+ label: TAG_LABEL,
209
+ button: TAG_BUTTON,
210
+ body: TAG_BODY,
211
+ center: TAG_CENTER,
212
+ kbd: TAG_KBD,
213
+ footer: TAG_FOOTER,
214
+ path: TAG_PATH,
215
+ svg: TAG_SVG,
216
+ article: TAG_ARTICLE,
217
+ section: TAG_SECTION,
218
+ script: TAG_SCRIPT,
219
+ style: TAG_STYLE,
220
+ link: TAG_LINK,
221
+ area: TAG_AREA,
222
+ base: TAG_BASE,
223
+ col: TAG_COL,
224
+ embed: TAG_EMBED,
225
+ input: TAG_INPUT,
226
+ keygen: TAG_KEYGEN,
227
+ param: TAG_PARAM,
228
+ source: TAG_SOURCE,
229
+ track: TAG_TRACK,
230
+ wbr: TAG_WBR,
231
+ select: TAG_SELECT,
232
+ textarea: TAG_TEXTAREA,
233
+ option: TAG_OPTION,
234
+ fieldset: TAG_FIELDSET,
235
+ legend: TAG_LEGEND,
236
+ audio: TAG_AUDIO,
237
+ video: TAG_VIDEO,
238
+ canvas: TAG_CANVAS,
239
+ iframe: TAG_IFRAME,
240
+ map: TAG_MAP,
241
+ dialog: TAG_DIALOG,
242
+ meter: TAG_METER,
243
+ progress: TAG_PROGRESS,
244
+ template: TAG_TEMPLATE,
245
+ abbr: TAG_ABBR,
246
+ mark: TAG_MARK,
247
+ q: TAG_Q,
248
+ samp: TAG_SAMP,
249
+ small: TAG_SMALL,
250
+ noscript: TAG_NOSCRIPT,
251
+ noframes: TAG_NOFRAMES,
252
+ xmp: TAG_XMP,
253
+ plaintext: TAG_PLAINTEXT,
254
+ aside: TAG_ASIDE,
255
+ u: TAG_U,
256
+ cite: TAG_CITE,
257
+ dfn: TAG_DFN,
258
+ var: TAG_VAR,
259
+ time: TAG_TIME,
260
+ bdo: TAG_BDO,
261
+ ruby: TAG_RUBY,
262
+ rt: TAG_RT,
263
+ rp: TAG_RP,
264
+ dd: TAG_DD,
265
+ dt: TAG_DT,
266
+ dl: TAG_DL,
267
+ address: TAG_ADDRESS,
268
+ figure: TAG_FIGURE,
269
+ object: TAG_OBJECT,
270
+ main: TAG_MAIN,
271
+ header: TAG_HEADER,
272
+ figcaption: TAG_FIGCAPTION,
273
+ caption: TAG_CAPTION
274
+ };
275
+ const MARKDOWN_STRONG = "**";
276
+ const MARKDOWN_EMPHASIS = "_";
277
+ const MARKDOWN_STRIKETHROUGH = "~~";
278
+ const MARKDOWN_CODE_BLOCK = "```";
279
+ const MARKDOWN_INLINE_CODE = "`";
280
+ const MARKDOWN_HORIZONTAL_RULE = "---";
281
+ const NO_SPACING = [0, 0];
282
+ const DEFAULT_BLOCK_SPACING = [2, 2];
283
+ const BLOCKQUOTE_SPACING = [1, 1];
284
+ const LIST_ITEM_SPACING = [1, 0];
285
+ const TABLE_ROW_SPACING = [0, 1];
286
+
287
+ //#endregion
288
+ //#region src/pluggable/plugin.ts
289
+ /**
290
+ * Create a plugin that implements the Plugin interface with improved type inference
291
+ *
292
+ * @returns A complete plugin implementation
293
+ */
294
+ function createPlugin(plugin) {
295
+ return plugin;
296
+ }
297
+
298
+ //#endregion
299
+ export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap as TagIdMap$1, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin as createPlugin$1 };