mdream 0.13.3 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/splitter.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { ELEMENT_NODE, NodeEventEnter, NodeEventExit, TAG_CODE, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HR, TAG_PRE, TEXT_NODE } from "./_chunks/const-BOAJ1T5c.mjs";
2
- import { createMarkdownProcessor, parseHtmlStream, processPluginsForEvent } from "./_chunks/markdown-processor-f7XT0--8.mjs";
1
+ import { $ as TAG_H2, F as TAG_CODE, Nt as TAG_PRE, Q as TAG_H1, et as TAG_H3, gn as TEXT_NODE, h as NodeEventExit, m as NodeEventEnter, nt as TAG_H5, ot as TAG_HR, r as ELEMENT_NODE, rt as TAG_H6, tt as TAG_H4 } from "./_chunks/const-Bf_XN9U9.mjs";
2
+ import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor-D26Uo5td.mjs";
3
3
 
4
4
  //#region src/splitter.ts
5
5
  const DEFAULT_HEADERS_TO_SPLIT_ON = [
@@ -36,26 +36,22 @@ function shouldSplitOnHeader(tagId, options) {
36
36
  */
37
37
  function getCurrentMarkdown(state) {
38
38
  const fragments = [];
39
- for (const [regionId, content] of state.regionContentBuffers.entries()) {
40
- const include = state.regionToggles.get(regionId);
41
- if (include) fragments.push(...content);
42
- }
39
+ for (const [regionId, content] of state.regionContentBuffers.entries()) if (state.regionToggles.get(regionId)) fragments.push(...content);
43
40
  return fragments.join("").trimStart();
44
41
  }
45
42
  /**
46
43
  * Convert HTML to Markdown and split into chunks in single pass
47
- * Chunks are created during HTML event processing
44
+ * Yields chunks during HTML event processing for better memory efficiency
48
45
  */
49
- function htmlToMarkdownSplitChunks(html, options = {}) {
46
+ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
50
47
  const opts = createOptions(options);
51
48
  if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
52
49
  const processor = createMarkdownProcessor({
53
50
  origin: opts.origin,
54
51
  plugins: opts.plugins
55
52
  });
56
- const chunks = [];
57
- const headerHierarchy = new Map();
58
- const seenSplitHeaders = new Set();
53
+ const headerHierarchy = /* @__PURE__ */ new Map();
54
+ const seenSplitHeaders = /* @__PURE__ */ new Set();
59
55
  let currentChunkCodeLanguage = "";
60
56
  let collectingHeaderText = false;
61
57
  let currentHeaderTagId = null;
@@ -63,19 +59,27 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
63
59
  let lineNumber = 1;
64
60
  let lastChunkEndPosition = 0;
65
61
  let lastSplitPosition = 0;
66
- function flushChunk(endPosition, applyOverlap = false) {
62
+ function* flushChunk(endPosition, applyOverlap = false) {
67
63
  const currentMd = getCurrentMarkdown(processor.state);
68
64
  const chunkEnd = endPosition ?? currentMd.length;
69
- const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
70
- if (!chunkContent.trim()) {
65
+ const originalChunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
66
+ if (!originalChunkContent.trim()) {
71
67
  lastChunkEndPosition = chunkEnd;
72
68
  return;
73
69
  }
70
+ let chunkContent = originalChunkContent;
71
+ if (opts.stripHeaders) {
72
+ chunkContent = chunkContent.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
73
+ if (!chunkContent) {
74
+ lastChunkEndPosition = chunkEnd;
75
+ return;
76
+ }
77
+ }
74
78
  const chunk = {
75
79
  content: chunkContent.trimEnd(),
76
80
  metadata: { loc: { lines: {
77
81
  from: lineNumber,
78
- to: lineNumber + (chunkContent.match(/\n/g) || []).length
82
+ to: lineNumber + (originalChunkContent.match(/\n/g) || []).length
79
83
  } } }
80
84
  };
81
85
  if (headerHierarchy.size > 0) {
@@ -86,22 +90,25 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
86
90
  }
87
91
  }
88
92
  if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
89
- chunks.push(chunk);
93
+ yield chunk;
90
94
  currentChunkCodeLanguage = "";
91
95
  lastSplitPosition = chunkEnd;
92
96
  if (applyOverlap && opts.chunkOverlap > 0) {
93
- const maxOverlap = Math.max(0, chunkContent.length - 1);
94
- const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap);
95
- lastChunkEndPosition = chunkEnd - actualOverlap;
97
+ const maxOverlap = Math.max(0, originalChunkContent.length - 1);
98
+ lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
96
99
  } else lastChunkEndPosition = chunkEnd;
97
- lineNumber += (chunkContent.match(/\n/g) || []).length;
100
+ lineNumber += (originalChunkContent.match(/\n/g) || []).length;
98
101
  }
99
102
  const parseState = {
100
103
  depthMap: processor.state.depthMap,
101
104
  depth: 0,
102
105
  plugins: opts.plugins
103
106
  };
107
+ const eventBuffer = [];
104
108
  parseHtmlStream(html, parseState, (event) => {
109
+ eventBuffer.push(event);
110
+ });
111
+ for (const event of eventBuffer) {
105
112
  const { type: eventType, node } = event;
106
113
  if (node.type === ELEMENT_NODE) {
107
114
  const element = node;
@@ -113,7 +120,7 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
113
120
  currentHeaderText = "";
114
121
  if (shouldSplitOnHeader(tagId, opts)) {
115
122
  if (seenSplitHeaders.has(tagId)) {
116
- flushChunk();
123
+ yield* flushChunk();
117
124
  for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
118
125
  }
119
126
  seenSplitHeaders.add(tagId);
@@ -130,17 +137,13 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
130
137
  if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
131
138
  }
132
139
  }
133
- if (tagId === TAG_HR && eventType === NodeEventEnter) flushChunk();
134
- }
135
- if (collectingHeaderText && node.type === TEXT_NODE) {
136
- const textNode = node;
137
- currentHeaderText += textNode.value;
140
+ if (tagId === TAG_HR && eventType === NodeEventEnter) yield* flushChunk();
138
141
  }
142
+ if (collectingHeaderText && node.type === TEXT_NODE) currentHeaderText += node.value;
139
143
  processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
140
144
  if (!opts.returnEachLine) {
141
145
  const currentMd = getCurrentMarkdown(processor.state);
142
- const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition));
143
- if (currentChunkSize > opts.chunkSize) {
146
+ if (opts.lengthFunction(currentMd.slice(lastChunkEndPosition)) > opts.chunkSize) {
144
147
  const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
145
148
  const separators = [
146
149
  "\n\n",
@@ -168,11 +171,20 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
168
171
  }
169
172
  }
170
173
  if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
171
- flushChunk(splitPosition, true);
174
+ yield* flushChunk(splitPosition, true);
172
175
  }
173
176
  }
174
- });
175
- flushChunk();
177
+ }
178
+ yield* flushChunk();
179
+ }
180
+ /**
181
+ * Convert HTML to Markdown and split into chunks in single pass
182
+ * Chunks are created during HTML event processing
183
+ */
184
+ function htmlToMarkdownSplitChunks(html, options = {}) {
185
+ const opts = createOptions(options);
186
+ const chunks = [];
187
+ for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) chunks.push(chunk);
176
188
  if (opts.returnEachLine && chunks.length > 0) {
177
189
  const lineChunks = [];
178
190
  for (const chunk of chunks) {
@@ -194,9 +206,8 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
194
206
  }
195
207
  return lineChunks;
196
208
  }
197
- if (opts.stripHeaders) for (const chunk of chunks) chunk.content = chunk.content.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
198
- return chunks.filter((chunk) => chunk.content.length > 0);
209
+ return chunks;
199
210
  }
200
211
 
201
212
  //#endregion
202
- export { htmlToMarkdownSplitChunks };
213
+ export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.13.3",
4
+ "version": "0.15.0",
5
5
  "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",