mdream 0.13.0 → 0.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/splitter.mjs +45 -9
  2. package/package.json +1 -1
package/dist/splitter.mjs CHANGED
@@ -62,10 +62,15 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
62
62
  let currentHeaderText = "";
63
63
  let lineNumber = 1;
64
64
  let lastChunkEndPosition = 0;
65
- function flushChunk() {
65
+ let lastSplitPosition = 0;
66
+ function flushChunk(endPosition, applyOverlap = false) {
66
67
  const currentMd = getCurrentMarkdown(processor.state);
67
- const chunkContent = currentMd.slice(lastChunkEndPosition);
68
- if (!chunkContent.trim()) return;
68
+ const chunkEnd = endPosition ?? currentMd.length;
69
+ const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
70
+ if (!chunkContent.trim()) {
71
+ lastChunkEndPosition = chunkEnd;
72
+ return;
73
+ }
69
74
  const chunk = {
70
75
  content: chunkContent.trimEnd(),
71
76
  metadata: { loc: { lines: {
@@ -83,10 +88,12 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
83
88
  if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
84
89
  chunks.push(chunk);
85
90
  currentChunkCodeLanguage = "";
86
- if (opts.chunkOverlap > 0) {
87
- const overlapText = chunkContent.slice(-opts.chunkOverlap);
88
- lastChunkEndPosition = currentMd.length - overlapText.length;
89
- } else lastChunkEndPosition = currentMd.length;
91
+ lastSplitPosition = chunkEnd;
92
+ if (applyOverlap && opts.chunkOverlap > 0) {
93
+ const maxOverlap = Math.max(0, chunkContent.length - 1);
94
+ const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap);
95
+ lastChunkEndPosition = chunkEnd - actualOverlap;
96
+ } else lastChunkEndPosition = chunkEnd;
90
97
  lineNumber += (chunkContent.match(/\n/g) || []).length;
91
98
  }
92
99
  const parseState = {
@@ -133,7 +140,36 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
133
140
  if (!opts.returnEachLine) {
134
141
  const currentMd = getCurrentMarkdown(processor.state);
135
142
  const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition));
136
- if (currentChunkSize > opts.chunkSize) flushChunk();
143
+ if (currentChunkSize > opts.chunkSize) {
144
+ const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
145
+ const separators = [
146
+ "\n\n",
147
+ "```\n",
148
+ "\n",
149
+ " "
150
+ ];
151
+ let splitPosition = -1;
152
+ for (const sep of separators) {
153
+ const idx = currentMd.lastIndexOf(sep, idealSplitPos);
154
+ const candidateSplitPos = idx + sep.length;
155
+ if (idx >= 0) {
156
+ const beforeSplit = currentMd.slice(0, candidateSplitPos);
157
+ let backtickCount = 0;
158
+ let pos = 0;
159
+ while ((pos = beforeSplit.indexOf("```", pos)) !== -1) {
160
+ backtickCount++;
161
+ pos += 3;
162
+ }
163
+ if (backtickCount % 2 === 1) continue;
164
+ }
165
+ if (idx >= 0 && candidateSplitPos > lastSplitPosition) {
166
+ splitPosition = candidateSplitPos;
167
+ break;
168
+ }
169
+ }
170
+ if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
171
+ flushChunk(splitPosition, true);
172
+ }
137
173
  }
138
174
  });
139
175
  flushChunk();
@@ -159,7 +195,7 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
159
195
  return lineChunks;
160
196
  }
161
197
  if (opts.stripHeaders) for (const chunk of chunks) chunk.content = chunk.content.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
162
- return chunks;
198
+ return chunks.filter((chunk) => chunk.content.length > 0);
163
199
  }
164
200
 
165
201
  //#endregion
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.13.0",
4
+ "version": "0.13.2",
5
5
  "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",