mdream 0.13.0 → 0.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/splitter.mjs +45 -9
- package/package.json +1 -1
package/dist/splitter.mjs
CHANGED
|
@@ -62,10 +62,15 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
62
62
|
let currentHeaderText = "";
|
|
63
63
|
let lineNumber = 1;
|
|
64
64
|
let lastChunkEndPosition = 0;
|
|
65
|
-
|
|
65
|
+
let lastSplitPosition = 0;
|
|
66
|
+
function flushChunk(endPosition, applyOverlap = false) {
|
|
66
67
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
67
|
-
const
|
|
68
|
-
|
|
68
|
+
const chunkEnd = endPosition ?? currentMd.length;
|
|
69
|
+
const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
|
|
70
|
+
if (!chunkContent.trim()) {
|
|
71
|
+
lastChunkEndPosition = chunkEnd;
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
69
74
|
const chunk = {
|
|
70
75
|
content: chunkContent.trimEnd(),
|
|
71
76
|
metadata: { loc: { lines: {
|
|
@@ -83,10 +88,12 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
83
88
|
if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
|
|
84
89
|
chunks.push(chunk);
|
|
85
90
|
currentChunkCodeLanguage = "";
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
91
|
+
lastSplitPosition = chunkEnd;
|
|
92
|
+
if (applyOverlap && opts.chunkOverlap > 0) {
|
|
93
|
+
const maxOverlap = Math.max(0, chunkContent.length - 1);
|
|
94
|
+
const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap);
|
|
95
|
+
lastChunkEndPosition = chunkEnd - actualOverlap;
|
|
96
|
+
} else lastChunkEndPosition = chunkEnd;
|
|
90
97
|
lineNumber += (chunkContent.match(/\n/g) || []).length;
|
|
91
98
|
}
|
|
92
99
|
const parseState = {
|
|
@@ -133,7 +140,36 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
133
140
|
if (!opts.returnEachLine) {
|
|
134
141
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
135
142
|
const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition));
|
|
136
|
-
if (currentChunkSize > opts.chunkSize)
|
|
143
|
+
if (currentChunkSize > opts.chunkSize) {
|
|
144
|
+
const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
|
|
145
|
+
const separators = [
|
|
146
|
+
"\n\n",
|
|
147
|
+
"```\n",
|
|
148
|
+
"\n",
|
|
149
|
+
" "
|
|
150
|
+
];
|
|
151
|
+
let splitPosition = -1;
|
|
152
|
+
for (const sep of separators) {
|
|
153
|
+
const idx = currentMd.lastIndexOf(sep, idealSplitPos);
|
|
154
|
+
const candidateSplitPos = idx + sep.length;
|
|
155
|
+
if (idx >= 0) {
|
|
156
|
+
const beforeSplit = currentMd.slice(0, candidateSplitPos);
|
|
157
|
+
let backtickCount = 0;
|
|
158
|
+
let pos = 0;
|
|
159
|
+
while ((pos = beforeSplit.indexOf("```", pos)) !== -1) {
|
|
160
|
+
backtickCount++;
|
|
161
|
+
pos += 3;
|
|
162
|
+
}
|
|
163
|
+
if (backtickCount % 2 === 1) continue;
|
|
164
|
+
}
|
|
165
|
+
if (idx >= 0 && candidateSplitPos > lastSplitPosition) {
|
|
166
|
+
splitPosition = candidateSplitPos;
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
|
|
171
|
+
flushChunk(splitPosition, true);
|
|
172
|
+
}
|
|
137
173
|
}
|
|
138
174
|
});
|
|
139
175
|
flushChunk();
|
|
@@ -159,7 +195,7 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
159
195
|
return lineChunks;
|
|
160
196
|
}
|
|
161
197
|
if (opts.stripHeaders) for (const chunk of chunks) chunk.content = chunk.content.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
|
|
162
|
-
return chunks;
|
|
198
|
+
return chunks.filter((chunk) => chunk.content.length > 0);
|
|
163
199
|
}
|
|
164
200
|
|
|
165
201
|
//#endregion
|