mdream 0.13.3 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -0
- package/dist/_chunks/{const-BOAJ1T5c.mjs → const-Bf_XN9U9.mjs} +2 -5
- package/dist/_chunks/{extraction-BPaDGYvv.mjs → extraction-BA9MDtq3.mjs} +4 -6
- package/dist/_chunks/{llms-txt-DC12yO2l.mjs → llms-txt-T79S7X24.mjs} +123 -35
- package/dist/_chunks/{markdown-processor-f7XT0--8.mjs → markdown-processor-D26Uo5td.mjs} +35 -64
- package/dist/_chunks/{minimal-co1tIZYm.mjs → minimal-BiDhcwif.mjs} +3 -3
- package/dist/_chunks/{plugin-DrovQriD.mjs → plugin-CjWWQTuL.mjs} +1 -1
- package/dist/_chunks/{plugin-CgnpSqtP.d.mts → plugin-D5soyEXm.d.mts} +2 -2
- package/dist/_chunks/{plugins-C5_irVJs.mjs → plugins-DJnqR2fA.mjs} +23 -41
- package/dist/_chunks/{src-C3QpB75q.mjs → src-BJpipdul.mjs} +3 -4
- package/dist/_chunks/{types-DqiI86yW.d.mts → types-CT4ZxeOH.d.mts} +1 -1
- package/dist/cli.mjs +12 -18
- package/dist/iife.js +8 -18
- package/dist/index.d.mts +2 -5
- package/dist/index.mjs +4 -4
- package/dist/llms-txt.d.mts +45 -1
- package/dist/llms-txt.mjs +7 -7
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +4 -4
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +5 -5
- package/dist/splitter.d.mts +7 -2
- package/dist/splitter.mjs +45 -34
- package/package.json +1 -1
package/dist/splitter.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { $ as TAG_H2, F as TAG_CODE, Nt as TAG_PRE, Q as TAG_H1, et as TAG_H3, gn as TEXT_NODE, h as NodeEventExit, m as NodeEventEnter, nt as TAG_H5, ot as TAG_HR, r as ELEMENT_NODE, rt as TAG_H6, tt as TAG_H4 } from "./_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor-D26Uo5td.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/splitter.ts
|
|
5
5
|
const DEFAULT_HEADERS_TO_SPLIT_ON = [
|
|
@@ -36,26 +36,22 @@ function shouldSplitOnHeader(tagId, options) {
|
|
|
36
36
|
*/
|
|
37
37
|
function getCurrentMarkdown(state) {
|
|
38
38
|
const fragments = [];
|
|
39
|
-
for (const [regionId, content] of state.regionContentBuffers.entries())
|
|
40
|
-
const include = state.regionToggles.get(regionId);
|
|
41
|
-
if (include) fragments.push(...content);
|
|
42
|
-
}
|
|
39
|
+
for (const [regionId, content] of state.regionContentBuffers.entries()) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
43
40
|
return fragments.join("").trimStart();
|
|
44
41
|
}
|
|
45
42
|
/**
|
|
46
43
|
* Convert HTML to Markdown and split into chunks in single pass
|
|
47
|
-
*
|
|
44
|
+
* Yields chunks during HTML event processing for better memory efficiency
|
|
48
45
|
*/
|
|
49
|
-
function
|
|
46
|
+
function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
50
47
|
const opts = createOptions(options);
|
|
51
48
|
if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
|
|
52
49
|
const processor = createMarkdownProcessor({
|
|
53
50
|
origin: opts.origin,
|
|
54
51
|
plugins: opts.plugins
|
|
55
52
|
});
|
|
56
|
-
const
|
|
57
|
-
const
|
|
58
|
-
const seenSplitHeaders = new Set();
|
|
53
|
+
const headerHierarchy = /* @__PURE__ */ new Map();
|
|
54
|
+
const seenSplitHeaders = /* @__PURE__ */ new Set();
|
|
59
55
|
let currentChunkCodeLanguage = "";
|
|
60
56
|
let collectingHeaderText = false;
|
|
61
57
|
let currentHeaderTagId = null;
|
|
@@ -63,19 +59,27 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
63
59
|
let lineNumber = 1;
|
|
64
60
|
let lastChunkEndPosition = 0;
|
|
65
61
|
let lastSplitPosition = 0;
|
|
66
|
-
function flushChunk(endPosition, applyOverlap = false) {
|
|
62
|
+
function* flushChunk(endPosition, applyOverlap = false) {
|
|
67
63
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
68
64
|
const chunkEnd = endPosition ?? currentMd.length;
|
|
69
|
-
const
|
|
70
|
-
if (!
|
|
65
|
+
const originalChunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
|
|
66
|
+
if (!originalChunkContent.trim()) {
|
|
71
67
|
lastChunkEndPosition = chunkEnd;
|
|
72
68
|
return;
|
|
73
69
|
}
|
|
70
|
+
let chunkContent = originalChunkContent;
|
|
71
|
+
if (opts.stripHeaders) {
|
|
72
|
+
chunkContent = chunkContent.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
|
|
73
|
+
if (!chunkContent) {
|
|
74
|
+
lastChunkEndPosition = chunkEnd;
|
|
75
|
+
return;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
74
78
|
const chunk = {
|
|
75
79
|
content: chunkContent.trimEnd(),
|
|
76
80
|
metadata: { loc: { lines: {
|
|
77
81
|
from: lineNumber,
|
|
78
|
-
to: lineNumber + (
|
|
82
|
+
to: lineNumber + (originalChunkContent.match(/\n/g) || []).length
|
|
79
83
|
} } }
|
|
80
84
|
};
|
|
81
85
|
if (headerHierarchy.size > 0) {
|
|
@@ -86,22 +90,25 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
86
90
|
}
|
|
87
91
|
}
|
|
88
92
|
if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
|
|
89
|
-
|
|
93
|
+
yield chunk;
|
|
90
94
|
currentChunkCodeLanguage = "";
|
|
91
95
|
lastSplitPosition = chunkEnd;
|
|
92
96
|
if (applyOverlap && opts.chunkOverlap > 0) {
|
|
93
|
-
const maxOverlap = Math.max(0,
|
|
94
|
-
|
|
95
|
-
lastChunkEndPosition = chunkEnd - actualOverlap;
|
|
97
|
+
const maxOverlap = Math.max(0, originalChunkContent.length - 1);
|
|
98
|
+
lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
|
|
96
99
|
} else lastChunkEndPosition = chunkEnd;
|
|
97
|
-
lineNumber += (
|
|
100
|
+
lineNumber += (originalChunkContent.match(/\n/g) || []).length;
|
|
98
101
|
}
|
|
99
102
|
const parseState = {
|
|
100
103
|
depthMap: processor.state.depthMap,
|
|
101
104
|
depth: 0,
|
|
102
105
|
plugins: opts.plugins
|
|
103
106
|
};
|
|
107
|
+
const eventBuffer = [];
|
|
104
108
|
parseHtmlStream(html, parseState, (event) => {
|
|
109
|
+
eventBuffer.push(event);
|
|
110
|
+
});
|
|
111
|
+
for (const event of eventBuffer) {
|
|
105
112
|
const { type: eventType, node } = event;
|
|
106
113
|
if (node.type === ELEMENT_NODE) {
|
|
107
114
|
const element = node;
|
|
@@ -113,7 +120,7 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
113
120
|
currentHeaderText = "";
|
|
114
121
|
if (shouldSplitOnHeader(tagId, opts)) {
|
|
115
122
|
if (seenSplitHeaders.has(tagId)) {
|
|
116
|
-
flushChunk();
|
|
123
|
+
yield* flushChunk();
|
|
117
124
|
for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
|
|
118
125
|
}
|
|
119
126
|
seenSplitHeaders.add(tagId);
|
|
@@ -130,17 +137,13 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
130
137
|
if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
|
|
131
138
|
}
|
|
132
139
|
}
|
|
133
|
-
if (tagId === TAG_HR && eventType === NodeEventEnter) flushChunk();
|
|
134
|
-
}
|
|
135
|
-
if (collectingHeaderText && node.type === TEXT_NODE) {
|
|
136
|
-
const textNode = node;
|
|
137
|
-
currentHeaderText += textNode.value;
|
|
140
|
+
if (tagId === TAG_HR && eventType === NodeEventEnter) yield* flushChunk();
|
|
138
141
|
}
|
|
142
|
+
if (collectingHeaderText && node.type === TEXT_NODE) currentHeaderText += node.value;
|
|
139
143
|
processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
|
|
140
144
|
if (!opts.returnEachLine) {
|
|
141
145
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
142
|
-
|
|
143
|
-
if (currentChunkSize > opts.chunkSize) {
|
|
146
|
+
if (opts.lengthFunction(currentMd.slice(lastChunkEndPosition)) > opts.chunkSize) {
|
|
144
147
|
const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
|
|
145
148
|
const separators = [
|
|
146
149
|
"\n\n",
|
|
@@ -168,11 +171,20 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
168
171
|
}
|
|
169
172
|
}
|
|
170
173
|
if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
|
|
171
|
-
flushChunk(splitPosition, true);
|
|
174
|
+
yield* flushChunk(splitPosition, true);
|
|
172
175
|
}
|
|
173
176
|
}
|
|
174
|
-
}
|
|
175
|
-
flushChunk();
|
|
177
|
+
}
|
|
178
|
+
yield* flushChunk();
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
182
|
+
* Chunks are created during HTML event processing
|
|
183
|
+
*/
|
|
184
|
+
function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
185
|
+
const opts = createOptions(options);
|
|
186
|
+
const chunks = [];
|
|
187
|
+
for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) chunks.push(chunk);
|
|
176
188
|
if (opts.returnEachLine && chunks.length > 0) {
|
|
177
189
|
const lineChunks = [];
|
|
178
190
|
for (const chunk of chunks) {
|
|
@@ -194,9 +206,8 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
194
206
|
}
|
|
195
207
|
return lineChunks;
|
|
196
208
|
}
|
|
197
|
-
|
|
198
|
-
return chunks.filter((chunk) => chunk.content.length > 0);
|
|
209
|
+
return chunks;
|
|
199
210
|
}
|
|
200
211
|
|
|
201
212
|
//#endregion
|
|
202
|
-
export { htmlToMarkdownSplitChunks };
|
|
213
|
+
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|