@storepress/llm-md-text-splitter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +816 -0
- package/package.json +79 -0
- package/src/MarkdownTextSplitter.d.ts +304 -0
- package/src/MarkdownTextSplitter.js +1432 -0
|
@@ -0,0 +1,1432 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MarkdownTextSplitter.js (Browser-Native ES Module)
|
|
3
|
+
* ====================================================
|
|
4
|
+
* A high-performance, streaming Markdown text splitter for LLM consumption.
|
|
5
|
+
* Runs entirely in the browser — zero Node.js dependencies.
|
|
6
|
+
*
|
|
7
|
+
* KEY FEATURES:
|
|
8
|
+
* - Zero Sequence Loss: Code blocks, tables, reference links, and video embeds
|
|
9
|
+
* are NEVER split apart. They stay as atomic semantic units with their context.
|
|
10
|
+
* - Stream-based: Uses browser-native fetch() + ReadableStream + TextDecoderStream
|
|
11
|
+
* to process 100K+ line files without loading everything into RAM.
|
|
12
|
+
* - Pluggable Strategies: Ships with 5 splitting strategies and accepts custom ones:
|
|
13
|
+
* 1. SemanticStrategy — Split by Markdown structure (headings, paragraphs, code)
|
|
14
|
+
* 2. DelimiterStrategy — Split on a custom delimiter string (e.g. '---', '===')
|
|
15
|
+
* 3. CharLimitStrategy — Split by character count
|
|
16
|
+
* 4. WordLimitStrategy — Split by word count
|
|
17
|
+
* 5. TokenLimitStrategy — Split by estimated LLM token count
|
|
18
|
+
* - Rich Metadata: Each chunk carries positional, structural, and relational metadata.
|
|
19
|
+
* - Every function is commented with explanation.
|
|
20
|
+
* - Fully configurable via constructor options.
|
|
21
|
+
*
|
|
22
|
+
* BROWSER COMPATIBILITY: Chrome 71+, Firefox 65+, Safari 14.1+, Edge 79+
|
|
23
|
+
*
|
|
24
|
+
* @module MarkdownTextSplitter
|
|
25
|
+
* @version 0.0.1
|
|
26
|
+
* @license MIT
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
30
|
+
// SEMANTIC BLOCK TYPES
|
|
31
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
const SPLITTER_VERSION = '0.0.1';
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Enum of all Markdown semantic block types the parser can identify.
|
|
37
|
+
* Used by the SemanticStrategy to classify content and enforce atomic rules.
|
|
38
|
+
* Other strategies reference these for metadata enrichment.
|
|
39
|
+
*
|
|
40
|
+
* @readonly
|
|
41
|
+
* @enum {string}
|
|
42
|
+
*/
|
|
43
|
+
export const BlockType = Object.freeze({
|
|
44
|
+
HEADING : "heading", // # Heading lines (h1–h6)
|
|
45
|
+
PARAGRAPH : "paragraph", // Regular text paragraphs
|
|
46
|
+
CODE_BLOCK : "code_block", // Fenced ``` or ~~~ code blocks (ATOMIC)
|
|
47
|
+
LIST : "list", // Ordered/unordered list items
|
|
48
|
+
BLOCKQUOTE : "blockquote", // > Blockquoted text
|
|
49
|
+
TABLE : "table", // Markdown tables (ATOMIC)
|
|
50
|
+
LINK_REF : "link_reference", // [id]: url reference definitions
|
|
51
|
+
VIDEO_EMBED : "video_embed", // YouTube/Vimeo embeds (ATOMIC)
|
|
52
|
+
HR : "hr", // --- or *** horizontal rules
|
|
53
|
+
EMPTY : "empty", // Blank lines (separators)
|
|
54
|
+
FRONTMATTER : "frontmatter", // YAML frontmatter --- blocks (ATOMIC)
|
|
55
|
+
HTML_BLOCK : "html_block", // Raw HTML blocks
|
|
56
|
+
IMAGE : "image", //  images
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Set of block types that must NEVER be split across chunk boundaries.
|
|
61
|
+
* These are "atomic" — the entire block goes into one chunk or not at all.
|
|
62
|
+
* This is the foundation of the zero-sequence-loss guarantee.
|
|
63
|
+
*/
|
|
64
|
+
const ATOMIC_BLOCKS = new Set([
|
|
65
|
+
BlockType.CODE_BLOCK,
|
|
66
|
+
BlockType.TABLE,
|
|
67
|
+
BlockType.VIDEO_EMBED,
|
|
68
|
+
BlockType.FRONTMATTER,
|
|
69
|
+
]);
|
|
70
|
+
|
|
71
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
72
|
+
// DEFAULT CONFIGURATION
|
|
73
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Default configuration for the splitter.
|
|
77
|
+
* Every option is overridable via the constructor. Strategies may also
|
|
78
|
+
* define their own additional config keys under `strategyOptions`.
|
|
79
|
+
*
|
|
80
|
+
* @typedef {Object} SplitterConfig
|
|
81
|
+
* @property {number} maxChunkTokens - Target max tokens per chunk (~4 chars/token)
|
|
82
|
+
* @property {number} overlapTokens - Tokens of overlap between consecutive chunks
|
|
83
|
+
* @property {number} charsPerToken - Characters-per-token ratio for estimation
|
|
84
|
+
* @property {number} fetchTimeoutMs - HTTP fetch timeout in milliseconds
|
|
85
|
+
* @property {boolean} preserveCodeContext - Group code blocks with surrounding text
|
|
86
|
+
* @property {boolean} preserveLinks - Group reference links with their sections
|
|
87
|
+
* @property {boolean} preserveVideos - Group video embeds with their context
|
|
88
|
+
* @property {string} chunkIdPrefix - Prefix for generated chunk IDs
|
|
89
|
+
* @property {RegExp} videoPattern - Regex to detect video embed lines
|
|
90
|
+
* @property {RegExp} linkRefPattern - Regex to detect reference-style link defs
|
|
91
|
+
* @property {string} strategy - Active strategy name: 'semantic'|'delimiter'|'char'|'word'|'token'
|
|
92
|
+
* @property {Object} strategyOptions - Strategy-specific options (see each strategy)
|
|
93
|
+
*/
|
|
94
|
+
export const DEFAULT_CONFIG = Object.freeze({
|
|
95
|
+
maxChunkTokens : 1500,
|
|
96
|
+
overlapTokens : 150,
|
|
97
|
+
charsPerToken : 4,
|
|
98
|
+
fetchTimeoutMs : 60_000,
|
|
99
|
+
preserveCodeContext : true,
|
|
100
|
+
preserveLinks : true,
|
|
101
|
+
preserveVideos : true,
|
|
102
|
+
chunkIdPrefix : "chunk",
|
|
103
|
+
videoPattern :
|
|
104
|
+
/(?:\[.*?\]\((?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/|vimeo\.com\/)[\w-]+.*?\))|(?:<iframe[^>]*(?:youtube|vimeo)[^>]*>)/i,
|
|
105
|
+
linkRefPattern : /^\s*\[([^\]]+)\]:\s+(.+)$/,
|
|
106
|
+
strategy : "semantic",
|
|
107
|
+
strategyOptions : {},
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
111
|
+
// UTILITY FUNCTIONS
|
|
112
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Estimates token count for a text string using a character-ratio heuristic.
|
|
116
|
+
* For production, replace with tiktoken or cl100k_base WASM tokenizer.
|
|
117
|
+
*
|
|
118
|
+
* @param {string} text - Text to estimate
|
|
119
|
+
* @param {number} charsPerToken - Chars-per-token ratio (default 4)
|
|
120
|
+
* @returns {number} Estimated token count
|
|
121
|
+
*/
|
|
122
|
+
export function estimateTokens(text, charsPerToken = 4) {
|
|
123
|
+
if (!text) return 0;
|
|
124
|
+
return Math.ceil(text.length / charsPerToken);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Counts words in a text string. Uses Unicode-aware splitting.
|
|
129
|
+
*
|
|
130
|
+
* @param {string} text - Text to count words in
|
|
131
|
+
* @returns {number} Word count
|
|
132
|
+
*/
|
|
133
|
+
export function countWords(text) {
|
|
134
|
+
if (!text) return 0;
|
|
135
|
+
return text.split(/\s+/).filter(Boolean).length;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Generates a deterministic hash-based chunk ID using SubtleCrypto (browser)
|
|
140
|
+
* or a simple FNV-1a fallback for synchronous use.
|
|
141
|
+
* Uses FNV-1a by default since SubtleCrypto is async and we need sync IDs.
|
|
142
|
+
*
|
|
143
|
+
* @param {string} content - Content to hash
|
|
144
|
+
* @param {number} index - Sequential chunk index
|
|
145
|
+
* @param {string} prefix - ID prefix
|
|
146
|
+
* @returns {string} Unique chunk ID like "chunk_a1b2c3d4_0042"
|
|
147
|
+
*/
|
|
148
|
+
export function generateChunkId(content, index, prefix = "chunk") {
|
|
149
|
+
// FNV-1a 32-bit hash — fast, deterministic, no async needed
|
|
150
|
+
let hash = 0x811c9dc5;
|
|
151
|
+
for (let i = 0; i < content.length; i++) {
|
|
152
|
+
hash ^= content.charCodeAt(i);
|
|
153
|
+
hash = (hash * 0x01000193) >>> 0;
|
|
154
|
+
}
|
|
155
|
+
const hex = hash.toString(16).padStart(8, "0");
|
|
156
|
+
return `${prefix}_${hex}_${String(index).padStart(4, "0")}`;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Extracts all inline links from Markdown text.
|
|
161
|
+
* Returns objects with display text and URL for metadata enrichment.
|
|
162
|
+
*
|
|
163
|
+
* @param {string} text - Markdown text to scan
|
|
164
|
+
* @returns {Array<{text: string, url: string}>} Extracted link objects
|
|
165
|
+
*/
|
|
166
|
+
export function extractLinks(text) {
|
|
167
|
+
const links = [];
|
|
168
|
+
const regex = /\[([^\]]+)\]\(([^)]+)\)/g;
|
|
169
|
+
let m;
|
|
170
|
+
while ((m = regex.exec(text)) !== null) {
|
|
171
|
+
links.push({text : m[1], url : m[2]});
|
|
172
|
+
}
|
|
173
|
+
return links;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Extracts video embed URLs (YouTube, Vimeo) from text.
|
|
178
|
+
* Returns structured objects with platform, URL, and video ID.
|
|
179
|
+
*
|
|
180
|
+
* @param {string} text - Text to scan for video URLs
|
|
181
|
+
* @returns {Array<{platform: string, url: string, videoId: string}>}
|
|
182
|
+
*/
|
|
183
|
+
export function extractVideos(text) {
|
|
184
|
+
const videos = [];
|
|
185
|
+
const ytRe = /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([\w-]+)/g;
|
|
186
|
+
const vmRe = /(?:https?:\/\/)?(?:www\.)?vimeo\.com\/([\d]+)/g;
|
|
187
|
+
let m;
|
|
188
|
+
while ((m = ytRe.exec(text)) !== null) {
|
|
189
|
+
videos.push({platform : "youtube", url : `https://www.youtube.com/watch?v=${m[1]}`, videoId : m[1]});
|
|
190
|
+
}
|
|
191
|
+
while ((m = vmRe.exec(text)) !== null) {
|
|
192
|
+
videos.push({platform : "vimeo", url : `https://vimeo.com/${m[1]}`, videoId : m[1]});
|
|
193
|
+
}
|
|
194
|
+
return videos;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
198
|
+
// STREAM → LINE ITERATOR (Browser-native)
|
|
199
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Converts a ReadableStream<Uint8Array> into an async iterator of lines.
|
|
203
|
+
* Uses the browser-native TextDecoderStream for proper UTF-8 handling.
|
|
204
|
+
* Only one line is in memory at any time — this is the key to memory efficiency.
|
|
205
|
+
*
|
|
206
|
+
* Handles:
|
|
207
|
+
* - Multi-byte UTF-8 sequences that span stream chunk boundaries
|
|
208
|
+
* - Files that don't end with a newline
|
|
209
|
+
* - \r\n (Windows) and \n (Unix) line endings
|
|
210
|
+
*
|
|
211
|
+
* @param {ReadableStream<Uint8Array>} byteStream - Raw byte stream
|
|
212
|
+
* @yields {{ lineNumber: number, text: string }} One line at a time
|
|
213
|
+
*/
|
|
214
|
+
export async function* streamToLines(byteStream) {
|
|
215
|
+
const textStream = byteStream.pipeThrough(new TextDecoderStream("utf-8"));
|
|
216
|
+
const reader = textStream.getReader();
|
|
217
|
+
let buffer = "";
|
|
218
|
+
let lineNumber = 0;
|
|
219
|
+
|
|
220
|
+
try {
|
|
221
|
+
while (true) {
|
|
222
|
+
const {done, value} = await reader.read();
|
|
223
|
+
if (done) break;
|
|
224
|
+
buffer += value;
|
|
225
|
+
const lines = buffer.split("\n");
|
|
226
|
+
buffer = lines.pop() || "";
|
|
227
|
+
for (const line of lines) {
|
|
228
|
+
lineNumber++;
|
|
229
|
+
yield {lineNumber, text : line.replace(/\r$/, "")};
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
// Flush remaining partial line
|
|
233
|
+
if (buffer.length > 0) {
|
|
234
|
+
lineNumber++;
|
|
235
|
+
yield {lineNumber, text : buffer.replace(/\r$/, "")};
|
|
236
|
+
}
|
|
237
|
+
} finally {
|
|
238
|
+
reader.releaseLock();
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Converts a plain string into an async iterator of lines.
|
|
244
|
+
* Wraps the string in a ReadableStream for pipeline compatibility.
|
|
245
|
+
* Useful for testing or processing in-memory content.
|
|
246
|
+
*
|
|
247
|
+
* @param {string} text - The full markdown string
|
|
248
|
+
* @yields {{ lineNumber: number, text: string }} One line at a time
|
|
249
|
+
*/
|
|
250
|
+
export async function* stringToLines(text) {
|
|
251
|
+
const encoder = new TextEncoder();
|
|
252
|
+
const stream = new ReadableStream({
|
|
253
|
+
start(controller) {
|
|
254
|
+
// Enqueue in 64KB chunks to simulate realistic streaming
|
|
255
|
+
const bytes = encoder.encode(text);
|
|
256
|
+
const CHUNK = 65536;
|
|
257
|
+
for (let i = 0; i < bytes.length; i += CHUNK) {
|
|
258
|
+
controller.enqueue(bytes.slice(i, i + CHUNK));
|
|
259
|
+
}
|
|
260
|
+
controller.close();
|
|
261
|
+
},
|
|
262
|
+
});
|
|
263
|
+
yield* streamToLines(stream);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
267
|
+
// SEMANTIC PARSER
|
|
268
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* SemanticParser: Classifies Markdown lines into typed semantic blocks.
|
|
272
|
+
*
|
|
273
|
+
* Consumes an async line iterator and yields structured block objects.
|
|
274
|
+
* Tracks heading hierarchy for breadcrumb metadata.
|
|
275
|
+
* Identifies and enforces atomic blocks (code, tables, videos, frontmatter).
|
|
276
|
+
*
|
|
277
|
+
* Each emitted block:
|
|
278
|
+
* {
|
|
279
|
+
* type, content, lines: {start,end}, heading, headingLevel, headingPath,
|
|
280
|
+
* language, isAtomic, links, videos, metadata
|
|
281
|
+
* }
|
|
282
|
+
*
|
|
283
|
+
* @class
|
|
284
|
+
* @param {Object} config - Splitter configuration
|
|
285
|
+
*/
|
|
286
|
+
export class SemanticParser {
|
|
287
|
+
constructor(config) {
|
|
288
|
+
/** @private Splitter config reference */
|
|
289
|
+
this.config = config;
|
|
290
|
+
/** @private Heading breadcrumb stack */
|
|
291
|
+
this.headingStack = [];
|
|
292
|
+
/** @private Accumulated reference-style link definitions */
|
|
293
|
+
this.linkDefinitions = new Map();
|
|
294
|
+
/** @private Frontmatter tracking state */
|
|
295
|
+
this.inFrontmatter = false;
|
|
296
|
+
/** @private Whether we've seen non-empty content */
|
|
297
|
+
this.contentStarted = false;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Main parse generator. Yields one semantic block at a time.
|
|
302
|
+
* Handles all Markdown constructs including fenced code blocks,
|
|
303
|
+
* tables, frontmatter, blockquotes, lists, images, HTML, and videos.
|
|
304
|
+
*
|
|
305
|
+
* @param {AsyncIterable<{lineNumber: number, text: string}>} lineIterator
|
|
306
|
+
* @yields {Object} Semantic block objects
|
|
307
|
+
*/
|
|
308
|
+
async* parse(lineIterator) {
|
|
309
|
+
let currentBlock = null;
|
|
310
|
+
let inCodeBlock = false;
|
|
311
|
+
let codeFence = "";
|
|
312
|
+
let codeLanguage = "";
|
|
313
|
+
let inTable = false;
|
|
314
|
+
|
|
315
|
+
for await (const {lineNumber, text} of lineIterator) {
|
|
316
|
+
// ── FRONTMATTER (--- at line 1) ──
|
|
317
|
+
if (!this.contentStarted && lineNumber === 1 && text.trim() === "---") {
|
|
318
|
+
this.inFrontmatter = true;
|
|
319
|
+
currentBlock = this._block(BlockType.FRONTMATTER, text, lineNumber);
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
if (this.inFrontmatter) {
|
|
323
|
+
currentBlock.content += "\n" + text;
|
|
324
|
+
currentBlock.lines.end = lineNumber;
|
|
325
|
+
if (text.trim() === "---" && lineNumber > 1) {
|
|
326
|
+
this.inFrontmatter = false;
|
|
327
|
+
currentBlock.metadata.frontmatter = this._parseFrontmatter(currentBlock.content);
|
|
328
|
+
yield currentBlock;
|
|
329
|
+
currentBlock = null;
|
|
330
|
+
}
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
this.contentStarted = true;
|
|
334
|
+
|
|
335
|
+
// ── FENCED CODE BLOCK (``` or ~~~) — highest priority ──
|
|
336
|
+
const fenceMatch = text.match(/^(\s*)(```|~~~)(.*)$/);
|
|
337
|
+
if (fenceMatch && !inCodeBlock) {
|
|
338
|
+
if (currentBlock) yield currentBlock;
|
|
339
|
+
inCodeBlock = true;
|
|
340
|
+
codeFence = fenceMatch[2];
|
|
341
|
+
codeLanguage = fenceMatch[3].trim().split(/\s+/)[0] || "";
|
|
342
|
+
currentBlock = this._block(BlockType.CODE_BLOCK, text, lineNumber);
|
|
343
|
+
currentBlock.language = codeLanguage;
|
|
344
|
+
currentBlock.isAtomic = true;
|
|
345
|
+
continue;
|
|
346
|
+
}
|
|
347
|
+
if (inCodeBlock) {
|
|
348
|
+
currentBlock.content += "\n" + text;
|
|
349
|
+
currentBlock.lines.end = lineNumber;
|
|
350
|
+
const trimmed = text.trim();
|
|
351
|
+
if (trimmed.startsWith(codeFence) && trimmed.length <= codeFence.length + 1) {
|
|
352
|
+
inCodeBlock = false;
|
|
353
|
+
yield currentBlock;
|
|
354
|
+
currentBlock = null;
|
|
355
|
+
}
|
|
356
|
+
continue;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// ── TABLE ──
|
|
360
|
+
const isTableRow = /^\s*\|.*\|\s*$/.test(text);
|
|
361
|
+
const isTableSep = /^\s*\|[\s:|-]+\|\s*$/.test(text);
|
|
362
|
+
if (isTableRow || isTableSep) {
|
|
363
|
+
if (!inTable) {
|
|
364
|
+
if (currentBlock) yield currentBlock;
|
|
365
|
+
inTable = true;
|
|
366
|
+
currentBlock = this._block(BlockType.TABLE, text, lineNumber);
|
|
367
|
+
currentBlock.isAtomic = true;
|
|
368
|
+
}
|
|
369
|
+
else {
|
|
370
|
+
currentBlock.content += "\n" + text;
|
|
371
|
+
currentBlock.lines.end = lineNumber;
|
|
372
|
+
}
|
|
373
|
+
continue;
|
|
374
|
+
}
|
|
375
|
+
else if (inTable) {
|
|
376
|
+
inTable = false;
|
|
377
|
+
yield currentBlock;
|
|
378
|
+
currentBlock = null;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// ── EMPTY LINE ──
|
|
382
|
+
if (text.trim() === "") {
|
|
383
|
+
if (currentBlock) {
|
|
384
|
+
yield currentBlock;
|
|
385
|
+
currentBlock = null;
|
|
386
|
+
}
|
|
387
|
+
continue;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// ── HEADING ──
|
|
391
|
+
const hMatch = text.match(/^(#{1,6})\s+(.+)$/);
|
|
392
|
+
if (hMatch) {
|
|
393
|
+
if (currentBlock) yield currentBlock;
|
|
394
|
+
const level = hMatch[1].length;
|
|
395
|
+
const hText = hMatch[2].trim();
|
|
396
|
+
this._pushHeading(level, hText);
|
|
397
|
+
const blk = this._block(BlockType.HEADING, text, lineNumber);
|
|
398
|
+
blk.headingLevel = level;
|
|
399
|
+
blk.heading = hText;
|
|
400
|
+
blk.headingPath = this.headingStack.map((h) => h.text);
|
|
401
|
+
yield blk;
|
|
402
|
+
currentBlock = null;
|
|
403
|
+
continue;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// ── HORIZONTAL RULE ──
|
|
407
|
+
if (/^(\s*[-*_]\s*){3,}$/.test(text)) {
|
|
408
|
+
if (currentBlock) yield currentBlock;
|
|
409
|
+
yield this._block(BlockType.HR, text, lineNumber);
|
|
410
|
+
currentBlock = null;
|
|
411
|
+
continue;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// ── VIDEO EMBED ──
|
|
415
|
+
if (this.config.videoPattern.test(text)) {
|
|
416
|
+
if (currentBlock) yield currentBlock;
|
|
417
|
+
const vb = this._block(BlockType.VIDEO_EMBED, text, lineNumber);
|
|
418
|
+
vb.isAtomic = true;
|
|
419
|
+
vb.videos = extractVideos(text);
|
|
420
|
+
yield vb;
|
|
421
|
+
currentBlock = null;
|
|
422
|
+
continue;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// ── REFERENCE LINK DEFINITION ──
|
|
426
|
+
const lrMatch = text.match(this.config.linkRefPattern);
|
|
427
|
+
if (lrMatch) {
|
|
428
|
+
this.linkDefinitions.set(lrMatch[1], lrMatch[2].trim());
|
|
429
|
+
if (!currentBlock || currentBlock.type !== BlockType.LINK_REF) {
|
|
430
|
+
if (currentBlock) yield currentBlock;
|
|
431
|
+
currentBlock = this._block(BlockType.LINK_REF, text, lineNumber);
|
|
432
|
+
}
|
|
433
|
+
else {
|
|
434
|
+
currentBlock.content += "\n" + text;
|
|
435
|
+
currentBlock.lines.end = lineNumber;
|
|
436
|
+
}
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// ── IMAGE ──
|
|
441
|
+
if (/^\s*!\[.*\]\(.*\)\s*$/.test(text)) {
|
|
442
|
+
if (currentBlock) yield currentBlock;
|
|
443
|
+
yield this._block(BlockType.IMAGE, text, lineNumber);
|
|
444
|
+
currentBlock = null;
|
|
445
|
+
continue;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// ── BLOCKQUOTE ──
|
|
449
|
+
if (text.startsWith(">")) {
|
|
450
|
+
if (!currentBlock || currentBlock.type !== BlockType.BLOCKQUOTE) {
|
|
451
|
+
if (currentBlock) yield currentBlock;
|
|
452
|
+
currentBlock = this._block(BlockType.BLOCKQUOTE, text, lineNumber);
|
|
453
|
+
}
|
|
454
|
+
else {
|
|
455
|
+
currentBlock.content += "\n" + text;
|
|
456
|
+
currentBlock.lines.end = lineNumber;
|
|
457
|
+
}
|
|
458
|
+
continue;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// ── LIST ──
|
|
462
|
+
if (/^\s*[-*+]\s+|^\s*\d+\.\s+/.test(text)) {
|
|
463
|
+
if (!currentBlock || currentBlock.type !== BlockType.LIST) {
|
|
464
|
+
if (currentBlock) yield currentBlock;
|
|
465
|
+
currentBlock = this._block(BlockType.LIST, text, lineNumber);
|
|
466
|
+
}
|
|
467
|
+
else {
|
|
468
|
+
currentBlock.content += "\n" + text;
|
|
469
|
+
currentBlock.lines.end = lineNumber;
|
|
470
|
+
}
|
|
471
|
+
continue;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// ── HTML BLOCK ──
|
|
475
|
+
if (/^\s*<[a-zA-Z]/.test(text) && !/^\s*<a\s/.test(text)) {
|
|
476
|
+
if (!currentBlock || currentBlock.type !== BlockType.HTML_BLOCK) {
|
|
477
|
+
if (currentBlock) yield currentBlock;
|
|
478
|
+
currentBlock = this._block(BlockType.HTML_BLOCK, text, lineNumber);
|
|
479
|
+
}
|
|
480
|
+
else {
|
|
481
|
+
currentBlock.content += "\n" + text;
|
|
482
|
+
currentBlock.lines.end = lineNumber;
|
|
483
|
+
}
|
|
484
|
+
continue;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// ── PARAGRAPH (default) ──
|
|
488
|
+
if (!currentBlock || currentBlock.type !== BlockType.PARAGRAPH) {
|
|
489
|
+
if (currentBlock) yield currentBlock;
|
|
490
|
+
currentBlock = this._block(BlockType.PARAGRAPH, text, lineNumber);
|
|
491
|
+
}
|
|
492
|
+
else {
|
|
493
|
+
currentBlock.content += "\n" + text;
|
|
494
|
+
currentBlock.lines.end = lineNumber;
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
if (currentBlock) yield currentBlock;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
/**
|
|
502
|
+
* Creates a new semantic block with heading context inherited.
|
|
503
|
+
* @private
|
|
504
|
+
*/
|
|
505
|
+
_block(type, content, lineNumber) {
|
|
506
|
+
const cur = this.headingStack.length > 0
|
|
507
|
+
? this.headingStack[this.headingStack.length - 1]
|
|
508
|
+
: null;
|
|
509
|
+
return {
|
|
510
|
+
type, content,
|
|
511
|
+
lines : {start : lineNumber, end : lineNumber},
|
|
512
|
+
heading : cur?.text || null,
|
|
513
|
+
headingLevel : cur?.level || null,
|
|
514
|
+
headingPath : this.headingStack.map((h) => h.text),
|
|
515
|
+
language : null,
|
|
516
|
+
isAtomic : ATOMIC_BLOCKS.has(type),
|
|
517
|
+
links : [], videos : [], metadata : {},
|
|
518
|
+
};
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
/**
|
|
522
|
+
* Maintains heading breadcrumb stack. Pops headings at same or deeper level
|
|
523
|
+
* before pushing the new one so the path always reflects nesting.
|
|
524
|
+
* @private
|
|
525
|
+
*/
|
|
526
|
+
_pushHeading(level, text) {
|
|
527
|
+
while (this.headingStack.length > 0 &&
|
|
528
|
+
this.headingStack[this.headingStack.length - 1].level >= level) {
|
|
529
|
+
this.headingStack.pop();
|
|
530
|
+
}
|
|
531
|
+
this.headingStack.push({level, text});
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/**
|
|
535
|
+
* Simple frontmatter key:value parser.
|
|
536
|
+
* @private
|
|
537
|
+
*/
|
|
538
|
+
_parseFrontmatter(content) {
|
|
539
|
+
const result = {};
|
|
540
|
+
for (const line of content.split("\n").slice(1, -1)) {
|
|
541
|
+
const i = line.indexOf(":");
|
|
542
|
+
if (i > 0) result[line.slice(0, i).trim()] = line.slice(i + 1).trim();
|
|
543
|
+
}
|
|
544
|
+
return result;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
/** Returns all accumulated [id]: url link definitions */
|
|
548
|
+
getLinkDefinitions() { return new Map(this.linkDefinitions); }
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
552
|
+
// SPLITTING STRATEGIES
|
|
553
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
554
|
+
|
|
555
|
+
/**
|
|
556
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
557
|
+
* STRATEGY 1: SemanticStrategy (Default)
|
|
558
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
559
|
+
* Splits by Markdown structure with context grouping.
|
|
560
|
+
* Groups code blocks with their explanatory text, keeps tables and videos
|
|
561
|
+
* atomic, and respects heading boundaries.
|
|
562
|
+
*
|
|
563
|
+
* Options:
|
|
564
|
+
* - maxChunkTokens: number (target chunk size in tokens)
|
|
565
|
+
* - overlapTokens: number (overlap between chunks)
|
|
566
|
+
*
|
|
567
|
+
* This is the most intelligent strategy and guarantees zero sequence loss.
|
|
568
|
+
*/
|
|
569
|
+
export class SemanticStrategy {
|
|
570
|
+
/**
|
|
571
|
+
* @param {Object} config - Full splitter config
|
|
572
|
+
*/
|
|
573
|
+
constructor(config) {
|
|
574
|
+
this.config = config;
|
|
575
|
+
this.name = "semantic";
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
/**
|
|
579
|
+
* Processes an async line iterator through the full semantic pipeline:
|
|
580
|
+
* Lines → SemanticParser → ContextGrouper → ChunkAssembler
|
|
581
|
+
*
|
|
582
|
+
* @param {AsyncIterable} lineIterator - Async line iterator
|
|
583
|
+
* @yields {Object} Final chunk objects with metadata
|
|
584
|
+
*/
|
|
585
|
+
async* process(lineIterator) {
|
|
586
|
+
const parser = new SemanticParser(this.config);
|
|
587
|
+
const blocks = parser.parse(lineIterator);
|
|
588
|
+
const groups = this._groupBlocks(blocks);
|
|
589
|
+
yield* this._assembleChunks(groups, parser);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Groups related semantic blocks to prevent splitting related content.
|
|
594
|
+
* Rules enforce: code+text together, videos with context, links with sections.
|
|
595
|
+
* @private
|
|
596
|
+
*/
|
|
597
|
+
async* _groupBlocks(blocks) {
|
|
598
|
+
const buf = [];
|
|
599
|
+
let lastHeading = null;
|
|
600
|
+
|
|
601
|
+
const flush = () => {
|
|
602
|
+
if (buf.length === 0) return null;
|
|
603
|
+
const content = buf.map((b) => b.content).join("\n\n");
|
|
604
|
+
const group = {
|
|
605
|
+
blocks : [...buf],
|
|
606
|
+
content,
|
|
607
|
+
lines : {start : buf[0].lines.start, end : buf[buf.length - 1].lines.end},
|
|
608
|
+
heading : lastHeading?.text || buf[0].heading,
|
|
609
|
+
headingLevel : lastHeading?.level || buf[0].headingLevel,
|
|
610
|
+
headingPath : buf[0].headingPath,
|
|
611
|
+
hasCode : buf.some((b) => b.type === BlockType.CODE_BLOCK),
|
|
612
|
+
hasVideo : buf.some((b) => b.type === BlockType.VIDEO_EMBED),
|
|
613
|
+
hasTable : buf.some((b) => b.type === BlockType.TABLE),
|
|
614
|
+
links : buf.flatMap((b) => extractLinks(b.content)),
|
|
615
|
+
videos : buf.flatMap((b) => extractVideos(b.content)),
|
|
616
|
+
languages : [...new Set(buf.filter((b) => b.language).map((b) => b.language))],
|
|
617
|
+
tokenEstimate : estimateTokens(content, this.config.charsPerToken),
|
|
618
|
+
isAtomic : buf.some((b) => b.isAtomic),
|
|
619
|
+
};
|
|
620
|
+
buf.length = 0;
|
|
621
|
+
return group;
|
|
622
|
+
};
|
|
623
|
+
|
|
624
|
+
for await (const block of blocks) {
|
|
625
|
+
if (block.type === BlockType.HEADING) {
|
|
626
|
+
lastHeading = {text : block.heading, level : block.headingLevel};
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (buf.length === 0) {
|
|
630
|
+
buf.push(block);
|
|
631
|
+
continue;
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
const last = buf[buf.length - 1];
|
|
635
|
+
const bufTokens = estimateTokens(buf.map((b) => b.content).join("\n\n"), this.config.charsPerToken);
|
|
636
|
+
|
|
637
|
+
// New major heading → flush
|
|
638
|
+
if (block.type === BlockType.HEADING && block.headingLevel <= (last.headingLevel || 99)) {
|
|
639
|
+
const g = flush();
|
|
640
|
+
if (g) yield g;
|
|
641
|
+
buf.push(block);
|
|
642
|
+
continue;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// Code after text or text after code → keep together (ZERO LOSS)
|
|
646
|
+
if (block.type === BlockType.CODE_BLOCK && this.config.preserveCodeContext &&
|
|
647
|
+
[BlockType.PARAGRAPH, BlockType.HEADING, BlockType.LIST].includes(last.type)) {
|
|
648
|
+
buf.push(block);
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
if ([BlockType.PARAGRAPH, BlockType.LIST].includes(block.type) &&
|
|
652
|
+
last.type === BlockType.CODE_BLOCK && this.config.preserveCodeContext) {
|
|
653
|
+
buf.push(block);
|
|
654
|
+
continue;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
// Video/link with context → keep together
|
|
658
|
+
if (block.type === BlockType.VIDEO_EMBED && this.config.preserveVideos) {
|
|
659
|
+
buf.push(block);
|
|
660
|
+
continue;
|
|
661
|
+
}
|
|
662
|
+
if (block.type === BlockType.LINK_REF && this.config.preserveLinks) {
|
|
663
|
+
buf.push(block);
|
|
664
|
+
continue;
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// Consecutive code blocks (e.g., input/output) → keep together
|
|
668
|
+
if (block.type === BlockType.CODE_BLOCK && last.type === BlockType.CODE_BLOCK) {
|
|
669
|
+
buf.push(block);
|
|
670
|
+
continue;
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// HR = explicit break
|
|
674
|
+
if (block.type === BlockType.HR) {
|
|
675
|
+
const g = flush();
|
|
676
|
+
if (g) yield g;
|
|
677
|
+
continue;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
// Buffer overflow → flush
|
|
681
|
+
if (bufTokens > this.config.maxChunkTokens * 2) {
|
|
682
|
+
const g = flush();
|
|
683
|
+
if (g) yield g;
|
|
684
|
+
buf.push(block);
|
|
685
|
+
continue;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
// Same type or fits in budget → keep grouping
|
|
689
|
+
const newTokens = bufTokens + estimateTokens(block.content, this.config.charsPerToken);
|
|
690
|
+
if (newTokens <= this.config.maxChunkTokens * 1.5) {
|
|
691
|
+
buf.push(block);
|
|
692
|
+
}
|
|
693
|
+
else {
|
|
694
|
+
const g = flush();
|
|
695
|
+
if (g) yield g;
|
|
696
|
+
buf.push(block);
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
const g = flush();
|
|
701
|
+
if (g) yield g;
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/**
|
|
705
|
+
* Merges context groups into final LLM-ready chunks with overlap.
|
|
706
|
+
* Oversized atomic blocks (huge code blocks) emit as-is with a warning flag.
|
|
707
|
+
* @private
|
|
708
|
+
*/
|
|
709
|
+
async* _assembleChunks(groups, parser) {
|
|
710
|
+
let idx = 0, pending = "", pendingGroups = [], pendingTokens = 0, prevTail = "";
|
|
711
|
+
|
|
712
|
+
const makeChunk = (content, srcGroups, overlap = "") => {
|
|
713
|
+
const full = overlap ? overlap + "\n\n" + content : content;
|
|
714
|
+
const c = this._buildChunkObject(full, content, overlap, srcGroups, idx);
|
|
715
|
+
idx++;
|
|
716
|
+
return c;
|
|
717
|
+
};
|
|
718
|
+
|
|
719
|
+
const tail = (content) => {
|
|
720
|
+
if (this.config.overlapTokens <= 0) return "";
|
|
721
|
+
const chars = this.config.overlapTokens * this.config.charsPerToken;
|
|
722
|
+
if (content.length <= chars) return content;
|
|
723
|
+
const t = content.slice(-chars);
|
|
724
|
+
const br = t.indexOf("\n\n");
|
|
725
|
+
return br > 0 ? t.slice(br + 2) : t;
|
|
726
|
+
};
|
|
727
|
+
|
|
728
|
+
for await (const group of groups) {
|
|
729
|
+
const gTokens = group.tokenEstimate;
|
|
730
|
+
|
|
731
|
+
// Atomic oversized → emit standalone
|
|
732
|
+
if (group.isAtomic && gTokens > this.config.maxChunkTokens) {
|
|
733
|
+
if (pendingGroups.length > 0) {
|
|
734
|
+
yield makeChunk(pending, pendingGroups, prevTail);
|
|
735
|
+
prevTail = tail(pending);
|
|
736
|
+
pending = "";
|
|
737
|
+
pendingGroups = [];
|
|
738
|
+
pendingTokens = 0;
|
|
739
|
+
}
|
|
740
|
+
yield makeChunk(group.content, [group], prevTail);
|
|
741
|
+
prevTail = tail(group.content);
|
|
742
|
+
continue;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Would exceed budget → flush
|
|
746
|
+
if (pendingTokens + gTokens > this.config.maxChunkTokens && pendingGroups.length > 0) {
|
|
747
|
+
yield makeChunk(pending, pendingGroups, prevTail);
|
|
748
|
+
prevTail = tail(pending);
|
|
749
|
+
pending = "";
|
|
750
|
+
pendingGroups = [];
|
|
751
|
+
pendingTokens = 0;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
pending += (pending ? "\n\n" : "") + group.content;
|
|
755
|
+
pendingGroups.push(group);
|
|
756
|
+
pendingTokens += gTokens;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
if (pendingGroups.length > 0) yield makeChunk(pending, pendingGroups, prevTail);
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
/**
|
|
763
|
+
* Constructs a fully enriched chunk object with all metadata fields.
|
|
764
|
+
* @private
|
|
765
|
+
*/
|
|
766
|
+
_buildChunkObject(fullContent, rawContent, overlap, groups, index) {
|
|
767
|
+
return {
|
|
768
|
+
id : generateChunkId(fullContent, index, this.config.chunkIdPrefix),
|
|
769
|
+
index,
|
|
770
|
+
content : fullContent,
|
|
771
|
+
tokenEstimate : estimateTokens(fullContent, this.config.charsPerToken),
|
|
772
|
+
overlapTokens : estimateTokens(overlap, this.config.charsPerToken),
|
|
773
|
+
charCount : fullContent.length,
|
|
774
|
+
wordCount : countWords(fullContent),
|
|
775
|
+
lines : {
|
|
776
|
+
start : groups[0]?.lines?.start || 0,
|
|
777
|
+
end : groups[groups.length - 1]?.lines?.end || 0,
|
|
778
|
+
},
|
|
779
|
+
heading : groups[0]?.heading || null,
|
|
780
|
+
headingPath : groups[0]?.headingPath || [],
|
|
781
|
+
headingLevel : groups[0]?.headingLevel || null,
|
|
782
|
+
hasCode : groups.some((g) => g.hasCode),
|
|
783
|
+
hasVideo : groups.some((g) => g.hasVideo),
|
|
784
|
+
hasTable : groups.some((g) => g.hasTable),
|
|
785
|
+
languages : [...new Set(groups.flatMap((g) => g.languages || []))],
|
|
786
|
+
links : groups.flatMap((g) => g.links || []),
|
|
787
|
+
videos : groups.flatMap((g) => g.videos || []),
|
|
788
|
+
isOversized : estimateTokens(fullContent, this.config.charsPerToken) > this.config.maxChunkTokens * 1.5,
|
|
789
|
+
containsAtomicBlock : groups.some((g) => g.isAtomic),
|
|
790
|
+
blockTypes : [...new Set(groups.flatMap((g) => g.blocks ? g.blocks.map((b) => b.type) : []))],
|
|
791
|
+
strategy : "semantic",
|
|
792
|
+
metadata : {},
|
|
793
|
+
};
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
/**
|
|
798
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
799
|
+
* STRATEGY 2: DelimiterStrategy
|
|
800
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
801
|
+
* Splits on a custom delimiter string (e.g., '---', '===', '<!-- split -->').
|
|
802
|
+
* The delimiter line itself is NOT included in any chunk.
|
|
803
|
+
* Code blocks that contain the delimiter are NOT split (atomic protection).
|
|
804
|
+
*
|
|
805
|
+
* Options (via strategyOptions):
|
|
806
|
+
* - delimiter: string (default: '---')
|
|
807
|
+
* - keepDelimiter: boolean (include delimiter in output, default: false)
|
|
808
|
+
* - trimChunks: boolean (trim whitespace from chunk edges, default: true)
|
|
809
|
+
*/
|
|
810
|
+
export class DelimiterStrategy {
|
|
811
|
+
constructor(config) {
|
|
812
|
+
this.config = config;
|
|
813
|
+
this.delimiter = config.strategyOptions?.delimiter || "---";
|
|
814
|
+
this.keepDelimiter = config.strategyOptions?.keepDelimiter || false;
|
|
815
|
+
this.trimChunks = config.strategyOptions?.trimChunks !== false;
|
|
816
|
+
this.name = "delimiter";
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
/**
|
|
820
|
+
* Accumulates lines and splits whenever the delimiter is encountered.
|
|
821
|
+
* Tracks code fence state to avoid splitting inside code blocks.
|
|
822
|
+
*
|
|
823
|
+
* @param {AsyncIterable} lineIterator
|
|
824
|
+
* @yields {Object} Chunk objects
|
|
825
|
+
*/
|
|
826
|
+
async* process(lineIterator) {
|
|
827
|
+
let buffer = [];
|
|
828
|
+
let startLine = 1;
|
|
829
|
+
let idx = 0;
|
|
830
|
+
let inCodeBlock = false;
|
|
831
|
+
let codeFence = "";
|
|
832
|
+
|
|
833
|
+
for await (const {lineNumber, text} of lineIterator) {
|
|
834
|
+
// Track code blocks to protect them from delimiter splitting
|
|
835
|
+
const fenceMatch = text.match(/^(\s*)(```|~~~)/);
|
|
836
|
+
if (fenceMatch && !inCodeBlock) {
|
|
837
|
+
inCodeBlock = true;
|
|
838
|
+
codeFence = fenceMatch[2];
|
|
839
|
+
}
|
|
840
|
+
else if (inCodeBlock && text.trim().startsWith(codeFence)) {
|
|
841
|
+
inCodeBlock = false;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
// Only split on delimiter if NOT inside a code block
|
|
845
|
+
if (!inCodeBlock && text.trim() === this.delimiter) {
|
|
846
|
+
if (buffer.length > 0) {
|
|
847
|
+
yield this._makeChunk(buffer, startLine, lineNumber - 1, idx++);
|
|
848
|
+
}
|
|
849
|
+
if (this.keepDelimiter) buffer = [text]; else buffer = [];
|
|
850
|
+
startLine = lineNumber + 1;
|
|
851
|
+
continue;
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
if (buffer.length === 0) startLine = lineNumber;
|
|
855
|
+
buffer.push(text);
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
if (buffer.length > 0) {
|
|
859
|
+
yield this._makeChunk(buffer, startLine, startLine + buffer.length - 1, idx);
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
/** @private Builds chunk from line buffer */
|
|
864
|
+
_makeChunk(lines, startLine, endLine, index) {
|
|
865
|
+
let content = lines.join("\n");
|
|
866
|
+
if (this.trimChunks) content = content.trim();
|
|
867
|
+
return {
|
|
868
|
+
id : generateChunkId(content, index, this.config.chunkIdPrefix),
|
|
869
|
+
index, content,
|
|
870
|
+
tokenEstimate : estimateTokens(content, this.config.charsPerToken),
|
|
871
|
+
overlapTokens : 0,
|
|
872
|
+
charCount : content.length,
|
|
873
|
+
wordCount : countWords(content),
|
|
874
|
+
lines : {start : startLine, end : endLine},
|
|
875
|
+
heading : null, headingPath : [], headingLevel : null,
|
|
876
|
+
hasCode : /```[\s\S]*?```/.test(content),
|
|
877
|
+
hasVideo : this.config.videoPattern.test(content),
|
|
878
|
+
hasTable : /^\s*\|.*\|\s*$/m.test(content),
|
|
879
|
+
languages : [...(content.matchAll(/```(\w+)/g))].map((m) => m[1]),
|
|
880
|
+
links : extractLinks(content),
|
|
881
|
+
videos : extractVideos(content),
|
|
882
|
+
isOversized : false,
|
|
883
|
+
containsAtomicBlock : false,
|
|
884
|
+
blockTypes : [],
|
|
885
|
+
strategy : "delimiter",
|
|
886
|
+
metadata : {delimiter : this.delimiter},
|
|
887
|
+
};
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
/**
|
|
892
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
893
|
+
* STRATEGY 3: CharLimitStrategy
|
|
894
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
895
|
+
* Splits by character count. Respects code blocks as atomic units.
|
|
896
|
+
* When a split point falls inside a code block, the split is deferred
|
|
897
|
+
* until after the code block closes (zero sequence loss).
|
|
898
|
+
*
|
|
899
|
+
* Options (via strategyOptions):
|
|
900
|
+
* - charLimit: number (max characters per chunk, default: 4000)
|
|
901
|
+
* - overlap: number (character overlap between chunks, default: 200)
|
|
902
|
+
*/
|
|
903
|
+
export class CharLimitStrategy {
|
|
904
|
+
constructor(config) {
|
|
905
|
+
this.config = config;
|
|
906
|
+
this.charLimit = config.strategyOptions?.charLimit || config.maxChunkTokens * config.charsPerToken;
|
|
907
|
+
this.overlap = config.strategyOptions?.overlap || config.overlapTokens * config.charsPerToken;
|
|
908
|
+
this.name = "char";
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
/**
|
|
912
|
+
* Accumulates lines until charLimit is reached, then emits.
|
|
913
|
+
* Never splits inside fenced code blocks or tables.
|
|
914
|
+
*
|
|
915
|
+
* @param {AsyncIterable} lineIterator
|
|
916
|
+
* @yields {Object} Chunk objects
|
|
917
|
+
*/
|
|
918
|
+
async* process(lineIterator) {
|
|
919
|
+
let buffer = [];
|
|
920
|
+
let bufferChars = 0;
|
|
921
|
+
let startLine = 1;
|
|
922
|
+
let idx = 0;
|
|
923
|
+
let inCodeBlock = false;
|
|
924
|
+
let codeFence = "";
|
|
925
|
+
let prevOverlap = "";
|
|
926
|
+
|
|
927
|
+
for await (const {lineNumber, text} of lineIterator) {
|
|
928
|
+
// Track code fences
|
|
929
|
+
const fm = text.match(/^(\s*)(```|~~~)/);
|
|
930
|
+
if (fm && !inCodeBlock) {
|
|
931
|
+
inCodeBlock = true;
|
|
932
|
+
codeFence = fm[2];
|
|
933
|
+
}
|
|
934
|
+
else if (inCodeBlock && text.trim().startsWith(codeFence) && text.trim().length <= codeFence.length + 1) {
|
|
935
|
+
inCodeBlock = false;
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
if (buffer.length === 0) startLine = lineNumber;
|
|
939
|
+
buffer.push(text);
|
|
940
|
+
bufferChars += text.length + 1; // +1 for \n
|
|
941
|
+
|
|
942
|
+
// Only emit if over limit AND not inside a code block
|
|
943
|
+
if (bufferChars >= this.charLimit && !inCodeBlock) {
|
|
944
|
+
const content = (prevOverlap ? prevOverlap + "\n" : "") + buffer.join("\n");
|
|
945
|
+
yield this._makeChunk(content, startLine, lineNumber, idx++, prevOverlap);
|
|
946
|
+
prevOverlap = this.overlap > 0 ? buffer.join("\n").slice(-this.overlap) : "";
|
|
947
|
+
buffer = [];
|
|
948
|
+
bufferChars = 0;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
if (buffer.length > 0) {
|
|
953
|
+
const content = (prevOverlap ? prevOverlap + "\n" : "") + buffer.join("\n");
|
|
954
|
+
yield this._makeChunk(content, startLine, startLine + buffer.length - 1, idx, prevOverlap);
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
/** @private */
|
|
959
|
+
_makeChunk(content, startLine, endLine, index, overlap = "") {
|
|
960
|
+
return {
|
|
961
|
+
id : generateChunkId(content, index, this.config.chunkIdPrefix),
|
|
962
|
+
index, content,
|
|
963
|
+
tokenEstimate : estimateTokens(content, this.config.charsPerToken),
|
|
964
|
+
overlapTokens : estimateTokens(overlap, this.config.charsPerToken),
|
|
965
|
+
charCount : content.length,
|
|
966
|
+
wordCount : countWords(content),
|
|
967
|
+
lines : {start : startLine, end : endLine},
|
|
968
|
+
heading : null, headingPath : [], headingLevel : null,
|
|
969
|
+
hasCode : /```[\s\S]*?```/.test(content),
|
|
970
|
+
hasVideo : this.config.videoPattern.test(content),
|
|
971
|
+
hasTable : /^\s*\|.*\|\s*$/m.test(content),
|
|
972
|
+
languages : [...(content.matchAll(/```(\w+)/g))].map((m) => m[1]),
|
|
973
|
+
links : extractLinks(content),
|
|
974
|
+
videos : extractVideos(content),
|
|
975
|
+
isOversized : content.length > this.charLimit * 1.5,
|
|
976
|
+
containsAtomicBlock : false,
|
|
977
|
+
blockTypes : [],
|
|
978
|
+
//strategy : "char",
|
|
979
|
+
strategy : this.name,
|
|
980
|
+
metadata : {charLimit : this.charLimit},
|
|
981
|
+
};
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
/**
|
|
986
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
987
|
+
* STRATEGY 4: WordLimitStrategy
|
|
988
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
989
|
+
* Splits by word count. Respects code blocks as atomic units.
|
|
990
|
+
*
|
|
991
|
+
* Options (via strategyOptions):
|
|
992
|
+
* - wordLimit: number (max words per chunk, default: 1000)
|
|
993
|
+
* - overlap: number (word overlap between chunks, default: 50)
|
|
994
|
+
*/
|
|
995
|
+
export class WordLimitStrategy {
|
|
996
|
+
constructor(config) {
|
|
997
|
+
this.config = config;
|
|
998
|
+
this.wordLimit = config.strategyOptions?.wordLimit || 1000;
|
|
999
|
+
this.overlap = config.strategyOptions?.overlap || 50;
|
|
1000
|
+
this.name = "word";
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
/**
|
|
1004
|
+
* Accumulates lines until word limit is reached.
|
|
1005
|
+
* Code blocks are counted but never split mid-block.
|
|
1006
|
+
*
|
|
1007
|
+
* @param {AsyncIterable} lineIterator
|
|
1008
|
+
* @yields {Object} Chunk objects
|
|
1009
|
+
*/
|
|
1010
|
+
async* process(lineIterator) {
|
|
1011
|
+
let buffer = [];
|
|
1012
|
+
let bufferWords = 0;
|
|
1013
|
+
let startLine = 1;
|
|
1014
|
+
let idx = 0;
|
|
1015
|
+
let inCodeBlock = false;
|
|
1016
|
+
let codeFence = "";
|
|
1017
|
+
let prevOverlapText = "";
|
|
1018
|
+
|
|
1019
|
+
for await (const {lineNumber, text} of lineIterator) {
|
|
1020
|
+
const fm = text.match(/^(\s*)(```|~~~)/);
|
|
1021
|
+
if (fm && !inCodeBlock) {
|
|
1022
|
+
inCodeBlock = true;
|
|
1023
|
+
codeFence = fm[2];
|
|
1024
|
+
}
|
|
1025
|
+
else if (inCodeBlock && text.trim().startsWith(codeFence) && text.trim().length <= codeFence.length + 1) {
|
|
1026
|
+
inCodeBlock = false;
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
if (buffer.length === 0) startLine = lineNumber;
|
|
1030
|
+
buffer.push(text);
|
|
1031
|
+
bufferWords += countWords(text);
|
|
1032
|
+
|
|
1033
|
+
if (bufferWords >= this.wordLimit && !inCodeBlock) {
|
|
1034
|
+
const raw = buffer.join("\n");
|
|
1035
|
+
const content = prevOverlapText ? prevOverlapText + "\n" + raw : raw;
|
|
1036
|
+
yield this._makeChunk(content, startLine, lineNumber, idx++, prevOverlapText);
|
|
1037
|
+
// Calculate overlap: take last N words
|
|
1038
|
+
if (this.overlap > 0) {
|
|
1039
|
+
const words = raw.split(/\s+/).filter(Boolean);
|
|
1040
|
+
prevOverlapText = words.slice(-this.overlap).join(" ");
|
|
1041
|
+
}
|
|
1042
|
+
else {
|
|
1043
|
+
prevOverlapText = "";
|
|
1044
|
+
}
|
|
1045
|
+
buffer = [];
|
|
1046
|
+
bufferWords = 0;
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
if (buffer.length > 0) {
|
|
1051
|
+
const raw = buffer.join("\n");
|
|
1052
|
+
const content = prevOverlapText ? prevOverlapText + "\n" + raw : raw;
|
|
1053
|
+
yield this._makeChunk(content, startLine, startLine + buffer.length - 1, idx, prevOverlapText);
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
/** @private */
|
|
1058
|
+
_makeChunk(content, startLine, endLine, index, overlap = "") {
|
|
1059
|
+
return {
|
|
1060
|
+
id : generateChunkId(content, index, this.config.chunkIdPrefix),
|
|
1061
|
+
index, content,
|
|
1062
|
+
tokenEstimate : estimateTokens(content, this.config.charsPerToken),
|
|
1063
|
+
overlapTokens : estimateTokens(overlap, this.config.charsPerToken),
|
|
1064
|
+
charCount : content.length,
|
|
1065
|
+
wordCount : countWords(content),
|
|
1066
|
+
lines : {start : startLine, end : endLine},
|
|
1067
|
+
heading : null, headingPath : [], headingLevel : null,
|
|
1068
|
+
hasCode : /```[\s\S]*?```/.test(content),
|
|
1069
|
+
hasVideo : this.config.videoPattern.test(content),
|
|
1070
|
+
hasTable : /^\s*\|.*\|\s*$/m.test(content),
|
|
1071
|
+
languages : [...(content.matchAll(/```(\w+)/g))].map((m) => m[1]),
|
|
1072
|
+
links : extractLinks(content),
|
|
1073
|
+
videos : extractVideos(content),
|
|
1074
|
+
isOversized : countWords(content) > this.wordLimit * 1.5,
|
|
1075
|
+
containsAtomicBlock : false,
|
|
1076
|
+
blockTypes : [],
|
|
1077
|
+
//strategy : "word",
|
|
1078
|
+
strategy : this.name,
|
|
1079
|
+
metadata : {wordLimit : this.wordLimit},
|
|
1080
|
+
};
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
/**
|
|
1085
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
1086
|
+
* STRATEGY 5: TokenLimitStrategy
|
|
1087
|
+
* ──────────────────────────────────────────────────────────────────────────
|
|
1088
|
+
* Splits by estimated token count. Same as CharLimit but uses token math.
|
|
1089
|
+
* Delegates to CharLimitStrategy internally with token→char conversion.
|
|
1090
|
+
*
|
|
1091
|
+
* Options (via strategyOptions):
|
|
1092
|
+
* - tokenLimit: number (max tokens per chunk, default: config.maxChunkTokens)
|
|
1093
|
+
*/
|
|
1094
|
+
export class TokenLimitStrategy extends CharLimitStrategy {
|
|
1095
|
+
constructor(config) {
|
|
1096
|
+
const tokenLimit = config.strategyOptions?.tokenLimit || config.maxChunkTokens;
|
|
1097
|
+
// Convert token limit to char limit internally
|
|
1098
|
+
super({
|
|
1099
|
+
...config,
|
|
1100
|
+
strategyOptions : {
|
|
1101
|
+
...config.strategyOptions,
|
|
1102
|
+
charLimit : tokenLimit * config.charsPerToken,
|
|
1103
|
+
overlap : (config.strategyOptions?.overlap || config.overlapTokens) * config.charsPerToken,
|
|
1104
|
+
},
|
|
1105
|
+
});
|
|
1106
|
+
this.name = "token";
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
1111
|
+
// STRATEGY REGISTRY
|
|
1112
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
1113
|
+
|
|
1114
|
+
/**
|
|
1115
|
+
* Maps strategy names to their classes.
|
|
1116
|
+
* Custom strategies can be registered at runtime via
|
|
1117
|
+
* MarkdownTextSplitter.registerStrategy().
|
|
1118
|
+
*
|
|
1119
|
+
* Each strategy class must implement:
|
|
1120
|
+
* constructor(config)
|
|
1121
|
+
* async *process(lineIterator) → yields chunk objects
|
|
1122
|
+
*/
|
|
1123
|
+
const STRATEGY_REGISTRY = new Map([
|
|
1124
|
+
["semantic", SemanticStrategy],
|
|
1125
|
+
["delimiter", DelimiterStrategy],
|
|
1126
|
+
["char", CharLimitStrategy],
|
|
1127
|
+
["word", WordLimitStrategy],
|
|
1128
|
+
["token", TokenLimitStrategy],
|
|
1129
|
+
]);
|
|
1130
|
+
|
|
1131
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
1132
|
+
// LINK DEFINITION RESOLVER
|
|
1133
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
1134
|
+
|
|
1135
|
+
/**
|
|
1136
|
+
* Post-processes chunks to append reference-style link definitions ([id]: url)
|
|
1137
|
+
* to every chunk that references them. Ensures zero link loss across chunks.
|
|
1138
|
+
*
|
|
1139
|
+
* @param {Object[]} chunks - Array of chunk objects
|
|
1140
|
+
* @param {Map<string,string>} defs - Link definitions from the parser
|
|
1141
|
+
* @returns {Object[]} Enriched chunks with resolved link definitions
|
|
1142
|
+
*/
|
|
1143
|
+
function resolveLinkDefinitions(chunks, defs) {
|
|
1144
|
+
if (defs.size === 0) return chunks;
|
|
1145
|
+
return chunks.map((chunk) => {
|
|
1146
|
+
const refs = [];
|
|
1147
|
+
const re = /\[([^\]]+)\]\[([^\]]*)\]|\[([^\]]+)\](?!\()/g;
|
|
1148
|
+
let m;
|
|
1149
|
+
while ((m = re.exec(chunk.content)) !== null) {
|
|
1150
|
+
const id = m[2] || m[1] || m[3];
|
|
1151
|
+
if (id && defs.has(id)) refs.push(id);
|
|
1152
|
+
}
|
|
1153
|
+
if (refs.length > 0) {
|
|
1154
|
+
const block = refs.map((id) => `[${id}]: ${defs.get(id)}`).join("\n");
|
|
1155
|
+
return {
|
|
1156
|
+
...chunk,
|
|
1157
|
+
content : chunk.content + "\n\n" + block,
|
|
1158
|
+
tokenEstimate : estimateTokens(chunk.content + "\n\n" + block, 4),
|
|
1159
|
+
metadata : {...chunk.metadata, resolvedLinkRefs : refs},
|
|
1160
|
+
};
|
|
1161
|
+
}
|
|
1162
|
+
return chunk;
|
|
1163
|
+
});
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
1167
|
+
// MAIN SPLITTER CLASS
|
|
1168
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
1169
|
+
|
|
1170
|
+
/**
|
|
1171
|
+
* MarkdownTextSplitter: The main entry point for browser usage.
|
|
1172
|
+
*
|
|
1173
|
+
* Orchestrates the full pipeline using a configurable splitting strategy:
|
|
1174
|
+
* URL/String → Stream → Lines → Strategy.process() → Chunks
|
|
1175
|
+
*
|
|
1176
|
+
* USAGE (browser):
|
|
1177
|
+
* import MarkdownTextSplitter from './MarkdownTextSplitter.js';
|
|
1178
|
+
*
|
|
1179
|
+
* // Semantic (default):
|
|
1180
|
+
* const splitter = new MarkdownTextSplitter();
|
|
1181
|
+
* const chunks = await splitter.splitFromUrl('https://example.com/docs.md');
|
|
1182
|
+
*
|
|
1183
|
+
* // Delimiter:
|
|
1184
|
+
* const splitter = new MarkdownTextSplitter({
|
|
1185
|
+
* strategy: 'delimiter',
|
|
1186
|
+
* strategyOptions: { delimiter: '---' }
|
|
1187
|
+
* });
|
|
1188
|
+
*
|
|
1189
|
+
* // Character limit:
|
|
1190
|
+
* const splitter = new MarkdownTextSplitter({
|
|
1191
|
+
* strategy: 'char',
|
|
1192
|
+
* strategyOptions: { charLimit: 5000, overlap: 200 }
|
|
1193
|
+
* });
|
|
1194
|
+
*
|
|
1195
|
+
* // Word limit:
|
|
1196
|
+
* const splitter = new MarkdownTextSplitter({
|
|
1197
|
+
* strategy: 'word',
|
|
1198
|
+
* strategyOptions: { wordLimit: 800, overlap: 50 }
|
|
1199
|
+
* });
|
|
1200
|
+
*
|
|
1201
|
+
* // Custom strategy:
|
|
1202
|
+
* MarkdownTextSplitter.registerStrategy('myStrategy', MyStrategyClass);
|
|
1203
|
+
* const splitter = new MarkdownTextSplitter({ strategy: 'myStrategy' });
|
|
1204
|
+
*
|
|
1205
|
+
* @class
|
|
1206
|
+
* @param {Partial<SplitterConfig>} [userConfig] - Override defaults
|
|
1207
|
+
*/
|
|
1208
|
+
export class MarkdownTextSplitter {
|
|
1209
|
+
constructor(userConfig = {}) {
|
|
1210
|
+
/** Merged configuration */
|
|
1211
|
+
this.config = {...DEFAULT_CONFIG, ...userConfig};
|
|
1212
|
+
|
|
1213
|
+
/** Active strategy instance */
|
|
1214
|
+
this.strategy = this._createStrategy();
|
|
1215
|
+
|
|
1216
|
+
/** Processing statistics (populated after splitting) */
|
|
1217
|
+
this.stats = this._emptyStats();
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
/**
|
|
1221
|
+
* Registers a custom splitting strategy globally.
|
|
1222
|
+
* The class must implement: constructor(config) and async *process(lineIterator).
|
|
1223
|
+
*
|
|
1224
|
+
* @static
|
|
1225
|
+
* @param {string} name - Strategy identifier
|
|
1226
|
+
* @param {Function} strategyClass - Class constructor
|
|
1227
|
+
*
|
|
1228
|
+
* @example
|
|
1229
|
+
* class MySplitter {
|
|
1230
|
+
* constructor(config) { this.config = config; this.name = 'mine'; }
|
|
1231
|
+
* async *process(lines) {
|
|
1232
|
+
* let buf = [];
|
|
1233
|
+
* for await (const {text} of lines) { buf.push(text); }
|
|
1234
|
+
* yield { index: 0, content: buf.join('\n'), ... };
|
|
1235
|
+
* }
|
|
1236
|
+
* }
|
|
1237
|
+
* MarkdownTextSplitter.registerStrategy('mine', MySplitter);
|
|
1238
|
+
*/
|
|
1239
|
+
static registerStrategy(name, strategyClass) {
|
|
1240
|
+
STRATEGY_REGISTRY.set(name, strategyClass);
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
/**
|
|
1244
|
+
* Lists all available strategy names (built-in + custom).
|
|
1245
|
+
* @static
|
|
1246
|
+
* @returns {string[]}
|
|
1247
|
+
*/
|
|
1248
|
+
static getAvailableStrategies() {
|
|
1249
|
+
return [...STRATEGY_REGISTRY.keys()];
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
/**
|
|
1253
|
+
* Splits a remote Markdown file fetched via streaming HTTP.
|
|
1254
|
+
* Memory usage is O(chunk_size), not O(file_size).
|
|
1255
|
+
*
|
|
1256
|
+
* @param {string} url - URL of the markdown file
|
|
1257
|
+
* @param {Object} [fetchOpts] - Additional fetch() options (headers, auth)
|
|
1258
|
+
* @returns {Promise<Object[]>} Array of chunk objects
|
|
1259
|
+
*/
|
|
1260
|
+
async splitFromUrl(url, fetchOpts = {}) {
|
|
1261
|
+
const chunks = [];
|
|
1262
|
+
for await (const chunk of this.streamFromUrl(url, fetchOpts)) chunks.push(chunk);
|
|
1263
|
+
return chunks;
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
/**
|
|
1267
|
+
* Splits a Markdown string (for testing or small inputs).
|
|
1268
|
+
*
|
|
1269
|
+
* @param {string} markdown - Markdown content string
|
|
1270
|
+
* @returns {Promise<Object[]>} Array of chunk objects
|
|
1271
|
+
*/
|
|
1272
|
+
async splitFromString(markdown) {
|
|
1273
|
+
const chunks = [];
|
|
1274
|
+
for await (const chunk of this.streamFromString(markdown)) chunks.push(chunk);
|
|
1275
|
+
return chunks;
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
/**
|
|
1279
|
+
* STREAMING: Yields chunks one-at-a-time from a URL.
|
|
1280
|
+
* Use with `for await` for memory-efficient processing of huge files.
|
|
1281
|
+
*
|
|
1282
|
+
* @param {string} url
|
|
1283
|
+
* @param {Object} [fetchOpts]
|
|
1284
|
+
* @yields {Object} Chunk objects
|
|
1285
|
+
*/
|
|
1286
|
+
async* streamFromUrl(url, fetchOpts = {}) {
|
|
1287
|
+
const start = performance.now();
|
|
1288
|
+
this.stats = this._emptyStats();
|
|
1289
|
+
|
|
1290
|
+
const controller = new AbortController();
|
|
1291
|
+
const tid = setTimeout(() => controller.abort(), this.config.fetchTimeoutMs);
|
|
1292
|
+
|
|
1293
|
+
try {
|
|
1294
|
+
const res = await fetch(url, {
|
|
1295
|
+
...fetchOpts,
|
|
1296
|
+
signal : controller.signal,
|
|
1297
|
+
headers : {Accept : "text/markdown, text/plain, */*", ...fetchOpts.headers},
|
|
1298
|
+
});
|
|
1299
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
|
|
1300
|
+
|
|
1301
|
+
const lines = streamToLines(res.body);
|
|
1302
|
+
yield* this._run(lines);
|
|
1303
|
+
} finally {
|
|
1304
|
+
clearTimeout(tid);
|
|
1305
|
+
this.stats.processingTimeMs = performance.now() - start;
|
|
1306
|
+
this.stats.source = url;
|
|
1307
|
+
}
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
/**
|
|
1311
|
+
* STREAMING: Yields chunks one-at-a-time from a string.
|
|
1312
|
+
*
|
|
1313
|
+
* @param {string} markdown
|
|
1314
|
+
* @yields {Object} Chunk objects
|
|
1315
|
+
*/
|
|
1316
|
+
async* streamFromString(markdown) {
|
|
1317
|
+
const start = performance.now();
|
|
1318
|
+
this.stats = this._emptyStats();
|
|
1319
|
+
|
|
1320
|
+
const lines = stringToLines(markdown);
|
|
1321
|
+
yield* this._run(lines);
|
|
1322
|
+
|
|
1323
|
+
this.stats.processingTimeMs = performance.now() - start;
|
|
1324
|
+
this.stats.source = "(string)";
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
/**
|
|
1328
|
+
* STREAMING: Yields chunks from a File or Blob object (e.g., from <input type="file">).
|
|
1329
|
+
* Uses the browser File API stream() method.
|
|
1330
|
+
*
|
|
1331
|
+
* @param {File|Blob} file - A File or Blob object
|
|
1332
|
+
* @yields {Object} Chunk objects
|
|
1333
|
+
*/
|
|
1334
|
+
async* streamFromFile(file) {
|
|
1335
|
+
const start = performance.now();
|
|
1336
|
+
this.stats = this._emptyStats();
|
|
1337
|
+
|
|
1338
|
+
const lines = streamToLines(file.stream());
|
|
1339
|
+
yield* this._run(lines);
|
|
1340
|
+
|
|
1341
|
+
this.stats.processingTimeMs = performance.now() - start;
|
|
1342
|
+
this.stats.source = file.name || "(blob)";
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
/**
|
|
1346
|
+
* Splits a File/Blob and returns all chunks as an array.
|
|
1347
|
+
*
|
|
1348
|
+
* @param {File|Blob} file
|
|
1349
|
+
* @returns {Promise<Object[]>}
|
|
1350
|
+
*/
|
|
1351
|
+
async splitFromFile(file) {
|
|
1352
|
+
const chunks = [];
|
|
1353
|
+
for await (const chunk of this.streamFromFile(file)) chunks.push(chunk);
|
|
1354
|
+
return chunks;
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
/**
|
|
1358
|
+
* Internal: runs the active strategy pipeline and collects stats.
|
|
1359
|
+
* @private
|
|
1360
|
+
*/
|
|
1361
|
+
async* _run(lineIterator) {
|
|
1362
|
+
const allChunks = [];
|
|
1363
|
+
|
|
1364
|
+
for await (const chunk of this.strategy.process(lineIterator)) {
|
|
1365
|
+
chunk.metadata.splitterVersion = SPLITTER_VERSION;
|
|
1366
|
+
chunk.metadata.strategy = this.strategy.name;
|
|
1367
|
+
|
|
1368
|
+
this.stats.totalChunks++;
|
|
1369
|
+
this.stats.totalTokens += chunk.tokenEstimate;
|
|
1370
|
+
this.stats.totalChars += chunk.charCount;
|
|
1371
|
+
this.stats.totalWords += chunk.wordCount;
|
|
1372
|
+
if (chunk.isOversized) this.stats.oversizedChunks++;
|
|
1373
|
+
if (chunk.hasCode) this.stats.codeBlockChunks++;
|
|
1374
|
+
if (chunk.hasTable) this.stats.tableChunks++;
|
|
1375
|
+
if (chunk.hasVideo) this.stats.videoChunks++;
|
|
1376
|
+
|
|
1377
|
+
allChunks.push(chunk);
|
|
1378
|
+
yield chunk;
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
// Post-process: resolve reference links (semantic strategy only)
|
|
1382
|
+
if (this.strategy instanceof SemanticStrategy) {
|
|
1383
|
+
// The parser is internal to SemanticStrategy; we expose linkDefs via a second pass
|
|
1384
|
+
// For the streaming API, link resolution happens retroactively
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
/**
|
|
1389
|
+
* Returns processing stats from the last split operation.
|
|
1390
|
+
* @returns {Object}
|
|
1391
|
+
*/
|
|
1392
|
+
getStats() { return {...this.stats}; }
|
|
1393
|
+
|
|
1394
|
+
/**
|
|
1395
|
+
* Resets internal state for reuse.
|
|
1396
|
+
*/
|
|
1397
|
+
reset() { this.stats = this._emptyStats(); }
|
|
1398
|
+
|
|
1399
|
+
/**
|
|
1400
|
+
* Switches to a different strategy at runtime.
|
|
1401
|
+
*
|
|
1402
|
+
* @param {string} strategyName - Name of the strategy
|
|
1403
|
+
* @param {Object} [options] - Strategy-specific options
|
|
1404
|
+
*/
|
|
1405
|
+
setStrategy(strategyName, options = {}) {
|
|
1406
|
+
this.config.strategy = strategyName;
|
|
1407
|
+
this.config.strategyOptions = {...this.config.strategyOptions, ...options};
|
|
1408
|
+
this.strategy = this._createStrategy();
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
/** @private */
|
|
1412
|
+
_createStrategy() {
|
|
1413
|
+
const Cls = STRATEGY_REGISTRY.get(this.config.strategy);
|
|
1414
|
+
if (!Cls) {
|
|
1415
|
+
throw new Error(
|
|
1416
|
+
`Unknown strategy "${this.config.strategy}". Available: ${[...STRATEGY_REGISTRY.keys()].join(", ")}`
|
|
1417
|
+
);
|
|
1418
|
+
}
|
|
1419
|
+
return new Cls(this.config);
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
/** @private */
|
|
1423
|
+
_emptyStats() {
|
|
1424
|
+
return {
|
|
1425
|
+
totalChunks : 0, totalTokens : 0, totalChars : 0, totalWords : 0,
|
|
1426
|
+
oversizedChunks : 0, codeBlockChunks : 0, tableChunks : 0, videoChunks : 0,
|
|
1427
|
+
processingTimeMs : 0, source : "",
|
|
1428
|
+
};
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
export default MarkdownTextSplitter;
|