@storepress/llm-md-text-splitter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1432 @@
1
+ /**
2
+ * MarkdownTextSplitter.js (Browser-Native ES Module)
3
+ * ====================================================
4
+ * A high-performance, streaming Markdown text splitter for LLM consumption.
5
+ * Runs entirely in the browser — zero Node.js dependencies.
6
+ *
7
+ * KEY FEATURES:
8
+ * - Zero Sequence Loss: Code blocks, tables, reference links, and video embeds
9
+ * are NEVER split apart. They stay as atomic semantic units with their context.
10
+ * - Stream-based: Uses browser-native fetch() + ReadableStream + TextDecoderStream
11
+ * to process 100K+ line files without loading everything into RAM.
12
+ * - Pluggable Strategies: Ships with 5 splitting strategies and accepts custom ones:
13
+ * 1. SemanticStrategy — Split by Markdown structure (headings, paragraphs, code)
14
+ * 2. DelimiterStrategy — Split on a custom delimiter string (e.g. '---', '===')
15
+ * 3. CharLimitStrategy — Split by character count
16
+ * 4. WordLimitStrategy — Split by word count
17
+ * 5. TokenLimitStrategy — Split by estimated LLM token count
18
+ * - Rich Metadata: Each chunk carries positional, structural, and relational metadata.
19
+ * - Every function is commented with explanation.
20
+ * - Fully configurable via constructor options.
21
+ *
22
+ * BROWSER COMPATIBILITY: Chrome 71+, Firefox 65+, Safari 14.1+, Edge 79+
23
+ *
24
+ * @module MarkdownTextSplitter
25
+ * @version 0.0.1
26
+ * @license MIT
27
+ */
28
+
29
+ // ─────────────────────────────────────────────────────────────────────────────
30
+ // SEMANTIC BLOCK TYPES
31
+ // ─────────────────────────────────────────────────────────────────────────────
32
+
33
+ const SPLITTER_VERSION = '0.0.1';
34
+
35
+ /**
36
+ * Enum of all Markdown semantic block types the parser can identify.
37
+ * Used by the SemanticStrategy to classify content and enforce atomic rules.
38
+ * Other strategies reference these for metadata enrichment.
39
+ *
40
+ * @readonly
41
+ * @enum {string}
42
+ */
43
+ export const BlockType = Object.freeze({
44
+ HEADING : "heading", // # Heading lines (h1–h6)
45
+ PARAGRAPH : "paragraph", // Regular text paragraphs
46
+ CODE_BLOCK : "code_block", // Fenced ``` or ~~~ code blocks (ATOMIC)
47
+ LIST : "list", // Ordered/unordered list items
48
+ BLOCKQUOTE : "blockquote", // > Blockquoted text
49
+ TABLE : "table", // Markdown tables (ATOMIC)
50
+ LINK_REF : "link_reference", // [id]: url reference definitions
51
+ VIDEO_EMBED : "video_embed", // YouTube/Vimeo embeds (ATOMIC)
52
+ HR : "hr", // --- or *** horizontal rules
53
+ EMPTY : "empty", // Blank lines (separators)
54
+ FRONTMATTER : "frontmatter", // YAML frontmatter --- blocks (ATOMIC)
55
+ HTML_BLOCK : "html_block", // Raw HTML blocks
56
+ IMAGE : "image", // ![alt](url) images
57
+ });
58
+
59
+ /**
60
+ * Set of block types that must NEVER be split across chunk boundaries.
61
+ * These are "atomic" — the entire block goes into one chunk or not at all.
62
+ * This is the foundation of the zero-sequence-loss guarantee.
63
+ */
64
+ const ATOMIC_BLOCKS = new Set([
65
+ BlockType.CODE_BLOCK,
66
+ BlockType.TABLE,
67
+ BlockType.VIDEO_EMBED,
68
+ BlockType.FRONTMATTER,
69
+ ]);
70
+
71
+ // ─────────────────────────────────────────────────────────────────────────────
72
+ // DEFAULT CONFIGURATION
73
+ // ─────────────────────────────────────────────────────────────────────────────
74
+
75
+ /**
76
+ * Default configuration for the splitter.
77
+ * Every option is overridable via the constructor. Strategies may also
78
+ * define their own additional config keys under `strategyOptions`.
79
+ *
80
+ * @typedef {Object} SplitterConfig
81
+ * @property {number} maxChunkTokens - Target max tokens per chunk (~4 chars/token)
82
+ * @property {number} overlapTokens - Tokens of overlap between consecutive chunks
83
+ * @property {number} charsPerToken - Characters-per-token ratio for estimation
84
+ * @property {number} fetchTimeoutMs - HTTP fetch timeout in milliseconds
85
+ * @property {boolean} preserveCodeContext - Group code blocks with surrounding text
86
+ * @property {boolean} preserveLinks - Group reference links with their sections
87
+ * @property {boolean} preserveVideos - Group video embeds with their context
88
+ * @property {string} chunkIdPrefix - Prefix for generated chunk IDs
89
+ * @property {RegExp} videoPattern - Regex to detect video embed lines
90
+ * @property {RegExp} linkRefPattern - Regex to detect reference-style link defs
91
+ * @property {string} strategy - Active strategy name: 'semantic'|'delimiter'|'char'|'word'|'token'
92
+ * @property {Object} strategyOptions - Strategy-specific options (see each strategy)
93
+ */
94
+ export const DEFAULT_CONFIG = Object.freeze({
95
+ maxChunkTokens : 1500,
96
+ overlapTokens : 150,
97
+ charsPerToken : 4,
98
+ fetchTimeoutMs : 60_000,
99
+ preserveCodeContext : true,
100
+ preserveLinks : true,
101
+ preserveVideos : true,
102
+ chunkIdPrefix : "chunk",
103
+ videoPattern :
104
+ /(?:\[.*?\]\((?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/|vimeo\.com\/)[\w-]+.*?\))|(?:<iframe[^>]*(?:youtube|vimeo)[^>]*>)/i,
105
+ linkRefPattern : /^\s*\[([^\]]+)\]:\s+(.+)$/,
106
+ strategy : "semantic",
107
+ strategyOptions : {},
108
+ });
109
+
110
+ // ─────────────────────────────────────────────────────────────────────────────
111
+ // UTILITY FUNCTIONS
112
+ // ─────────────────────────────────────────────────────────────────────────────
113
+
114
+ /**
115
+ * Estimates token count for a text string using a character-ratio heuristic.
116
+ * For production, replace with tiktoken or cl100k_base WASM tokenizer.
117
+ *
118
+ * @param {string} text - Text to estimate
119
+ * @param {number} charsPerToken - Chars-per-token ratio (default 4)
120
+ * @returns {number} Estimated token count
121
+ */
122
+ export function estimateTokens(text, charsPerToken = 4) {
123
+ if (!text) return 0;
124
+ return Math.ceil(text.length / charsPerToken);
125
+ }
126
+
127
+ /**
128
+ * Counts words in a text string. Uses Unicode-aware splitting.
129
+ *
130
+ * @param {string} text - Text to count words in
131
+ * @returns {number} Word count
132
+ */
133
+ export function countWords(text) {
134
+ if (!text) return 0;
135
+ return text.split(/\s+/).filter(Boolean).length;
136
+ }
137
+
138
+ /**
139
+ * Generates a deterministic hash-based chunk ID using SubtleCrypto (browser)
140
+ * or a simple FNV-1a fallback for synchronous use.
141
+ * Uses FNV-1a by default since SubtleCrypto is async and we need sync IDs.
142
+ *
143
+ * @param {string} content - Content to hash
144
+ * @param {number} index - Sequential chunk index
145
+ * @param {string} prefix - ID prefix
146
+ * @returns {string} Unique chunk ID like "chunk_a1b2c3d4_0042"
147
+ */
148
+ export function generateChunkId(content, index, prefix = "chunk") {
149
+ // FNV-1a 32-bit hash — fast, deterministic, no async needed
150
+ let hash = 0x811c9dc5;
151
+ for (let i = 0; i < content.length; i++) {
152
+ hash ^= content.charCodeAt(i);
153
+ hash = (hash * 0x01000193) >>> 0;
154
+ }
155
+ const hex = hash.toString(16).padStart(8, "0");
156
+ return `${prefix}_${hex}_${String(index).padStart(4, "0")}`;
157
+ }
158
+
159
+ /**
160
+ * Extracts all inline links from Markdown text.
161
+ * Returns objects with display text and URL for metadata enrichment.
162
+ *
163
+ * @param {string} text - Markdown text to scan
164
+ * @returns {Array<{text: string, url: string}>} Extracted link objects
165
+ */
166
+ export function extractLinks(text) {
167
+ const links = [];
168
+ const regex = /\[([^\]]+)\]\(([^)]+)\)/g;
169
+ let m;
170
+ while ((m = regex.exec(text)) !== null) {
171
+ links.push({text : m[1], url : m[2]});
172
+ }
173
+ return links;
174
+ }
175
+
176
+ /**
177
+ * Extracts video embed URLs (YouTube, Vimeo) from text.
178
+ * Returns structured objects with platform, URL, and video ID.
179
+ *
180
+ * @param {string} text - Text to scan for video URLs
181
+ * @returns {Array<{platform: string, url: string, videoId: string}>}
182
+ */
183
+ export function extractVideos(text) {
184
+ const videos = [];
185
+ const ytRe = /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([\w-]+)/g;
186
+ const vmRe = /(?:https?:\/\/)?(?:www\.)?vimeo\.com\/([\d]+)/g;
187
+ let m;
188
+ while ((m = ytRe.exec(text)) !== null) {
189
+ videos.push({platform : "youtube", url : `https://www.youtube.com/watch?v=${m[1]}`, videoId : m[1]});
190
+ }
191
+ while ((m = vmRe.exec(text)) !== null) {
192
+ videos.push({platform : "vimeo", url : `https://vimeo.com/${m[1]}`, videoId : m[1]});
193
+ }
194
+ return videos;
195
+ }
196
+
197
+ // ─────────────────────────────────────────────────────────────────────────────
198
+ // STREAM → LINE ITERATOR (Browser-native)
199
+ // ─────────────────────────────────────────────────────────────────────────────
200
+
201
+ /**
202
+ * Converts a ReadableStream<Uint8Array> into an async iterator of lines.
203
+ * Uses the browser-native TextDecoderStream for proper UTF-8 handling.
204
+ * Only one line is in memory at any time — this is the key to memory efficiency.
205
+ *
206
+ * Handles:
207
+ * - Multi-byte UTF-8 sequences that span stream chunk boundaries
208
+ * - Files that don't end with a newline
209
+ * - \r\n (Windows) and \n (Unix) line endings
210
+ *
211
+ * @param {ReadableStream<Uint8Array>} byteStream - Raw byte stream
212
+ * @yields {{ lineNumber: number, text: string }} One line at a time
213
+ */
214
+ export async function* streamToLines(byteStream) {
215
+ const textStream = byteStream.pipeThrough(new TextDecoderStream("utf-8"));
216
+ const reader = textStream.getReader();
217
+ let buffer = "";
218
+ let lineNumber = 0;
219
+
220
+ try {
221
+ while (true) {
222
+ const {done, value} = await reader.read();
223
+ if (done) break;
224
+ buffer += value;
225
+ const lines = buffer.split("\n");
226
+ buffer = lines.pop() || "";
227
+ for (const line of lines) {
228
+ lineNumber++;
229
+ yield {lineNumber, text : line.replace(/\r$/, "")};
230
+ }
231
+ }
232
+ // Flush remaining partial line
233
+ if (buffer.length > 0) {
234
+ lineNumber++;
235
+ yield {lineNumber, text : buffer.replace(/\r$/, "")};
236
+ }
237
+ } finally {
238
+ reader.releaseLock();
239
+ }
240
+ }
241
+
242
+ /**
243
+ * Converts a plain string into an async iterator of lines.
244
+ * Wraps the string in a ReadableStream for pipeline compatibility.
245
+ * Useful for testing or processing in-memory content.
246
+ *
247
+ * @param {string} text - The full markdown string
248
+ * @yields {{ lineNumber: number, text: string }} One line at a time
249
+ */
250
+ export async function* stringToLines(text) {
251
+ const encoder = new TextEncoder();
252
+ const stream = new ReadableStream({
253
+ start(controller) {
254
+ // Enqueue in 64KB chunks to simulate realistic streaming
255
+ const bytes = encoder.encode(text);
256
+ const CHUNK = 65536;
257
+ for (let i = 0; i < bytes.length; i += CHUNK) {
258
+ controller.enqueue(bytes.slice(i, i + CHUNK));
259
+ }
260
+ controller.close();
261
+ },
262
+ });
263
+ yield* streamToLines(stream);
264
+ }
265
+
266
+ // ─────────────────────────────────────────────────────────────────────────────
267
+ // SEMANTIC PARSER
268
+ // ─────────────────────────────────────────────────────────────────────────────
269
+
270
+ /**
271
+ * SemanticParser: Classifies Markdown lines into typed semantic blocks.
272
+ *
273
+ * Consumes an async line iterator and yields structured block objects.
274
+ * Tracks heading hierarchy for breadcrumb metadata.
275
+ * Identifies and enforces atomic blocks (code, tables, videos, frontmatter).
276
+ *
277
+ * Each emitted block:
278
+ * {
279
+ * type, content, lines: {start,end}, heading, headingLevel, headingPath,
280
+ * language, isAtomic, links, videos, metadata
281
+ * }
282
+ *
283
+ * @class
284
+ * @param {Object} config - Splitter configuration
285
+ */
286
+ export class SemanticParser {
287
+ constructor(config) {
288
+ /** @private Splitter config reference */
289
+ this.config = config;
290
+ /** @private Heading breadcrumb stack */
291
+ this.headingStack = [];
292
+ /** @private Accumulated reference-style link definitions */
293
+ this.linkDefinitions = new Map();
294
+ /** @private Frontmatter tracking state */
295
+ this.inFrontmatter = false;
296
+ /** @private Whether we've seen non-empty content */
297
+ this.contentStarted = false;
298
+ }
299
+
300
+ /**
301
+ * Main parse generator. Yields one semantic block at a time.
302
+ * Handles all Markdown constructs including fenced code blocks,
303
+ * tables, frontmatter, blockquotes, lists, images, HTML, and videos.
304
+ *
305
+ * @param {AsyncIterable<{lineNumber: number, text: string}>} lineIterator
306
+ * @yields {Object} Semantic block objects
307
+ */
308
+ async* parse(lineIterator) {
309
+ let currentBlock = null;
310
+ let inCodeBlock = false;
311
+ let codeFence = "";
312
+ let codeLanguage = "";
313
+ let inTable = false;
314
+
315
+ for await (const {lineNumber, text} of lineIterator) {
316
+ // ── FRONTMATTER (--- at line 1) ──
317
+ if (!this.contentStarted && lineNumber === 1 && text.trim() === "---") {
318
+ this.inFrontmatter = true;
319
+ currentBlock = this._block(BlockType.FRONTMATTER, text, lineNumber);
320
+ continue;
321
+ }
322
+ if (this.inFrontmatter) {
323
+ currentBlock.content += "\n" + text;
324
+ currentBlock.lines.end = lineNumber;
325
+ if (text.trim() === "---" && lineNumber > 1) {
326
+ this.inFrontmatter = false;
327
+ currentBlock.metadata.frontmatter = this._parseFrontmatter(currentBlock.content);
328
+ yield currentBlock;
329
+ currentBlock = null;
330
+ }
331
+ continue;
332
+ }
333
+ this.contentStarted = true;
334
+
335
+ // ── FENCED CODE BLOCK (``` or ~~~) — highest priority ──
336
+ const fenceMatch = text.match(/^(\s*)(```|~~~)(.*)$/);
337
+ if (fenceMatch && !inCodeBlock) {
338
+ if (currentBlock) yield currentBlock;
339
+ inCodeBlock = true;
340
+ codeFence = fenceMatch[2];
341
+ codeLanguage = fenceMatch[3].trim().split(/\s+/)[0] || "";
342
+ currentBlock = this._block(BlockType.CODE_BLOCK, text, lineNumber);
343
+ currentBlock.language = codeLanguage;
344
+ currentBlock.isAtomic = true;
345
+ continue;
346
+ }
347
+ if (inCodeBlock) {
348
+ currentBlock.content += "\n" + text;
349
+ currentBlock.lines.end = lineNumber;
350
+ const trimmed = text.trim();
351
+ if (trimmed.startsWith(codeFence) && trimmed.length <= codeFence.length + 1) {
352
+ inCodeBlock = false;
353
+ yield currentBlock;
354
+ currentBlock = null;
355
+ }
356
+ continue;
357
+ }
358
+
359
+ // ── TABLE ──
360
+ const isTableRow = /^\s*\|.*\|\s*$/.test(text);
361
+ const isTableSep = /^\s*\|[\s:|-]+\|\s*$/.test(text);
362
+ if (isTableRow || isTableSep) {
363
+ if (!inTable) {
364
+ if (currentBlock) yield currentBlock;
365
+ inTable = true;
366
+ currentBlock = this._block(BlockType.TABLE, text, lineNumber);
367
+ currentBlock.isAtomic = true;
368
+ }
369
+ else {
370
+ currentBlock.content += "\n" + text;
371
+ currentBlock.lines.end = lineNumber;
372
+ }
373
+ continue;
374
+ }
375
+ else if (inTable) {
376
+ inTable = false;
377
+ yield currentBlock;
378
+ currentBlock = null;
379
+ }
380
+
381
+ // ── EMPTY LINE ──
382
+ if (text.trim() === "") {
383
+ if (currentBlock) {
384
+ yield currentBlock;
385
+ currentBlock = null;
386
+ }
387
+ continue;
388
+ }
389
+
390
+ // ── HEADING ──
391
+ const hMatch = text.match(/^(#{1,6})\s+(.+)$/);
392
+ if (hMatch) {
393
+ if (currentBlock) yield currentBlock;
394
+ const level = hMatch[1].length;
395
+ const hText = hMatch[2].trim();
396
+ this._pushHeading(level, hText);
397
+ const blk = this._block(BlockType.HEADING, text, lineNumber);
398
+ blk.headingLevel = level;
399
+ blk.heading = hText;
400
+ blk.headingPath = this.headingStack.map((h) => h.text);
401
+ yield blk;
402
+ currentBlock = null;
403
+ continue;
404
+ }
405
+
406
+ // ── HORIZONTAL RULE ──
407
+ if (/^(\s*[-*_]\s*){3,}$/.test(text)) {
408
+ if (currentBlock) yield currentBlock;
409
+ yield this._block(BlockType.HR, text, lineNumber);
410
+ currentBlock = null;
411
+ continue;
412
+ }
413
+
414
+ // ── VIDEO EMBED ──
415
+ if (this.config.videoPattern.test(text)) {
416
+ if (currentBlock) yield currentBlock;
417
+ const vb = this._block(BlockType.VIDEO_EMBED, text, lineNumber);
418
+ vb.isAtomic = true;
419
+ vb.videos = extractVideos(text);
420
+ yield vb;
421
+ currentBlock = null;
422
+ continue;
423
+ }
424
+
425
+ // ── REFERENCE LINK DEFINITION ──
426
+ const lrMatch = text.match(this.config.linkRefPattern);
427
+ if (lrMatch) {
428
+ this.linkDefinitions.set(lrMatch[1], lrMatch[2].trim());
429
+ if (!currentBlock || currentBlock.type !== BlockType.LINK_REF) {
430
+ if (currentBlock) yield currentBlock;
431
+ currentBlock = this._block(BlockType.LINK_REF, text, lineNumber);
432
+ }
433
+ else {
434
+ currentBlock.content += "\n" + text;
435
+ currentBlock.lines.end = lineNumber;
436
+ }
437
+ continue;
438
+ }
439
+
440
+ // ── IMAGE ──
441
+ if (/^\s*!\[.*\]\(.*\)\s*$/.test(text)) {
442
+ if (currentBlock) yield currentBlock;
443
+ yield this._block(BlockType.IMAGE, text, lineNumber);
444
+ currentBlock = null;
445
+ continue;
446
+ }
447
+
448
+ // ── BLOCKQUOTE ──
449
+ if (text.startsWith(">")) {
450
+ if (!currentBlock || currentBlock.type !== BlockType.BLOCKQUOTE) {
451
+ if (currentBlock) yield currentBlock;
452
+ currentBlock = this._block(BlockType.BLOCKQUOTE, text, lineNumber);
453
+ }
454
+ else {
455
+ currentBlock.content += "\n" + text;
456
+ currentBlock.lines.end = lineNumber;
457
+ }
458
+ continue;
459
+ }
460
+
461
+ // ── LIST ──
462
+ if (/^\s*[-*+]\s+|^\s*\d+\.\s+/.test(text)) {
463
+ if (!currentBlock || currentBlock.type !== BlockType.LIST) {
464
+ if (currentBlock) yield currentBlock;
465
+ currentBlock = this._block(BlockType.LIST, text, lineNumber);
466
+ }
467
+ else {
468
+ currentBlock.content += "\n" + text;
469
+ currentBlock.lines.end = lineNumber;
470
+ }
471
+ continue;
472
+ }
473
+
474
+ // ── HTML BLOCK ──
475
+ if (/^\s*<[a-zA-Z]/.test(text) && !/^\s*<a\s/.test(text)) {
476
+ if (!currentBlock || currentBlock.type !== BlockType.HTML_BLOCK) {
477
+ if (currentBlock) yield currentBlock;
478
+ currentBlock = this._block(BlockType.HTML_BLOCK, text, lineNumber);
479
+ }
480
+ else {
481
+ currentBlock.content += "\n" + text;
482
+ currentBlock.lines.end = lineNumber;
483
+ }
484
+ continue;
485
+ }
486
+
487
+ // ── PARAGRAPH (default) ──
488
+ if (!currentBlock || currentBlock.type !== BlockType.PARAGRAPH) {
489
+ if (currentBlock) yield currentBlock;
490
+ currentBlock = this._block(BlockType.PARAGRAPH, text, lineNumber);
491
+ }
492
+ else {
493
+ currentBlock.content += "\n" + text;
494
+ currentBlock.lines.end = lineNumber;
495
+ }
496
+ }
497
+
498
+ if (currentBlock) yield currentBlock;
499
+ }
500
+
501
+ /**
502
+ * Creates a new semantic block with heading context inherited.
503
+ * @private
504
+ */
505
+ _block(type, content, lineNumber) {
506
+ const cur = this.headingStack.length > 0
507
+ ? this.headingStack[this.headingStack.length - 1]
508
+ : null;
509
+ return {
510
+ type, content,
511
+ lines : {start : lineNumber, end : lineNumber},
512
+ heading : cur?.text || null,
513
+ headingLevel : cur?.level || null,
514
+ headingPath : this.headingStack.map((h) => h.text),
515
+ language : null,
516
+ isAtomic : ATOMIC_BLOCKS.has(type),
517
+ links : [], videos : [], metadata : {},
518
+ };
519
+ }
520
+
521
+ /**
522
+ * Maintains heading breadcrumb stack. Pops headings at same or deeper level
523
+ * before pushing the new one so the path always reflects nesting.
524
+ * @private
525
+ */
526
+ _pushHeading(level, text) {
527
+ while (this.headingStack.length > 0 &&
528
+ this.headingStack[this.headingStack.length - 1].level >= level) {
529
+ this.headingStack.pop();
530
+ }
531
+ this.headingStack.push({level, text});
532
+ }
533
+
534
+ /**
535
+ * Simple frontmatter key:value parser.
536
+ * @private
537
+ */
538
+ _parseFrontmatter(content) {
539
+ const result = {};
540
+ for (const line of content.split("\n").slice(1, -1)) {
541
+ const i = line.indexOf(":");
542
+ if (i > 0) result[line.slice(0, i).trim()] = line.slice(i + 1).trim();
543
+ }
544
+ return result;
545
+ }
546
+
547
+ /** Returns all accumulated [id]: url link definitions */
548
+ getLinkDefinitions() { return new Map(this.linkDefinitions); }
549
+ }
550
+
551
+ // ─────────────────────────────────────────────────────────────────────────────
552
+ // SPLITTING STRATEGIES
553
+ // ─────────────────────────────────────────────────────────────────────────────
554
+
555
+ /**
556
+ * ──────────────────────────────────────────────────────────────────────────
557
+ * STRATEGY 1: SemanticStrategy (Default)
558
+ * ──────────────────────────────────────────────────────────────────────────
559
+ * Splits by Markdown structure with context grouping.
560
+ * Groups code blocks with their explanatory text, keeps tables and videos
561
+ * atomic, and respects heading boundaries.
562
+ *
563
+ * Options:
564
+ * - maxChunkTokens: number (target chunk size in tokens)
565
+ * - overlapTokens: number (overlap between chunks)
566
+ *
567
+ * This is the most intelligent strategy and guarantees zero sequence loss.
568
+ */
569
+ export class SemanticStrategy {
570
+ /**
571
+ * @param {Object} config - Full splitter config
572
+ */
573
+ constructor(config) {
574
+ this.config = config;
575
+ this.name = "semantic";
576
+ }
577
+
578
+ /**
579
+ * Processes an async line iterator through the full semantic pipeline:
580
+ * Lines → SemanticParser → ContextGrouper → ChunkAssembler
581
+ *
582
+ * @param {AsyncIterable} lineIterator - Async line iterator
583
+ * @yields {Object} Final chunk objects with metadata
584
+ */
585
+ async* process(lineIterator) {
586
+ const parser = new SemanticParser(this.config);
587
+ const blocks = parser.parse(lineIterator);
588
+ const groups = this._groupBlocks(blocks);
589
+ yield* this._assembleChunks(groups, parser);
590
+ }
591
+
592
+ /**
593
+ * Groups related semantic blocks to prevent splitting related content.
594
+ * Rules enforce: code+text together, videos with context, links with sections.
595
+ * @private
596
+ */
597
+ async* _groupBlocks(blocks) {
598
+ const buf = [];
599
+ let lastHeading = null;
600
+
601
+ const flush = () => {
602
+ if (buf.length === 0) return null;
603
+ const content = buf.map((b) => b.content).join("\n\n");
604
+ const group = {
605
+ blocks : [...buf],
606
+ content,
607
+ lines : {start : buf[0].lines.start, end : buf[buf.length - 1].lines.end},
608
+ heading : lastHeading?.text || buf[0].heading,
609
+ headingLevel : lastHeading?.level || buf[0].headingLevel,
610
+ headingPath : buf[0].headingPath,
611
+ hasCode : buf.some((b) => b.type === BlockType.CODE_BLOCK),
612
+ hasVideo : buf.some((b) => b.type === BlockType.VIDEO_EMBED),
613
+ hasTable : buf.some((b) => b.type === BlockType.TABLE),
614
+ links : buf.flatMap((b) => extractLinks(b.content)),
615
+ videos : buf.flatMap((b) => extractVideos(b.content)),
616
+ languages : [...new Set(buf.filter((b) => b.language).map((b) => b.language))],
617
+ tokenEstimate : estimateTokens(content, this.config.charsPerToken),
618
+ isAtomic : buf.some((b) => b.isAtomic),
619
+ };
620
+ buf.length = 0;
621
+ return group;
622
+ };
623
+
624
+ for await (const block of blocks) {
625
+ if (block.type === BlockType.HEADING) {
626
+ lastHeading = {text : block.heading, level : block.headingLevel};
627
+ }
628
+
629
+ if (buf.length === 0) {
630
+ buf.push(block);
631
+ continue;
632
+ }
633
+
634
+ const last = buf[buf.length - 1];
635
+ const bufTokens = estimateTokens(buf.map((b) => b.content).join("\n\n"), this.config.charsPerToken);
636
+
637
+ // New major heading → flush
638
+ if (block.type === BlockType.HEADING && block.headingLevel <= (last.headingLevel || 99)) {
639
+ const g = flush();
640
+ if (g) yield g;
641
+ buf.push(block);
642
+ continue;
643
+ }
644
+
645
+ // Code after text or text after code → keep together (ZERO LOSS)
646
+ if (block.type === BlockType.CODE_BLOCK && this.config.preserveCodeContext &&
647
+ [BlockType.PARAGRAPH, BlockType.HEADING, BlockType.LIST].includes(last.type)) {
648
+ buf.push(block);
649
+ continue;
650
+ }
651
+ if ([BlockType.PARAGRAPH, BlockType.LIST].includes(block.type) &&
652
+ last.type === BlockType.CODE_BLOCK && this.config.preserveCodeContext) {
653
+ buf.push(block);
654
+ continue;
655
+ }
656
+
657
+ // Video/link with context → keep together
658
+ if (block.type === BlockType.VIDEO_EMBED && this.config.preserveVideos) {
659
+ buf.push(block);
660
+ continue;
661
+ }
662
+ if (block.type === BlockType.LINK_REF && this.config.preserveLinks) {
663
+ buf.push(block);
664
+ continue;
665
+ }
666
+
667
+ // Consecutive code blocks (e.g., input/output) → keep together
668
+ if (block.type === BlockType.CODE_BLOCK && last.type === BlockType.CODE_BLOCK) {
669
+ buf.push(block);
670
+ continue;
671
+ }
672
+
673
+ // HR = explicit break
674
+ if (block.type === BlockType.HR) {
675
+ const g = flush();
676
+ if (g) yield g;
677
+ continue;
678
+ }
679
+
680
+ // Buffer overflow → flush
681
+ if (bufTokens > this.config.maxChunkTokens * 2) {
682
+ const g = flush();
683
+ if (g) yield g;
684
+ buf.push(block);
685
+ continue;
686
+ }
687
+
688
+ // Same type or fits in budget → keep grouping
689
+ const newTokens = bufTokens + estimateTokens(block.content, this.config.charsPerToken);
690
+ if (newTokens <= this.config.maxChunkTokens * 1.5) {
691
+ buf.push(block);
692
+ }
693
+ else {
694
+ const g = flush();
695
+ if (g) yield g;
696
+ buf.push(block);
697
+ }
698
+ }
699
+
700
+ const g = flush();
701
+ if (g) yield g;
702
+ }
703
+
704
+ /**
705
+ * Merges context groups into final LLM-ready chunks with overlap.
706
+ * Oversized atomic blocks (huge code blocks) emit as-is with a warning flag.
707
+ * @private
708
+ */
709
+ async* _assembleChunks(groups, parser) {
710
+ let idx = 0, pending = "", pendingGroups = [], pendingTokens = 0, prevTail = "";
711
+
712
+ const makeChunk = (content, srcGroups, overlap = "") => {
713
+ const full = overlap ? overlap + "\n\n" + content : content;
714
+ const c = this._buildChunkObject(full, content, overlap, srcGroups, idx);
715
+ idx++;
716
+ return c;
717
+ };
718
+
719
+ const tail = (content) => {
720
+ if (this.config.overlapTokens <= 0) return "";
721
+ const chars = this.config.overlapTokens * this.config.charsPerToken;
722
+ if (content.length <= chars) return content;
723
+ const t = content.slice(-chars);
724
+ const br = t.indexOf("\n\n");
725
+ return br > 0 ? t.slice(br + 2) : t;
726
+ };
727
+
728
+ for await (const group of groups) {
729
+ const gTokens = group.tokenEstimate;
730
+
731
+ // Atomic oversized → emit standalone
732
+ if (group.isAtomic && gTokens > this.config.maxChunkTokens) {
733
+ if (pendingGroups.length > 0) {
734
+ yield makeChunk(pending, pendingGroups, prevTail);
735
+ prevTail = tail(pending);
736
+ pending = "";
737
+ pendingGroups = [];
738
+ pendingTokens = 0;
739
+ }
740
+ yield makeChunk(group.content, [group], prevTail);
741
+ prevTail = tail(group.content);
742
+ continue;
743
+ }
744
+
745
+ // Would exceed budget → flush
746
+ if (pendingTokens + gTokens > this.config.maxChunkTokens && pendingGroups.length > 0) {
747
+ yield makeChunk(pending, pendingGroups, prevTail);
748
+ prevTail = tail(pending);
749
+ pending = "";
750
+ pendingGroups = [];
751
+ pendingTokens = 0;
752
+ }
753
+
754
+ pending += (pending ? "\n\n" : "") + group.content;
755
+ pendingGroups.push(group);
756
+ pendingTokens += gTokens;
757
+ }
758
+
759
+ if (pendingGroups.length > 0) yield makeChunk(pending, pendingGroups, prevTail);
760
+ }
761
+
762
+ /**
763
+ * Constructs a fully enriched chunk object with all metadata fields.
764
+ * @private
765
+ */
766
+ _buildChunkObject(fullContent, rawContent, overlap, groups, index) {
767
+ return {
768
+ id : generateChunkId(fullContent, index, this.config.chunkIdPrefix),
769
+ index,
770
+ content : fullContent,
771
+ tokenEstimate : estimateTokens(fullContent, this.config.charsPerToken),
772
+ overlapTokens : estimateTokens(overlap, this.config.charsPerToken),
773
+ charCount : fullContent.length,
774
+ wordCount : countWords(fullContent),
775
+ lines : {
776
+ start : groups[0]?.lines?.start || 0,
777
+ end : groups[groups.length - 1]?.lines?.end || 0,
778
+ },
779
+ heading : groups[0]?.heading || null,
780
+ headingPath : groups[0]?.headingPath || [],
781
+ headingLevel : groups[0]?.headingLevel || null,
782
+ hasCode : groups.some((g) => g.hasCode),
783
+ hasVideo : groups.some((g) => g.hasVideo),
784
+ hasTable : groups.some((g) => g.hasTable),
785
+ languages : [...new Set(groups.flatMap((g) => g.languages || []))],
786
+ links : groups.flatMap((g) => g.links || []),
787
+ videos : groups.flatMap((g) => g.videos || []),
788
+ isOversized : estimateTokens(fullContent, this.config.charsPerToken) > this.config.maxChunkTokens * 1.5,
789
+ containsAtomicBlock : groups.some((g) => g.isAtomic),
790
+ blockTypes : [...new Set(groups.flatMap((g) => g.blocks ? g.blocks.map((b) => b.type) : []))],
791
+ strategy : "semantic",
792
+ metadata : {},
793
+ };
794
+ }
795
+ }
796
+
797
+ /**
798
+ * ──────────────────────────────────────────────────────────────────────────
799
+ * STRATEGY 2: DelimiterStrategy
800
+ * ──────────────────────────────────────────────────────────────────────────
801
+ * Splits on a custom delimiter string (e.g., '---', '===', '<!-- split -->').
802
+ * The delimiter line itself is NOT included in any chunk.
803
+ * Code blocks that contain the delimiter are NOT split (atomic protection).
804
+ *
805
+ * Options (via strategyOptions):
806
+ * - delimiter: string (default: '---')
807
+ * - keepDelimiter: boolean (include delimiter in output, default: false)
808
+ * - trimChunks: boolean (trim whitespace from chunk edges, default: true)
809
+ */
810
+ export class DelimiterStrategy {
811
+ constructor(config) {
812
+ this.config = config;
813
+ this.delimiter = config.strategyOptions?.delimiter || "---";
814
+ this.keepDelimiter = config.strategyOptions?.keepDelimiter || false;
815
+ this.trimChunks = config.strategyOptions?.trimChunks !== false;
816
+ this.name = "delimiter";
817
+ }
818
+
819
+ /**
820
+ * Accumulates lines and splits whenever the delimiter is encountered.
821
+ * Tracks code fence state to avoid splitting inside code blocks.
822
+ *
823
+ * @param {AsyncIterable} lineIterator
824
+ * @yields {Object} Chunk objects
825
+ */
826
+ async* process(lineIterator) {
827
+ let buffer = [];
828
+ let startLine = 1;
829
+ let idx = 0;
830
+ let inCodeBlock = false;
831
+ let codeFence = "";
832
+
833
+ for await (const {lineNumber, text} of lineIterator) {
834
+ // Track code blocks to protect them from delimiter splitting
835
+ const fenceMatch = text.match(/^(\s*)(```|~~~)/);
836
+ if (fenceMatch && !inCodeBlock) {
837
+ inCodeBlock = true;
838
+ codeFence = fenceMatch[2];
839
+ }
840
+ else if (inCodeBlock && text.trim().startsWith(codeFence)) {
841
+ inCodeBlock = false;
842
+ }
843
+
844
+ // Only split on delimiter if NOT inside a code block
845
+ if (!inCodeBlock && text.trim() === this.delimiter) {
846
+ if (buffer.length > 0) {
847
+ yield this._makeChunk(buffer, startLine, lineNumber - 1, idx++);
848
+ }
849
+ if (this.keepDelimiter) buffer = [text]; else buffer = [];
850
+ startLine = lineNumber + 1;
851
+ continue;
852
+ }
853
+
854
+ if (buffer.length === 0) startLine = lineNumber;
855
+ buffer.push(text);
856
+ }
857
+
858
+ if (buffer.length > 0) {
859
+ yield this._makeChunk(buffer, startLine, startLine + buffer.length - 1, idx);
860
+ }
861
+ }
862
+
863
+ /** @private Builds chunk from line buffer */
864
+ _makeChunk(lines, startLine, endLine, index) {
865
+ let content = lines.join("\n");
866
+ if (this.trimChunks) content = content.trim();
867
+ return {
868
+ id : generateChunkId(content, index, this.config.chunkIdPrefix),
869
+ index, content,
870
+ tokenEstimate : estimateTokens(content, this.config.charsPerToken),
871
+ overlapTokens : 0,
872
+ charCount : content.length,
873
+ wordCount : countWords(content),
874
+ lines : {start : startLine, end : endLine},
875
+ heading : null, headingPath : [], headingLevel : null,
876
+ hasCode : /```[\s\S]*?```/.test(content),
877
+ hasVideo : this.config.videoPattern.test(content),
878
+ hasTable : /^\s*\|.*\|\s*$/m.test(content),
879
+ languages : [...(content.matchAll(/```(\w+)/g))].map((m) => m[1]),
880
+ links : extractLinks(content),
881
+ videos : extractVideos(content),
882
+ isOversized : false,
883
+ containsAtomicBlock : false,
884
+ blockTypes : [],
885
+ strategy : "delimiter",
886
+ metadata : {delimiter : this.delimiter},
887
+ };
888
+ }
889
+ }
890
+
891
+ /**
892
+ * ──────────────────────────────────────────────────────────────────────────
893
+ * STRATEGY 3: CharLimitStrategy
894
+ * ──────────────────────────────────────────────────────────────────────────
895
+ * Splits by character count. Respects code blocks as atomic units.
896
+ * When a split point falls inside a code block, the split is deferred
897
+ * until after the code block closes (zero sequence loss).
898
+ *
899
+ * Options (via strategyOptions):
900
+ * - charLimit: number (max characters per chunk, default: 4000)
901
+ * - overlap: number (character overlap between chunks, default: 200)
902
+ */
903
+ export class CharLimitStrategy {
904
+ constructor(config) {
905
+ this.config = config;
906
+ this.charLimit = config.strategyOptions?.charLimit || config.maxChunkTokens * config.charsPerToken;
907
+ this.overlap = config.strategyOptions?.overlap || config.overlapTokens * config.charsPerToken;
908
+ this.name = "char";
909
+ }
910
+
911
+ /**
912
+ * Accumulates lines until charLimit is reached, then emits.
913
+ * Never splits inside fenced code blocks or tables.
914
+ *
915
+ * @param {AsyncIterable} lineIterator
916
+ * @yields {Object} Chunk objects
917
+ */
918
+ async* process(lineIterator) {
919
+ let buffer = [];
920
+ let bufferChars = 0;
921
+ let startLine = 1;
922
+ let idx = 0;
923
+ let inCodeBlock = false;
924
+ let codeFence = "";
925
+ let prevOverlap = "";
926
+
927
+ for await (const {lineNumber, text} of lineIterator) {
928
+ // Track code fences
929
+ const fm = text.match(/^(\s*)(```|~~~)/);
930
+ if (fm && !inCodeBlock) {
931
+ inCodeBlock = true;
932
+ codeFence = fm[2];
933
+ }
934
+ else if (inCodeBlock && text.trim().startsWith(codeFence) && text.trim().length <= codeFence.length + 1) {
935
+ inCodeBlock = false;
936
+ }
937
+
938
+ if (buffer.length === 0) startLine = lineNumber;
939
+ buffer.push(text);
940
+ bufferChars += text.length + 1; // +1 for \n
941
+
942
+ // Only emit if over limit AND not inside a code block
943
+ if (bufferChars >= this.charLimit && !inCodeBlock) {
944
+ const content = (prevOverlap ? prevOverlap + "\n" : "") + buffer.join("\n");
945
+ yield this._makeChunk(content, startLine, lineNumber, idx++, prevOverlap);
946
+ prevOverlap = this.overlap > 0 ? buffer.join("\n").slice(-this.overlap) : "";
947
+ buffer = [];
948
+ bufferChars = 0;
949
+ }
950
+ }
951
+
952
+ if (buffer.length > 0) {
953
+ const content = (prevOverlap ? prevOverlap + "\n" : "") + buffer.join("\n");
954
+ yield this._makeChunk(content, startLine, startLine + buffer.length - 1, idx, prevOverlap);
955
+ }
956
+ }
957
+
958
+ /** @private */
959
+ _makeChunk(content, startLine, endLine, index, overlap = "") {
960
+ return {
961
+ id : generateChunkId(content, index, this.config.chunkIdPrefix),
962
+ index, content,
963
+ tokenEstimate : estimateTokens(content, this.config.charsPerToken),
964
+ overlapTokens : estimateTokens(overlap, this.config.charsPerToken),
965
+ charCount : content.length,
966
+ wordCount : countWords(content),
967
+ lines : {start : startLine, end : endLine},
968
+ heading : null, headingPath : [], headingLevel : null,
969
+ hasCode : /```[\s\S]*?```/.test(content),
970
+ hasVideo : this.config.videoPattern.test(content),
971
+ hasTable : /^\s*\|.*\|\s*$/m.test(content),
972
+ languages : [...(content.matchAll(/```(\w+)/g))].map((m) => m[1]),
973
+ links : extractLinks(content),
974
+ videos : extractVideos(content),
975
+ isOversized : content.length > this.charLimit * 1.5,
976
+ containsAtomicBlock : false,
977
+ blockTypes : [],
978
+ //strategy : "char",
979
+ strategy : this.name,
980
+ metadata : {charLimit : this.charLimit},
981
+ };
982
+ }
983
+ }
984
+
985
+ /**
986
+ * ──────────────────────────────────────────────────────────────────────────
987
+ * STRATEGY 4: WordLimitStrategy
988
+ * ──────────────────────────────────────────────────────────────────────────
989
+ * Splits by word count. Respects code blocks as atomic units.
990
+ *
991
+ * Options (via strategyOptions):
992
+ * - wordLimit: number (max words per chunk, default: 1000)
993
+ * - overlap: number (word overlap between chunks, default: 50)
994
+ */
995
+ export class WordLimitStrategy {
996
+ constructor(config) {
997
+ this.config = config;
998
+ this.wordLimit = config.strategyOptions?.wordLimit || 1000;
999
+ this.overlap = config.strategyOptions?.overlap || 50;
1000
+ this.name = "word";
1001
+ }
1002
+
1003
+ /**
1004
+ * Accumulates lines until word limit is reached.
1005
+ * Code blocks are counted but never split mid-block.
1006
+ *
1007
+ * @param {AsyncIterable} lineIterator
1008
+ * @yields {Object} Chunk objects
1009
+ */
1010
+ async* process(lineIterator) {
1011
+ let buffer = [];
1012
+ let bufferWords = 0;
1013
+ let startLine = 1;
1014
+ let idx = 0;
1015
+ let inCodeBlock = false;
1016
+ let codeFence = "";
1017
+ let prevOverlapText = "";
1018
+
1019
+ for await (const {lineNumber, text} of lineIterator) {
1020
+ const fm = text.match(/^(\s*)(```|~~~)/);
1021
+ if (fm && !inCodeBlock) {
1022
+ inCodeBlock = true;
1023
+ codeFence = fm[2];
1024
+ }
1025
+ else if (inCodeBlock && text.trim().startsWith(codeFence) && text.trim().length <= codeFence.length + 1) {
1026
+ inCodeBlock = false;
1027
+ }
1028
+
1029
+ if (buffer.length === 0) startLine = lineNumber;
1030
+ buffer.push(text);
1031
+ bufferWords += countWords(text);
1032
+
1033
+ if (bufferWords >= this.wordLimit && !inCodeBlock) {
1034
+ const raw = buffer.join("\n");
1035
+ const content = prevOverlapText ? prevOverlapText + "\n" + raw : raw;
1036
+ yield this._makeChunk(content, startLine, lineNumber, idx++, prevOverlapText);
1037
+ // Calculate overlap: take last N words
1038
+ if (this.overlap > 0) {
1039
+ const words = raw.split(/\s+/).filter(Boolean);
1040
+ prevOverlapText = words.slice(-this.overlap).join(" ");
1041
+ }
1042
+ else {
1043
+ prevOverlapText = "";
1044
+ }
1045
+ buffer = [];
1046
+ bufferWords = 0;
1047
+ }
1048
+ }
1049
+
1050
+ if (buffer.length > 0) {
1051
+ const raw = buffer.join("\n");
1052
+ const content = prevOverlapText ? prevOverlapText + "\n" + raw : raw;
1053
+ yield this._makeChunk(content, startLine, startLine + buffer.length - 1, idx, prevOverlapText);
1054
+ }
1055
+ }
1056
+
1057
+ /** @private */
1058
+ _makeChunk(content, startLine, endLine, index, overlap = "") {
1059
+ return {
1060
+ id : generateChunkId(content, index, this.config.chunkIdPrefix),
1061
+ index, content,
1062
+ tokenEstimate : estimateTokens(content, this.config.charsPerToken),
1063
+ overlapTokens : estimateTokens(overlap, this.config.charsPerToken),
1064
+ charCount : content.length,
1065
+ wordCount : countWords(content),
1066
+ lines : {start : startLine, end : endLine},
1067
+ heading : null, headingPath : [], headingLevel : null,
1068
+ hasCode : /```[\s\S]*?```/.test(content),
1069
+ hasVideo : this.config.videoPattern.test(content),
1070
+ hasTable : /^\s*\|.*\|\s*$/m.test(content),
1071
+ languages : [...(content.matchAll(/```(\w+)/g))].map((m) => m[1]),
1072
+ links : extractLinks(content),
1073
+ videos : extractVideos(content),
1074
+ isOversized : countWords(content) > this.wordLimit * 1.5,
1075
+ containsAtomicBlock : false,
1076
+ blockTypes : [],
1077
+ //strategy : "word",
1078
+ strategy : this.name,
1079
+ metadata : {wordLimit : this.wordLimit},
1080
+ };
1081
+ }
1082
+ }
1083
+
1084
+ /**
1085
+ * ──────────────────────────────────────────────────────────────────────────
1086
+ * STRATEGY 5: TokenLimitStrategy
1087
+ * ──────────────────────────────────────────────────────────────────────────
1088
+ * Splits by estimated token count. Same as CharLimit but uses token math.
1089
+ * Delegates to CharLimitStrategy internally with token→char conversion.
1090
+ *
1091
+ * Options (via strategyOptions):
1092
+ * - tokenLimit: number (max tokens per chunk, default: config.maxChunkTokens)
1093
+ */
1094
+ export class TokenLimitStrategy extends CharLimitStrategy {
1095
+ constructor(config) {
1096
+ const tokenLimit = config.strategyOptions?.tokenLimit || config.maxChunkTokens;
1097
+ // Convert token limit to char limit internally
1098
+ super({
1099
+ ...config,
1100
+ strategyOptions : {
1101
+ ...config.strategyOptions,
1102
+ charLimit : tokenLimit * config.charsPerToken,
1103
+ overlap : (config.strategyOptions?.overlap || config.overlapTokens) * config.charsPerToken,
1104
+ },
1105
+ });
1106
+ this.name = "token";
1107
+ }
1108
+ }
1109
+
1110
+ // ─────────────────────────────────────────────────────────────────────────────
1111
+ // STRATEGY REGISTRY
1112
+ // ─────────────────────────────────────────────────────────────────────────────
1113
+
1114
+ /**
1115
+ * Maps strategy names to their classes.
1116
+ * Custom strategies can be registered at runtime via
1117
+ * MarkdownTextSplitter.registerStrategy().
1118
+ *
1119
+ * Each strategy class must implement:
1120
+ * constructor(config)
1121
+ * async *process(lineIterator) → yields chunk objects
1122
+ */
1123
+ const STRATEGY_REGISTRY = new Map([
1124
+ ["semantic", SemanticStrategy],
1125
+ ["delimiter", DelimiterStrategy],
1126
+ ["char", CharLimitStrategy],
1127
+ ["word", WordLimitStrategy],
1128
+ ["token", TokenLimitStrategy],
1129
+ ]);
1130
+
1131
+ // ─────────────────────────────────────────────────────────────────────────────
1132
+ // LINK DEFINITION RESOLVER
1133
+ // ─────────────────────────────────────────────────────────────────────────────
1134
+
1135
+ /**
1136
+ * Post-processes chunks to append reference-style link definitions ([id]: url)
1137
+ * to every chunk that references them. Ensures zero link loss across chunks.
1138
+ *
1139
+ * @param {Object[]} chunks - Array of chunk objects
1140
+ * @param {Map<string,string>} defs - Link definitions from the parser
1141
+ * @returns {Object[]} Enriched chunks with resolved link definitions
1142
+ */
1143
+ function resolveLinkDefinitions(chunks, defs) {
1144
+ if (defs.size === 0) return chunks;
1145
+ return chunks.map((chunk) => {
1146
+ const refs = [];
1147
+ const re = /\[([^\]]+)\]\[([^\]]*)\]|\[([^\]]+)\](?!\()/g;
1148
+ let m;
1149
+ while ((m = re.exec(chunk.content)) !== null) {
1150
+ const id = m[2] || m[1] || m[3];
1151
+ if (id && defs.has(id)) refs.push(id);
1152
+ }
1153
+ if (refs.length > 0) {
1154
+ const block = refs.map((id) => `[${id}]: ${defs.get(id)}`).join("\n");
1155
+ return {
1156
+ ...chunk,
1157
+ content : chunk.content + "\n\n" + block,
1158
+ tokenEstimate : estimateTokens(chunk.content + "\n\n" + block, 4),
1159
+ metadata : {...chunk.metadata, resolvedLinkRefs : refs},
1160
+ };
1161
+ }
1162
+ return chunk;
1163
+ });
1164
+ }
1165
+
1166
+ // ─────────────────────────────────────────────────────────────────────────────
1167
+ // MAIN SPLITTER CLASS
1168
+ // ─────────────────────────────────────────────────────────────────────────────
1169
+
1170
+ /**
1171
+ * MarkdownTextSplitter: The main entry point for browser usage.
1172
+ *
1173
+ * Orchestrates the full pipeline using a configurable splitting strategy:
1174
+ * URL/String → Stream → Lines → Strategy.process() → Chunks
1175
+ *
1176
+ * USAGE (browser):
1177
+ * import MarkdownTextSplitter from './MarkdownTextSplitter.js';
1178
+ *
1179
+ * // Semantic (default):
1180
+ * const splitter = new MarkdownTextSplitter();
1181
+ * const chunks = await splitter.splitFromUrl('https://example.com/docs.md');
1182
+ *
1183
+ * // Delimiter:
1184
+ * const splitter = new MarkdownTextSplitter({
1185
+ * strategy: 'delimiter',
1186
+ * strategyOptions: { delimiter: '---' }
1187
+ * });
1188
+ *
1189
+ * // Character limit:
1190
+ * const splitter = new MarkdownTextSplitter({
1191
+ * strategy: 'char',
1192
+ * strategyOptions: { charLimit: 5000, overlap: 200 }
1193
+ * });
1194
+ *
1195
+ * // Word limit:
1196
+ * const splitter = new MarkdownTextSplitter({
1197
+ * strategy: 'word',
1198
+ * strategyOptions: { wordLimit: 800, overlap: 50 }
1199
+ * });
1200
+ *
1201
+ * // Custom strategy:
1202
+ * MarkdownTextSplitter.registerStrategy('myStrategy', MyStrategyClass);
1203
+ * const splitter = new MarkdownTextSplitter({ strategy: 'myStrategy' });
1204
+ *
1205
+ * @class
1206
+ * @param {Partial<SplitterConfig>} [userConfig] - Override defaults
1207
+ */
1208
+ export class MarkdownTextSplitter {
1209
+ constructor(userConfig = {}) {
1210
+ /** Merged configuration */
1211
+ this.config = {...DEFAULT_CONFIG, ...userConfig};
1212
+
1213
+ /** Active strategy instance */
1214
+ this.strategy = this._createStrategy();
1215
+
1216
+ /** Processing statistics (populated after splitting) */
1217
+ this.stats = this._emptyStats();
1218
+ }
1219
+
1220
+ /**
1221
+ * Registers a custom splitting strategy globally.
1222
+ * The class must implement: constructor(config) and async *process(lineIterator).
1223
+ *
1224
+ * @static
1225
+ * @param {string} name - Strategy identifier
1226
+ * @param {Function} strategyClass - Class constructor
1227
+ *
1228
+ * @example
1229
+ * class MySplitter {
1230
+ * constructor(config) { this.config = config; this.name = 'mine'; }
1231
+ * async *process(lines) {
1232
+ * let buf = [];
1233
+ * for await (const {text} of lines) { buf.push(text); }
1234
+ * yield { index: 0, content: buf.join('\n'), ... };
1235
+ * }
1236
+ * }
1237
+ * MarkdownTextSplitter.registerStrategy('mine', MySplitter);
1238
+ */
1239
+ static registerStrategy(name, strategyClass) {
1240
+ STRATEGY_REGISTRY.set(name, strategyClass);
1241
+ }
1242
+
1243
+ /**
1244
+ * Lists all available strategy names (built-in + custom).
1245
+ * @static
1246
+ * @returns {string[]}
1247
+ */
1248
+ static getAvailableStrategies() {
1249
+ return [...STRATEGY_REGISTRY.keys()];
1250
+ }
1251
+
1252
+ /**
1253
+ * Splits a remote Markdown file fetched via streaming HTTP.
1254
+ * Memory usage is O(chunk_size), not O(file_size).
1255
+ *
1256
+ * @param {string} url - URL of the markdown file
1257
+ * @param {Object} [fetchOpts] - Additional fetch() options (headers, auth)
1258
+ * @returns {Promise<Object[]>} Array of chunk objects
1259
+ */
1260
+ async splitFromUrl(url, fetchOpts = {}) {
1261
+ const chunks = [];
1262
+ for await (const chunk of this.streamFromUrl(url, fetchOpts)) chunks.push(chunk);
1263
+ return chunks;
1264
+ }
1265
+
1266
+ /**
1267
+ * Splits a Markdown string (for testing or small inputs).
1268
+ *
1269
+ * @param {string} markdown - Markdown content string
1270
+ * @returns {Promise<Object[]>} Array of chunk objects
1271
+ */
1272
+ async splitFromString(markdown) {
1273
+ const chunks = [];
1274
+ for await (const chunk of this.streamFromString(markdown)) chunks.push(chunk);
1275
+ return chunks;
1276
+ }
1277
+
1278
+ /**
1279
+ * STREAMING: Yields chunks one-at-a-time from a URL.
1280
+ * Use with `for await` for memory-efficient processing of huge files.
1281
+ *
1282
+ * @param {string} url
1283
+ * @param {Object} [fetchOpts]
1284
+ * @yields {Object} Chunk objects
1285
+ */
1286
+ async* streamFromUrl(url, fetchOpts = {}) {
1287
+ const start = performance.now();
1288
+ this.stats = this._emptyStats();
1289
+
1290
+ const controller = new AbortController();
1291
+ const tid = setTimeout(() => controller.abort(), this.config.fetchTimeoutMs);
1292
+
1293
+ try {
1294
+ const res = await fetch(url, {
1295
+ ...fetchOpts,
1296
+ signal : controller.signal,
1297
+ headers : {Accept : "text/markdown, text/plain, */*", ...fetchOpts.headers},
1298
+ });
1299
+ if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
1300
+
1301
+ const lines = streamToLines(res.body);
1302
+ yield* this._run(lines);
1303
+ } finally {
1304
+ clearTimeout(tid);
1305
+ this.stats.processingTimeMs = performance.now() - start;
1306
+ this.stats.source = url;
1307
+ }
1308
+ }
1309
+
1310
+ /**
1311
+ * STREAMING: Yields chunks one-at-a-time from a string.
1312
+ *
1313
+ * @param {string} markdown
1314
+ * @yields {Object} Chunk objects
1315
+ */
1316
+ async* streamFromString(markdown) {
1317
+ const start = performance.now();
1318
+ this.stats = this._emptyStats();
1319
+
1320
+ const lines = stringToLines(markdown);
1321
+ yield* this._run(lines);
1322
+
1323
+ this.stats.processingTimeMs = performance.now() - start;
1324
+ this.stats.source = "(string)";
1325
+ }
1326
+
1327
+ /**
1328
+ * STREAMING: Yields chunks from a File or Blob object (e.g., from <input type="file">).
1329
+ * Uses the browser File API stream() method.
1330
+ *
1331
+ * @param {File|Blob} file - A File or Blob object
1332
+ * @yields {Object} Chunk objects
1333
+ */
1334
+ async* streamFromFile(file) {
1335
+ const start = performance.now();
1336
+ this.stats = this._emptyStats();
1337
+
1338
+ const lines = streamToLines(file.stream());
1339
+ yield* this._run(lines);
1340
+
1341
+ this.stats.processingTimeMs = performance.now() - start;
1342
+ this.stats.source = file.name || "(blob)";
1343
+ }
1344
+
1345
+ /**
1346
+ * Splits a File/Blob and returns all chunks as an array.
1347
+ *
1348
+ * @param {File|Blob} file
1349
+ * @returns {Promise<Object[]>}
1350
+ */
1351
+ async splitFromFile(file) {
1352
+ const chunks = [];
1353
+ for await (const chunk of this.streamFromFile(file)) chunks.push(chunk);
1354
+ return chunks;
1355
+ }
1356
+
1357
+ /**
1358
+ * Internal: runs the active strategy pipeline and collects stats.
1359
+ * @private
1360
+ */
1361
+ async* _run(lineIterator) {
1362
+ const allChunks = [];
1363
+
1364
+ for await (const chunk of this.strategy.process(lineIterator)) {
1365
+ chunk.metadata.splitterVersion = SPLITTER_VERSION;
1366
+ chunk.metadata.strategy = this.strategy.name;
1367
+
1368
+ this.stats.totalChunks++;
1369
+ this.stats.totalTokens += chunk.tokenEstimate;
1370
+ this.stats.totalChars += chunk.charCount;
1371
+ this.stats.totalWords += chunk.wordCount;
1372
+ if (chunk.isOversized) this.stats.oversizedChunks++;
1373
+ if (chunk.hasCode) this.stats.codeBlockChunks++;
1374
+ if (chunk.hasTable) this.stats.tableChunks++;
1375
+ if (chunk.hasVideo) this.stats.videoChunks++;
1376
+
1377
+ allChunks.push(chunk);
1378
+ yield chunk;
1379
+ }
1380
+
1381
+ // Post-process: resolve reference links (semantic strategy only)
1382
+ if (this.strategy instanceof SemanticStrategy) {
1383
+ // The parser is internal to SemanticStrategy; we expose linkDefs via a second pass
1384
+ // For the streaming API, link resolution happens retroactively
1385
+ }
1386
+ }
1387
+
1388
+ /**
1389
+ * Returns processing stats from the last split operation.
1390
+ * @returns {Object}
1391
+ */
1392
+ getStats() { return {...this.stats}; }
1393
+
1394
+ /**
1395
+ * Resets internal state for reuse.
1396
+ */
1397
+ reset() { this.stats = this._emptyStats(); }
1398
+
1399
+ /**
1400
+ * Switches to a different strategy at runtime.
1401
+ *
1402
+ * @param {string} strategyName - Name of the strategy
1403
+ * @param {Object} [options] - Strategy-specific options
1404
+ */
1405
+ setStrategy(strategyName, options = {}) {
1406
+ this.config.strategy = strategyName;
1407
+ this.config.strategyOptions = {...this.config.strategyOptions, ...options};
1408
+ this.strategy = this._createStrategy();
1409
+ }
1410
+
1411
+ /** @private */
1412
+ _createStrategy() {
1413
+ const Cls = STRATEGY_REGISTRY.get(this.config.strategy);
1414
+ if (!Cls) {
1415
+ throw new Error(
1416
+ `Unknown strategy "${this.config.strategy}". Available: ${[...STRATEGY_REGISTRY.keys()].join(", ")}`
1417
+ );
1418
+ }
1419
+ return new Cls(this.config);
1420
+ }
1421
+
1422
+ /** @private */
1423
+ _emptyStats() {
1424
+ return {
1425
+ totalChunks : 0, totalTokens : 0, totalChars : 0, totalWords : 0,
1426
+ oversizedChunks : 0, codeBlockChunks : 0, tableChunks : 0, videoChunks : 0,
1427
+ processingTimeMs : 0, source : "",
1428
+ };
1429
+ }
1430
+ }
1431
+
1432
+ export default MarkdownTextSplitter;