@lov3kaizen/agentsea-embeddings 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1408 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/chunking/index.ts
21
+ var chunking_exports = {};
22
+ __export(chunking_exports, {
23
+ BaseChunker: () => BaseChunker,
24
+ CodeChunker: () => CodeChunker,
25
+ FixedChunker: () => FixedChunker,
26
+ MarkdownChunker: () => MarkdownChunker,
27
+ RecursiveChunker: () => RecursiveChunker,
28
+ SemanticChunker: () => SemanticChunker,
29
+ chunk: () => chunk,
30
+ createChunker: () => createChunker,
31
+ createCodeChunker: () => createCodeChunker,
32
+ createFixedChunker: () => createFixedChunker,
33
+ createMarkdownChunker: () => createMarkdownChunker,
34
+ createRecursiveChunker: () => createRecursiveChunker,
35
+ createSemanticChunker: () => createSemanticChunker,
36
+ defaultTokenCounter: () => defaultTokenCounter,
37
+ mergeSmallChunks: () => mergeSmallChunks,
38
+ splitLargeChunks: () => splitLargeChunks
39
+ });
40
+ module.exports = __toCommonJS(chunking_exports);
41
+
42
+ // src/chunking/BaseChunker.ts
43
+ var import_nanoid = require("nanoid");
44
+ var defaultTokenCounter = (text) => {
45
+ return Math.ceil(text.length / 4);
46
+ };
47
+ var BaseChunker = class {
48
+ /** Default options */
49
+ defaultOptions = {
50
+ chunkSize: 512,
51
+ chunkOverlap: 50,
52
+ minChunkSize: 100,
53
+ maxChunkSize: 2e3,
54
+ tokenCounter: defaultTokenCounter
55
+ };
56
+ /**
57
+ * Get merged options with defaults
58
+ */
59
+ getOptions(options) {
60
+ return {
61
+ chunkSize: options?.chunkSize ?? this.defaultOptions.chunkSize,
62
+ chunkOverlap: options?.chunkOverlap ?? this.defaultOptions.chunkOverlap,
63
+ minChunkSize: options?.minChunkSize ?? this.defaultOptions.minChunkSize,
64
+ maxChunkSize: options?.maxChunkSize ?? this.defaultOptions.maxChunkSize,
65
+ tokenCounter: options?.tokenCounter ?? this.defaultOptions.tokenCounter,
66
+ documentId: options?.documentId,
67
+ source: options?.source,
68
+ type: options?.type,
69
+ metadata: options?.metadata ?? {}
70
+ };
71
+ }
72
+ /**
73
+ * Create a chunk object
74
+ */
75
+ createChunk(text, index, startPosition, options, additionalMetadata) {
76
+ const tokenCounter = options.tokenCounter ?? defaultTokenCounter;
77
+ const metadata = {
78
+ ...options.metadata,
79
+ ...additionalMetadata
80
+ };
81
+ if (options.documentId) metadata.documentId = options.documentId;
82
+ if (options.source) metadata.source = options.source;
83
+ if (options.type) metadata.type = options.type;
84
+ return {
85
+ id: (0, import_nanoid.nanoid)(),
86
+ text,
87
+ index,
88
+ startPosition,
89
+ endPosition: startPosition + text.length,
90
+ tokenCount: tokenCounter(text),
91
+ charCount: text.length,
92
+ overlapPrev: 0,
93
+ overlapNext: 0,
94
+ metadata
95
+ };
96
+ }
97
+ /**
98
+ * Process chunks and set overlap information
99
+ */
100
+ setOverlapInfo(chunks, overlapChars) {
101
+ for (let i = 1; i < chunks.length; i++) {
102
+ chunks[i].overlapPrev = overlapChars;
103
+ chunks[i - 1].overlapNext = overlapChars;
104
+ }
105
+ }
106
+ /**
107
+ * Split text with overlap
108
+ */
109
+ splitWithOverlap(text, chunkSize, overlap, tokenCounter) {
110
+ const chunks = [];
111
+ let start = 0;
112
+ while (start < text.length) {
113
+ let end = start;
114
+ let tokens = 0;
115
+ while (end < text.length && tokens < chunkSize) {
116
+ end++;
117
+ tokens = tokenCounter(text.slice(start, end));
118
+ }
119
+ if (end < text.length) {
120
+ const lastSpace = text.lastIndexOf(" ", end);
121
+ if (lastSpace > start) {
122
+ end = lastSpace + 1;
123
+ }
124
+ }
125
+ chunks.push(text.slice(start, end).trim());
126
+ const overlapChars = Math.floor(overlap * 4);
127
+ start = Math.max(start + 1, end - overlapChars);
128
+ if (start >= text.length) break;
129
+ }
130
+ return chunks.filter((c) => c.length > 0);
131
+ }
132
+ /**
133
+ * Chunk text and return a result object
134
+ */
135
+ async chunkWithResult(text, options) {
136
+ const startTime = performance.now();
137
+ const chunks = await this.chunk(text, options);
138
+ const processingTimeMs = performance.now() - startTime;
139
+ const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0);
140
+ return {
141
+ chunks,
142
+ totalChunks: chunks.length,
143
+ totalTokens,
144
+ avgChunkSize: chunks.length > 0 ? totalTokens / chunks.length : 0,
145
+ processingTimeMs,
146
+ strategy: this.strategyType,
147
+ originalLength: text.length
148
+ };
149
+ }
150
+ };
151
+ function mergeSmallChunks(chunks, minTokens, tokenCounter) {
152
+ if (chunks.length <= 1) return chunks;
153
+ const merged = [];
154
+ let current = null;
155
+ for (const chunk2 of chunks) {
156
+ if (!current) {
157
+ current = { ...chunk2 };
158
+ continue;
159
+ }
160
+ const combinedText = current.text + "\n" + chunk2.text;
161
+ const combinedTokens = tokenCounter(combinedText);
162
+ if (current.tokenCount < minTokens) {
163
+ current.text = combinedText;
164
+ current.tokenCount = combinedTokens;
165
+ current.charCount = combinedText.length;
166
+ current.endPosition = chunk2.endPosition;
167
+ } else {
168
+ merged.push(current);
169
+ current = { ...chunk2 };
170
+ }
171
+ }
172
+ if (current) {
173
+ merged.push(current);
174
+ }
175
+ return merged.map((c, i) => ({ ...c, index: i }));
176
+ }
177
+ function splitLargeChunks(chunks, maxTokens, tokenCounter) {
178
+ const result = [];
179
+ for (const chunk2 of chunks) {
180
+ if (chunk2.tokenCount <= maxTokens) {
181
+ result.push(chunk2);
182
+ continue;
183
+ }
184
+ const sentences = chunk2.text.split(/(?<=[.!?])\s+/);
185
+ let currentText = "";
186
+ let currentStart = chunk2.startPosition;
187
+ for (const sentence of sentences) {
188
+ const testText = currentText ? currentText + " " + sentence : sentence;
189
+ const testTokens = tokenCounter(testText);
190
+ if (testTokens > maxTokens && currentText) {
191
+ result.push({
192
+ ...chunk2,
193
+ id: (0, import_nanoid.nanoid)(),
194
+ text: currentText,
195
+ startPosition: currentStart,
196
+ endPosition: currentStart + currentText.length,
197
+ tokenCount: tokenCounter(currentText),
198
+ charCount: currentText.length
199
+ });
200
+ currentText = sentence;
201
+ currentStart = currentStart + currentText.length + 1;
202
+ } else {
203
+ currentText = testText;
204
+ }
205
+ }
206
+ if (currentText) {
207
+ result.push({
208
+ ...chunk2,
209
+ id: (0, import_nanoid.nanoid)(),
210
+ text: currentText,
211
+ startPosition: currentStart,
212
+ endPosition: currentStart + currentText.length,
213
+ tokenCount: tokenCounter(currentText),
214
+ charCount: currentText.length
215
+ });
216
+ }
217
+ }
218
+ return result.map((c, i) => ({ ...c, index: i }));
219
+ }
220
+
221
+ // src/chunking/FixedChunker.ts
222
+ var FixedChunker = class extends BaseChunker {
223
+ strategyType = "fixed";
224
+ async chunk(text, options) {
225
+ const opts = this.getOptions(options);
226
+ const splitByChars = options?.splitByChars ?? false;
227
+ const separator = options?.separator ?? "\n";
228
+ const keepSeparator = options?.keepSeparator ?? false;
229
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
230
+ const chunks = [];
231
+ let position = 0;
232
+ if (splitByChars) {
233
+ const chunkSize = opts.chunkSize * 4;
234
+ const overlap = opts.chunkOverlap * 4;
235
+ let start = 0;
236
+ while (start < text.length) {
237
+ const end = Math.min(start + chunkSize, text.length);
238
+ const chunkText = text.slice(start, end).trim();
239
+ if (chunkText.length > 0) {
240
+ chunks.push(this.createChunk(chunkText, chunks.length, start, opts));
241
+ }
242
+ start = end - overlap;
243
+ if (start >= text.length) break;
244
+ }
245
+ } else {
246
+ const parts = text.split(separator);
247
+ let currentChunk = "";
248
+ let chunkStart = 0;
249
+ for (let i = 0; i < parts.length; i++) {
250
+ const part = parts[i];
251
+ const partWithSep = keepSeparator && i < parts.length - 1 ? part + separator : part;
252
+ const testChunk = currentChunk ? currentChunk + (keepSeparator ? "" : separator) + partWithSep : partWithSep;
253
+ const testTokens = tokenCounter(testChunk);
254
+ if (testTokens > opts.chunkSize && currentChunk) {
255
+ chunks.push(
256
+ this.createChunk(
257
+ currentChunk.trim(),
258
+ chunks.length,
259
+ chunkStart,
260
+ opts
261
+ )
262
+ );
263
+ const overlapText = this.getOverlapText(
264
+ currentChunk,
265
+ opts.chunkOverlap,
266
+ tokenCounter
267
+ );
268
+ currentChunk = overlapText + (overlapText ? separator : "") + partWithSep;
269
+ chunkStart = position - (overlapText?.length ?? 0);
270
+ } else {
271
+ currentChunk = testChunk;
272
+ }
273
+ position += part.length + separator.length;
274
+ }
275
+ if (currentChunk.trim()) {
276
+ chunks.push(
277
+ this.createChunk(
278
+ currentChunk.trim(),
279
+ chunks.length,
280
+ chunkStart,
281
+ opts
282
+ )
283
+ );
284
+ }
285
+ }
286
+ this.setOverlapInfo(chunks, opts.chunkOverlap * 4);
287
+ return Promise.resolve(chunks);
288
+ }
289
+ /**
290
+ * Get text for overlap from the end of a chunk
291
+ */
292
+ getOverlapText(text, overlapTokens, tokenCounter) {
293
+ if (overlapTokens <= 0) return "";
294
+ const sentences = text.split(/(?<=[.!?])\s+/);
295
+ let overlapText = "";
296
+ for (let i = sentences.length - 1; i >= 0; i--) {
297
+ const testText = sentences[i] + (overlapText ? " " + overlapText : "");
298
+ const testTokens = tokenCounter(testText);
299
+ if (testTokens > overlapTokens && overlapText) {
300
+ break;
301
+ }
302
+ overlapText = testText;
303
+ }
304
+ return overlapText;
305
+ }
306
+ };
307
+ function createFixedChunker() {
308
+ return new FixedChunker();
309
+ }
310
+
311
+ // src/chunking/RecursiveChunker.ts
312
+ var DEFAULT_SEPARATORS = [
313
+ "\n\n",
314
+ // Paragraphs
315
+ "\n",
316
+ // Lines
317
+ ". ",
318
+ // Sentences
319
+ ", ",
320
+ // Clauses
321
+ " ",
322
+ // Words
323
+ ""
324
+ // Characters
325
+ ];
326
+ var RecursiveChunker = class extends BaseChunker {
327
+ strategyType = "recursive";
328
+ async chunk(text, options) {
329
+ const opts = this.getOptions(options);
330
+ const separators = options?.separators ?? DEFAULT_SEPARATORS;
331
+ const keepSeparator = options?.keepSeparator ?? true;
332
+ const mergeSmall = options?.mergeSmallChunks ?? true;
333
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
334
+ const texts = this.splitRecursively(
335
+ text,
336
+ separators,
337
+ opts.chunkSize,
338
+ keepSeparator,
339
+ tokenCounter
340
+ );
341
+ let position = 0;
342
+ let chunks = [];
343
+ for (let i = 0; i < texts.length; i++) {
344
+ const chunkText = texts[i].trim();
345
+ if (chunkText) {
346
+ chunks.push(this.createChunk(chunkText, i, position, opts));
347
+ position += texts[i].length;
348
+ }
349
+ }
350
+ if (mergeSmall) {
351
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
352
+ }
353
+ chunks = this.addOverlap(chunks, opts.chunkOverlap, tokenCounter);
354
+ return Promise.resolve(chunks);
355
+ }
356
+ /**
357
+ * Recursively split text
358
+ */
359
+ splitRecursively(text, separators, chunkSize, keepSeparator, tokenCounter) {
360
+ if (tokenCounter(text) <= chunkSize) {
361
+ return [text];
362
+ }
363
+ for (let i = 0; i < separators.length; i++) {
364
+ const separator = separators[i];
365
+ if (separator === "") {
366
+ return this.splitByChars(text, chunkSize, tokenCounter);
367
+ }
368
+ if (!text.includes(separator)) {
369
+ continue;
370
+ }
371
+ const splits = this.splitBySeparator(text, separator, keepSeparator);
372
+ const result = [];
373
+ for (const split of splits) {
374
+ if (tokenCounter(split) <= chunkSize) {
375
+ result.push(split);
376
+ } else {
377
+ const subSplits = this.splitRecursively(
378
+ split,
379
+ separators.slice(i + 1),
380
+ chunkSize,
381
+ keepSeparator,
382
+ tokenCounter
383
+ );
384
+ result.push(...subSplits);
385
+ }
386
+ }
387
+ return result;
388
+ }
389
+ return this.splitByChars(text, chunkSize, tokenCounter);
390
+ }
391
+ /**
392
+ * Split by separator
393
+ */
394
+ splitBySeparator(text, separator, keepSeparator) {
395
+ if (keepSeparator) {
396
+ const parts = text.split(separator);
397
+ return parts.map((part, i) => i < parts.length - 1 ? part + separator : part).filter((p) => p.trim());
398
+ } else {
399
+ return text.split(separator).filter((p) => p.trim());
400
+ }
401
+ }
402
+ /**
403
+ * Split by characters (last resort)
404
+ */
405
+ splitByChars(text, chunkSize, tokenCounter) {
406
+ const chunks = [];
407
+ let start = 0;
408
+ while (start < text.length) {
409
+ let end = start;
410
+ while (end < text.length && tokenCounter(text.slice(start, end)) < chunkSize) {
411
+ end++;
412
+ }
413
+ if (end < text.length) {
414
+ const lastSpace = text.lastIndexOf(" ", end);
415
+ if (lastSpace > start) {
416
+ end = lastSpace;
417
+ }
418
+ }
419
+ chunks.push(text.slice(start, end));
420
+ start = end;
421
+ }
422
+ return chunks.filter((c) => c.trim());
423
+ }
424
+ /**
425
+ * Add overlap between chunks
426
+ */
427
+ addOverlap(chunks, overlapTokens, tokenCounter) {
428
+ if (overlapTokens <= 0 || chunks.length <= 1) {
429
+ return chunks;
430
+ }
431
+ const result = [];
432
+ for (let i = 0; i < chunks.length; i++) {
433
+ let chunkText = chunks[i].text;
434
+ let startOffset = 0;
435
+ if (i > 0) {
436
+ const prevText = chunks[i - 1].text;
437
+ const overlapText = this.getEndOverlap(
438
+ prevText,
439
+ overlapTokens,
440
+ tokenCounter
441
+ );
442
+ if (overlapText) {
443
+ chunkText = overlapText + " " + chunkText;
444
+ startOffset = -overlapText.length - 1;
445
+ }
446
+ }
447
+ result.push({
448
+ ...chunks[i],
449
+ text: chunkText,
450
+ startPosition: chunks[i].startPosition + startOffset,
451
+ tokenCount: tokenCounter(chunkText),
452
+ charCount: chunkText.length,
453
+ overlapPrev: i > 0 ? overlapTokens : 0,
454
+ overlapNext: i < chunks.length - 1 ? overlapTokens : 0
455
+ });
456
+ }
457
+ return result;
458
+ }
459
+ /**
460
+ * Get overlap text from end of string
461
+ */
462
+ getEndOverlap(text, overlapTokens, tokenCounter) {
463
+ const words = text.split(/\s+/);
464
+ let overlap = "";
465
+ let tokens = 0;
466
+ for (let i = words.length - 1; i >= 0; i--) {
467
+ const testOverlap = words[i] + (overlap ? " " + overlap : "");
468
+ tokens = tokenCounter(testOverlap);
469
+ if (tokens > overlapTokens) {
470
+ break;
471
+ }
472
+ overlap = testOverlap;
473
+ }
474
+ return overlap;
475
+ }
476
+ };
477
+ function createRecursiveChunker() {
478
+ return new RecursiveChunker();
479
+ }
480
+
481
+ // src/chunking/MarkdownChunker.ts
482
+ var MarkdownChunker = class extends BaseChunker {
483
+ strategyType = "markdown";
484
+ async chunk(text, options) {
485
+ const opts = this.getOptions(options);
486
+ const preserveHeaders = options?.preserveHeaders ?? true;
487
+ const includeHeaderHierarchy = options?.includeHeaderHierarchy ?? true;
488
+ const headingLevels = options?.headingLevels ?? [1, 2, 3, 4, 5, 6];
489
+ const splitCodeBlocks = options?.splitCodeBlocks ?? false;
490
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
491
+ const sections = this.parseMarkdown(text, headingLevels);
492
+ let chunks = [];
493
+ for (const section of sections) {
494
+ const sectionChunks = await this.chunkSection(
495
+ section,
496
+ opts,
497
+ preserveHeaders,
498
+ includeHeaderHierarchy,
499
+ splitCodeBlocks,
500
+ tokenCounter
501
+ );
502
+ chunks.push(...sectionChunks);
503
+ }
504
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
505
+ return chunks.map((c, i) => ({ ...c, index: i }));
506
+ }
507
+ /**
508
+ * Parse markdown into sections
509
+ */
510
+ parseMarkdown(text, headingLevels) {
511
+ const sections = [];
512
+ const lines = text.split("\n");
513
+ const headingRegex = /^(#{1,6})\s+(.+)$/;
514
+ let currentSection = {
515
+ headingLevel: 0,
516
+ content: "",
517
+ startPosition: 0,
518
+ path: []
519
+ };
520
+ const headingStack = [];
521
+ let position = 0;
522
+ for (const line of lines) {
523
+ const headingMatch = line.match(headingRegex);
524
+ if (headingMatch) {
525
+ const level = headingMatch[1].length;
526
+ const headingText = headingMatch[2];
527
+ if (headingLevels.includes(level)) {
528
+ if (currentSection.content.trim()) {
529
+ sections.push({ ...currentSection });
530
+ }
531
+ while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= level) {
532
+ headingStack.pop();
533
+ }
534
+ headingStack.push({ level, text: headingText });
535
+ currentSection = {
536
+ heading: headingText,
537
+ headingLevel: level,
538
+ content: "",
539
+ startPosition: position,
540
+ path: headingStack.map((h) => h.text)
541
+ };
542
+ } else {
543
+ currentSection.content += line + "\n";
544
+ }
545
+ } else {
546
+ currentSection.content += line + "\n";
547
+ }
548
+ position += line.length + 1;
549
+ }
550
+ if (currentSection.content.trim() || currentSection.heading) {
551
+ sections.push(currentSection);
552
+ }
553
+ return sections;
554
+ }
555
+ /**
556
+ * Chunk a markdown section
557
+ */
558
+ async chunkSection(section, options, preserveHeaders, includeHeaderHierarchy, splitCodeBlocks, tokenCounter) {
559
+ const chunks = [];
560
+ let content = section.content;
561
+ let headerPrefix = "";
562
+ if (preserveHeaders && section.heading) {
563
+ if (includeHeaderHierarchy && section.path.length > 1) {
564
+ headerPrefix = section.path.map((h, i) => "#".repeat(i + 1) + " " + h).join("\n") + "\n\n";
565
+ } else {
566
+ headerPrefix = "#".repeat(section.headingLevel) + " " + section.heading + "\n\n";
567
+ }
568
+ }
569
+ const codeBlocks = [];
570
+ if (!splitCodeBlocks) {
571
+ const codeBlockRegex = /```[\s\S]*?```/g;
572
+ let match;
573
+ let blockIndex = 0;
574
+ while ((match = codeBlockRegex.exec(content)) !== null) {
575
+ const placeholder = `__CODE_BLOCK_${blockIndex}__`;
576
+ codeBlocks.push({ placeholder, content: match[0] });
577
+ content = content.replace(match[0], placeholder);
578
+ blockIndex++;
579
+ }
580
+ }
581
+ const fullContent = headerPrefix + content;
582
+ if (tokenCounter(fullContent) <= options.chunkSize) {
583
+ let finalContent = fullContent;
584
+ for (const block of codeBlocks) {
585
+ finalContent = finalContent.replace(block.placeholder, block.content);
586
+ }
587
+ chunks.push(
588
+ this.createChunk(
589
+ finalContent.trim(),
590
+ 0,
591
+ section.startPosition,
592
+ options,
593
+ {
594
+ section: section.heading,
595
+ headingLevel: section.headingLevel,
596
+ path: section.path
597
+ }
598
+ )
599
+ );
600
+ } else {
601
+ const paragraphs = content.split(/\n\n+/);
602
+ let currentContent = headerPrefix;
603
+ let chunkStart = section.startPosition;
604
+ for (const paragraph of paragraphs) {
605
+ let para = paragraph;
606
+ for (const block of codeBlocks) {
607
+ para = para.replace(block.placeholder, block.content);
608
+ }
609
+ const testContent = currentContent + para + "\n\n";
610
+ if (tokenCounter(testContent) > options.chunkSize && currentContent !== headerPrefix) {
611
+ chunks.push(
612
+ this.createChunk(
613
+ currentContent.trim(),
614
+ chunks.length,
615
+ chunkStart,
616
+ options,
617
+ {
618
+ section: section.heading,
619
+ headingLevel: section.headingLevel,
620
+ path: section.path
621
+ }
622
+ )
623
+ );
624
+ currentContent = headerPrefix + para + "\n\n";
625
+ chunkStart = section.startPosition + content.indexOf(paragraph);
626
+ } else {
627
+ currentContent = testContent;
628
+ }
629
+ }
630
+ if (currentContent.trim() && currentContent !== headerPrefix.trim()) {
631
+ chunks.push(
632
+ this.createChunk(
633
+ currentContent.trim(),
634
+ chunks.length,
635
+ chunkStart,
636
+ options,
637
+ {
638
+ section: section.heading,
639
+ headingLevel: section.headingLevel,
640
+ path: section.path
641
+ }
642
+ )
643
+ );
644
+ }
645
+ }
646
+ return Promise.resolve(chunks);
647
+ }
648
+ };
649
+ function createMarkdownChunker() {
650
+ return new MarkdownChunker();
651
+ }
652
+
653
+ // src/chunking/CodeChunker.ts
654
+ var LANGUAGE_PATTERNS = {
655
+ typescript: {
656
+ functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
657
+ classStart: /^(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/m,
658
+ importPattern: /^import\s+.*?(?:from\s+['"][^'"]+['"]|['"][^'"]+['"])/gm,
659
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
660
+ blockEnd: /^}/m
661
+ },
662
+ javascript: {
663
+ functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
664
+ classStart: /^(?:export\s+)?class\s+(\w+)/m,
665
+ importPattern: /^(?:import|require)\s*\(?\s*['"][^'"]+['"]\)?/gm,
666
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
667
+ blockEnd: /^}/m
668
+ },
669
+ python: {
670
+ functionStart: /^(?:async\s+)?def\s+(\w+)/m,
671
+ classStart: /^class\s+(\w+)/m,
672
+ importPattern: /^(?:from\s+\S+\s+)?import\s+.+$/gm,
673
+ commentPattern: /'''[\s\S]*?'''|"""[\s\S]*?"""|#.*/g,
674
+ blockEnd: /^(?=\S)/m
675
+ // Python uses indentation
676
+ },
677
+ go: {
678
+ functionStart: /^func\s+(?:\([^)]+\)\s+)?(\w+)/m,
679
+ classStart: /^type\s+(\w+)\s+struct/m,
680
+ importPattern: /^import\s+(?:\([\s\S]*?\)|"[^"]+")/gm,
681
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
682
+ blockEnd: /^}/m
683
+ },
684
+ rust: {
685
+ functionStart: /^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/m,
686
+ classStart: /^(?:pub\s+)?(?:struct|impl|trait)\s+(\w+)/m,
687
+ importPattern: /^use\s+.+;$/gm,
688
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
689
+ blockEnd: /^}/m
690
+ }
691
+ };
692
+ var CodeChunker = class extends BaseChunker {
693
+ strategyType = "code";
694
+ async chunk(text, options) {
695
+ const opts = this.getOptions(options);
696
+ const language = options?.language ?? this.detectLanguage(text);
697
+ const splitBy = options?.splitBy ?? "auto";
698
+ const includeComments = options?.includeComments ?? true;
699
+ const includeImports = options?.includeImports ?? true;
700
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
701
+ const patterns = LANGUAGE_PATTERNS[language] ?? LANGUAGE_PATTERNS.typescript;
702
+ const blocks = this.parseCode(text, patterns, splitBy, includeComments);
703
+ let importBlock = "";
704
+ if (includeImports) {
705
+ const imports = text.match(patterns.importPattern);
706
+ if (imports) {
707
+ importBlock = imports.join("\n") + "\n\n";
708
+ }
709
+ }
710
+ let chunks = [];
711
+ for (const block of blocks) {
712
+ if (block.type === "import") continue;
713
+ const blockContent = includeImports && block.type !== "comment" ? importBlock + block.content : block.content;
714
+ if (tokenCounter(blockContent) <= opts.chunkSize) {
715
+ chunks.push(
716
+ this.createChunk(
717
+ blockContent.trim(),
718
+ chunks.length,
719
+ block.startPosition,
720
+ opts,
721
+ {
722
+ language,
723
+ blockType: block.type,
724
+ blockName: block.name
725
+ }
726
+ )
727
+ );
728
+ } else {
729
+ const subChunks = this.splitLargeBlock(
730
+ block,
731
+ importBlock,
732
+ opts,
733
+ tokenCounter,
734
+ language
735
+ );
736
+ chunks.push(...subChunks);
737
+ }
738
+ }
739
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
740
+ return Promise.resolve(chunks.map((c, i) => ({ ...c, index: i })));
741
+ }
742
+ /**
743
+ * Detect programming language
744
+ */
745
+ detectLanguage(text) {
746
+ if (text.includes("import type") || text.includes(": string") || text.includes("interface ")) {
747
+ return "typescript";
748
+ }
749
+ if (text.includes("def ") && text.includes(":")) {
750
+ return "python";
751
+ }
752
+ if (text.includes("func ") && text.includes("package ")) {
753
+ return "go";
754
+ }
755
+ if (text.includes("fn ") && (text.includes("let mut") || text.includes("pub fn"))) {
756
+ return "rust";
757
+ }
758
+ if (text.includes("const ") || text.includes("function ") || text.includes("require(")) {
759
+ return "javascript";
760
+ }
761
+ return "typescript";
762
+ }
763
+ /**
764
+ * Parse code into blocks
765
+ */
766
+ parseCode(text, patterns, splitBy, includeComments) {
767
+ const blocks = [];
768
+ const lines = text.split("\n");
769
+ let currentBlock = null;
770
+ let braceCount = 0;
771
+ let position = 0;
772
+ for (let i = 0; i < lines.length; i++) {
773
+ const line = lines[i];
774
+ const lineStart = position;
775
+ position += line.length + 1;
776
+ if (splitBy === "function" || splitBy === "auto") {
777
+ const funcMatch = line.match(patterns.functionStart);
778
+ if (funcMatch) {
779
+ if (currentBlock) {
780
+ blocks.push(currentBlock);
781
+ }
782
+ currentBlock = {
783
+ type: "function",
784
+ name: funcMatch[1] || funcMatch[2],
785
+ content: line + "\n",
786
+ startPosition: lineStart
787
+ };
788
+ braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
789
+ continue;
790
+ }
791
+ }
792
+ if (splitBy === "class" || splitBy === "auto") {
793
+ const classMatch = line.match(patterns.classStart);
794
+ if (classMatch) {
795
+ if (currentBlock) {
796
+ blocks.push(currentBlock);
797
+ }
798
+ currentBlock = {
799
+ type: "class",
800
+ name: classMatch[1],
801
+ content: line + "\n",
802
+ startPosition: lineStart
803
+ };
804
+ braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
805
+ continue;
806
+ }
807
+ }
808
+ if (currentBlock) {
809
+ currentBlock.content += line + "\n";
810
+ braceCount += (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
811
+ if (braceCount <= 0) {
812
+ blocks.push(currentBlock);
813
+ currentBlock = null;
814
+ braceCount = 0;
815
+ }
816
+ } else {
817
+ if (line.trim()) {
818
+ currentBlock = {
819
+ type: "other",
820
+ content: line + "\n",
821
+ startPosition: lineStart
822
+ };
823
+ }
824
+ }
825
+ }
826
+ if (currentBlock) {
827
+ blocks.push(currentBlock);
828
+ }
829
+ if (!includeComments) {
830
+ return blocks.map((block) => ({
831
+ ...block,
832
+ content: block.content.replace(patterns.commentPattern, "")
833
+ }));
834
+ }
835
+ return blocks;
836
+ }
837
+ /**
838
+ * Split a large code block
839
+ */
840
+ splitLargeBlock(block, importBlock, options, tokenCounter, language) {
841
+ const chunks = [];
842
+ const lines = block.content.split("\n");
843
+ let currentContent = importBlock;
844
+ let chunkStart = block.startPosition;
845
+ for (const line of lines) {
846
+ const testContent = currentContent + line + "\n";
847
+ if (tokenCounter(testContent) > options.chunkSize && currentContent !== importBlock) {
848
+ chunks.push(
849
+ this.createChunk(
850
+ currentContent.trim(),
851
+ chunks.length,
852
+ chunkStart,
853
+ options,
854
+ {
855
+ language,
856
+ blockType: block.type,
857
+ blockName: block.name,
858
+ partial: true
859
+ }
860
+ )
861
+ );
862
+ currentContent = importBlock + line + "\n";
863
+ chunkStart = block.startPosition + block.content.indexOf(line);
864
+ } else {
865
+ currentContent = testContent;
866
+ }
867
+ }
868
+ if (currentContent.trim() && currentContent !== importBlock.trim()) {
869
+ chunks.push(
870
+ this.createChunk(
871
+ currentContent.trim(),
872
+ chunks.length,
873
+ chunkStart,
874
+ options,
875
+ {
876
+ language,
877
+ blockType: block.type,
878
+ blockName: block.name,
879
+ partial: chunks.length > 0
880
+ }
881
+ )
882
+ );
883
+ }
884
+ return chunks;
885
+ }
886
+ };
887
+ function createCodeChunker() {
888
+ return new CodeChunker();
889
+ }
890
+
891
+ // src/core/EmbeddingModel.ts
892
+ var EmbeddingModel = class {
893
+ /**
894
+ * Get model dimensions
895
+ */
896
+ get dimensions() {
897
+ return this.info.dimensions;
898
+ }
899
+ /**
900
+ * Get max tokens
901
+ */
902
+ get maxTokens() {
903
+ return this.info.maxTokens;
904
+ }
905
+ /**
906
+ * Get max batch size
907
+ */
908
+ get maxBatchSize() {
909
+ return this.info.maxBatchSize;
910
+ }
911
+ /**
912
+ * Get model name
913
+ */
914
+ get name() {
915
+ return this.info.name;
916
+ }
917
+ /**
918
+ * Get provider name
919
+ */
920
+ get provider() {
921
+ return this.info.provider;
922
+ }
923
+ /**
924
+ * Count tokens in text (default implementation)
925
+ * Subclasses should override for accurate counting
926
+ */
927
+ countTokens(text) {
928
+ return Math.ceil(text.length / 4);
929
+ }
930
+ /**
931
+ * Check if text exceeds max tokens
932
+ */
933
+ exceedsMaxTokens(text) {
934
+ return this.countTokens(text) > this.maxTokens;
935
+ }
936
+ /**
937
+ * Truncate text to max tokens
938
+ */
939
+ truncateToMaxTokens(text) {
940
+ const tokens = this.countTokens(text);
941
+ if (tokens <= this.maxTokens) {
942
+ return text;
943
+ }
944
+ const ratio = this.maxTokens / tokens;
945
+ const targetLength = Math.floor(text.length * ratio * 0.95);
946
+ return text.slice(0, targetLength);
947
+ }
948
+ /**
949
+ * Calculate similarity between two vectors
950
+ */
951
+ static cosineSimilarity(a, b) {
952
+ if (a.length !== b.length) {
953
+ throw new Error(`Vector dimensions mismatch: ${a.length} vs ${b.length}`);
954
+ }
955
+ let dotProduct = 0;
956
+ let normA = 0;
957
+ let normB = 0;
958
+ for (let i = 0; i < a.length; i++) {
959
+ dotProduct += a[i] * b[i];
960
+ normA += a[i] * a[i];
961
+ normB += b[i] * b[i];
962
+ }
963
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
964
+ if (magnitude === 0) {
965
+ return 0;
966
+ }
967
+ return dotProduct / magnitude;
968
+ }
969
+ /**
970
+ * Calculate Euclidean distance between two vectors
971
+ */
972
+ static euclideanDistance(a, b) {
973
+ if (a.length !== b.length) {
974
+ throw new Error(`Vector dimensions mismatch: ${a.length} vs ${b.length}`);
975
+ }
976
+ let sum = 0;
977
+ for (let i = 0; i < a.length; i++) {
978
+ const diff = a[i] - b[i];
979
+ sum += diff * diff;
980
+ }
981
+ return Math.sqrt(sum);
982
+ }
983
+ /**
984
+ * Calculate dot product of two vectors
985
+ */
986
+ static dotProduct(a, b) {
987
+ if (a.length !== b.length) {
988
+ throw new Error(`Vector dimensions mismatch: ${a.length} vs ${b.length}`);
989
+ }
990
+ let result = 0;
991
+ for (let i = 0; i < a.length; i++) {
992
+ result += a[i] * b[i];
993
+ }
994
+ return result;
995
+ }
996
+ /**
997
+ * Normalize a vector to unit length
998
+ */
999
+ static normalize(vector) {
1000
+ let norm = 0;
1001
+ for (let i = 0; i < vector.length; i++) {
1002
+ norm += vector[i] * vector[i];
1003
+ }
1004
+ norm = Math.sqrt(norm);
1005
+ if (norm === 0) {
1006
+ return vector.slice();
1007
+ }
1008
+ return vector.map((v) => v / norm);
1009
+ }
1010
+ /**
1011
+ * Average multiple vectors
1012
+ */
1013
+ static average(vectors) {
1014
+ if (vectors.length === 0) {
1015
+ throw new Error("Cannot average empty array of vectors");
1016
+ }
1017
+ const dimensions = vectors[0].length;
1018
+ const result = new Array(dimensions).fill(0);
1019
+ for (const vector of vectors) {
1020
+ if (vector.length !== dimensions) {
1021
+ throw new Error(
1022
+ `Vector dimensions mismatch: expected ${dimensions}, got ${vector.length}`
1023
+ );
1024
+ }
1025
+ for (let i = 0; i < dimensions; i++) {
1026
+ result[i] += vector[i];
1027
+ }
1028
+ }
1029
+ for (let i = 0; i < dimensions; i++) {
1030
+ result[i] /= vectors.length;
1031
+ }
1032
+ return result;
1033
+ }
1034
+ /**
1035
+ * Weighted average of vectors
1036
+ */
1037
+ static weightedAverage(vectors, weights) {
1038
+ if (vectors.length === 0) {
1039
+ throw new Error("Cannot average empty array of vectors");
1040
+ }
1041
+ if (vectors.length !== weights.length) {
1042
+ throw new Error("Vectors and weights arrays must have same length");
1043
+ }
1044
+ const dimensions = vectors[0].length;
1045
+ const result = new Array(dimensions).fill(0);
1046
+ let totalWeight = 0;
1047
+ for (let j = 0; j < vectors.length; j++) {
1048
+ const vector = vectors[j];
1049
+ const weight = weights[j];
1050
+ totalWeight += weight;
1051
+ if (vector.length !== dimensions) {
1052
+ throw new Error(
1053
+ `Vector dimensions mismatch: expected ${dimensions}, got ${vector.length}`
1054
+ );
1055
+ }
1056
+ for (let i = 0; i < dimensions; i++) {
1057
+ result[i] += vector[i] * weight;
1058
+ }
1059
+ }
1060
+ if (totalWeight === 0) {
1061
+ throw new Error("Total weight cannot be zero");
1062
+ }
1063
+ for (let i = 0; i < dimensions; i++) {
1064
+ result[i] /= totalWeight;
1065
+ }
1066
+ return result;
1067
+ }
1068
+ };
1069
+ var ModelRegistry = class {
1070
+ models = /* @__PURE__ */ new Map();
1071
+ defaultModel = null;
1072
+ /**
1073
+ * Register a model
1074
+ */
1075
+ register(model, isDefault = false) {
1076
+ const key = `${model.provider}:${model.name}`;
1077
+ this.models.set(key, model);
1078
+ if (isDefault || this.defaultModel === null) {
1079
+ this.defaultModel = key;
1080
+ }
1081
+ }
1082
+ /**
1083
+ * Get a model by provider and name
1084
+ */
1085
+ get(provider, name) {
1086
+ return this.models.get(`${provider}:${name}`);
1087
+ }
1088
+ /**
1089
+ * Get model by key
1090
+ */
1091
+ getByKey(key) {
1092
+ return this.models.get(key);
1093
+ }
1094
+ /**
1095
+ * Get the default model
1096
+ */
1097
+ getDefault() {
1098
+ if (this.defaultModel === null) {
1099
+ return void 0;
1100
+ }
1101
+ return this.models.get(this.defaultModel);
1102
+ }
1103
+ /**
1104
+ * Set default model
1105
+ */
1106
+ setDefault(provider, name) {
1107
+ const key = `${provider}:${name}`;
1108
+ if (!this.models.has(key)) {
1109
+ throw new Error(`Model ${key} not found in registry`);
1110
+ }
1111
+ this.defaultModel = key;
1112
+ }
1113
+ /**
1114
+ * List all registered models
1115
+ */
1116
+ list() {
1117
+ return Array.from(this.models.values()).map((m) => m.info);
1118
+ }
1119
+ /**
1120
+ * Check if a model is registered
1121
+ */
1122
+ has(provider, name) {
1123
+ return this.models.has(`${provider}:${name}`);
1124
+ }
1125
+ /**
1126
+ * Remove a model
1127
+ */
1128
+ remove(provider, name) {
1129
+ const key = `${provider}:${name}`;
1130
+ if (this.defaultModel === key) {
1131
+ this.defaultModel = null;
1132
+ }
1133
+ return this.models.delete(key);
1134
+ }
1135
+ /**
1136
+ * Clear all models
1137
+ */
1138
+ clear() {
1139
+ this.models.clear();
1140
+ this.defaultModel = null;
1141
+ }
1142
+ };
1143
+ var modelRegistry = new ModelRegistry();
1144
+
1145
+ // src/chunking/SemanticChunker.ts
1146
+ var SemanticChunker = class extends BaseChunker {
1147
+ strategyType = "semantic";
1148
+ async chunk(text, options) {
1149
+ const opts = this.getOptions(options);
1150
+ const similarityThreshold = options?.similarityThreshold ?? 0.5;
1151
+ const breakpointPercentile = options?.breakpointPercentileThreshold ?? 95;
1152
+ const bufferSize = options?.bufferSize ?? 1;
1153
+ const embeddingFn = options?.embeddingFn;
1154
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
1155
+ const sentences = this.splitSentences(text);
1156
+ if (sentences.length === 0) {
1157
+ return [];
1158
+ }
1159
+ if (!embeddingFn) {
1160
+ return this.fallbackChunk(sentences, opts, tokenCounter);
1161
+ }
1162
+ const sentenceTexts = sentences.map((s) => s.text);
1163
+ const embeddings = await embeddingFn(sentenceTexts);
1164
+ const sentencesWithEmbeddings = sentences.map(
1165
+ (s, i) => ({
1166
+ ...s,
1167
+ embedding: embeddings[i]
1168
+ })
1169
+ );
1170
+ const distances = this.calculateDistances(
1171
+ sentencesWithEmbeddings,
1172
+ bufferSize
1173
+ );
1174
+ const breakpoints = this.findBreakpoints(
1175
+ distances,
1176
+ breakpointPercentile,
1177
+ similarityThreshold
1178
+ );
1179
+ let chunks = [];
1180
+ let chunkStart = 0;
1181
+ let chunkText = "";
1182
+ let chunkPosition = sentences[0]?.position ?? 0;
1183
+ for (let i = 0; i < sentences.length; i++) {
1184
+ chunkText += (chunkText ? " " : "") + sentences[i].text;
1185
+ if (breakpoints.includes(i) || i === sentences.length - 1) {
1186
+ if (chunkText.trim()) {
1187
+ chunks.push(
1188
+ this.createChunk(
1189
+ chunkText.trim(),
1190
+ chunks.length,
1191
+ chunkPosition,
1192
+ opts,
1193
+ {
1194
+ boundaryType: "semantic",
1195
+ sentenceCount: i - chunkStart + 1
1196
+ }
1197
+ )
1198
+ );
1199
+ }
1200
+ if (i < sentences.length - 1) {
1201
+ chunkStart = i + 1;
1202
+ chunkText = "";
1203
+ chunkPosition = sentences[i + 1].position;
1204
+ }
1205
+ }
1206
+ }
1207
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
1208
+ chunks = this.splitLargeChunks(chunks, opts.maxChunkSize, tokenCounter);
1209
+ return chunks;
1210
+ }
1211
+ /**
1212
+ * Split text into sentences
1213
+ */
1214
+ splitSentences(text) {
1215
+ const sentenceRegex = /[^.!?]+[.!?]+/g;
1216
+ const sentences = [];
1217
+ let match;
1218
+ while ((match = sentenceRegex.exec(text)) !== null) {
1219
+ const sentence = match[0].trim();
1220
+ if (sentence) {
1221
+ sentences.push({
1222
+ text: sentence,
1223
+ position: match.index
1224
+ });
1225
+ }
1226
+ }
1227
+ if (sentences.length === 0 && text.trim()) {
1228
+ sentences.push({
1229
+ text: text.trim(),
1230
+ position: 0
1231
+ });
1232
+ }
1233
+ return sentences;
1234
+ }
1235
+ /**
1236
+ * Calculate distances between adjacent sentences
1237
+ */
1238
+ calculateDistances(sentences, bufferSize) {
1239
+ const distances = [];
1240
+ for (let i = 0; i < sentences.length - 1; i++) {
1241
+ const leftStart = Math.max(0, i - bufferSize + 1);
1242
+ const rightEnd = Math.min(sentences.length, i + bufferSize + 1);
1243
+ const leftEmbeddings = sentences.slice(leftStart, i + 1).map((s) => s.embedding).filter((e) => e !== void 0);
1244
+ const rightEmbeddings = sentences.slice(i + 1, rightEnd).map((s) => s.embedding).filter((e) => e !== void 0);
1245
+ if (leftEmbeddings.length > 0 && rightEmbeddings.length > 0) {
1246
+ const leftAvg = EmbeddingModel.average(leftEmbeddings);
1247
+ const rightAvg = EmbeddingModel.average(rightEmbeddings);
1248
+ const similarity = EmbeddingModel.cosineSimilarity(leftAvg, rightAvg);
1249
+ distances.push(1 - similarity);
1250
+ } else {
1251
+ distances.push(0);
1252
+ }
1253
+ }
1254
+ return distances;
1255
+ }
1256
+ /**
1257
+ * Find breakpoints based on distance threshold
1258
+ */
1259
+ findBreakpoints(distances, percentile, minThreshold) {
1260
+ if (distances.length === 0) return [];
1261
+ const sortedDistances = [...distances].sort((a, b) => a - b);
1262
+ const percentileIndex = Math.floor(
1263
+ percentile / 100 * sortedDistances.length
1264
+ );
1265
+ const percentileThreshold = sortedDistances[percentileIndex] ?? sortedDistances[sortedDistances.length - 1];
1266
+ const threshold = Math.max(percentileThreshold, 1 - minThreshold);
1267
+ const breakpoints = [];
1268
+ for (let i = 0; i < distances.length; i++) {
1269
+ if (distances[i] >= threshold) {
1270
+ breakpoints.push(i);
1271
+ }
1272
+ }
1273
+ return breakpoints;
1274
+ }
1275
+ /**
1276
+ * Fallback chunking when no embedding function available
1277
+ */
1278
+ fallbackChunk(sentences, options, tokenCounter) {
1279
+ const chunks = [];
1280
+ let currentText = "";
1281
+ let chunkPosition = sentences[0]?.position ?? 0;
1282
+ for (const sentence of sentences) {
1283
+ const testText = currentText ? currentText + " " + sentence.text : sentence.text;
1284
+ if (tokenCounter(testText) > options.chunkSize && currentText) {
1285
+ chunks.push(
1286
+ this.createChunk(
1287
+ currentText.trim(),
1288
+ chunks.length,
1289
+ chunkPosition,
1290
+ options,
1291
+ { boundaryType: "sentence" }
1292
+ )
1293
+ );
1294
+ currentText = sentence.text;
1295
+ chunkPosition = sentence.position;
1296
+ } else {
1297
+ currentText = testText;
1298
+ }
1299
+ }
1300
+ if (currentText.trim()) {
1301
+ chunks.push(
1302
+ this.createChunk(
1303
+ currentText.trim(),
1304
+ chunks.length,
1305
+ chunkPosition,
1306
+ options,
1307
+ { boundaryType: "sentence" }
1308
+ )
1309
+ );
1310
+ }
1311
+ return chunks;
1312
+ }
1313
+ /**
1314
+ * Split chunks that are too large
1315
+ */
1316
+ splitLargeChunks(chunks, maxTokens, tokenCounter) {
1317
+ const result = [];
1318
+ for (const chunk2 of chunks) {
1319
+ if (chunk2.tokenCount <= maxTokens) {
1320
+ result.push(chunk2);
1321
+ continue;
1322
+ }
1323
+ const sentences = this.splitSentences(chunk2.text);
1324
+ let currentText = "";
1325
+ let currentStart = chunk2.startPosition;
1326
+ for (const sentence of sentences) {
1327
+ const testText = currentText ? currentText + " " + sentence.text : sentence.text;
1328
+ if (tokenCounter(testText) > maxTokens && currentText) {
1329
+ result.push({
1330
+ ...chunk2,
1331
+ id: chunk2.id + "_" + result.length,
1332
+ text: currentText.trim(),
1333
+ startPosition: currentStart,
1334
+ endPosition: currentStart + currentText.length,
1335
+ tokenCount: tokenCounter(currentText),
1336
+ charCount: currentText.length,
1337
+ index: result.length
1338
+ });
1339
+ currentText = sentence.text;
1340
+ currentStart = chunk2.startPosition + sentence.position;
1341
+ } else {
1342
+ currentText = testText;
1343
+ }
1344
+ }
1345
+ if (currentText.trim()) {
1346
+ result.push({
1347
+ ...chunk2,
1348
+ id: chunk2.id + "_" + result.length,
1349
+ text: currentText.trim(),
1350
+ startPosition: currentStart,
1351
+ endPosition: currentStart + currentText.length,
1352
+ tokenCount: tokenCounter(currentText),
1353
+ charCount: currentText.length,
1354
+ index: result.length
1355
+ });
1356
+ }
1357
+ }
1358
+ return result;
1359
+ }
1360
+ };
1361
+ function createSemanticChunker() {
1362
+ return new SemanticChunker();
1363
+ }
1364
+
1365
+ // src/chunking/index.ts
1366
+ function createChunker(strategy) {
1367
+ switch (strategy) {
1368
+ case "fixed":
1369
+ return new FixedChunker();
1370
+ case "recursive":
1371
+ return new RecursiveChunker();
1372
+ case "markdown":
1373
+ return new MarkdownChunker();
1374
+ case "code":
1375
+ return new CodeChunker();
1376
+ case "semantic":
1377
+ return new SemanticChunker();
1378
+ case "sentence":
1379
+ return new FixedChunker();
1380
+ case "paragraph":
1381
+ return new FixedChunker();
1382
+ default:
1383
+ return new RecursiveChunker();
1384
+ }
1385
+ }
1386
+ async function chunk(text, strategy = "recursive", options) {
1387
+ const chunker = createChunker(strategy);
1388
+ return chunker.chunk(text, options);
1389
+ }
1390
+ // Annotate the CommonJS export names for ESM import in node:
1391
+ 0 && (module.exports = {
1392
+ BaseChunker,
1393
+ CodeChunker,
1394
+ FixedChunker,
1395
+ MarkdownChunker,
1396
+ RecursiveChunker,
1397
+ SemanticChunker,
1398
+ chunk,
1399
+ createChunker,
1400
+ createCodeChunker,
1401
+ createFixedChunker,
1402
+ createMarkdownChunker,
1403
+ createRecursiveChunker,
1404
+ createSemanticChunker,
1405
+ defaultTokenCounter,
1406
+ mergeSmallChunks,
1407
+ splitLargeChunks
1408
+ });