@lov3kaizen/agentsea-embeddings 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1117 @@
1
+ import {
2
+ EmbeddingModel
3
+ } from "./chunk-QAITLJ2E.mjs";
4
+
5
+ // src/chunking/BaseChunker.ts
6
+ import { nanoid } from "nanoid";
7
+ var defaultTokenCounter = (text) => {
8
+ return Math.ceil(text.length / 4);
9
+ };
10
+ var BaseChunker = class {
11
+ /** Default options */
12
+ defaultOptions = {
13
+ chunkSize: 512,
14
+ chunkOverlap: 50,
15
+ minChunkSize: 100,
16
+ maxChunkSize: 2e3,
17
+ tokenCounter: defaultTokenCounter
18
+ };
19
+ /**
20
+ * Get merged options with defaults
21
+ */
22
+ getOptions(options) {
23
+ return {
24
+ chunkSize: options?.chunkSize ?? this.defaultOptions.chunkSize,
25
+ chunkOverlap: options?.chunkOverlap ?? this.defaultOptions.chunkOverlap,
26
+ minChunkSize: options?.minChunkSize ?? this.defaultOptions.minChunkSize,
27
+ maxChunkSize: options?.maxChunkSize ?? this.defaultOptions.maxChunkSize,
28
+ tokenCounter: options?.tokenCounter ?? this.defaultOptions.tokenCounter,
29
+ documentId: options?.documentId,
30
+ source: options?.source,
31
+ type: options?.type,
32
+ metadata: options?.metadata ?? {}
33
+ };
34
+ }
35
+ /**
36
+ * Create a chunk object
37
+ */
38
+ createChunk(text, index, startPosition, options, additionalMetadata) {
39
+ const tokenCounter = options.tokenCounter ?? defaultTokenCounter;
40
+ const metadata = {
41
+ ...options.metadata,
42
+ ...additionalMetadata
43
+ };
44
+ if (options.documentId) metadata.documentId = options.documentId;
45
+ if (options.source) metadata.source = options.source;
46
+ if (options.type) metadata.type = options.type;
47
+ return {
48
+ id: nanoid(),
49
+ text,
50
+ index,
51
+ startPosition,
52
+ endPosition: startPosition + text.length,
53
+ tokenCount: tokenCounter(text),
54
+ charCount: text.length,
55
+ overlapPrev: 0,
56
+ overlapNext: 0,
57
+ metadata
58
+ };
59
+ }
60
+ /**
61
+ * Process chunks and set overlap information
62
+ */
63
+ setOverlapInfo(chunks, overlapChars) {
64
+ for (let i = 1; i < chunks.length; i++) {
65
+ chunks[i].overlapPrev = overlapChars;
66
+ chunks[i - 1].overlapNext = overlapChars;
67
+ }
68
+ }
69
+ /**
70
+ * Split text with overlap
71
+ */
72
+ splitWithOverlap(text, chunkSize, overlap, tokenCounter) {
73
+ const chunks = [];
74
+ let start = 0;
75
+ while (start < text.length) {
76
+ let end = start;
77
+ let tokens = 0;
78
+ while (end < text.length && tokens < chunkSize) {
79
+ end++;
80
+ tokens = tokenCounter(text.slice(start, end));
81
+ }
82
+ if (end < text.length) {
83
+ const lastSpace = text.lastIndexOf(" ", end);
84
+ if (lastSpace > start) {
85
+ end = lastSpace + 1;
86
+ }
87
+ }
88
+ chunks.push(text.slice(start, end).trim());
89
+ const overlapChars = Math.floor(overlap * 4);
90
+ start = Math.max(start + 1, end - overlapChars);
91
+ if (start >= text.length) break;
92
+ }
93
+ return chunks.filter((c) => c.length > 0);
94
+ }
95
+ /**
96
+ * Chunk text and return a result object
97
+ */
98
+ async chunkWithResult(text, options) {
99
+ const startTime = performance.now();
100
+ const chunks = await this.chunk(text, options);
101
+ const processingTimeMs = performance.now() - startTime;
102
+ const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0);
103
+ return {
104
+ chunks,
105
+ totalChunks: chunks.length,
106
+ totalTokens,
107
+ avgChunkSize: chunks.length > 0 ? totalTokens / chunks.length : 0,
108
+ processingTimeMs,
109
+ strategy: this.strategyType,
110
+ originalLength: text.length
111
+ };
112
+ }
113
+ };
114
+ function mergeSmallChunks(chunks, minTokens, tokenCounter) {
115
+ if (chunks.length <= 1) return chunks;
116
+ const merged = [];
117
+ let current = null;
118
+ for (const chunk2 of chunks) {
119
+ if (!current) {
120
+ current = { ...chunk2 };
121
+ continue;
122
+ }
123
+ const combinedText = current.text + "\n" + chunk2.text;
124
+ const combinedTokens = tokenCounter(combinedText);
125
+ if (current.tokenCount < minTokens) {
126
+ current.text = combinedText;
127
+ current.tokenCount = combinedTokens;
128
+ current.charCount = combinedText.length;
129
+ current.endPosition = chunk2.endPosition;
130
+ } else {
131
+ merged.push(current);
132
+ current = { ...chunk2 };
133
+ }
134
+ }
135
+ if (current) {
136
+ merged.push(current);
137
+ }
138
+ return merged.map((c, i) => ({ ...c, index: i }));
139
+ }
140
+ function splitLargeChunks(chunks, maxTokens, tokenCounter) {
141
+ const result = [];
142
+ for (const chunk2 of chunks) {
143
+ if (chunk2.tokenCount <= maxTokens) {
144
+ result.push(chunk2);
145
+ continue;
146
+ }
147
+ const sentences = chunk2.text.split(/(?<=[.!?])\s+/);
148
+ let currentText = "";
149
+ let currentStart = chunk2.startPosition;
150
+ for (const sentence of sentences) {
151
+ const testText = currentText ? currentText + " " + sentence : sentence;
152
+ const testTokens = tokenCounter(testText);
153
+ if (testTokens > maxTokens && currentText) {
154
+ result.push({
155
+ ...chunk2,
156
+ id: nanoid(),
157
+ text: currentText,
158
+ startPosition: currentStart,
159
+ endPosition: currentStart + currentText.length,
160
+ tokenCount: tokenCounter(currentText),
161
+ charCount: currentText.length
162
+ });
163
+ currentText = sentence;
164
+ currentStart = currentStart + currentText.length + 1;
165
+ } else {
166
+ currentText = testText;
167
+ }
168
+ }
169
+ if (currentText) {
170
+ result.push({
171
+ ...chunk2,
172
+ id: nanoid(),
173
+ text: currentText,
174
+ startPosition: currentStart,
175
+ endPosition: currentStart + currentText.length,
176
+ tokenCount: tokenCounter(currentText),
177
+ charCount: currentText.length
178
+ });
179
+ }
180
+ }
181
+ return result.map((c, i) => ({ ...c, index: i }));
182
+ }
183
+
184
+ // src/chunking/FixedChunker.ts
185
+ var FixedChunker = class extends BaseChunker {
186
+ strategyType = "fixed";
187
+ async chunk(text, options) {
188
+ const opts = this.getOptions(options);
189
+ const splitByChars = options?.splitByChars ?? false;
190
+ const separator = options?.separator ?? "\n";
191
+ const keepSeparator = options?.keepSeparator ?? false;
192
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
193
+ const chunks = [];
194
+ let position = 0;
195
+ if (splitByChars) {
196
+ const chunkSize = opts.chunkSize * 4;
197
+ const overlap = opts.chunkOverlap * 4;
198
+ let start = 0;
199
+ while (start < text.length) {
200
+ const end = Math.min(start + chunkSize, text.length);
201
+ const chunkText = text.slice(start, end).trim();
202
+ if (chunkText.length > 0) {
203
+ chunks.push(this.createChunk(chunkText, chunks.length, start, opts));
204
+ }
205
+ start = end - overlap;
206
+ if (start >= text.length) break;
207
+ }
208
+ } else {
209
+ const parts = text.split(separator);
210
+ let currentChunk = "";
211
+ let chunkStart = 0;
212
+ for (let i = 0; i < parts.length; i++) {
213
+ const part = parts[i];
214
+ const partWithSep = keepSeparator && i < parts.length - 1 ? part + separator : part;
215
+ const testChunk = currentChunk ? currentChunk + (keepSeparator ? "" : separator) + partWithSep : partWithSep;
216
+ const testTokens = tokenCounter(testChunk);
217
+ if (testTokens > opts.chunkSize && currentChunk) {
218
+ chunks.push(
219
+ this.createChunk(
220
+ currentChunk.trim(),
221
+ chunks.length,
222
+ chunkStart,
223
+ opts
224
+ )
225
+ );
226
+ const overlapText = this.getOverlapText(
227
+ currentChunk,
228
+ opts.chunkOverlap,
229
+ tokenCounter
230
+ );
231
+ currentChunk = overlapText + (overlapText ? separator : "") + partWithSep;
232
+ chunkStart = position - (overlapText?.length ?? 0);
233
+ } else {
234
+ currentChunk = testChunk;
235
+ }
236
+ position += part.length + separator.length;
237
+ }
238
+ if (currentChunk.trim()) {
239
+ chunks.push(
240
+ this.createChunk(
241
+ currentChunk.trim(),
242
+ chunks.length,
243
+ chunkStart,
244
+ opts
245
+ )
246
+ );
247
+ }
248
+ }
249
+ this.setOverlapInfo(chunks, opts.chunkOverlap * 4);
250
+ return Promise.resolve(chunks);
251
+ }
252
+ /**
253
+ * Get text for overlap from the end of a chunk
254
+ */
255
+ getOverlapText(text, overlapTokens, tokenCounter) {
256
+ if (overlapTokens <= 0) return "";
257
+ const sentences = text.split(/(?<=[.!?])\s+/);
258
+ let overlapText = "";
259
+ for (let i = sentences.length - 1; i >= 0; i--) {
260
+ const testText = sentences[i] + (overlapText ? " " + overlapText : "");
261
+ const testTokens = tokenCounter(testText);
262
+ if (testTokens > overlapTokens && overlapText) {
263
+ break;
264
+ }
265
+ overlapText = testText;
266
+ }
267
+ return overlapText;
268
+ }
269
+ };
270
+ function createFixedChunker() {
271
+ return new FixedChunker();
272
+ }
273
+
274
+ // src/chunking/RecursiveChunker.ts
275
+ var DEFAULT_SEPARATORS = [
276
+ "\n\n",
277
+ // Paragraphs
278
+ "\n",
279
+ // Lines
280
+ ". ",
281
+ // Sentences
282
+ ", ",
283
+ // Clauses
284
+ " ",
285
+ // Words
286
+ ""
287
+ // Characters
288
+ ];
289
+ var RecursiveChunker = class extends BaseChunker {
290
+ strategyType = "recursive";
291
+ async chunk(text, options) {
292
+ const opts = this.getOptions(options);
293
+ const separators = options?.separators ?? DEFAULT_SEPARATORS;
294
+ const keepSeparator = options?.keepSeparator ?? true;
295
+ const mergeSmall = options?.mergeSmallChunks ?? true;
296
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
297
+ const texts = this.splitRecursively(
298
+ text,
299
+ separators,
300
+ opts.chunkSize,
301
+ keepSeparator,
302
+ tokenCounter
303
+ );
304
+ let position = 0;
305
+ let chunks = [];
306
+ for (let i = 0; i < texts.length; i++) {
307
+ const chunkText = texts[i].trim();
308
+ if (chunkText) {
309
+ chunks.push(this.createChunk(chunkText, i, position, opts));
310
+ position += texts[i].length;
311
+ }
312
+ }
313
+ if (mergeSmall) {
314
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
315
+ }
316
+ chunks = this.addOverlap(chunks, opts.chunkOverlap, tokenCounter);
317
+ return Promise.resolve(chunks);
318
+ }
319
+ /**
320
+ * Recursively split text
321
+ */
322
+ splitRecursively(text, separators, chunkSize, keepSeparator, tokenCounter) {
323
+ if (tokenCounter(text) <= chunkSize) {
324
+ return [text];
325
+ }
326
+ for (let i = 0; i < separators.length; i++) {
327
+ const separator = separators[i];
328
+ if (separator === "") {
329
+ return this.splitByChars(text, chunkSize, tokenCounter);
330
+ }
331
+ if (!text.includes(separator)) {
332
+ continue;
333
+ }
334
+ const splits = this.splitBySeparator(text, separator, keepSeparator);
335
+ const result = [];
336
+ for (const split of splits) {
337
+ if (tokenCounter(split) <= chunkSize) {
338
+ result.push(split);
339
+ } else {
340
+ const subSplits = this.splitRecursively(
341
+ split,
342
+ separators.slice(i + 1),
343
+ chunkSize,
344
+ keepSeparator,
345
+ tokenCounter
346
+ );
347
+ result.push(...subSplits);
348
+ }
349
+ }
350
+ return result;
351
+ }
352
+ return this.splitByChars(text, chunkSize, tokenCounter);
353
+ }
354
+ /**
355
+ * Split by separator
356
+ */
357
+ splitBySeparator(text, separator, keepSeparator) {
358
+ if (keepSeparator) {
359
+ const parts = text.split(separator);
360
+ return parts.map((part, i) => i < parts.length - 1 ? part + separator : part).filter((p) => p.trim());
361
+ } else {
362
+ return text.split(separator).filter((p) => p.trim());
363
+ }
364
+ }
365
+ /**
366
+ * Split by characters (last resort)
367
+ */
368
+ splitByChars(text, chunkSize, tokenCounter) {
369
+ const chunks = [];
370
+ let start = 0;
371
+ while (start < text.length) {
372
+ let end = start;
373
+ while (end < text.length && tokenCounter(text.slice(start, end)) < chunkSize) {
374
+ end++;
375
+ }
376
+ if (end < text.length) {
377
+ const lastSpace = text.lastIndexOf(" ", end);
378
+ if (lastSpace > start) {
379
+ end = lastSpace;
380
+ }
381
+ }
382
+ chunks.push(text.slice(start, end));
383
+ start = end;
384
+ }
385
+ return chunks.filter((c) => c.trim());
386
+ }
387
+ /**
388
+ * Add overlap between chunks
389
+ */
390
+ addOverlap(chunks, overlapTokens, tokenCounter) {
391
+ if (overlapTokens <= 0 || chunks.length <= 1) {
392
+ return chunks;
393
+ }
394
+ const result = [];
395
+ for (let i = 0; i < chunks.length; i++) {
396
+ let chunkText = chunks[i].text;
397
+ let startOffset = 0;
398
+ if (i > 0) {
399
+ const prevText = chunks[i - 1].text;
400
+ const overlapText = this.getEndOverlap(
401
+ prevText,
402
+ overlapTokens,
403
+ tokenCounter
404
+ );
405
+ if (overlapText) {
406
+ chunkText = overlapText + " " + chunkText;
407
+ startOffset = -overlapText.length - 1;
408
+ }
409
+ }
410
+ result.push({
411
+ ...chunks[i],
412
+ text: chunkText,
413
+ startPosition: chunks[i].startPosition + startOffset,
414
+ tokenCount: tokenCounter(chunkText),
415
+ charCount: chunkText.length,
416
+ overlapPrev: i > 0 ? overlapTokens : 0,
417
+ overlapNext: i < chunks.length - 1 ? overlapTokens : 0
418
+ });
419
+ }
420
+ return result;
421
+ }
422
+ /**
423
+ * Get overlap text from end of string
424
+ */
425
+ getEndOverlap(text, overlapTokens, tokenCounter) {
426
+ const words = text.split(/\s+/);
427
+ let overlap = "";
428
+ let tokens = 0;
429
+ for (let i = words.length - 1; i >= 0; i--) {
430
+ const testOverlap = words[i] + (overlap ? " " + overlap : "");
431
+ tokens = tokenCounter(testOverlap);
432
+ if (tokens > overlapTokens) {
433
+ break;
434
+ }
435
+ overlap = testOverlap;
436
+ }
437
+ return overlap;
438
+ }
439
+ };
440
+ function createRecursiveChunker() {
441
+ return new RecursiveChunker();
442
+ }
443
+
444
+ // src/chunking/MarkdownChunker.ts
445
+ var MarkdownChunker = class extends BaseChunker {
446
+ strategyType = "markdown";
447
+ async chunk(text, options) {
448
+ const opts = this.getOptions(options);
449
+ const preserveHeaders = options?.preserveHeaders ?? true;
450
+ const includeHeaderHierarchy = options?.includeHeaderHierarchy ?? true;
451
+ const headingLevels = options?.headingLevels ?? [1, 2, 3, 4, 5, 6];
452
+ const splitCodeBlocks = options?.splitCodeBlocks ?? false;
453
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
454
+ const sections = this.parseMarkdown(text, headingLevels);
455
+ let chunks = [];
456
+ for (const section of sections) {
457
+ const sectionChunks = await this.chunkSection(
458
+ section,
459
+ opts,
460
+ preserveHeaders,
461
+ includeHeaderHierarchy,
462
+ splitCodeBlocks,
463
+ tokenCounter
464
+ );
465
+ chunks.push(...sectionChunks);
466
+ }
467
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
468
+ return chunks.map((c, i) => ({ ...c, index: i }));
469
+ }
470
+ /**
471
+ * Parse markdown into sections
472
+ */
473
+ parseMarkdown(text, headingLevels) {
474
+ const sections = [];
475
+ const lines = text.split("\n");
476
+ const headingRegex = /^(#{1,6})\s+(.+)$/;
477
+ let currentSection = {
478
+ headingLevel: 0,
479
+ content: "",
480
+ startPosition: 0,
481
+ path: []
482
+ };
483
+ const headingStack = [];
484
+ let position = 0;
485
+ for (const line of lines) {
486
+ const headingMatch = line.match(headingRegex);
487
+ if (headingMatch) {
488
+ const level = headingMatch[1].length;
489
+ const headingText = headingMatch[2];
490
+ if (headingLevels.includes(level)) {
491
+ if (currentSection.content.trim()) {
492
+ sections.push({ ...currentSection });
493
+ }
494
+ while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= level) {
495
+ headingStack.pop();
496
+ }
497
+ headingStack.push({ level, text: headingText });
498
+ currentSection = {
499
+ heading: headingText,
500
+ headingLevel: level,
501
+ content: "",
502
+ startPosition: position,
503
+ path: headingStack.map((h) => h.text)
504
+ };
505
+ } else {
506
+ currentSection.content += line + "\n";
507
+ }
508
+ } else {
509
+ currentSection.content += line + "\n";
510
+ }
511
+ position += line.length + 1;
512
+ }
513
+ if (currentSection.content.trim() || currentSection.heading) {
514
+ sections.push(currentSection);
515
+ }
516
+ return sections;
517
+ }
518
+ /**
519
+ * Chunk a markdown section
520
+ */
521
+ async chunkSection(section, options, preserveHeaders, includeHeaderHierarchy, splitCodeBlocks, tokenCounter) {
522
+ const chunks = [];
523
+ let content = section.content;
524
+ let headerPrefix = "";
525
+ if (preserveHeaders && section.heading) {
526
+ if (includeHeaderHierarchy && section.path.length > 1) {
527
+ headerPrefix = section.path.map((h, i) => "#".repeat(i + 1) + " " + h).join("\n") + "\n\n";
528
+ } else {
529
+ headerPrefix = "#".repeat(section.headingLevel) + " " + section.heading + "\n\n";
530
+ }
531
+ }
532
+ const codeBlocks = [];
533
+ if (!splitCodeBlocks) {
534
+ const codeBlockRegex = /```[\s\S]*?```/g;
535
+ let match;
536
+ let blockIndex = 0;
537
+ while ((match = codeBlockRegex.exec(content)) !== null) {
538
+ const placeholder = `__CODE_BLOCK_${blockIndex}__`;
539
+ codeBlocks.push({ placeholder, content: match[0] });
540
+ content = content.replace(match[0], placeholder);
541
+ blockIndex++;
542
+ }
543
+ }
544
+ const fullContent = headerPrefix + content;
545
+ if (tokenCounter(fullContent) <= options.chunkSize) {
546
+ let finalContent = fullContent;
547
+ for (const block of codeBlocks) {
548
+ finalContent = finalContent.replace(block.placeholder, block.content);
549
+ }
550
+ chunks.push(
551
+ this.createChunk(
552
+ finalContent.trim(),
553
+ 0,
554
+ section.startPosition,
555
+ options,
556
+ {
557
+ section: section.heading,
558
+ headingLevel: section.headingLevel,
559
+ path: section.path
560
+ }
561
+ )
562
+ );
563
+ } else {
564
+ const paragraphs = content.split(/\n\n+/);
565
+ let currentContent = headerPrefix;
566
+ let chunkStart = section.startPosition;
567
+ for (const paragraph of paragraphs) {
568
+ let para = paragraph;
569
+ for (const block of codeBlocks) {
570
+ para = para.replace(block.placeholder, block.content);
571
+ }
572
+ const testContent = currentContent + para + "\n\n";
573
+ if (tokenCounter(testContent) > options.chunkSize && currentContent !== headerPrefix) {
574
+ chunks.push(
575
+ this.createChunk(
576
+ currentContent.trim(),
577
+ chunks.length,
578
+ chunkStart,
579
+ options,
580
+ {
581
+ section: section.heading,
582
+ headingLevel: section.headingLevel,
583
+ path: section.path
584
+ }
585
+ )
586
+ );
587
+ currentContent = headerPrefix + para + "\n\n";
588
+ chunkStart = section.startPosition + content.indexOf(paragraph);
589
+ } else {
590
+ currentContent = testContent;
591
+ }
592
+ }
593
+ if (currentContent.trim() && currentContent !== headerPrefix.trim()) {
594
+ chunks.push(
595
+ this.createChunk(
596
+ currentContent.trim(),
597
+ chunks.length,
598
+ chunkStart,
599
+ options,
600
+ {
601
+ section: section.heading,
602
+ headingLevel: section.headingLevel,
603
+ path: section.path
604
+ }
605
+ )
606
+ );
607
+ }
608
+ }
609
+ return Promise.resolve(chunks);
610
+ }
611
+ };
612
+ function createMarkdownChunker() {
613
+ return new MarkdownChunker();
614
+ }
615
+
616
+ // src/chunking/CodeChunker.ts
617
+ var LANGUAGE_PATTERNS = {
618
+ typescript: {
619
+ functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
620
+ classStart: /^(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/m,
621
+ importPattern: /^import\s+.*?(?:from\s+['"][^'"]+['"]|['"][^'"]+['"])/gm,
622
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
623
+ blockEnd: /^}/m
624
+ },
625
+ javascript: {
626
+ functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
627
+ classStart: /^(?:export\s+)?class\s+(\w+)/m,
628
+ importPattern: /^(?:import|require)\s*\(?\s*['"][^'"]+['"]\)?/gm,
629
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
630
+ blockEnd: /^}/m
631
+ },
632
+ python: {
633
+ functionStart: /^(?:async\s+)?def\s+(\w+)/m,
634
+ classStart: /^class\s+(\w+)/m,
635
+ importPattern: /^(?:from\s+\S+\s+)?import\s+.+$/gm,
636
+ commentPattern: /'''[\s\S]*?'''|"""[\s\S]*?"""|#.*/g,
637
+ blockEnd: /^(?=\S)/m
638
+ // Python uses indentation
639
+ },
640
+ go: {
641
+ functionStart: /^func\s+(?:\([^)]+\)\s+)?(\w+)/m,
642
+ classStart: /^type\s+(\w+)\s+struct/m,
643
+ importPattern: /^import\s+(?:\([\s\S]*?\)|"[^"]+")/gm,
644
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
645
+ blockEnd: /^}/m
646
+ },
647
+ rust: {
648
+ functionStart: /^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/m,
649
+ classStart: /^(?:pub\s+)?(?:struct|impl|trait)\s+(\w+)/m,
650
+ importPattern: /^use\s+.+;$/gm,
651
+ commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
652
+ blockEnd: /^}/m
653
+ }
654
+ };
655
+ var CodeChunker = class extends BaseChunker {
656
+ strategyType = "code";
657
+ async chunk(text, options) {
658
+ const opts = this.getOptions(options);
659
+ const language = options?.language ?? this.detectLanguage(text);
660
+ const splitBy = options?.splitBy ?? "auto";
661
+ const includeComments = options?.includeComments ?? true;
662
+ const includeImports = options?.includeImports ?? true;
663
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
664
+ const patterns = LANGUAGE_PATTERNS[language] ?? LANGUAGE_PATTERNS.typescript;
665
+ const blocks = this.parseCode(text, patterns, splitBy, includeComments);
666
+ let importBlock = "";
667
+ if (includeImports) {
668
+ const imports = text.match(patterns.importPattern);
669
+ if (imports) {
670
+ importBlock = imports.join("\n") + "\n\n";
671
+ }
672
+ }
673
+ let chunks = [];
674
+ for (const block of blocks) {
675
+ if (block.type === "import") continue;
676
+ const blockContent = includeImports && block.type !== "comment" ? importBlock + block.content : block.content;
677
+ if (tokenCounter(blockContent) <= opts.chunkSize) {
678
+ chunks.push(
679
+ this.createChunk(
680
+ blockContent.trim(),
681
+ chunks.length,
682
+ block.startPosition,
683
+ opts,
684
+ {
685
+ language,
686
+ blockType: block.type,
687
+ blockName: block.name
688
+ }
689
+ )
690
+ );
691
+ } else {
692
+ const subChunks = this.splitLargeBlock(
693
+ block,
694
+ importBlock,
695
+ opts,
696
+ tokenCounter,
697
+ language
698
+ );
699
+ chunks.push(...subChunks);
700
+ }
701
+ }
702
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
703
+ return Promise.resolve(chunks.map((c, i) => ({ ...c, index: i })));
704
+ }
705
+ /**
706
+ * Detect programming language
707
+ */
708
+ detectLanguage(text) {
709
+ if (text.includes("import type") || text.includes(": string") || text.includes("interface ")) {
710
+ return "typescript";
711
+ }
712
+ if (text.includes("def ") && text.includes(":")) {
713
+ return "python";
714
+ }
715
+ if (text.includes("func ") && text.includes("package ")) {
716
+ return "go";
717
+ }
718
+ if (text.includes("fn ") && (text.includes("let mut") || text.includes("pub fn"))) {
719
+ return "rust";
720
+ }
721
+ if (text.includes("const ") || text.includes("function ") || text.includes("require(")) {
722
+ return "javascript";
723
+ }
724
+ return "typescript";
725
+ }
726
+ /**
727
+ * Parse code into blocks
728
+ */
729
+ parseCode(text, patterns, splitBy, includeComments) {
730
+ const blocks = [];
731
+ const lines = text.split("\n");
732
+ let currentBlock = null;
733
+ let braceCount = 0;
734
+ let position = 0;
735
+ for (let i = 0; i < lines.length; i++) {
736
+ const line = lines[i];
737
+ const lineStart = position;
738
+ position += line.length + 1;
739
+ if (splitBy === "function" || splitBy === "auto") {
740
+ const funcMatch = line.match(patterns.functionStart);
741
+ if (funcMatch) {
742
+ if (currentBlock) {
743
+ blocks.push(currentBlock);
744
+ }
745
+ currentBlock = {
746
+ type: "function",
747
+ name: funcMatch[1] || funcMatch[2],
748
+ content: line + "\n",
749
+ startPosition: lineStart
750
+ };
751
+ braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
752
+ continue;
753
+ }
754
+ }
755
+ if (splitBy === "class" || splitBy === "auto") {
756
+ const classMatch = line.match(patterns.classStart);
757
+ if (classMatch) {
758
+ if (currentBlock) {
759
+ blocks.push(currentBlock);
760
+ }
761
+ currentBlock = {
762
+ type: "class",
763
+ name: classMatch[1],
764
+ content: line + "\n",
765
+ startPosition: lineStart
766
+ };
767
+ braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
768
+ continue;
769
+ }
770
+ }
771
+ if (currentBlock) {
772
+ currentBlock.content += line + "\n";
773
+ braceCount += (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
774
+ if (braceCount <= 0) {
775
+ blocks.push(currentBlock);
776
+ currentBlock = null;
777
+ braceCount = 0;
778
+ }
779
+ } else {
780
+ if (line.trim()) {
781
+ currentBlock = {
782
+ type: "other",
783
+ content: line + "\n",
784
+ startPosition: lineStart
785
+ };
786
+ }
787
+ }
788
+ }
789
+ if (currentBlock) {
790
+ blocks.push(currentBlock);
791
+ }
792
+ if (!includeComments) {
793
+ return blocks.map((block) => ({
794
+ ...block,
795
+ content: block.content.replace(patterns.commentPattern, "")
796
+ }));
797
+ }
798
+ return blocks;
799
+ }
800
+ /**
801
+ * Split a large code block
802
+ */
803
+ splitLargeBlock(block, importBlock, options, tokenCounter, language) {
804
+ const chunks = [];
805
+ const lines = block.content.split("\n");
806
+ let currentContent = importBlock;
807
+ let chunkStart = block.startPosition;
808
+ for (const line of lines) {
809
+ const testContent = currentContent + line + "\n";
810
+ if (tokenCounter(testContent) > options.chunkSize && currentContent !== importBlock) {
811
+ chunks.push(
812
+ this.createChunk(
813
+ currentContent.trim(),
814
+ chunks.length,
815
+ chunkStart,
816
+ options,
817
+ {
818
+ language,
819
+ blockType: block.type,
820
+ blockName: block.name,
821
+ partial: true
822
+ }
823
+ )
824
+ );
825
+ currentContent = importBlock + line + "\n";
826
+ chunkStart = block.startPosition + block.content.indexOf(line);
827
+ } else {
828
+ currentContent = testContent;
829
+ }
830
+ }
831
+ if (currentContent.trim() && currentContent !== importBlock.trim()) {
832
+ chunks.push(
833
+ this.createChunk(
834
+ currentContent.trim(),
835
+ chunks.length,
836
+ chunkStart,
837
+ options,
838
+ {
839
+ language,
840
+ blockType: block.type,
841
+ blockName: block.name,
842
+ partial: chunks.length > 0
843
+ }
844
+ )
845
+ );
846
+ }
847
+ return chunks;
848
+ }
849
+ };
850
+ function createCodeChunker() {
851
+ return new CodeChunker();
852
+ }
853
+
854
+ // src/chunking/SemanticChunker.ts
855
+ var SemanticChunker = class extends BaseChunker {
856
+ strategyType = "semantic";
857
+ async chunk(text, options) {
858
+ const opts = this.getOptions(options);
859
+ const similarityThreshold = options?.similarityThreshold ?? 0.5;
860
+ const breakpointPercentile = options?.breakpointPercentileThreshold ?? 95;
861
+ const bufferSize = options?.bufferSize ?? 1;
862
+ const embeddingFn = options?.embeddingFn;
863
+ const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
864
+ const sentences = this.splitSentences(text);
865
+ if (sentences.length === 0) {
866
+ return [];
867
+ }
868
+ if (!embeddingFn) {
869
+ return this.fallbackChunk(sentences, opts, tokenCounter);
870
+ }
871
+ const sentenceTexts = sentences.map((s) => s.text);
872
+ const embeddings = await embeddingFn(sentenceTexts);
873
+ const sentencesWithEmbeddings = sentences.map(
874
+ (s, i) => ({
875
+ ...s,
876
+ embedding: embeddings[i]
877
+ })
878
+ );
879
+ const distances = this.calculateDistances(
880
+ sentencesWithEmbeddings,
881
+ bufferSize
882
+ );
883
+ const breakpoints = this.findBreakpoints(
884
+ distances,
885
+ breakpointPercentile,
886
+ similarityThreshold
887
+ );
888
+ let chunks = [];
889
+ let chunkStart = 0;
890
+ let chunkText = "";
891
+ let chunkPosition = sentences[0]?.position ?? 0;
892
+ for (let i = 0; i < sentences.length; i++) {
893
+ chunkText += (chunkText ? " " : "") + sentences[i].text;
894
+ if (breakpoints.includes(i) || i === sentences.length - 1) {
895
+ if (chunkText.trim()) {
896
+ chunks.push(
897
+ this.createChunk(
898
+ chunkText.trim(),
899
+ chunks.length,
900
+ chunkPosition,
901
+ opts,
902
+ {
903
+ boundaryType: "semantic",
904
+ sentenceCount: i - chunkStart + 1
905
+ }
906
+ )
907
+ );
908
+ }
909
+ if (i < sentences.length - 1) {
910
+ chunkStart = i + 1;
911
+ chunkText = "";
912
+ chunkPosition = sentences[i + 1].position;
913
+ }
914
+ }
915
+ }
916
+ chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
917
+ chunks = this.splitLargeChunks(chunks, opts.maxChunkSize, tokenCounter);
918
+ return chunks;
919
+ }
920
+ /**
921
+ * Split text into sentences
922
+ */
923
+ splitSentences(text) {
924
+ const sentenceRegex = /[^.!?]+[.!?]+/g;
925
+ const sentences = [];
926
+ let match;
927
+ while ((match = sentenceRegex.exec(text)) !== null) {
928
+ const sentence = match[0].trim();
929
+ if (sentence) {
930
+ sentences.push({
931
+ text: sentence,
932
+ position: match.index
933
+ });
934
+ }
935
+ }
936
+ if (sentences.length === 0 && text.trim()) {
937
+ sentences.push({
938
+ text: text.trim(),
939
+ position: 0
940
+ });
941
+ }
942
+ return sentences;
943
+ }
944
+ /**
945
+ * Calculate distances between adjacent sentences
946
+ */
947
+ calculateDistances(sentences, bufferSize) {
948
+ const distances = [];
949
+ for (let i = 0; i < sentences.length - 1; i++) {
950
+ const leftStart = Math.max(0, i - bufferSize + 1);
951
+ const rightEnd = Math.min(sentences.length, i + bufferSize + 1);
952
+ const leftEmbeddings = sentences.slice(leftStart, i + 1).map((s) => s.embedding).filter((e) => e !== void 0);
953
+ const rightEmbeddings = sentences.slice(i + 1, rightEnd).map((s) => s.embedding).filter((e) => e !== void 0);
954
+ if (leftEmbeddings.length > 0 && rightEmbeddings.length > 0) {
955
+ const leftAvg = EmbeddingModel.average(leftEmbeddings);
956
+ const rightAvg = EmbeddingModel.average(rightEmbeddings);
957
+ const similarity = EmbeddingModel.cosineSimilarity(leftAvg, rightAvg);
958
+ distances.push(1 - similarity);
959
+ } else {
960
+ distances.push(0);
961
+ }
962
+ }
963
+ return distances;
964
+ }
965
+ /**
966
+ * Find breakpoints based on distance threshold
967
+ */
968
+ findBreakpoints(distances, percentile, minThreshold) {
969
+ if (distances.length === 0) return [];
970
+ const sortedDistances = [...distances].sort((a, b) => a - b);
971
+ const percentileIndex = Math.floor(
972
+ percentile / 100 * sortedDistances.length
973
+ );
974
+ const percentileThreshold = sortedDistances[percentileIndex] ?? sortedDistances[sortedDistances.length - 1];
975
+ const threshold = Math.max(percentileThreshold, 1 - minThreshold);
976
+ const breakpoints = [];
977
+ for (let i = 0; i < distances.length; i++) {
978
+ if (distances[i] >= threshold) {
979
+ breakpoints.push(i);
980
+ }
981
+ }
982
+ return breakpoints;
983
+ }
984
+ /**
985
+ * Fallback chunking when no embedding function available
986
+ */
987
+ fallbackChunk(sentences, options, tokenCounter) {
988
+ const chunks = [];
989
+ let currentText = "";
990
+ let chunkPosition = sentences[0]?.position ?? 0;
991
+ for (const sentence of sentences) {
992
+ const testText = currentText ? currentText + " " + sentence.text : sentence.text;
993
+ if (tokenCounter(testText) > options.chunkSize && currentText) {
994
+ chunks.push(
995
+ this.createChunk(
996
+ currentText.trim(),
997
+ chunks.length,
998
+ chunkPosition,
999
+ options,
1000
+ { boundaryType: "sentence" }
1001
+ )
1002
+ );
1003
+ currentText = sentence.text;
1004
+ chunkPosition = sentence.position;
1005
+ } else {
1006
+ currentText = testText;
1007
+ }
1008
+ }
1009
+ if (currentText.trim()) {
1010
+ chunks.push(
1011
+ this.createChunk(
1012
+ currentText.trim(),
1013
+ chunks.length,
1014
+ chunkPosition,
1015
+ options,
1016
+ { boundaryType: "sentence" }
1017
+ )
1018
+ );
1019
+ }
1020
+ return chunks;
1021
+ }
1022
+ /**
1023
+ * Split chunks that are too large
1024
+ */
1025
+ splitLargeChunks(chunks, maxTokens, tokenCounter) {
1026
+ const result = [];
1027
+ for (const chunk2 of chunks) {
1028
+ if (chunk2.tokenCount <= maxTokens) {
1029
+ result.push(chunk2);
1030
+ continue;
1031
+ }
1032
+ const sentences = this.splitSentences(chunk2.text);
1033
+ let currentText = "";
1034
+ let currentStart = chunk2.startPosition;
1035
+ for (const sentence of sentences) {
1036
+ const testText = currentText ? currentText + " " + sentence.text : sentence.text;
1037
+ if (tokenCounter(testText) > maxTokens && currentText) {
1038
+ result.push({
1039
+ ...chunk2,
1040
+ id: chunk2.id + "_" + result.length,
1041
+ text: currentText.trim(),
1042
+ startPosition: currentStart,
1043
+ endPosition: currentStart + currentText.length,
1044
+ tokenCount: tokenCounter(currentText),
1045
+ charCount: currentText.length,
1046
+ index: result.length
1047
+ });
1048
+ currentText = sentence.text;
1049
+ currentStart = chunk2.startPosition + sentence.position;
1050
+ } else {
1051
+ currentText = testText;
1052
+ }
1053
+ }
1054
+ if (currentText.trim()) {
1055
+ result.push({
1056
+ ...chunk2,
1057
+ id: chunk2.id + "_" + result.length,
1058
+ text: currentText.trim(),
1059
+ startPosition: currentStart,
1060
+ endPosition: currentStart + currentText.length,
1061
+ tokenCount: tokenCounter(currentText),
1062
+ charCount: currentText.length,
1063
+ index: result.length
1064
+ });
1065
+ }
1066
+ }
1067
+ return result;
1068
+ }
1069
+ };
1070
+ function createSemanticChunker() {
1071
+ return new SemanticChunker();
1072
+ }
1073
+
1074
+ // src/chunking/index.ts
1075
+ function createChunker(strategy) {
1076
+ switch (strategy) {
1077
+ case "fixed":
1078
+ return new FixedChunker();
1079
+ case "recursive":
1080
+ return new RecursiveChunker();
1081
+ case "markdown":
1082
+ return new MarkdownChunker();
1083
+ case "code":
1084
+ return new CodeChunker();
1085
+ case "semantic":
1086
+ return new SemanticChunker();
1087
+ case "sentence":
1088
+ return new FixedChunker();
1089
+ case "paragraph":
1090
+ return new FixedChunker();
1091
+ default:
1092
+ return new RecursiveChunker();
1093
+ }
1094
+ }
1095
+ async function chunk(text, strategy = "recursive", options) {
1096
+ const chunker = createChunker(strategy);
1097
+ return chunker.chunk(text, options);
1098
+ }
1099
+
1100
+ export {
1101
+ defaultTokenCounter,
1102
+ BaseChunker,
1103
+ mergeSmallChunks,
1104
+ splitLargeChunks,
1105
+ FixedChunker,
1106
+ createFixedChunker,
1107
+ RecursiveChunker,
1108
+ createRecursiveChunker,
1109
+ MarkdownChunker,
1110
+ createMarkdownChunker,
1111
+ CodeChunker,
1112
+ createCodeChunker,
1113
+ SemanticChunker,
1114
+ createSemanticChunker,
1115
+ createChunker,
1116
+ chunk
1117
+ };