@lov3kaizen/agentsea-embeddings 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +475 -0
- package/dist/caching/index.d.mts +286 -0
- package/dist/caching/index.d.ts +286 -0
- package/dist/caching/index.js +1005 -0
- package/dist/caching/index.mjs +27 -0
- package/dist/chunk-3KM32UQK.mjs +207 -0
- package/dist/chunk-DJAURHAS.mjs +1117 -0
- package/dist/chunk-NBHIRTJT.mjs +895 -0
- package/dist/chunk-QAITLJ2E.mjs +259 -0
- package/dist/chunk-TER262ST.mjs +877 -0
- package/dist/chunk-VPSMDBHH.mjs +957 -0
- package/dist/chunking/index.d.mts +1 -0
- package/dist/chunking/index.d.ts +1 -0
- package/dist/chunking/index.js +1408 -0
- package/dist/chunking/index.mjs +37 -0
- package/dist/embedding.types-CCgPVxt1.d.mts +102 -0
- package/dist/embedding.types-CCgPVxt1.d.ts +102 -0
- package/dist/index-CeG6God2.d.mts +297 -0
- package/dist/index-DMaQRn2w.d.mts +172 -0
- package/dist/index-DMaQRn2w.d.ts +172 -0
- package/dist/index-DWddsKRi.d.ts +297 -0
- package/dist/index.d.mts +647 -0
- package/dist/index.d.ts +647 -0
- package/dist/index.js +5259 -0
- package/dist/index.mjs +1028 -0
- package/dist/providers/index.d.mts +2 -0
- package/dist/providers/index.d.ts +2 -0
- package/dist/providers/index.js +1235 -0
- package/dist/providers/index.mjs +32 -0
- package/dist/stores/index.d.mts +298 -0
- package/dist/stores/index.d.ts +298 -0
- package/dist/stores/index.js +1178 -0
- package/dist/stores/index.mjs +26 -0
- package/package.json +102 -0
|
@@ -0,0 +1,1117 @@
|
|
|
1
|
+
import {
|
|
2
|
+
EmbeddingModel
|
|
3
|
+
} from "./chunk-QAITLJ2E.mjs";
|
|
4
|
+
|
|
5
|
+
// src/chunking/BaseChunker.ts
|
|
6
|
+
import { nanoid } from "nanoid";
|
|
7
|
+
var defaultTokenCounter = (text) => {
|
|
8
|
+
return Math.ceil(text.length / 4);
|
|
9
|
+
};
|
|
10
|
+
var BaseChunker = class {
|
|
11
|
+
/** Default options */
|
|
12
|
+
defaultOptions = {
|
|
13
|
+
chunkSize: 512,
|
|
14
|
+
chunkOverlap: 50,
|
|
15
|
+
minChunkSize: 100,
|
|
16
|
+
maxChunkSize: 2e3,
|
|
17
|
+
tokenCounter: defaultTokenCounter
|
|
18
|
+
};
|
|
19
|
+
/**
|
|
20
|
+
* Get merged options with defaults
|
|
21
|
+
*/
|
|
22
|
+
getOptions(options) {
|
|
23
|
+
return {
|
|
24
|
+
chunkSize: options?.chunkSize ?? this.defaultOptions.chunkSize,
|
|
25
|
+
chunkOverlap: options?.chunkOverlap ?? this.defaultOptions.chunkOverlap,
|
|
26
|
+
minChunkSize: options?.minChunkSize ?? this.defaultOptions.minChunkSize,
|
|
27
|
+
maxChunkSize: options?.maxChunkSize ?? this.defaultOptions.maxChunkSize,
|
|
28
|
+
tokenCounter: options?.tokenCounter ?? this.defaultOptions.tokenCounter,
|
|
29
|
+
documentId: options?.documentId,
|
|
30
|
+
source: options?.source,
|
|
31
|
+
type: options?.type,
|
|
32
|
+
metadata: options?.metadata ?? {}
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Create a chunk object
|
|
37
|
+
*/
|
|
38
|
+
createChunk(text, index, startPosition, options, additionalMetadata) {
|
|
39
|
+
const tokenCounter = options.tokenCounter ?? defaultTokenCounter;
|
|
40
|
+
const metadata = {
|
|
41
|
+
...options.metadata,
|
|
42
|
+
...additionalMetadata
|
|
43
|
+
};
|
|
44
|
+
if (options.documentId) metadata.documentId = options.documentId;
|
|
45
|
+
if (options.source) metadata.source = options.source;
|
|
46
|
+
if (options.type) metadata.type = options.type;
|
|
47
|
+
return {
|
|
48
|
+
id: nanoid(),
|
|
49
|
+
text,
|
|
50
|
+
index,
|
|
51
|
+
startPosition,
|
|
52
|
+
endPosition: startPosition + text.length,
|
|
53
|
+
tokenCount: tokenCounter(text),
|
|
54
|
+
charCount: text.length,
|
|
55
|
+
overlapPrev: 0,
|
|
56
|
+
overlapNext: 0,
|
|
57
|
+
metadata
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Process chunks and set overlap information
|
|
62
|
+
*/
|
|
63
|
+
setOverlapInfo(chunks, overlapChars) {
|
|
64
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
65
|
+
chunks[i].overlapPrev = overlapChars;
|
|
66
|
+
chunks[i - 1].overlapNext = overlapChars;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Split text with overlap
|
|
71
|
+
*/
|
|
72
|
+
splitWithOverlap(text, chunkSize, overlap, tokenCounter) {
|
|
73
|
+
const chunks = [];
|
|
74
|
+
let start = 0;
|
|
75
|
+
while (start < text.length) {
|
|
76
|
+
let end = start;
|
|
77
|
+
let tokens = 0;
|
|
78
|
+
while (end < text.length && tokens < chunkSize) {
|
|
79
|
+
end++;
|
|
80
|
+
tokens = tokenCounter(text.slice(start, end));
|
|
81
|
+
}
|
|
82
|
+
if (end < text.length) {
|
|
83
|
+
const lastSpace = text.lastIndexOf(" ", end);
|
|
84
|
+
if (lastSpace > start) {
|
|
85
|
+
end = lastSpace + 1;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
chunks.push(text.slice(start, end).trim());
|
|
89
|
+
const overlapChars = Math.floor(overlap * 4);
|
|
90
|
+
start = Math.max(start + 1, end - overlapChars);
|
|
91
|
+
if (start >= text.length) break;
|
|
92
|
+
}
|
|
93
|
+
return chunks.filter((c) => c.length > 0);
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Chunk text and return a result object
|
|
97
|
+
*/
|
|
98
|
+
async chunkWithResult(text, options) {
|
|
99
|
+
const startTime = performance.now();
|
|
100
|
+
const chunks = await this.chunk(text, options);
|
|
101
|
+
const processingTimeMs = performance.now() - startTime;
|
|
102
|
+
const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0);
|
|
103
|
+
return {
|
|
104
|
+
chunks,
|
|
105
|
+
totalChunks: chunks.length,
|
|
106
|
+
totalTokens,
|
|
107
|
+
avgChunkSize: chunks.length > 0 ? totalTokens / chunks.length : 0,
|
|
108
|
+
processingTimeMs,
|
|
109
|
+
strategy: this.strategyType,
|
|
110
|
+
originalLength: text.length
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
};
|
|
114
|
+
function mergeSmallChunks(chunks, minTokens, tokenCounter) {
|
|
115
|
+
if (chunks.length <= 1) return chunks;
|
|
116
|
+
const merged = [];
|
|
117
|
+
let current = null;
|
|
118
|
+
for (const chunk2 of chunks) {
|
|
119
|
+
if (!current) {
|
|
120
|
+
current = { ...chunk2 };
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
const combinedText = current.text + "\n" + chunk2.text;
|
|
124
|
+
const combinedTokens = tokenCounter(combinedText);
|
|
125
|
+
if (current.tokenCount < minTokens) {
|
|
126
|
+
current.text = combinedText;
|
|
127
|
+
current.tokenCount = combinedTokens;
|
|
128
|
+
current.charCount = combinedText.length;
|
|
129
|
+
current.endPosition = chunk2.endPosition;
|
|
130
|
+
} else {
|
|
131
|
+
merged.push(current);
|
|
132
|
+
current = { ...chunk2 };
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
if (current) {
|
|
136
|
+
merged.push(current);
|
|
137
|
+
}
|
|
138
|
+
return merged.map((c, i) => ({ ...c, index: i }));
|
|
139
|
+
}
|
|
140
|
+
function splitLargeChunks(chunks, maxTokens, tokenCounter) {
|
|
141
|
+
const result = [];
|
|
142
|
+
for (const chunk2 of chunks) {
|
|
143
|
+
if (chunk2.tokenCount <= maxTokens) {
|
|
144
|
+
result.push(chunk2);
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
const sentences = chunk2.text.split(/(?<=[.!?])\s+/);
|
|
148
|
+
let currentText = "";
|
|
149
|
+
let currentStart = chunk2.startPosition;
|
|
150
|
+
for (const sentence of sentences) {
|
|
151
|
+
const testText = currentText ? currentText + " " + sentence : sentence;
|
|
152
|
+
const testTokens = tokenCounter(testText);
|
|
153
|
+
if (testTokens > maxTokens && currentText) {
|
|
154
|
+
result.push({
|
|
155
|
+
...chunk2,
|
|
156
|
+
id: nanoid(),
|
|
157
|
+
text: currentText,
|
|
158
|
+
startPosition: currentStart,
|
|
159
|
+
endPosition: currentStart + currentText.length,
|
|
160
|
+
tokenCount: tokenCounter(currentText),
|
|
161
|
+
charCount: currentText.length
|
|
162
|
+
});
|
|
163
|
+
currentText = sentence;
|
|
164
|
+
currentStart = currentStart + currentText.length + 1;
|
|
165
|
+
} else {
|
|
166
|
+
currentText = testText;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
if (currentText) {
|
|
170
|
+
result.push({
|
|
171
|
+
...chunk2,
|
|
172
|
+
id: nanoid(),
|
|
173
|
+
text: currentText,
|
|
174
|
+
startPosition: currentStart,
|
|
175
|
+
endPosition: currentStart + currentText.length,
|
|
176
|
+
tokenCount: tokenCounter(currentText),
|
|
177
|
+
charCount: currentText.length
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
return result.map((c, i) => ({ ...c, index: i }));
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// src/chunking/FixedChunker.ts
|
|
185
|
+
var FixedChunker = class extends BaseChunker {
|
|
186
|
+
strategyType = "fixed";
|
|
187
|
+
async chunk(text, options) {
|
|
188
|
+
const opts = this.getOptions(options);
|
|
189
|
+
const splitByChars = options?.splitByChars ?? false;
|
|
190
|
+
const separator = options?.separator ?? "\n";
|
|
191
|
+
const keepSeparator = options?.keepSeparator ?? false;
|
|
192
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
193
|
+
const chunks = [];
|
|
194
|
+
let position = 0;
|
|
195
|
+
if (splitByChars) {
|
|
196
|
+
const chunkSize = opts.chunkSize * 4;
|
|
197
|
+
const overlap = opts.chunkOverlap * 4;
|
|
198
|
+
let start = 0;
|
|
199
|
+
while (start < text.length) {
|
|
200
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
201
|
+
const chunkText = text.slice(start, end).trim();
|
|
202
|
+
if (chunkText.length > 0) {
|
|
203
|
+
chunks.push(this.createChunk(chunkText, chunks.length, start, opts));
|
|
204
|
+
}
|
|
205
|
+
start = end - overlap;
|
|
206
|
+
if (start >= text.length) break;
|
|
207
|
+
}
|
|
208
|
+
} else {
|
|
209
|
+
const parts = text.split(separator);
|
|
210
|
+
let currentChunk = "";
|
|
211
|
+
let chunkStart = 0;
|
|
212
|
+
for (let i = 0; i < parts.length; i++) {
|
|
213
|
+
const part = parts[i];
|
|
214
|
+
const partWithSep = keepSeparator && i < parts.length - 1 ? part + separator : part;
|
|
215
|
+
const testChunk = currentChunk ? currentChunk + (keepSeparator ? "" : separator) + partWithSep : partWithSep;
|
|
216
|
+
const testTokens = tokenCounter(testChunk);
|
|
217
|
+
if (testTokens > opts.chunkSize && currentChunk) {
|
|
218
|
+
chunks.push(
|
|
219
|
+
this.createChunk(
|
|
220
|
+
currentChunk.trim(),
|
|
221
|
+
chunks.length,
|
|
222
|
+
chunkStart,
|
|
223
|
+
opts
|
|
224
|
+
)
|
|
225
|
+
);
|
|
226
|
+
const overlapText = this.getOverlapText(
|
|
227
|
+
currentChunk,
|
|
228
|
+
opts.chunkOverlap,
|
|
229
|
+
tokenCounter
|
|
230
|
+
);
|
|
231
|
+
currentChunk = overlapText + (overlapText ? separator : "") + partWithSep;
|
|
232
|
+
chunkStart = position - (overlapText?.length ?? 0);
|
|
233
|
+
} else {
|
|
234
|
+
currentChunk = testChunk;
|
|
235
|
+
}
|
|
236
|
+
position += part.length + separator.length;
|
|
237
|
+
}
|
|
238
|
+
if (currentChunk.trim()) {
|
|
239
|
+
chunks.push(
|
|
240
|
+
this.createChunk(
|
|
241
|
+
currentChunk.trim(),
|
|
242
|
+
chunks.length,
|
|
243
|
+
chunkStart,
|
|
244
|
+
opts
|
|
245
|
+
)
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
this.setOverlapInfo(chunks, opts.chunkOverlap * 4);
|
|
250
|
+
return Promise.resolve(chunks);
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Get text for overlap from the end of a chunk
|
|
254
|
+
*/
|
|
255
|
+
getOverlapText(text, overlapTokens, tokenCounter) {
|
|
256
|
+
if (overlapTokens <= 0) return "";
|
|
257
|
+
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
258
|
+
let overlapText = "";
|
|
259
|
+
for (let i = sentences.length - 1; i >= 0; i--) {
|
|
260
|
+
const testText = sentences[i] + (overlapText ? " " + overlapText : "");
|
|
261
|
+
const testTokens = tokenCounter(testText);
|
|
262
|
+
if (testTokens > overlapTokens && overlapText) {
|
|
263
|
+
break;
|
|
264
|
+
}
|
|
265
|
+
overlapText = testText;
|
|
266
|
+
}
|
|
267
|
+
return overlapText;
|
|
268
|
+
}
|
|
269
|
+
};
|
|
270
|
+
function createFixedChunker() {
|
|
271
|
+
return new FixedChunker();
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// src/chunking/RecursiveChunker.ts
|
|
275
|
+
var DEFAULT_SEPARATORS = [
|
|
276
|
+
"\n\n",
|
|
277
|
+
// Paragraphs
|
|
278
|
+
"\n",
|
|
279
|
+
// Lines
|
|
280
|
+
". ",
|
|
281
|
+
// Sentences
|
|
282
|
+
", ",
|
|
283
|
+
// Clauses
|
|
284
|
+
" ",
|
|
285
|
+
// Words
|
|
286
|
+
""
|
|
287
|
+
// Characters
|
|
288
|
+
];
|
|
289
|
+
var RecursiveChunker = class extends BaseChunker {
|
|
290
|
+
strategyType = "recursive";
|
|
291
|
+
async chunk(text, options) {
|
|
292
|
+
const opts = this.getOptions(options);
|
|
293
|
+
const separators = options?.separators ?? DEFAULT_SEPARATORS;
|
|
294
|
+
const keepSeparator = options?.keepSeparator ?? true;
|
|
295
|
+
const mergeSmall = options?.mergeSmallChunks ?? true;
|
|
296
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
297
|
+
const texts = this.splitRecursively(
|
|
298
|
+
text,
|
|
299
|
+
separators,
|
|
300
|
+
opts.chunkSize,
|
|
301
|
+
keepSeparator,
|
|
302
|
+
tokenCounter
|
|
303
|
+
);
|
|
304
|
+
let position = 0;
|
|
305
|
+
let chunks = [];
|
|
306
|
+
for (let i = 0; i < texts.length; i++) {
|
|
307
|
+
const chunkText = texts[i].trim();
|
|
308
|
+
if (chunkText) {
|
|
309
|
+
chunks.push(this.createChunk(chunkText, i, position, opts));
|
|
310
|
+
position += texts[i].length;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
if (mergeSmall) {
|
|
314
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
315
|
+
}
|
|
316
|
+
chunks = this.addOverlap(chunks, opts.chunkOverlap, tokenCounter);
|
|
317
|
+
return Promise.resolve(chunks);
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Recursively split text
|
|
321
|
+
*/
|
|
322
|
+
splitRecursively(text, separators, chunkSize, keepSeparator, tokenCounter) {
|
|
323
|
+
if (tokenCounter(text) <= chunkSize) {
|
|
324
|
+
return [text];
|
|
325
|
+
}
|
|
326
|
+
for (let i = 0; i < separators.length; i++) {
|
|
327
|
+
const separator = separators[i];
|
|
328
|
+
if (separator === "") {
|
|
329
|
+
return this.splitByChars(text, chunkSize, tokenCounter);
|
|
330
|
+
}
|
|
331
|
+
if (!text.includes(separator)) {
|
|
332
|
+
continue;
|
|
333
|
+
}
|
|
334
|
+
const splits = this.splitBySeparator(text, separator, keepSeparator);
|
|
335
|
+
const result = [];
|
|
336
|
+
for (const split of splits) {
|
|
337
|
+
if (tokenCounter(split) <= chunkSize) {
|
|
338
|
+
result.push(split);
|
|
339
|
+
} else {
|
|
340
|
+
const subSplits = this.splitRecursively(
|
|
341
|
+
split,
|
|
342
|
+
separators.slice(i + 1),
|
|
343
|
+
chunkSize,
|
|
344
|
+
keepSeparator,
|
|
345
|
+
tokenCounter
|
|
346
|
+
);
|
|
347
|
+
result.push(...subSplits);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
return result;
|
|
351
|
+
}
|
|
352
|
+
return this.splitByChars(text, chunkSize, tokenCounter);
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Split by separator
|
|
356
|
+
*/
|
|
357
|
+
splitBySeparator(text, separator, keepSeparator) {
|
|
358
|
+
if (keepSeparator) {
|
|
359
|
+
const parts = text.split(separator);
|
|
360
|
+
return parts.map((part, i) => i < parts.length - 1 ? part + separator : part).filter((p) => p.trim());
|
|
361
|
+
} else {
|
|
362
|
+
return text.split(separator).filter((p) => p.trim());
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Split by characters (last resort)
|
|
367
|
+
*/
|
|
368
|
+
splitByChars(text, chunkSize, tokenCounter) {
|
|
369
|
+
const chunks = [];
|
|
370
|
+
let start = 0;
|
|
371
|
+
while (start < text.length) {
|
|
372
|
+
let end = start;
|
|
373
|
+
while (end < text.length && tokenCounter(text.slice(start, end)) < chunkSize) {
|
|
374
|
+
end++;
|
|
375
|
+
}
|
|
376
|
+
if (end < text.length) {
|
|
377
|
+
const lastSpace = text.lastIndexOf(" ", end);
|
|
378
|
+
if (lastSpace > start) {
|
|
379
|
+
end = lastSpace;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
chunks.push(text.slice(start, end));
|
|
383
|
+
start = end;
|
|
384
|
+
}
|
|
385
|
+
return chunks.filter((c) => c.trim());
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* Add overlap between chunks
|
|
389
|
+
*/
|
|
390
|
+
addOverlap(chunks, overlapTokens, tokenCounter) {
|
|
391
|
+
if (overlapTokens <= 0 || chunks.length <= 1) {
|
|
392
|
+
return chunks;
|
|
393
|
+
}
|
|
394
|
+
const result = [];
|
|
395
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
396
|
+
let chunkText = chunks[i].text;
|
|
397
|
+
let startOffset = 0;
|
|
398
|
+
if (i > 0) {
|
|
399
|
+
const prevText = chunks[i - 1].text;
|
|
400
|
+
const overlapText = this.getEndOverlap(
|
|
401
|
+
prevText,
|
|
402
|
+
overlapTokens,
|
|
403
|
+
tokenCounter
|
|
404
|
+
);
|
|
405
|
+
if (overlapText) {
|
|
406
|
+
chunkText = overlapText + " " + chunkText;
|
|
407
|
+
startOffset = -overlapText.length - 1;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
result.push({
|
|
411
|
+
...chunks[i],
|
|
412
|
+
text: chunkText,
|
|
413
|
+
startPosition: chunks[i].startPosition + startOffset,
|
|
414
|
+
tokenCount: tokenCounter(chunkText),
|
|
415
|
+
charCount: chunkText.length,
|
|
416
|
+
overlapPrev: i > 0 ? overlapTokens : 0,
|
|
417
|
+
overlapNext: i < chunks.length - 1 ? overlapTokens : 0
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
return result;
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Get overlap text from end of string
|
|
424
|
+
*/
|
|
425
|
+
getEndOverlap(text, overlapTokens, tokenCounter) {
|
|
426
|
+
const words = text.split(/\s+/);
|
|
427
|
+
let overlap = "";
|
|
428
|
+
let tokens = 0;
|
|
429
|
+
for (let i = words.length - 1; i >= 0; i--) {
|
|
430
|
+
const testOverlap = words[i] + (overlap ? " " + overlap : "");
|
|
431
|
+
tokens = tokenCounter(testOverlap);
|
|
432
|
+
if (tokens > overlapTokens) {
|
|
433
|
+
break;
|
|
434
|
+
}
|
|
435
|
+
overlap = testOverlap;
|
|
436
|
+
}
|
|
437
|
+
return overlap;
|
|
438
|
+
}
|
|
439
|
+
};
|
|
440
|
+
function createRecursiveChunker() {
|
|
441
|
+
return new RecursiveChunker();
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// src/chunking/MarkdownChunker.ts
|
|
445
|
+
var MarkdownChunker = class extends BaseChunker {
|
|
446
|
+
strategyType = "markdown";
|
|
447
|
+
async chunk(text, options) {
|
|
448
|
+
const opts = this.getOptions(options);
|
|
449
|
+
const preserveHeaders = options?.preserveHeaders ?? true;
|
|
450
|
+
const includeHeaderHierarchy = options?.includeHeaderHierarchy ?? true;
|
|
451
|
+
const headingLevels = options?.headingLevels ?? [1, 2, 3, 4, 5, 6];
|
|
452
|
+
const splitCodeBlocks = options?.splitCodeBlocks ?? false;
|
|
453
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
454
|
+
const sections = this.parseMarkdown(text, headingLevels);
|
|
455
|
+
let chunks = [];
|
|
456
|
+
for (const section of sections) {
|
|
457
|
+
const sectionChunks = await this.chunkSection(
|
|
458
|
+
section,
|
|
459
|
+
opts,
|
|
460
|
+
preserveHeaders,
|
|
461
|
+
includeHeaderHierarchy,
|
|
462
|
+
splitCodeBlocks,
|
|
463
|
+
tokenCounter
|
|
464
|
+
);
|
|
465
|
+
chunks.push(...sectionChunks);
|
|
466
|
+
}
|
|
467
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
468
|
+
return chunks.map((c, i) => ({ ...c, index: i }));
|
|
469
|
+
}
|
|
470
|
+
/**
|
|
471
|
+
* Parse markdown into sections
|
|
472
|
+
*/
|
|
473
|
+
parseMarkdown(text, headingLevels) {
|
|
474
|
+
const sections = [];
|
|
475
|
+
const lines = text.split("\n");
|
|
476
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/;
|
|
477
|
+
let currentSection = {
|
|
478
|
+
headingLevel: 0,
|
|
479
|
+
content: "",
|
|
480
|
+
startPosition: 0,
|
|
481
|
+
path: []
|
|
482
|
+
};
|
|
483
|
+
const headingStack = [];
|
|
484
|
+
let position = 0;
|
|
485
|
+
for (const line of lines) {
|
|
486
|
+
const headingMatch = line.match(headingRegex);
|
|
487
|
+
if (headingMatch) {
|
|
488
|
+
const level = headingMatch[1].length;
|
|
489
|
+
const headingText = headingMatch[2];
|
|
490
|
+
if (headingLevels.includes(level)) {
|
|
491
|
+
if (currentSection.content.trim()) {
|
|
492
|
+
sections.push({ ...currentSection });
|
|
493
|
+
}
|
|
494
|
+
while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= level) {
|
|
495
|
+
headingStack.pop();
|
|
496
|
+
}
|
|
497
|
+
headingStack.push({ level, text: headingText });
|
|
498
|
+
currentSection = {
|
|
499
|
+
heading: headingText,
|
|
500
|
+
headingLevel: level,
|
|
501
|
+
content: "",
|
|
502
|
+
startPosition: position,
|
|
503
|
+
path: headingStack.map((h) => h.text)
|
|
504
|
+
};
|
|
505
|
+
} else {
|
|
506
|
+
currentSection.content += line + "\n";
|
|
507
|
+
}
|
|
508
|
+
} else {
|
|
509
|
+
currentSection.content += line + "\n";
|
|
510
|
+
}
|
|
511
|
+
position += line.length + 1;
|
|
512
|
+
}
|
|
513
|
+
if (currentSection.content.trim() || currentSection.heading) {
|
|
514
|
+
sections.push(currentSection);
|
|
515
|
+
}
|
|
516
|
+
return sections;
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* Chunk a markdown section
|
|
520
|
+
*/
|
|
521
|
+
async chunkSection(section, options, preserveHeaders, includeHeaderHierarchy, splitCodeBlocks, tokenCounter) {
|
|
522
|
+
const chunks = [];
|
|
523
|
+
let content = section.content;
|
|
524
|
+
let headerPrefix = "";
|
|
525
|
+
if (preserveHeaders && section.heading) {
|
|
526
|
+
if (includeHeaderHierarchy && section.path.length > 1) {
|
|
527
|
+
headerPrefix = section.path.map((h, i) => "#".repeat(i + 1) + " " + h).join("\n") + "\n\n";
|
|
528
|
+
} else {
|
|
529
|
+
headerPrefix = "#".repeat(section.headingLevel) + " " + section.heading + "\n\n";
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
const codeBlocks = [];
|
|
533
|
+
if (!splitCodeBlocks) {
|
|
534
|
+
const codeBlockRegex = /```[\s\S]*?```/g;
|
|
535
|
+
let match;
|
|
536
|
+
let blockIndex = 0;
|
|
537
|
+
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
538
|
+
const placeholder = `__CODE_BLOCK_${blockIndex}__`;
|
|
539
|
+
codeBlocks.push({ placeholder, content: match[0] });
|
|
540
|
+
content = content.replace(match[0], placeholder);
|
|
541
|
+
blockIndex++;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
const fullContent = headerPrefix + content;
|
|
545
|
+
if (tokenCounter(fullContent) <= options.chunkSize) {
|
|
546
|
+
let finalContent = fullContent;
|
|
547
|
+
for (const block of codeBlocks) {
|
|
548
|
+
finalContent = finalContent.replace(block.placeholder, block.content);
|
|
549
|
+
}
|
|
550
|
+
chunks.push(
|
|
551
|
+
this.createChunk(
|
|
552
|
+
finalContent.trim(),
|
|
553
|
+
0,
|
|
554
|
+
section.startPosition,
|
|
555
|
+
options,
|
|
556
|
+
{
|
|
557
|
+
section: section.heading,
|
|
558
|
+
headingLevel: section.headingLevel,
|
|
559
|
+
path: section.path
|
|
560
|
+
}
|
|
561
|
+
)
|
|
562
|
+
);
|
|
563
|
+
} else {
|
|
564
|
+
const paragraphs = content.split(/\n\n+/);
|
|
565
|
+
let currentContent = headerPrefix;
|
|
566
|
+
let chunkStart = section.startPosition;
|
|
567
|
+
for (const paragraph of paragraphs) {
|
|
568
|
+
let para = paragraph;
|
|
569
|
+
for (const block of codeBlocks) {
|
|
570
|
+
para = para.replace(block.placeholder, block.content);
|
|
571
|
+
}
|
|
572
|
+
const testContent = currentContent + para + "\n\n";
|
|
573
|
+
if (tokenCounter(testContent) > options.chunkSize && currentContent !== headerPrefix) {
|
|
574
|
+
chunks.push(
|
|
575
|
+
this.createChunk(
|
|
576
|
+
currentContent.trim(),
|
|
577
|
+
chunks.length,
|
|
578
|
+
chunkStart,
|
|
579
|
+
options,
|
|
580
|
+
{
|
|
581
|
+
section: section.heading,
|
|
582
|
+
headingLevel: section.headingLevel,
|
|
583
|
+
path: section.path
|
|
584
|
+
}
|
|
585
|
+
)
|
|
586
|
+
);
|
|
587
|
+
currentContent = headerPrefix + para + "\n\n";
|
|
588
|
+
chunkStart = section.startPosition + content.indexOf(paragraph);
|
|
589
|
+
} else {
|
|
590
|
+
currentContent = testContent;
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
if (currentContent.trim() && currentContent !== headerPrefix.trim()) {
|
|
594
|
+
chunks.push(
|
|
595
|
+
this.createChunk(
|
|
596
|
+
currentContent.trim(),
|
|
597
|
+
chunks.length,
|
|
598
|
+
chunkStart,
|
|
599
|
+
options,
|
|
600
|
+
{
|
|
601
|
+
section: section.heading,
|
|
602
|
+
headingLevel: section.headingLevel,
|
|
603
|
+
path: section.path
|
|
604
|
+
}
|
|
605
|
+
)
|
|
606
|
+
);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
return Promise.resolve(chunks);
|
|
610
|
+
}
|
|
611
|
+
};
|
|
612
|
+
function createMarkdownChunker() {
|
|
613
|
+
return new MarkdownChunker();
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// src/chunking/CodeChunker.ts
|
|
617
|
+
var LANGUAGE_PATTERNS = {
|
|
618
|
+
typescript: {
|
|
619
|
+
functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
|
|
620
|
+
classStart: /^(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/m,
|
|
621
|
+
importPattern: /^import\s+.*?(?:from\s+['"][^'"]+['"]|['"][^'"]+['"])/gm,
|
|
622
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
623
|
+
blockEnd: /^}/m
|
|
624
|
+
},
|
|
625
|
+
javascript: {
|
|
626
|
+
functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
|
|
627
|
+
classStart: /^(?:export\s+)?class\s+(\w+)/m,
|
|
628
|
+
importPattern: /^(?:import|require)\s*\(?\s*['"][^'"]+['"]\)?/gm,
|
|
629
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
630
|
+
blockEnd: /^}/m
|
|
631
|
+
},
|
|
632
|
+
python: {
|
|
633
|
+
functionStart: /^(?:async\s+)?def\s+(\w+)/m,
|
|
634
|
+
classStart: /^class\s+(\w+)/m,
|
|
635
|
+
importPattern: /^(?:from\s+\S+\s+)?import\s+.+$/gm,
|
|
636
|
+
commentPattern: /'''[\s\S]*?'''|"""[\s\S]*?"""|#.*/g,
|
|
637
|
+
blockEnd: /^(?=\S)/m
|
|
638
|
+
// Python uses indentation
|
|
639
|
+
},
|
|
640
|
+
go: {
|
|
641
|
+
functionStart: /^func\s+(?:\([^)]+\)\s+)?(\w+)/m,
|
|
642
|
+
classStart: /^type\s+(\w+)\s+struct/m,
|
|
643
|
+
importPattern: /^import\s+(?:\([\s\S]*?\)|"[^"]+")/gm,
|
|
644
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
645
|
+
blockEnd: /^}/m
|
|
646
|
+
},
|
|
647
|
+
rust: {
|
|
648
|
+
functionStart: /^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/m,
|
|
649
|
+
classStart: /^(?:pub\s+)?(?:struct|impl|trait)\s+(\w+)/m,
|
|
650
|
+
importPattern: /^use\s+.+;$/gm,
|
|
651
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
652
|
+
blockEnd: /^}/m
|
|
653
|
+
}
|
|
654
|
+
};
|
|
655
|
+
var CodeChunker = class extends BaseChunker {
|
|
656
|
+
strategyType = "code";
|
|
657
|
+
async chunk(text, options) {
|
|
658
|
+
const opts = this.getOptions(options);
|
|
659
|
+
const language = options?.language ?? this.detectLanguage(text);
|
|
660
|
+
const splitBy = options?.splitBy ?? "auto";
|
|
661
|
+
const includeComments = options?.includeComments ?? true;
|
|
662
|
+
const includeImports = options?.includeImports ?? true;
|
|
663
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
664
|
+
const patterns = LANGUAGE_PATTERNS[language] ?? LANGUAGE_PATTERNS.typescript;
|
|
665
|
+
const blocks = this.parseCode(text, patterns, splitBy, includeComments);
|
|
666
|
+
let importBlock = "";
|
|
667
|
+
if (includeImports) {
|
|
668
|
+
const imports = text.match(patterns.importPattern);
|
|
669
|
+
if (imports) {
|
|
670
|
+
importBlock = imports.join("\n") + "\n\n";
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
let chunks = [];
|
|
674
|
+
for (const block of blocks) {
|
|
675
|
+
if (block.type === "import") continue;
|
|
676
|
+
const blockContent = includeImports && block.type !== "comment" ? importBlock + block.content : block.content;
|
|
677
|
+
if (tokenCounter(blockContent) <= opts.chunkSize) {
|
|
678
|
+
chunks.push(
|
|
679
|
+
this.createChunk(
|
|
680
|
+
blockContent.trim(),
|
|
681
|
+
chunks.length,
|
|
682
|
+
block.startPosition,
|
|
683
|
+
opts,
|
|
684
|
+
{
|
|
685
|
+
language,
|
|
686
|
+
blockType: block.type,
|
|
687
|
+
blockName: block.name
|
|
688
|
+
}
|
|
689
|
+
)
|
|
690
|
+
);
|
|
691
|
+
} else {
|
|
692
|
+
const subChunks = this.splitLargeBlock(
|
|
693
|
+
block,
|
|
694
|
+
importBlock,
|
|
695
|
+
opts,
|
|
696
|
+
tokenCounter,
|
|
697
|
+
language
|
|
698
|
+
);
|
|
699
|
+
chunks.push(...subChunks);
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
703
|
+
return Promise.resolve(chunks.map((c, i) => ({ ...c, index: i })));
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Detect programming language
|
|
707
|
+
*/
|
|
708
|
+
detectLanguage(text) {
|
|
709
|
+
if (text.includes("import type") || text.includes(": string") || text.includes("interface ")) {
|
|
710
|
+
return "typescript";
|
|
711
|
+
}
|
|
712
|
+
if (text.includes("def ") && text.includes(":")) {
|
|
713
|
+
return "python";
|
|
714
|
+
}
|
|
715
|
+
if (text.includes("func ") && text.includes("package ")) {
|
|
716
|
+
return "go";
|
|
717
|
+
}
|
|
718
|
+
if (text.includes("fn ") && (text.includes("let mut") || text.includes("pub fn"))) {
|
|
719
|
+
return "rust";
|
|
720
|
+
}
|
|
721
|
+
if (text.includes("const ") || text.includes("function ") || text.includes("require(")) {
|
|
722
|
+
return "javascript";
|
|
723
|
+
}
|
|
724
|
+
return "typescript";
|
|
725
|
+
}
|
|
726
|
+
/**
|
|
727
|
+
* Parse code into blocks
|
|
728
|
+
*/
|
|
729
|
+
parseCode(text, patterns, splitBy, includeComments) {
|
|
730
|
+
const blocks = [];
|
|
731
|
+
const lines = text.split("\n");
|
|
732
|
+
let currentBlock = null;
|
|
733
|
+
let braceCount = 0;
|
|
734
|
+
let position = 0;
|
|
735
|
+
for (let i = 0; i < lines.length; i++) {
|
|
736
|
+
const line = lines[i];
|
|
737
|
+
const lineStart = position;
|
|
738
|
+
position += line.length + 1;
|
|
739
|
+
if (splitBy === "function" || splitBy === "auto") {
|
|
740
|
+
const funcMatch = line.match(patterns.functionStart);
|
|
741
|
+
if (funcMatch) {
|
|
742
|
+
if (currentBlock) {
|
|
743
|
+
blocks.push(currentBlock);
|
|
744
|
+
}
|
|
745
|
+
currentBlock = {
|
|
746
|
+
type: "function",
|
|
747
|
+
name: funcMatch[1] || funcMatch[2],
|
|
748
|
+
content: line + "\n",
|
|
749
|
+
startPosition: lineStart
|
|
750
|
+
};
|
|
751
|
+
braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
|
752
|
+
continue;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
if (splitBy === "class" || splitBy === "auto") {
|
|
756
|
+
const classMatch = line.match(patterns.classStart);
|
|
757
|
+
if (classMatch) {
|
|
758
|
+
if (currentBlock) {
|
|
759
|
+
blocks.push(currentBlock);
|
|
760
|
+
}
|
|
761
|
+
currentBlock = {
|
|
762
|
+
type: "class",
|
|
763
|
+
name: classMatch[1],
|
|
764
|
+
content: line + "\n",
|
|
765
|
+
startPosition: lineStart
|
|
766
|
+
};
|
|
767
|
+
braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
|
768
|
+
continue;
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
if (currentBlock) {
|
|
772
|
+
currentBlock.content += line + "\n";
|
|
773
|
+
braceCount += (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
|
774
|
+
if (braceCount <= 0) {
|
|
775
|
+
blocks.push(currentBlock);
|
|
776
|
+
currentBlock = null;
|
|
777
|
+
braceCount = 0;
|
|
778
|
+
}
|
|
779
|
+
} else {
|
|
780
|
+
if (line.trim()) {
|
|
781
|
+
currentBlock = {
|
|
782
|
+
type: "other",
|
|
783
|
+
content: line + "\n",
|
|
784
|
+
startPosition: lineStart
|
|
785
|
+
};
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
if (currentBlock) {
|
|
790
|
+
blocks.push(currentBlock);
|
|
791
|
+
}
|
|
792
|
+
if (!includeComments) {
|
|
793
|
+
return blocks.map((block) => ({
|
|
794
|
+
...block,
|
|
795
|
+
content: block.content.replace(patterns.commentPattern, "")
|
|
796
|
+
}));
|
|
797
|
+
}
|
|
798
|
+
return blocks;
|
|
799
|
+
}
|
|
800
|
+
/**
|
|
801
|
+
* Split a large code block
|
|
802
|
+
*/
|
|
803
|
+
splitLargeBlock(block, importBlock, options, tokenCounter, language) {
|
|
804
|
+
const chunks = [];
|
|
805
|
+
const lines = block.content.split("\n");
|
|
806
|
+
let currentContent = importBlock;
|
|
807
|
+
let chunkStart = block.startPosition;
|
|
808
|
+
for (const line of lines) {
|
|
809
|
+
const testContent = currentContent + line + "\n";
|
|
810
|
+
if (tokenCounter(testContent) > options.chunkSize && currentContent !== importBlock) {
|
|
811
|
+
chunks.push(
|
|
812
|
+
this.createChunk(
|
|
813
|
+
currentContent.trim(),
|
|
814
|
+
chunks.length,
|
|
815
|
+
chunkStart,
|
|
816
|
+
options,
|
|
817
|
+
{
|
|
818
|
+
language,
|
|
819
|
+
blockType: block.type,
|
|
820
|
+
blockName: block.name,
|
|
821
|
+
partial: true
|
|
822
|
+
}
|
|
823
|
+
)
|
|
824
|
+
);
|
|
825
|
+
currentContent = importBlock + line + "\n";
|
|
826
|
+
chunkStart = block.startPosition + block.content.indexOf(line);
|
|
827
|
+
} else {
|
|
828
|
+
currentContent = testContent;
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
if (currentContent.trim() && currentContent !== importBlock.trim()) {
|
|
832
|
+
chunks.push(
|
|
833
|
+
this.createChunk(
|
|
834
|
+
currentContent.trim(),
|
|
835
|
+
chunks.length,
|
|
836
|
+
chunkStart,
|
|
837
|
+
options,
|
|
838
|
+
{
|
|
839
|
+
language,
|
|
840
|
+
blockType: block.type,
|
|
841
|
+
blockName: block.name,
|
|
842
|
+
partial: chunks.length > 0
|
|
843
|
+
}
|
|
844
|
+
)
|
|
845
|
+
);
|
|
846
|
+
}
|
|
847
|
+
return chunks;
|
|
848
|
+
}
|
|
849
|
+
};
|
|
850
|
+
function createCodeChunker() {
|
|
851
|
+
return new CodeChunker();
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
// src/chunking/SemanticChunker.ts
|
|
855
|
+
var SemanticChunker = class extends BaseChunker {
|
|
856
|
+
strategyType = "semantic";
|
|
857
|
+
async chunk(text, options) {
|
|
858
|
+
const opts = this.getOptions(options);
|
|
859
|
+
const similarityThreshold = options?.similarityThreshold ?? 0.5;
|
|
860
|
+
const breakpointPercentile = options?.breakpointPercentileThreshold ?? 95;
|
|
861
|
+
const bufferSize = options?.bufferSize ?? 1;
|
|
862
|
+
const embeddingFn = options?.embeddingFn;
|
|
863
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
864
|
+
const sentences = this.splitSentences(text);
|
|
865
|
+
if (sentences.length === 0) {
|
|
866
|
+
return [];
|
|
867
|
+
}
|
|
868
|
+
if (!embeddingFn) {
|
|
869
|
+
return this.fallbackChunk(sentences, opts, tokenCounter);
|
|
870
|
+
}
|
|
871
|
+
const sentenceTexts = sentences.map((s) => s.text);
|
|
872
|
+
const embeddings = await embeddingFn(sentenceTexts);
|
|
873
|
+
const sentencesWithEmbeddings = sentences.map(
|
|
874
|
+
(s, i) => ({
|
|
875
|
+
...s,
|
|
876
|
+
embedding: embeddings[i]
|
|
877
|
+
})
|
|
878
|
+
);
|
|
879
|
+
const distances = this.calculateDistances(
|
|
880
|
+
sentencesWithEmbeddings,
|
|
881
|
+
bufferSize
|
|
882
|
+
);
|
|
883
|
+
const breakpoints = this.findBreakpoints(
|
|
884
|
+
distances,
|
|
885
|
+
breakpointPercentile,
|
|
886
|
+
similarityThreshold
|
|
887
|
+
);
|
|
888
|
+
let chunks = [];
|
|
889
|
+
let chunkStart = 0;
|
|
890
|
+
let chunkText = "";
|
|
891
|
+
let chunkPosition = sentences[0]?.position ?? 0;
|
|
892
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
893
|
+
chunkText += (chunkText ? " " : "") + sentences[i].text;
|
|
894
|
+
if (breakpoints.includes(i) || i === sentences.length - 1) {
|
|
895
|
+
if (chunkText.trim()) {
|
|
896
|
+
chunks.push(
|
|
897
|
+
this.createChunk(
|
|
898
|
+
chunkText.trim(),
|
|
899
|
+
chunks.length,
|
|
900
|
+
chunkPosition,
|
|
901
|
+
opts,
|
|
902
|
+
{
|
|
903
|
+
boundaryType: "semantic",
|
|
904
|
+
sentenceCount: i - chunkStart + 1
|
|
905
|
+
}
|
|
906
|
+
)
|
|
907
|
+
);
|
|
908
|
+
}
|
|
909
|
+
if (i < sentences.length - 1) {
|
|
910
|
+
chunkStart = i + 1;
|
|
911
|
+
chunkText = "";
|
|
912
|
+
chunkPosition = sentences[i + 1].position;
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
917
|
+
chunks = this.splitLargeChunks(chunks, opts.maxChunkSize, tokenCounter);
|
|
918
|
+
return chunks;
|
|
919
|
+
}
|
|
920
|
+
/**
|
|
921
|
+
* Split text into sentences
|
|
922
|
+
*/
|
|
923
|
+
splitSentences(text) {
|
|
924
|
+
const sentenceRegex = /[^.!?]+[.!?]+/g;
|
|
925
|
+
const sentences = [];
|
|
926
|
+
let match;
|
|
927
|
+
while ((match = sentenceRegex.exec(text)) !== null) {
|
|
928
|
+
const sentence = match[0].trim();
|
|
929
|
+
if (sentence) {
|
|
930
|
+
sentences.push({
|
|
931
|
+
text: sentence,
|
|
932
|
+
position: match.index
|
|
933
|
+
});
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
if (sentences.length === 0 && text.trim()) {
|
|
937
|
+
sentences.push({
|
|
938
|
+
text: text.trim(),
|
|
939
|
+
position: 0
|
|
940
|
+
});
|
|
941
|
+
}
|
|
942
|
+
return sentences;
|
|
943
|
+
}
|
|
944
|
+
/**
|
|
945
|
+
* Calculate distances between adjacent sentences
|
|
946
|
+
*/
|
|
947
|
+
calculateDistances(sentences, bufferSize) {
|
|
948
|
+
const distances = [];
|
|
949
|
+
for (let i = 0; i < sentences.length - 1; i++) {
|
|
950
|
+
const leftStart = Math.max(0, i - bufferSize + 1);
|
|
951
|
+
const rightEnd = Math.min(sentences.length, i + bufferSize + 1);
|
|
952
|
+
const leftEmbeddings = sentences.slice(leftStart, i + 1).map((s) => s.embedding).filter((e) => e !== void 0);
|
|
953
|
+
const rightEmbeddings = sentences.slice(i + 1, rightEnd).map((s) => s.embedding).filter((e) => e !== void 0);
|
|
954
|
+
if (leftEmbeddings.length > 0 && rightEmbeddings.length > 0) {
|
|
955
|
+
const leftAvg = EmbeddingModel.average(leftEmbeddings);
|
|
956
|
+
const rightAvg = EmbeddingModel.average(rightEmbeddings);
|
|
957
|
+
const similarity = EmbeddingModel.cosineSimilarity(leftAvg, rightAvg);
|
|
958
|
+
distances.push(1 - similarity);
|
|
959
|
+
} else {
|
|
960
|
+
distances.push(0);
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
return distances;
|
|
964
|
+
}
|
|
965
|
+
/**
|
|
966
|
+
* Find breakpoints based on distance threshold
|
|
967
|
+
*/
|
|
968
|
+
findBreakpoints(distances, percentile, minThreshold) {
|
|
969
|
+
if (distances.length === 0) return [];
|
|
970
|
+
const sortedDistances = [...distances].sort((a, b) => a - b);
|
|
971
|
+
const percentileIndex = Math.floor(
|
|
972
|
+
percentile / 100 * sortedDistances.length
|
|
973
|
+
);
|
|
974
|
+
const percentileThreshold = sortedDistances[percentileIndex] ?? sortedDistances[sortedDistances.length - 1];
|
|
975
|
+
const threshold = Math.max(percentileThreshold, 1 - minThreshold);
|
|
976
|
+
const breakpoints = [];
|
|
977
|
+
for (let i = 0; i < distances.length; i++) {
|
|
978
|
+
if (distances[i] >= threshold) {
|
|
979
|
+
breakpoints.push(i);
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
return breakpoints;
|
|
983
|
+
}
|
|
984
|
+
/**
|
|
985
|
+
* Fallback chunking when no embedding function available
|
|
986
|
+
*/
|
|
987
|
+
fallbackChunk(sentences, options, tokenCounter) {
|
|
988
|
+
const chunks = [];
|
|
989
|
+
let currentText = "";
|
|
990
|
+
let chunkPosition = sentences[0]?.position ?? 0;
|
|
991
|
+
for (const sentence of sentences) {
|
|
992
|
+
const testText = currentText ? currentText + " " + sentence.text : sentence.text;
|
|
993
|
+
if (tokenCounter(testText) > options.chunkSize && currentText) {
|
|
994
|
+
chunks.push(
|
|
995
|
+
this.createChunk(
|
|
996
|
+
currentText.trim(),
|
|
997
|
+
chunks.length,
|
|
998
|
+
chunkPosition,
|
|
999
|
+
options,
|
|
1000
|
+
{ boundaryType: "sentence" }
|
|
1001
|
+
)
|
|
1002
|
+
);
|
|
1003
|
+
currentText = sentence.text;
|
|
1004
|
+
chunkPosition = sentence.position;
|
|
1005
|
+
} else {
|
|
1006
|
+
currentText = testText;
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
if (currentText.trim()) {
|
|
1010
|
+
chunks.push(
|
|
1011
|
+
this.createChunk(
|
|
1012
|
+
currentText.trim(),
|
|
1013
|
+
chunks.length,
|
|
1014
|
+
chunkPosition,
|
|
1015
|
+
options,
|
|
1016
|
+
{ boundaryType: "sentence" }
|
|
1017
|
+
)
|
|
1018
|
+
);
|
|
1019
|
+
}
|
|
1020
|
+
return chunks;
|
|
1021
|
+
}
|
|
1022
|
+
/**
|
|
1023
|
+
* Split chunks that are too large
|
|
1024
|
+
*/
|
|
1025
|
+
splitLargeChunks(chunks, maxTokens, tokenCounter) {
|
|
1026
|
+
const result = [];
|
|
1027
|
+
for (const chunk2 of chunks) {
|
|
1028
|
+
if (chunk2.tokenCount <= maxTokens) {
|
|
1029
|
+
result.push(chunk2);
|
|
1030
|
+
continue;
|
|
1031
|
+
}
|
|
1032
|
+
const sentences = this.splitSentences(chunk2.text);
|
|
1033
|
+
let currentText = "";
|
|
1034
|
+
let currentStart = chunk2.startPosition;
|
|
1035
|
+
for (const sentence of sentences) {
|
|
1036
|
+
const testText = currentText ? currentText + " " + sentence.text : sentence.text;
|
|
1037
|
+
if (tokenCounter(testText) > maxTokens && currentText) {
|
|
1038
|
+
result.push({
|
|
1039
|
+
...chunk2,
|
|
1040
|
+
id: chunk2.id + "_" + result.length,
|
|
1041
|
+
text: currentText.trim(),
|
|
1042
|
+
startPosition: currentStart,
|
|
1043
|
+
endPosition: currentStart + currentText.length,
|
|
1044
|
+
tokenCount: tokenCounter(currentText),
|
|
1045
|
+
charCount: currentText.length,
|
|
1046
|
+
index: result.length
|
|
1047
|
+
});
|
|
1048
|
+
currentText = sentence.text;
|
|
1049
|
+
currentStart = chunk2.startPosition + sentence.position;
|
|
1050
|
+
} else {
|
|
1051
|
+
currentText = testText;
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
if (currentText.trim()) {
|
|
1055
|
+
result.push({
|
|
1056
|
+
...chunk2,
|
|
1057
|
+
id: chunk2.id + "_" + result.length,
|
|
1058
|
+
text: currentText.trim(),
|
|
1059
|
+
startPosition: currentStart,
|
|
1060
|
+
endPosition: currentStart + currentText.length,
|
|
1061
|
+
tokenCount: tokenCounter(currentText),
|
|
1062
|
+
charCount: currentText.length,
|
|
1063
|
+
index: result.length
|
|
1064
|
+
});
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
return result;
|
|
1068
|
+
}
|
|
1069
|
+
};
|
|
1070
|
+
function createSemanticChunker() {
|
|
1071
|
+
return new SemanticChunker();
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
// src/chunking/index.ts
|
|
1075
|
+
function createChunker(strategy) {
|
|
1076
|
+
switch (strategy) {
|
|
1077
|
+
case "fixed":
|
|
1078
|
+
return new FixedChunker();
|
|
1079
|
+
case "recursive":
|
|
1080
|
+
return new RecursiveChunker();
|
|
1081
|
+
case "markdown":
|
|
1082
|
+
return new MarkdownChunker();
|
|
1083
|
+
case "code":
|
|
1084
|
+
return new CodeChunker();
|
|
1085
|
+
case "semantic":
|
|
1086
|
+
return new SemanticChunker();
|
|
1087
|
+
case "sentence":
|
|
1088
|
+
return new FixedChunker();
|
|
1089
|
+
case "paragraph":
|
|
1090
|
+
return new FixedChunker();
|
|
1091
|
+
default:
|
|
1092
|
+
return new RecursiveChunker();
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
async function chunk(text, strategy = "recursive", options) {
|
|
1096
|
+
const chunker = createChunker(strategy);
|
|
1097
|
+
return chunker.chunk(text, options);
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
export {
|
|
1101
|
+
defaultTokenCounter,
|
|
1102
|
+
BaseChunker,
|
|
1103
|
+
mergeSmallChunks,
|
|
1104
|
+
splitLargeChunks,
|
|
1105
|
+
FixedChunker,
|
|
1106
|
+
createFixedChunker,
|
|
1107
|
+
RecursiveChunker,
|
|
1108
|
+
createRecursiveChunker,
|
|
1109
|
+
MarkdownChunker,
|
|
1110
|
+
createMarkdownChunker,
|
|
1111
|
+
CodeChunker,
|
|
1112
|
+
createCodeChunker,
|
|
1113
|
+
SemanticChunker,
|
|
1114
|
+
createSemanticChunker,
|
|
1115
|
+
createChunker,
|
|
1116
|
+
chunk
|
|
1117
|
+
};
|