@lov3kaizen/agentsea-embeddings 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +475 -0
- package/dist/caching/index.d.mts +286 -0
- package/dist/caching/index.d.ts +286 -0
- package/dist/caching/index.js +1005 -0
- package/dist/caching/index.mjs +27 -0
- package/dist/chunk-3KM32UQK.mjs +207 -0
- package/dist/chunk-DJAURHAS.mjs +1117 -0
- package/dist/chunk-NBHIRTJT.mjs +895 -0
- package/dist/chunk-QAITLJ2E.mjs +259 -0
- package/dist/chunk-TER262ST.mjs +877 -0
- package/dist/chunk-VPSMDBHH.mjs +957 -0
- package/dist/chunking/index.d.mts +1 -0
- package/dist/chunking/index.d.ts +1 -0
- package/dist/chunking/index.js +1408 -0
- package/dist/chunking/index.mjs +37 -0
- package/dist/embedding.types-CCgPVxt1.d.mts +102 -0
- package/dist/embedding.types-CCgPVxt1.d.ts +102 -0
- package/dist/index-CeG6God2.d.mts +297 -0
- package/dist/index-DMaQRn2w.d.mts +172 -0
- package/dist/index-DMaQRn2w.d.ts +172 -0
- package/dist/index-DWddsKRi.d.ts +297 -0
- package/dist/index.d.mts +647 -0
- package/dist/index.d.ts +647 -0
- package/dist/index.js +5259 -0
- package/dist/index.mjs +1028 -0
- package/dist/providers/index.d.mts +2 -0
- package/dist/providers/index.d.ts +2 -0
- package/dist/providers/index.js +1235 -0
- package/dist/providers/index.mjs +32 -0
- package/dist/stores/index.d.mts +298 -0
- package/dist/stores/index.d.ts +298 -0
- package/dist/stores/index.js +1178 -0
- package/dist/stores/index.mjs +26 -0
- package/package.json +102 -0
|
@@ -0,0 +1,1408 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/chunking/index.ts
|
|
21
|
+
var chunking_exports = {};
|
|
22
|
+
__export(chunking_exports, {
|
|
23
|
+
BaseChunker: () => BaseChunker,
|
|
24
|
+
CodeChunker: () => CodeChunker,
|
|
25
|
+
FixedChunker: () => FixedChunker,
|
|
26
|
+
MarkdownChunker: () => MarkdownChunker,
|
|
27
|
+
RecursiveChunker: () => RecursiveChunker,
|
|
28
|
+
SemanticChunker: () => SemanticChunker,
|
|
29
|
+
chunk: () => chunk,
|
|
30
|
+
createChunker: () => createChunker,
|
|
31
|
+
createCodeChunker: () => createCodeChunker,
|
|
32
|
+
createFixedChunker: () => createFixedChunker,
|
|
33
|
+
createMarkdownChunker: () => createMarkdownChunker,
|
|
34
|
+
createRecursiveChunker: () => createRecursiveChunker,
|
|
35
|
+
createSemanticChunker: () => createSemanticChunker,
|
|
36
|
+
defaultTokenCounter: () => defaultTokenCounter,
|
|
37
|
+
mergeSmallChunks: () => mergeSmallChunks,
|
|
38
|
+
splitLargeChunks: () => splitLargeChunks
|
|
39
|
+
});
|
|
40
|
+
module.exports = __toCommonJS(chunking_exports);
|
|
41
|
+
|
|
42
|
+
// src/chunking/BaseChunker.ts
|
|
43
|
+
var import_nanoid = require("nanoid");
|
|
44
|
+
var defaultTokenCounter = (text) => {
|
|
45
|
+
return Math.ceil(text.length / 4);
|
|
46
|
+
};
|
|
47
|
+
var BaseChunker = class {
|
|
48
|
+
/** Default options */
|
|
49
|
+
defaultOptions = {
|
|
50
|
+
chunkSize: 512,
|
|
51
|
+
chunkOverlap: 50,
|
|
52
|
+
minChunkSize: 100,
|
|
53
|
+
maxChunkSize: 2e3,
|
|
54
|
+
tokenCounter: defaultTokenCounter
|
|
55
|
+
};
|
|
56
|
+
/**
|
|
57
|
+
* Get merged options with defaults
|
|
58
|
+
*/
|
|
59
|
+
getOptions(options) {
|
|
60
|
+
return {
|
|
61
|
+
chunkSize: options?.chunkSize ?? this.defaultOptions.chunkSize,
|
|
62
|
+
chunkOverlap: options?.chunkOverlap ?? this.defaultOptions.chunkOverlap,
|
|
63
|
+
minChunkSize: options?.minChunkSize ?? this.defaultOptions.minChunkSize,
|
|
64
|
+
maxChunkSize: options?.maxChunkSize ?? this.defaultOptions.maxChunkSize,
|
|
65
|
+
tokenCounter: options?.tokenCounter ?? this.defaultOptions.tokenCounter,
|
|
66
|
+
documentId: options?.documentId,
|
|
67
|
+
source: options?.source,
|
|
68
|
+
type: options?.type,
|
|
69
|
+
metadata: options?.metadata ?? {}
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Create a chunk object
|
|
74
|
+
*/
|
|
75
|
+
createChunk(text, index, startPosition, options, additionalMetadata) {
|
|
76
|
+
const tokenCounter = options.tokenCounter ?? defaultTokenCounter;
|
|
77
|
+
const metadata = {
|
|
78
|
+
...options.metadata,
|
|
79
|
+
...additionalMetadata
|
|
80
|
+
};
|
|
81
|
+
if (options.documentId) metadata.documentId = options.documentId;
|
|
82
|
+
if (options.source) metadata.source = options.source;
|
|
83
|
+
if (options.type) metadata.type = options.type;
|
|
84
|
+
return {
|
|
85
|
+
id: (0, import_nanoid.nanoid)(),
|
|
86
|
+
text,
|
|
87
|
+
index,
|
|
88
|
+
startPosition,
|
|
89
|
+
endPosition: startPosition + text.length,
|
|
90
|
+
tokenCount: tokenCounter(text),
|
|
91
|
+
charCount: text.length,
|
|
92
|
+
overlapPrev: 0,
|
|
93
|
+
overlapNext: 0,
|
|
94
|
+
metadata
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Process chunks and set overlap information
|
|
99
|
+
*/
|
|
100
|
+
setOverlapInfo(chunks, overlapChars) {
|
|
101
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
102
|
+
chunks[i].overlapPrev = overlapChars;
|
|
103
|
+
chunks[i - 1].overlapNext = overlapChars;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Split text with overlap
|
|
108
|
+
*/
|
|
109
|
+
splitWithOverlap(text, chunkSize, overlap, tokenCounter) {
|
|
110
|
+
const chunks = [];
|
|
111
|
+
let start = 0;
|
|
112
|
+
while (start < text.length) {
|
|
113
|
+
let end = start;
|
|
114
|
+
let tokens = 0;
|
|
115
|
+
while (end < text.length && tokens < chunkSize) {
|
|
116
|
+
end++;
|
|
117
|
+
tokens = tokenCounter(text.slice(start, end));
|
|
118
|
+
}
|
|
119
|
+
if (end < text.length) {
|
|
120
|
+
const lastSpace = text.lastIndexOf(" ", end);
|
|
121
|
+
if (lastSpace > start) {
|
|
122
|
+
end = lastSpace + 1;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
chunks.push(text.slice(start, end).trim());
|
|
126
|
+
const overlapChars = Math.floor(overlap * 4);
|
|
127
|
+
start = Math.max(start + 1, end - overlapChars);
|
|
128
|
+
if (start >= text.length) break;
|
|
129
|
+
}
|
|
130
|
+
return chunks.filter((c) => c.length > 0);
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Chunk text and return a result object
|
|
134
|
+
*/
|
|
135
|
+
async chunkWithResult(text, options) {
|
|
136
|
+
const startTime = performance.now();
|
|
137
|
+
const chunks = await this.chunk(text, options);
|
|
138
|
+
const processingTimeMs = performance.now() - startTime;
|
|
139
|
+
const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0);
|
|
140
|
+
return {
|
|
141
|
+
chunks,
|
|
142
|
+
totalChunks: chunks.length,
|
|
143
|
+
totalTokens,
|
|
144
|
+
avgChunkSize: chunks.length > 0 ? totalTokens / chunks.length : 0,
|
|
145
|
+
processingTimeMs,
|
|
146
|
+
strategy: this.strategyType,
|
|
147
|
+
originalLength: text.length
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
};
|
|
151
|
+
function mergeSmallChunks(chunks, minTokens, tokenCounter) {
|
|
152
|
+
if (chunks.length <= 1) return chunks;
|
|
153
|
+
const merged = [];
|
|
154
|
+
let current = null;
|
|
155
|
+
for (const chunk2 of chunks) {
|
|
156
|
+
if (!current) {
|
|
157
|
+
current = { ...chunk2 };
|
|
158
|
+
continue;
|
|
159
|
+
}
|
|
160
|
+
const combinedText = current.text + "\n" + chunk2.text;
|
|
161
|
+
const combinedTokens = tokenCounter(combinedText);
|
|
162
|
+
if (current.tokenCount < minTokens) {
|
|
163
|
+
current.text = combinedText;
|
|
164
|
+
current.tokenCount = combinedTokens;
|
|
165
|
+
current.charCount = combinedText.length;
|
|
166
|
+
current.endPosition = chunk2.endPosition;
|
|
167
|
+
} else {
|
|
168
|
+
merged.push(current);
|
|
169
|
+
current = { ...chunk2 };
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
if (current) {
|
|
173
|
+
merged.push(current);
|
|
174
|
+
}
|
|
175
|
+
return merged.map((c, i) => ({ ...c, index: i }));
|
|
176
|
+
}
|
|
177
|
+
function splitLargeChunks(chunks, maxTokens, tokenCounter) {
|
|
178
|
+
const result = [];
|
|
179
|
+
for (const chunk2 of chunks) {
|
|
180
|
+
if (chunk2.tokenCount <= maxTokens) {
|
|
181
|
+
result.push(chunk2);
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
const sentences = chunk2.text.split(/(?<=[.!?])\s+/);
|
|
185
|
+
let currentText = "";
|
|
186
|
+
let currentStart = chunk2.startPosition;
|
|
187
|
+
for (const sentence of sentences) {
|
|
188
|
+
const testText = currentText ? currentText + " " + sentence : sentence;
|
|
189
|
+
const testTokens = tokenCounter(testText);
|
|
190
|
+
if (testTokens > maxTokens && currentText) {
|
|
191
|
+
result.push({
|
|
192
|
+
...chunk2,
|
|
193
|
+
id: (0, import_nanoid.nanoid)(),
|
|
194
|
+
text: currentText,
|
|
195
|
+
startPosition: currentStart,
|
|
196
|
+
endPosition: currentStart + currentText.length,
|
|
197
|
+
tokenCount: tokenCounter(currentText),
|
|
198
|
+
charCount: currentText.length
|
|
199
|
+
});
|
|
200
|
+
currentText = sentence;
|
|
201
|
+
currentStart = currentStart + currentText.length + 1;
|
|
202
|
+
} else {
|
|
203
|
+
currentText = testText;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
if (currentText) {
|
|
207
|
+
result.push({
|
|
208
|
+
...chunk2,
|
|
209
|
+
id: (0, import_nanoid.nanoid)(),
|
|
210
|
+
text: currentText,
|
|
211
|
+
startPosition: currentStart,
|
|
212
|
+
endPosition: currentStart + currentText.length,
|
|
213
|
+
tokenCount: tokenCounter(currentText),
|
|
214
|
+
charCount: currentText.length
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return result.map((c, i) => ({ ...c, index: i }));
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// src/chunking/FixedChunker.ts
|
|
222
|
+
var FixedChunker = class extends BaseChunker {
|
|
223
|
+
strategyType = "fixed";
|
|
224
|
+
async chunk(text, options) {
|
|
225
|
+
const opts = this.getOptions(options);
|
|
226
|
+
const splitByChars = options?.splitByChars ?? false;
|
|
227
|
+
const separator = options?.separator ?? "\n";
|
|
228
|
+
const keepSeparator = options?.keepSeparator ?? false;
|
|
229
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
230
|
+
const chunks = [];
|
|
231
|
+
let position = 0;
|
|
232
|
+
if (splitByChars) {
|
|
233
|
+
const chunkSize = opts.chunkSize * 4;
|
|
234
|
+
const overlap = opts.chunkOverlap * 4;
|
|
235
|
+
let start = 0;
|
|
236
|
+
while (start < text.length) {
|
|
237
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
238
|
+
const chunkText = text.slice(start, end).trim();
|
|
239
|
+
if (chunkText.length > 0) {
|
|
240
|
+
chunks.push(this.createChunk(chunkText, chunks.length, start, opts));
|
|
241
|
+
}
|
|
242
|
+
start = end - overlap;
|
|
243
|
+
if (start >= text.length) break;
|
|
244
|
+
}
|
|
245
|
+
} else {
|
|
246
|
+
const parts = text.split(separator);
|
|
247
|
+
let currentChunk = "";
|
|
248
|
+
let chunkStart = 0;
|
|
249
|
+
for (let i = 0; i < parts.length; i++) {
|
|
250
|
+
const part = parts[i];
|
|
251
|
+
const partWithSep = keepSeparator && i < parts.length - 1 ? part + separator : part;
|
|
252
|
+
const testChunk = currentChunk ? currentChunk + (keepSeparator ? "" : separator) + partWithSep : partWithSep;
|
|
253
|
+
const testTokens = tokenCounter(testChunk);
|
|
254
|
+
if (testTokens > opts.chunkSize && currentChunk) {
|
|
255
|
+
chunks.push(
|
|
256
|
+
this.createChunk(
|
|
257
|
+
currentChunk.trim(),
|
|
258
|
+
chunks.length,
|
|
259
|
+
chunkStart,
|
|
260
|
+
opts
|
|
261
|
+
)
|
|
262
|
+
);
|
|
263
|
+
const overlapText = this.getOverlapText(
|
|
264
|
+
currentChunk,
|
|
265
|
+
opts.chunkOverlap,
|
|
266
|
+
tokenCounter
|
|
267
|
+
);
|
|
268
|
+
currentChunk = overlapText + (overlapText ? separator : "") + partWithSep;
|
|
269
|
+
chunkStart = position - (overlapText?.length ?? 0);
|
|
270
|
+
} else {
|
|
271
|
+
currentChunk = testChunk;
|
|
272
|
+
}
|
|
273
|
+
position += part.length + separator.length;
|
|
274
|
+
}
|
|
275
|
+
if (currentChunk.trim()) {
|
|
276
|
+
chunks.push(
|
|
277
|
+
this.createChunk(
|
|
278
|
+
currentChunk.trim(),
|
|
279
|
+
chunks.length,
|
|
280
|
+
chunkStart,
|
|
281
|
+
opts
|
|
282
|
+
)
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
this.setOverlapInfo(chunks, opts.chunkOverlap * 4);
|
|
287
|
+
return Promise.resolve(chunks);
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Get text for overlap from the end of a chunk
|
|
291
|
+
*/
|
|
292
|
+
getOverlapText(text, overlapTokens, tokenCounter) {
|
|
293
|
+
if (overlapTokens <= 0) return "";
|
|
294
|
+
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
295
|
+
let overlapText = "";
|
|
296
|
+
for (let i = sentences.length - 1; i >= 0; i--) {
|
|
297
|
+
const testText = sentences[i] + (overlapText ? " " + overlapText : "");
|
|
298
|
+
const testTokens = tokenCounter(testText);
|
|
299
|
+
if (testTokens > overlapTokens && overlapText) {
|
|
300
|
+
break;
|
|
301
|
+
}
|
|
302
|
+
overlapText = testText;
|
|
303
|
+
}
|
|
304
|
+
return overlapText;
|
|
305
|
+
}
|
|
306
|
+
};
|
|
307
|
+
function createFixedChunker() {
|
|
308
|
+
return new FixedChunker();
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// src/chunking/RecursiveChunker.ts
|
|
312
|
+
var DEFAULT_SEPARATORS = [
|
|
313
|
+
"\n\n",
|
|
314
|
+
// Paragraphs
|
|
315
|
+
"\n",
|
|
316
|
+
// Lines
|
|
317
|
+
". ",
|
|
318
|
+
// Sentences
|
|
319
|
+
", ",
|
|
320
|
+
// Clauses
|
|
321
|
+
" ",
|
|
322
|
+
// Words
|
|
323
|
+
""
|
|
324
|
+
// Characters
|
|
325
|
+
];
|
|
326
|
+
var RecursiveChunker = class extends BaseChunker {
|
|
327
|
+
strategyType = "recursive";
|
|
328
|
+
async chunk(text, options) {
|
|
329
|
+
const opts = this.getOptions(options);
|
|
330
|
+
const separators = options?.separators ?? DEFAULT_SEPARATORS;
|
|
331
|
+
const keepSeparator = options?.keepSeparator ?? true;
|
|
332
|
+
const mergeSmall = options?.mergeSmallChunks ?? true;
|
|
333
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
334
|
+
const texts = this.splitRecursively(
|
|
335
|
+
text,
|
|
336
|
+
separators,
|
|
337
|
+
opts.chunkSize,
|
|
338
|
+
keepSeparator,
|
|
339
|
+
tokenCounter
|
|
340
|
+
);
|
|
341
|
+
let position = 0;
|
|
342
|
+
let chunks = [];
|
|
343
|
+
for (let i = 0; i < texts.length; i++) {
|
|
344
|
+
const chunkText = texts[i].trim();
|
|
345
|
+
if (chunkText) {
|
|
346
|
+
chunks.push(this.createChunk(chunkText, i, position, opts));
|
|
347
|
+
position += texts[i].length;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
if (mergeSmall) {
|
|
351
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
352
|
+
}
|
|
353
|
+
chunks = this.addOverlap(chunks, opts.chunkOverlap, tokenCounter);
|
|
354
|
+
return Promise.resolve(chunks);
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Recursively split text
|
|
358
|
+
*/
|
|
359
|
+
splitRecursively(text, separators, chunkSize, keepSeparator, tokenCounter) {
|
|
360
|
+
if (tokenCounter(text) <= chunkSize) {
|
|
361
|
+
return [text];
|
|
362
|
+
}
|
|
363
|
+
for (let i = 0; i < separators.length; i++) {
|
|
364
|
+
const separator = separators[i];
|
|
365
|
+
if (separator === "") {
|
|
366
|
+
return this.splitByChars(text, chunkSize, tokenCounter);
|
|
367
|
+
}
|
|
368
|
+
if (!text.includes(separator)) {
|
|
369
|
+
continue;
|
|
370
|
+
}
|
|
371
|
+
const splits = this.splitBySeparator(text, separator, keepSeparator);
|
|
372
|
+
const result = [];
|
|
373
|
+
for (const split of splits) {
|
|
374
|
+
if (tokenCounter(split) <= chunkSize) {
|
|
375
|
+
result.push(split);
|
|
376
|
+
} else {
|
|
377
|
+
const subSplits = this.splitRecursively(
|
|
378
|
+
split,
|
|
379
|
+
separators.slice(i + 1),
|
|
380
|
+
chunkSize,
|
|
381
|
+
keepSeparator,
|
|
382
|
+
tokenCounter
|
|
383
|
+
);
|
|
384
|
+
result.push(...subSplits);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
return result;
|
|
388
|
+
}
|
|
389
|
+
return this.splitByChars(text, chunkSize, tokenCounter);
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Split by separator
|
|
393
|
+
*/
|
|
394
|
+
splitBySeparator(text, separator, keepSeparator) {
|
|
395
|
+
if (keepSeparator) {
|
|
396
|
+
const parts = text.split(separator);
|
|
397
|
+
return parts.map((part, i) => i < parts.length - 1 ? part + separator : part).filter((p) => p.trim());
|
|
398
|
+
} else {
|
|
399
|
+
return text.split(separator).filter((p) => p.trim());
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* Split by characters (last resort)
|
|
404
|
+
*/
|
|
405
|
+
splitByChars(text, chunkSize, tokenCounter) {
|
|
406
|
+
const chunks = [];
|
|
407
|
+
let start = 0;
|
|
408
|
+
while (start < text.length) {
|
|
409
|
+
let end = start;
|
|
410
|
+
while (end < text.length && tokenCounter(text.slice(start, end)) < chunkSize) {
|
|
411
|
+
end++;
|
|
412
|
+
}
|
|
413
|
+
if (end < text.length) {
|
|
414
|
+
const lastSpace = text.lastIndexOf(" ", end);
|
|
415
|
+
if (lastSpace > start) {
|
|
416
|
+
end = lastSpace;
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
chunks.push(text.slice(start, end));
|
|
420
|
+
start = end;
|
|
421
|
+
}
|
|
422
|
+
return chunks.filter((c) => c.trim());
|
|
423
|
+
}
|
|
424
|
+
/**
|
|
425
|
+
* Add overlap between chunks
|
|
426
|
+
*/
|
|
427
|
+
addOverlap(chunks, overlapTokens, tokenCounter) {
|
|
428
|
+
if (overlapTokens <= 0 || chunks.length <= 1) {
|
|
429
|
+
return chunks;
|
|
430
|
+
}
|
|
431
|
+
const result = [];
|
|
432
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
433
|
+
let chunkText = chunks[i].text;
|
|
434
|
+
let startOffset = 0;
|
|
435
|
+
if (i > 0) {
|
|
436
|
+
const prevText = chunks[i - 1].text;
|
|
437
|
+
const overlapText = this.getEndOverlap(
|
|
438
|
+
prevText,
|
|
439
|
+
overlapTokens,
|
|
440
|
+
tokenCounter
|
|
441
|
+
);
|
|
442
|
+
if (overlapText) {
|
|
443
|
+
chunkText = overlapText + " " + chunkText;
|
|
444
|
+
startOffset = -overlapText.length - 1;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
result.push({
|
|
448
|
+
...chunks[i],
|
|
449
|
+
text: chunkText,
|
|
450
|
+
startPosition: chunks[i].startPosition + startOffset,
|
|
451
|
+
tokenCount: tokenCounter(chunkText),
|
|
452
|
+
charCount: chunkText.length,
|
|
453
|
+
overlapPrev: i > 0 ? overlapTokens : 0,
|
|
454
|
+
overlapNext: i < chunks.length - 1 ? overlapTokens : 0
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
return result;
|
|
458
|
+
}
|
|
459
|
+
/**
|
|
460
|
+
* Get overlap text from end of string
|
|
461
|
+
*/
|
|
462
|
+
getEndOverlap(text, overlapTokens, tokenCounter) {
|
|
463
|
+
const words = text.split(/\s+/);
|
|
464
|
+
let overlap = "";
|
|
465
|
+
let tokens = 0;
|
|
466
|
+
for (let i = words.length - 1; i >= 0; i--) {
|
|
467
|
+
const testOverlap = words[i] + (overlap ? " " + overlap : "");
|
|
468
|
+
tokens = tokenCounter(testOverlap);
|
|
469
|
+
if (tokens > overlapTokens) {
|
|
470
|
+
break;
|
|
471
|
+
}
|
|
472
|
+
overlap = testOverlap;
|
|
473
|
+
}
|
|
474
|
+
return overlap;
|
|
475
|
+
}
|
|
476
|
+
};
|
|
477
|
+
function createRecursiveChunker() {
|
|
478
|
+
return new RecursiveChunker();
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// src/chunking/MarkdownChunker.ts
|
|
482
|
+
var MarkdownChunker = class extends BaseChunker {
|
|
483
|
+
strategyType = "markdown";
|
|
484
|
+
async chunk(text, options) {
|
|
485
|
+
const opts = this.getOptions(options);
|
|
486
|
+
const preserveHeaders = options?.preserveHeaders ?? true;
|
|
487
|
+
const includeHeaderHierarchy = options?.includeHeaderHierarchy ?? true;
|
|
488
|
+
const headingLevels = options?.headingLevels ?? [1, 2, 3, 4, 5, 6];
|
|
489
|
+
const splitCodeBlocks = options?.splitCodeBlocks ?? false;
|
|
490
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
491
|
+
const sections = this.parseMarkdown(text, headingLevels);
|
|
492
|
+
let chunks = [];
|
|
493
|
+
for (const section of sections) {
|
|
494
|
+
const sectionChunks = await this.chunkSection(
|
|
495
|
+
section,
|
|
496
|
+
opts,
|
|
497
|
+
preserveHeaders,
|
|
498
|
+
includeHeaderHierarchy,
|
|
499
|
+
splitCodeBlocks,
|
|
500
|
+
tokenCounter
|
|
501
|
+
);
|
|
502
|
+
chunks.push(...sectionChunks);
|
|
503
|
+
}
|
|
504
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
505
|
+
return chunks.map((c, i) => ({ ...c, index: i }));
|
|
506
|
+
}
|
|
507
|
+
/**
|
|
508
|
+
* Parse markdown into sections
|
|
509
|
+
*/
|
|
510
|
+
parseMarkdown(text, headingLevels) {
|
|
511
|
+
const sections = [];
|
|
512
|
+
const lines = text.split("\n");
|
|
513
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/;
|
|
514
|
+
let currentSection = {
|
|
515
|
+
headingLevel: 0,
|
|
516
|
+
content: "",
|
|
517
|
+
startPosition: 0,
|
|
518
|
+
path: []
|
|
519
|
+
};
|
|
520
|
+
const headingStack = [];
|
|
521
|
+
let position = 0;
|
|
522
|
+
for (const line of lines) {
|
|
523
|
+
const headingMatch = line.match(headingRegex);
|
|
524
|
+
if (headingMatch) {
|
|
525
|
+
const level = headingMatch[1].length;
|
|
526
|
+
const headingText = headingMatch[2];
|
|
527
|
+
if (headingLevels.includes(level)) {
|
|
528
|
+
if (currentSection.content.trim()) {
|
|
529
|
+
sections.push({ ...currentSection });
|
|
530
|
+
}
|
|
531
|
+
while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= level) {
|
|
532
|
+
headingStack.pop();
|
|
533
|
+
}
|
|
534
|
+
headingStack.push({ level, text: headingText });
|
|
535
|
+
currentSection = {
|
|
536
|
+
heading: headingText,
|
|
537
|
+
headingLevel: level,
|
|
538
|
+
content: "",
|
|
539
|
+
startPosition: position,
|
|
540
|
+
path: headingStack.map((h) => h.text)
|
|
541
|
+
};
|
|
542
|
+
} else {
|
|
543
|
+
currentSection.content += line + "\n";
|
|
544
|
+
}
|
|
545
|
+
} else {
|
|
546
|
+
currentSection.content += line + "\n";
|
|
547
|
+
}
|
|
548
|
+
position += line.length + 1;
|
|
549
|
+
}
|
|
550
|
+
if (currentSection.content.trim() || currentSection.heading) {
|
|
551
|
+
sections.push(currentSection);
|
|
552
|
+
}
|
|
553
|
+
return sections;
|
|
554
|
+
}
|
|
555
|
+
/**
|
|
556
|
+
* Chunk a markdown section
|
|
557
|
+
*/
|
|
558
|
+
async chunkSection(section, options, preserveHeaders, includeHeaderHierarchy, splitCodeBlocks, tokenCounter) {
|
|
559
|
+
const chunks = [];
|
|
560
|
+
let content = section.content;
|
|
561
|
+
let headerPrefix = "";
|
|
562
|
+
if (preserveHeaders && section.heading) {
|
|
563
|
+
if (includeHeaderHierarchy && section.path.length > 1) {
|
|
564
|
+
headerPrefix = section.path.map((h, i) => "#".repeat(i + 1) + " " + h).join("\n") + "\n\n";
|
|
565
|
+
} else {
|
|
566
|
+
headerPrefix = "#".repeat(section.headingLevel) + " " + section.heading + "\n\n";
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
const codeBlocks = [];
|
|
570
|
+
if (!splitCodeBlocks) {
|
|
571
|
+
const codeBlockRegex = /```[\s\S]*?```/g;
|
|
572
|
+
let match;
|
|
573
|
+
let blockIndex = 0;
|
|
574
|
+
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
575
|
+
const placeholder = `__CODE_BLOCK_${blockIndex}__`;
|
|
576
|
+
codeBlocks.push({ placeholder, content: match[0] });
|
|
577
|
+
content = content.replace(match[0], placeholder);
|
|
578
|
+
blockIndex++;
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
const fullContent = headerPrefix + content;
|
|
582
|
+
if (tokenCounter(fullContent) <= options.chunkSize) {
|
|
583
|
+
let finalContent = fullContent;
|
|
584
|
+
for (const block of codeBlocks) {
|
|
585
|
+
finalContent = finalContent.replace(block.placeholder, block.content);
|
|
586
|
+
}
|
|
587
|
+
chunks.push(
|
|
588
|
+
this.createChunk(
|
|
589
|
+
finalContent.trim(),
|
|
590
|
+
0,
|
|
591
|
+
section.startPosition,
|
|
592
|
+
options,
|
|
593
|
+
{
|
|
594
|
+
section: section.heading,
|
|
595
|
+
headingLevel: section.headingLevel,
|
|
596
|
+
path: section.path
|
|
597
|
+
}
|
|
598
|
+
)
|
|
599
|
+
);
|
|
600
|
+
} else {
|
|
601
|
+
const paragraphs = content.split(/\n\n+/);
|
|
602
|
+
let currentContent = headerPrefix;
|
|
603
|
+
let chunkStart = section.startPosition;
|
|
604
|
+
for (const paragraph of paragraphs) {
|
|
605
|
+
let para = paragraph;
|
|
606
|
+
for (const block of codeBlocks) {
|
|
607
|
+
para = para.replace(block.placeholder, block.content);
|
|
608
|
+
}
|
|
609
|
+
const testContent = currentContent + para + "\n\n";
|
|
610
|
+
if (tokenCounter(testContent) > options.chunkSize && currentContent !== headerPrefix) {
|
|
611
|
+
chunks.push(
|
|
612
|
+
this.createChunk(
|
|
613
|
+
currentContent.trim(),
|
|
614
|
+
chunks.length,
|
|
615
|
+
chunkStart,
|
|
616
|
+
options,
|
|
617
|
+
{
|
|
618
|
+
section: section.heading,
|
|
619
|
+
headingLevel: section.headingLevel,
|
|
620
|
+
path: section.path
|
|
621
|
+
}
|
|
622
|
+
)
|
|
623
|
+
);
|
|
624
|
+
currentContent = headerPrefix + para + "\n\n";
|
|
625
|
+
chunkStart = section.startPosition + content.indexOf(paragraph);
|
|
626
|
+
} else {
|
|
627
|
+
currentContent = testContent;
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
if (currentContent.trim() && currentContent !== headerPrefix.trim()) {
|
|
631
|
+
chunks.push(
|
|
632
|
+
this.createChunk(
|
|
633
|
+
currentContent.trim(),
|
|
634
|
+
chunks.length,
|
|
635
|
+
chunkStart,
|
|
636
|
+
options,
|
|
637
|
+
{
|
|
638
|
+
section: section.heading,
|
|
639
|
+
headingLevel: section.headingLevel,
|
|
640
|
+
path: section.path
|
|
641
|
+
}
|
|
642
|
+
)
|
|
643
|
+
);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
return Promise.resolve(chunks);
|
|
647
|
+
}
|
|
648
|
+
};
|
|
649
|
+
function createMarkdownChunker() {
|
|
650
|
+
return new MarkdownChunker();
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// src/chunking/CodeChunker.ts
|
|
654
|
+
var LANGUAGE_PATTERNS = {
|
|
655
|
+
typescript: {
|
|
656
|
+
functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
|
|
657
|
+
classStart: /^(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/m,
|
|
658
|
+
importPattern: /^import\s+.*?(?:from\s+['"][^'"]+['"]|['"][^'"]+['"])/gm,
|
|
659
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
660
|
+
blockEnd: /^}/m
|
|
661
|
+
},
|
|
662
|
+
javascript: {
|
|
663
|
+
functionStart: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)|^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/m,
|
|
664
|
+
classStart: /^(?:export\s+)?class\s+(\w+)/m,
|
|
665
|
+
importPattern: /^(?:import|require)\s*\(?\s*['"][^'"]+['"]\)?/gm,
|
|
666
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
667
|
+
blockEnd: /^}/m
|
|
668
|
+
},
|
|
669
|
+
python: {
|
|
670
|
+
functionStart: /^(?:async\s+)?def\s+(\w+)/m,
|
|
671
|
+
classStart: /^class\s+(\w+)/m,
|
|
672
|
+
importPattern: /^(?:from\s+\S+\s+)?import\s+.+$/gm,
|
|
673
|
+
commentPattern: /'''[\s\S]*?'''|"""[\s\S]*?"""|#.*/g,
|
|
674
|
+
blockEnd: /^(?=\S)/m
|
|
675
|
+
// Python uses indentation
|
|
676
|
+
},
|
|
677
|
+
go: {
|
|
678
|
+
functionStart: /^func\s+(?:\([^)]+\)\s+)?(\w+)/m,
|
|
679
|
+
classStart: /^type\s+(\w+)\s+struct/m,
|
|
680
|
+
importPattern: /^import\s+(?:\([\s\S]*?\)|"[^"]+")/gm,
|
|
681
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
682
|
+
blockEnd: /^}/m
|
|
683
|
+
},
|
|
684
|
+
rust: {
|
|
685
|
+
functionStart: /^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/m,
|
|
686
|
+
classStart: /^(?:pub\s+)?(?:struct|impl|trait)\s+(\w+)/m,
|
|
687
|
+
importPattern: /^use\s+.+;$/gm,
|
|
688
|
+
commentPattern: /\/\*[\s\S]*?\*\/|\/\/.*/g,
|
|
689
|
+
blockEnd: /^}/m
|
|
690
|
+
}
|
|
691
|
+
};
|
|
692
|
+
var CodeChunker = class extends BaseChunker {
|
|
693
|
+
strategyType = "code";
|
|
694
|
+
async chunk(text, options) {
|
|
695
|
+
const opts = this.getOptions(options);
|
|
696
|
+
const language = options?.language ?? this.detectLanguage(text);
|
|
697
|
+
const splitBy = options?.splitBy ?? "auto";
|
|
698
|
+
const includeComments = options?.includeComments ?? true;
|
|
699
|
+
const includeImports = options?.includeImports ?? true;
|
|
700
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
701
|
+
const patterns = LANGUAGE_PATTERNS[language] ?? LANGUAGE_PATTERNS.typescript;
|
|
702
|
+
const blocks = this.parseCode(text, patterns, splitBy, includeComments);
|
|
703
|
+
let importBlock = "";
|
|
704
|
+
if (includeImports) {
|
|
705
|
+
const imports = text.match(patterns.importPattern);
|
|
706
|
+
if (imports) {
|
|
707
|
+
importBlock = imports.join("\n") + "\n\n";
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
let chunks = [];
|
|
711
|
+
for (const block of blocks) {
|
|
712
|
+
if (block.type === "import") continue;
|
|
713
|
+
const blockContent = includeImports && block.type !== "comment" ? importBlock + block.content : block.content;
|
|
714
|
+
if (tokenCounter(blockContent) <= opts.chunkSize) {
|
|
715
|
+
chunks.push(
|
|
716
|
+
this.createChunk(
|
|
717
|
+
blockContent.trim(),
|
|
718
|
+
chunks.length,
|
|
719
|
+
block.startPosition,
|
|
720
|
+
opts,
|
|
721
|
+
{
|
|
722
|
+
language,
|
|
723
|
+
blockType: block.type,
|
|
724
|
+
blockName: block.name
|
|
725
|
+
}
|
|
726
|
+
)
|
|
727
|
+
);
|
|
728
|
+
} else {
|
|
729
|
+
const subChunks = this.splitLargeBlock(
|
|
730
|
+
block,
|
|
731
|
+
importBlock,
|
|
732
|
+
opts,
|
|
733
|
+
tokenCounter,
|
|
734
|
+
language
|
|
735
|
+
);
|
|
736
|
+
chunks.push(...subChunks);
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
740
|
+
return Promise.resolve(chunks.map((c, i) => ({ ...c, index: i })));
|
|
741
|
+
}
|
|
742
|
+
/**
|
|
743
|
+
* Detect programming language
|
|
744
|
+
*/
|
|
745
|
+
detectLanguage(text) {
|
|
746
|
+
if (text.includes("import type") || text.includes(": string") || text.includes("interface ")) {
|
|
747
|
+
return "typescript";
|
|
748
|
+
}
|
|
749
|
+
if (text.includes("def ") && text.includes(":")) {
|
|
750
|
+
return "python";
|
|
751
|
+
}
|
|
752
|
+
if (text.includes("func ") && text.includes("package ")) {
|
|
753
|
+
return "go";
|
|
754
|
+
}
|
|
755
|
+
if (text.includes("fn ") && (text.includes("let mut") || text.includes("pub fn"))) {
|
|
756
|
+
return "rust";
|
|
757
|
+
}
|
|
758
|
+
if (text.includes("const ") || text.includes("function ") || text.includes("require(")) {
|
|
759
|
+
return "javascript";
|
|
760
|
+
}
|
|
761
|
+
return "typescript";
|
|
762
|
+
}
|
|
763
|
+
/**
|
|
764
|
+
* Parse code into blocks
|
|
765
|
+
*/
|
|
766
|
+
parseCode(text, patterns, splitBy, includeComments) {
|
|
767
|
+
const blocks = [];
|
|
768
|
+
const lines = text.split("\n");
|
|
769
|
+
let currentBlock = null;
|
|
770
|
+
let braceCount = 0;
|
|
771
|
+
let position = 0;
|
|
772
|
+
for (let i = 0; i < lines.length; i++) {
|
|
773
|
+
const line = lines[i];
|
|
774
|
+
const lineStart = position;
|
|
775
|
+
position += line.length + 1;
|
|
776
|
+
if (splitBy === "function" || splitBy === "auto") {
|
|
777
|
+
const funcMatch = line.match(patterns.functionStart);
|
|
778
|
+
if (funcMatch) {
|
|
779
|
+
if (currentBlock) {
|
|
780
|
+
blocks.push(currentBlock);
|
|
781
|
+
}
|
|
782
|
+
currentBlock = {
|
|
783
|
+
type: "function",
|
|
784
|
+
name: funcMatch[1] || funcMatch[2],
|
|
785
|
+
content: line + "\n",
|
|
786
|
+
startPosition: lineStart
|
|
787
|
+
};
|
|
788
|
+
braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
|
789
|
+
continue;
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
if (splitBy === "class" || splitBy === "auto") {
|
|
793
|
+
const classMatch = line.match(patterns.classStart);
|
|
794
|
+
if (classMatch) {
|
|
795
|
+
if (currentBlock) {
|
|
796
|
+
blocks.push(currentBlock);
|
|
797
|
+
}
|
|
798
|
+
currentBlock = {
|
|
799
|
+
type: "class",
|
|
800
|
+
name: classMatch[1],
|
|
801
|
+
content: line + "\n",
|
|
802
|
+
startPosition: lineStart
|
|
803
|
+
};
|
|
804
|
+
braceCount = (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
|
805
|
+
continue;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
if (currentBlock) {
|
|
809
|
+
currentBlock.content += line + "\n";
|
|
810
|
+
braceCount += (line.match(/{/g) || []).length - (line.match(/}/g) || []).length;
|
|
811
|
+
if (braceCount <= 0) {
|
|
812
|
+
blocks.push(currentBlock);
|
|
813
|
+
currentBlock = null;
|
|
814
|
+
braceCount = 0;
|
|
815
|
+
}
|
|
816
|
+
} else {
|
|
817
|
+
if (line.trim()) {
|
|
818
|
+
currentBlock = {
|
|
819
|
+
type: "other",
|
|
820
|
+
content: line + "\n",
|
|
821
|
+
startPosition: lineStart
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
if (currentBlock) {
|
|
827
|
+
blocks.push(currentBlock);
|
|
828
|
+
}
|
|
829
|
+
if (!includeComments) {
|
|
830
|
+
return blocks.map((block) => ({
|
|
831
|
+
...block,
|
|
832
|
+
content: block.content.replace(patterns.commentPattern, "")
|
|
833
|
+
}));
|
|
834
|
+
}
|
|
835
|
+
return blocks;
|
|
836
|
+
}
|
|
837
|
+
/**
|
|
838
|
+
* Split a large code block
|
|
839
|
+
*/
|
|
840
|
+
splitLargeBlock(block, importBlock, options, tokenCounter, language) {
|
|
841
|
+
const chunks = [];
|
|
842
|
+
const lines = block.content.split("\n");
|
|
843
|
+
let currentContent = importBlock;
|
|
844
|
+
let chunkStart = block.startPosition;
|
|
845
|
+
for (const line of lines) {
|
|
846
|
+
const testContent = currentContent + line + "\n";
|
|
847
|
+
if (tokenCounter(testContent) > options.chunkSize && currentContent !== importBlock) {
|
|
848
|
+
chunks.push(
|
|
849
|
+
this.createChunk(
|
|
850
|
+
currentContent.trim(),
|
|
851
|
+
chunks.length,
|
|
852
|
+
chunkStart,
|
|
853
|
+
options,
|
|
854
|
+
{
|
|
855
|
+
language,
|
|
856
|
+
blockType: block.type,
|
|
857
|
+
blockName: block.name,
|
|
858
|
+
partial: true
|
|
859
|
+
}
|
|
860
|
+
)
|
|
861
|
+
);
|
|
862
|
+
currentContent = importBlock + line + "\n";
|
|
863
|
+
chunkStart = block.startPosition + block.content.indexOf(line);
|
|
864
|
+
} else {
|
|
865
|
+
currentContent = testContent;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
if (currentContent.trim() && currentContent !== importBlock.trim()) {
|
|
869
|
+
chunks.push(
|
|
870
|
+
this.createChunk(
|
|
871
|
+
currentContent.trim(),
|
|
872
|
+
chunks.length,
|
|
873
|
+
chunkStart,
|
|
874
|
+
options,
|
|
875
|
+
{
|
|
876
|
+
language,
|
|
877
|
+
blockType: block.type,
|
|
878
|
+
blockName: block.name,
|
|
879
|
+
partial: chunks.length > 0
|
|
880
|
+
}
|
|
881
|
+
)
|
|
882
|
+
);
|
|
883
|
+
}
|
|
884
|
+
return chunks;
|
|
885
|
+
}
|
|
886
|
+
};
|
|
887
|
+
function createCodeChunker() {
|
|
888
|
+
return new CodeChunker();
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
// src/core/EmbeddingModel.ts
|
|
892
|
+
var EmbeddingModel = class {
|
|
893
|
+
/**
|
|
894
|
+
* Get model dimensions
|
|
895
|
+
*/
|
|
896
|
+
get dimensions() {
|
|
897
|
+
return this.info.dimensions;
|
|
898
|
+
}
|
|
899
|
+
/**
|
|
900
|
+
* Get max tokens
|
|
901
|
+
*/
|
|
902
|
+
get maxTokens() {
|
|
903
|
+
return this.info.maxTokens;
|
|
904
|
+
}
|
|
905
|
+
/**
|
|
906
|
+
* Get max batch size
|
|
907
|
+
*/
|
|
908
|
+
get maxBatchSize() {
|
|
909
|
+
return this.info.maxBatchSize;
|
|
910
|
+
}
|
|
911
|
+
/**
|
|
912
|
+
* Get model name
|
|
913
|
+
*/
|
|
914
|
+
get name() {
|
|
915
|
+
return this.info.name;
|
|
916
|
+
}
|
|
917
|
+
/**
|
|
918
|
+
* Get provider name
|
|
919
|
+
*/
|
|
920
|
+
get provider() {
|
|
921
|
+
return this.info.provider;
|
|
922
|
+
}
|
|
923
|
+
/**
|
|
924
|
+
* Count tokens in text (default implementation)
|
|
925
|
+
* Subclasses should override for accurate counting
|
|
926
|
+
*/
|
|
927
|
+
countTokens(text) {
|
|
928
|
+
return Math.ceil(text.length / 4);
|
|
929
|
+
}
|
|
930
|
+
/**
|
|
931
|
+
* Check if text exceeds max tokens
|
|
932
|
+
*/
|
|
933
|
+
exceedsMaxTokens(text) {
|
|
934
|
+
return this.countTokens(text) > this.maxTokens;
|
|
935
|
+
}
|
|
936
|
+
/**
|
|
937
|
+
* Truncate text to max tokens
|
|
938
|
+
*/
|
|
939
|
+
truncateToMaxTokens(text) {
|
|
940
|
+
const tokens = this.countTokens(text);
|
|
941
|
+
if (tokens <= this.maxTokens) {
|
|
942
|
+
return text;
|
|
943
|
+
}
|
|
944
|
+
const ratio = this.maxTokens / tokens;
|
|
945
|
+
const targetLength = Math.floor(text.length * ratio * 0.95);
|
|
946
|
+
return text.slice(0, targetLength);
|
|
947
|
+
}
|
|
948
|
+
/**
|
|
949
|
+
* Calculate similarity between two vectors
|
|
950
|
+
*/
|
|
951
|
+
static cosineSimilarity(a, b) {
|
|
952
|
+
if (a.length !== b.length) {
|
|
953
|
+
throw new Error(`Vector dimensions mismatch: ${a.length} vs ${b.length}`);
|
|
954
|
+
}
|
|
955
|
+
let dotProduct = 0;
|
|
956
|
+
let normA = 0;
|
|
957
|
+
let normB = 0;
|
|
958
|
+
for (let i = 0; i < a.length; i++) {
|
|
959
|
+
dotProduct += a[i] * b[i];
|
|
960
|
+
normA += a[i] * a[i];
|
|
961
|
+
normB += b[i] * b[i];
|
|
962
|
+
}
|
|
963
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
964
|
+
if (magnitude === 0) {
|
|
965
|
+
return 0;
|
|
966
|
+
}
|
|
967
|
+
return dotProduct / magnitude;
|
|
968
|
+
}
|
|
969
|
+
/**
|
|
970
|
+
* Calculate Euclidean distance between two vectors
|
|
971
|
+
*/
|
|
972
|
+
static euclideanDistance(a, b) {
|
|
973
|
+
if (a.length !== b.length) {
|
|
974
|
+
throw new Error(`Vector dimensions mismatch: ${a.length} vs ${b.length}`);
|
|
975
|
+
}
|
|
976
|
+
let sum = 0;
|
|
977
|
+
for (let i = 0; i < a.length; i++) {
|
|
978
|
+
const diff = a[i] - b[i];
|
|
979
|
+
sum += diff * diff;
|
|
980
|
+
}
|
|
981
|
+
return Math.sqrt(sum);
|
|
982
|
+
}
|
|
983
|
+
/**
|
|
984
|
+
* Calculate dot product of two vectors
|
|
985
|
+
*/
|
|
986
|
+
static dotProduct(a, b) {
|
|
987
|
+
if (a.length !== b.length) {
|
|
988
|
+
throw new Error(`Vector dimensions mismatch: ${a.length} vs ${b.length}`);
|
|
989
|
+
}
|
|
990
|
+
let result = 0;
|
|
991
|
+
for (let i = 0; i < a.length; i++) {
|
|
992
|
+
result += a[i] * b[i];
|
|
993
|
+
}
|
|
994
|
+
return result;
|
|
995
|
+
}
|
|
996
|
+
/**
|
|
997
|
+
* Normalize a vector to unit length
|
|
998
|
+
*/
|
|
999
|
+
static normalize(vector) {
|
|
1000
|
+
let norm = 0;
|
|
1001
|
+
for (let i = 0; i < vector.length; i++) {
|
|
1002
|
+
norm += vector[i] * vector[i];
|
|
1003
|
+
}
|
|
1004
|
+
norm = Math.sqrt(norm);
|
|
1005
|
+
if (norm === 0) {
|
|
1006
|
+
return vector.slice();
|
|
1007
|
+
}
|
|
1008
|
+
return vector.map((v) => v / norm);
|
|
1009
|
+
}
|
|
1010
|
+
/**
|
|
1011
|
+
* Average multiple vectors
|
|
1012
|
+
*/
|
|
1013
|
+
static average(vectors) {
|
|
1014
|
+
if (vectors.length === 0) {
|
|
1015
|
+
throw new Error("Cannot average empty array of vectors");
|
|
1016
|
+
}
|
|
1017
|
+
const dimensions = vectors[0].length;
|
|
1018
|
+
const result = new Array(dimensions).fill(0);
|
|
1019
|
+
for (const vector of vectors) {
|
|
1020
|
+
if (vector.length !== dimensions) {
|
|
1021
|
+
throw new Error(
|
|
1022
|
+
`Vector dimensions mismatch: expected ${dimensions}, got ${vector.length}`
|
|
1023
|
+
);
|
|
1024
|
+
}
|
|
1025
|
+
for (let i = 0; i < dimensions; i++) {
|
|
1026
|
+
result[i] += vector[i];
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
for (let i = 0; i < dimensions; i++) {
|
|
1030
|
+
result[i] /= vectors.length;
|
|
1031
|
+
}
|
|
1032
|
+
return result;
|
|
1033
|
+
}
|
|
1034
|
+
/**
|
|
1035
|
+
* Weighted average of vectors
|
|
1036
|
+
*/
|
|
1037
|
+
static weightedAverage(vectors, weights) {
|
|
1038
|
+
if (vectors.length === 0) {
|
|
1039
|
+
throw new Error("Cannot average empty array of vectors");
|
|
1040
|
+
}
|
|
1041
|
+
if (vectors.length !== weights.length) {
|
|
1042
|
+
throw new Error("Vectors and weights arrays must have same length");
|
|
1043
|
+
}
|
|
1044
|
+
const dimensions = vectors[0].length;
|
|
1045
|
+
const result = new Array(dimensions).fill(0);
|
|
1046
|
+
let totalWeight = 0;
|
|
1047
|
+
for (let j = 0; j < vectors.length; j++) {
|
|
1048
|
+
const vector = vectors[j];
|
|
1049
|
+
const weight = weights[j];
|
|
1050
|
+
totalWeight += weight;
|
|
1051
|
+
if (vector.length !== dimensions) {
|
|
1052
|
+
throw new Error(
|
|
1053
|
+
`Vector dimensions mismatch: expected ${dimensions}, got ${vector.length}`
|
|
1054
|
+
);
|
|
1055
|
+
}
|
|
1056
|
+
for (let i = 0; i < dimensions; i++) {
|
|
1057
|
+
result[i] += vector[i] * weight;
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
if (totalWeight === 0) {
|
|
1061
|
+
throw new Error("Total weight cannot be zero");
|
|
1062
|
+
}
|
|
1063
|
+
for (let i = 0; i < dimensions; i++) {
|
|
1064
|
+
result[i] /= totalWeight;
|
|
1065
|
+
}
|
|
1066
|
+
return result;
|
|
1067
|
+
}
|
|
1068
|
+
};
|
|
1069
|
+
var ModelRegistry = class {
|
|
1070
|
+
models = /* @__PURE__ */ new Map();
|
|
1071
|
+
defaultModel = null;
|
|
1072
|
+
/**
|
|
1073
|
+
* Register a model
|
|
1074
|
+
*/
|
|
1075
|
+
register(model, isDefault = false) {
|
|
1076
|
+
const key = `${model.provider}:${model.name}`;
|
|
1077
|
+
this.models.set(key, model);
|
|
1078
|
+
if (isDefault || this.defaultModel === null) {
|
|
1079
|
+
this.defaultModel = key;
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
/**
|
|
1083
|
+
* Get a model by provider and name
|
|
1084
|
+
*/
|
|
1085
|
+
get(provider, name) {
|
|
1086
|
+
return this.models.get(`${provider}:${name}`);
|
|
1087
|
+
}
|
|
1088
|
+
/**
|
|
1089
|
+
* Get model by key
|
|
1090
|
+
*/
|
|
1091
|
+
getByKey(key) {
|
|
1092
|
+
return this.models.get(key);
|
|
1093
|
+
}
|
|
1094
|
+
/**
|
|
1095
|
+
* Get the default model
|
|
1096
|
+
*/
|
|
1097
|
+
getDefault() {
|
|
1098
|
+
if (this.defaultModel === null) {
|
|
1099
|
+
return void 0;
|
|
1100
|
+
}
|
|
1101
|
+
return this.models.get(this.defaultModel);
|
|
1102
|
+
}
|
|
1103
|
+
/**
|
|
1104
|
+
* Set default model
|
|
1105
|
+
*/
|
|
1106
|
+
setDefault(provider, name) {
|
|
1107
|
+
const key = `${provider}:${name}`;
|
|
1108
|
+
if (!this.models.has(key)) {
|
|
1109
|
+
throw new Error(`Model ${key} not found in registry`);
|
|
1110
|
+
}
|
|
1111
|
+
this.defaultModel = key;
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* List all registered models
|
|
1115
|
+
*/
|
|
1116
|
+
list() {
|
|
1117
|
+
return Array.from(this.models.values()).map((m) => m.info);
|
|
1118
|
+
}
|
|
1119
|
+
/**
|
|
1120
|
+
* Check if a model is registered
|
|
1121
|
+
*/
|
|
1122
|
+
has(provider, name) {
|
|
1123
|
+
return this.models.has(`${provider}:${name}`);
|
|
1124
|
+
}
|
|
1125
|
+
/**
|
|
1126
|
+
* Remove a model
|
|
1127
|
+
*/
|
|
1128
|
+
remove(provider, name) {
|
|
1129
|
+
const key = `${provider}:${name}`;
|
|
1130
|
+
if (this.defaultModel === key) {
|
|
1131
|
+
this.defaultModel = null;
|
|
1132
|
+
}
|
|
1133
|
+
return this.models.delete(key);
|
|
1134
|
+
}
|
|
1135
|
+
/**
|
|
1136
|
+
* Clear all models
|
|
1137
|
+
*/
|
|
1138
|
+
clear() {
|
|
1139
|
+
this.models.clear();
|
|
1140
|
+
this.defaultModel = null;
|
|
1141
|
+
}
|
|
1142
|
+
};
|
|
1143
|
+
var modelRegistry = new ModelRegistry();
|
|
1144
|
+
|
|
1145
|
+
// src/chunking/SemanticChunker.ts
|
|
1146
|
+
var SemanticChunker = class extends BaseChunker {
|
|
1147
|
+
strategyType = "semantic";
|
|
1148
|
+
async chunk(text, options) {
|
|
1149
|
+
const opts = this.getOptions(options);
|
|
1150
|
+
const similarityThreshold = options?.similarityThreshold ?? 0.5;
|
|
1151
|
+
const breakpointPercentile = options?.breakpointPercentileThreshold ?? 95;
|
|
1152
|
+
const bufferSize = options?.bufferSize ?? 1;
|
|
1153
|
+
const embeddingFn = options?.embeddingFn;
|
|
1154
|
+
const tokenCounter = opts.tokenCounter ?? defaultTokenCounter;
|
|
1155
|
+
const sentences = this.splitSentences(text);
|
|
1156
|
+
if (sentences.length === 0) {
|
|
1157
|
+
return [];
|
|
1158
|
+
}
|
|
1159
|
+
if (!embeddingFn) {
|
|
1160
|
+
return this.fallbackChunk(sentences, opts, tokenCounter);
|
|
1161
|
+
}
|
|
1162
|
+
const sentenceTexts = sentences.map((s) => s.text);
|
|
1163
|
+
const embeddings = await embeddingFn(sentenceTexts);
|
|
1164
|
+
const sentencesWithEmbeddings = sentences.map(
|
|
1165
|
+
(s, i) => ({
|
|
1166
|
+
...s,
|
|
1167
|
+
embedding: embeddings[i]
|
|
1168
|
+
})
|
|
1169
|
+
);
|
|
1170
|
+
const distances = this.calculateDistances(
|
|
1171
|
+
sentencesWithEmbeddings,
|
|
1172
|
+
bufferSize
|
|
1173
|
+
);
|
|
1174
|
+
const breakpoints = this.findBreakpoints(
|
|
1175
|
+
distances,
|
|
1176
|
+
breakpointPercentile,
|
|
1177
|
+
similarityThreshold
|
|
1178
|
+
);
|
|
1179
|
+
let chunks = [];
|
|
1180
|
+
let chunkStart = 0;
|
|
1181
|
+
let chunkText = "";
|
|
1182
|
+
let chunkPosition = sentences[0]?.position ?? 0;
|
|
1183
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
1184
|
+
chunkText += (chunkText ? " " : "") + sentences[i].text;
|
|
1185
|
+
if (breakpoints.includes(i) || i === sentences.length - 1) {
|
|
1186
|
+
if (chunkText.trim()) {
|
|
1187
|
+
chunks.push(
|
|
1188
|
+
this.createChunk(
|
|
1189
|
+
chunkText.trim(),
|
|
1190
|
+
chunks.length,
|
|
1191
|
+
chunkPosition,
|
|
1192
|
+
opts,
|
|
1193
|
+
{
|
|
1194
|
+
boundaryType: "semantic",
|
|
1195
|
+
sentenceCount: i - chunkStart + 1
|
|
1196
|
+
}
|
|
1197
|
+
)
|
|
1198
|
+
);
|
|
1199
|
+
}
|
|
1200
|
+
if (i < sentences.length - 1) {
|
|
1201
|
+
chunkStart = i + 1;
|
|
1202
|
+
chunkText = "";
|
|
1203
|
+
chunkPosition = sentences[i + 1].position;
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
chunks = mergeSmallChunks(chunks, opts.minChunkSize, tokenCounter);
|
|
1208
|
+
chunks = this.splitLargeChunks(chunks, opts.maxChunkSize, tokenCounter);
|
|
1209
|
+
return chunks;
|
|
1210
|
+
}
|
|
1211
|
+
/**
|
|
1212
|
+
* Split text into sentences
|
|
1213
|
+
*/
|
|
1214
|
+
splitSentences(text) {
|
|
1215
|
+
const sentenceRegex = /[^.!?]+[.!?]+/g;
|
|
1216
|
+
const sentences = [];
|
|
1217
|
+
let match;
|
|
1218
|
+
while ((match = sentenceRegex.exec(text)) !== null) {
|
|
1219
|
+
const sentence = match[0].trim();
|
|
1220
|
+
if (sentence) {
|
|
1221
|
+
sentences.push({
|
|
1222
|
+
text: sentence,
|
|
1223
|
+
position: match.index
|
|
1224
|
+
});
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
if (sentences.length === 0 && text.trim()) {
|
|
1228
|
+
sentences.push({
|
|
1229
|
+
text: text.trim(),
|
|
1230
|
+
position: 0
|
|
1231
|
+
});
|
|
1232
|
+
}
|
|
1233
|
+
return sentences;
|
|
1234
|
+
}
|
|
1235
|
+
/**
|
|
1236
|
+
* Calculate distances between adjacent sentences
|
|
1237
|
+
*/
|
|
1238
|
+
calculateDistances(sentences, bufferSize) {
|
|
1239
|
+
const distances = [];
|
|
1240
|
+
for (let i = 0; i < sentences.length - 1; i++) {
|
|
1241
|
+
const leftStart = Math.max(0, i - bufferSize + 1);
|
|
1242
|
+
const rightEnd = Math.min(sentences.length, i + bufferSize + 1);
|
|
1243
|
+
const leftEmbeddings = sentences.slice(leftStart, i + 1).map((s) => s.embedding).filter((e) => e !== void 0);
|
|
1244
|
+
const rightEmbeddings = sentences.slice(i + 1, rightEnd).map((s) => s.embedding).filter((e) => e !== void 0);
|
|
1245
|
+
if (leftEmbeddings.length > 0 && rightEmbeddings.length > 0) {
|
|
1246
|
+
const leftAvg = EmbeddingModel.average(leftEmbeddings);
|
|
1247
|
+
const rightAvg = EmbeddingModel.average(rightEmbeddings);
|
|
1248
|
+
const similarity = EmbeddingModel.cosineSimilarity(leftAvg, rightAvg);
|
|
1249
|
+
distances.push(1 - similarity);
|
|
1250
|
+
} else {
|
|
1251
|
+
distances.push(0);
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
return distances;
|
|
1255
|
+
}
|
|
1256
|
+
/**
|
|
1257
|
+
* Find breakpoints based on distance threshold
|
|
1258
|
+
*/
|
|
1259
|
+
findBreakpoints(distances, percentile, minThreshold) {
|
|
1260
|
+
if (distances.length === 0) return [];
|
|
1261
|
+
const sortedDistances = [...distances].sort((a, b) => a - b);
|
|
1262
|
+
const percentileIndex = Math.floor(
|
|
1263
|
+
percentile / 100 * sortedDistances.length
|
|
1264
|
+
);
|
|
1265
|
+
const percentileThreshold = sortedDistances[percentileIndex] ?? sortedDistances[sortedDistances.length - 1];
|
|
1266
|
+
const threshold = Math.max(percentileThreshold, 1 - minThreshold);
|
|
1267
|
+
const breakpoints = [];
|
|
1268
|
+
for (let i = 0; i < distances.length; i++) {
|
|
1269
|
+
if (distances[i] >= threshold) {
|
|
1270
|
+
breakpoints.push(i);
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
return breakpoints;
|
|
1274
|
+
}
|
|
1275
|
+
/**
|
|
1276
|
+
* Fallback chunking when no embedding function available
|
|
1277
|
+
*/
|
|
1278
|
+
fallbackChunk(sentences, options, tokenCounter) {
|
|
1279
|
+
const chunks = [];
|
|
1280
|
+
let currentText = "";
|
|
1281
|
+
let chunkPosition = sentences[0]?.position ?? 0;
|
|
1282
|
+
for (const sentence of sentences) {
|
|
1283
|
+
const testText = currentText ? currentText + " " + sentence.text : sentence.text;
|
|
1284
|
+
if (tokenCounter(testText) > options.chunkSize && currentText) {
|
|
1285
|
+
chunks.push(
|
|
1286
|
+
this.createChunk(
|
|
1287
|
+
currentText.trim(),
|
|
1288
|
+
chunks.length,
|
|
1289
|
+
chunkPosition,
|
|
1290
|
+
options,
|
|
1291
|
+
{ boundaryType: "sentence" }
|
|
1292
|
+
)
|
|
1293
|
+
);
|
|
1294
|
+
currentText = sentence.text;
|
|
1295
|
+
chunkPosition = sentence.position;
|
|
1296
|
+
} else {
|
|
1297
|
+
currentText = testText;
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
if (currentText.trim()) {
|
|
1301
|
+
chunks.push(
|
|
1302
|
+
this.createChunk(
|
|
1303
|
+
currentText.trim(),
|
|
1304
|
+
chunks.length,
|
|
1305
|
+
chunkPosition,
|
|
1306
|
+
options,
|
|
1307
|
+
{ boundaryType: "sentence" }
|
|
1308
|
+
)
|
|
1309
|
+
);
|
|
1310
|
+
}
|
|
1311
|
+
return chunks;
|
|
1312
|
+
}
|
|
1313
|
+
/**
|
|
1314
|
+
* Split chunks that are too large
|
|
1315
|
+
*/
|
|
1316
|
+
splitLargeChunks(chunks, maxTokens, tokenCounter) {
|
|
1317
|
+
const result = [];
|
|
1318
|
+
for (const chunk2 of chunks) {
|
|
1319
|
+
if (chunk2.tokenCount <= maxTokens) {
|
|
1320
|
+
result.push(chunk2);
|
|
1321
|
+
continue;
|
|
1322
|
+
}
|
|
1323
|
+
const sentences = this.splitSentences(chunk2.text);
|
|
1324
|
+
let currentText = "";
|
|
1325
|
+
let currentStart = chunk2.startPosition;
|
|
1326
|
+
for (const sentence of sentences) {
|
|
1327
|
+
const testText = currentText ? currentText + " " + sentence.text : sentence.text;
|
|
1328
|
+
if (tokenCounter(testText) > maxTokens && currentText) {
|
|
1329
|
+
result.push({
|
|
1330
|
+
...chunk2,
|
|
1331
|
+
id: chunk2.id + "_" + result.length,
|
|
1332
|
+
text: currentText.trim(),
|
|
1333
|
+
startPosition: currentStart,
|
|
1334
|
+
endPosition: currentStart + currentText.length,
|
|
1335
|
+
tokenCount: tokenCounter(currentText),
|
|
1336
|
+
charCount: currentText.length,
|
|
1337
|
+
index: result.length
|
|
1338
|
+
});
|
|
1339
|
+
currentText = sentence.text;
|
|
1340
|
+
currentStart = chunk2.startPosition + sentence.position;
|
|
1341
|
+
} else {
|
|
1342
|
+
currentText = testText;
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
if (currentText.trim()) {
|
|
1346
|
+
result.push({
|
|
1347
|
+
...chunk2,
|
|
1348
|
+
id: chunk2.id + "_" + result.length,
|
|
1349
|
+
text: currentText.trim(),
|
|
1350
|
+
startPosition: currentStart,
|
|
1351
|
+
endPosition: currentStart + currentText.length,
|
|
1352
|
+
tokenCount: tokenCounter(currentText),
|
|
1353
|
+
charCount: currentText.length,
|
|
1354
|
+
index: result.length
|
|
1355
|
+
});
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
return result;
|
|
1359
|
+
}
|
|
1360
|
+
};
|
|
1361
|
+
function createSemanticChunker() {
|
|
1362
|
+
return new SemanticChunker();
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
// src/chunking/index.ts
|
|
1366
|
+
function createChunker(strategy) {
|
|
1367
|
+
switch (strategy) {
|
|
1368
|
+
case "fixed":
|
|
1369
|
+
return new FixedChunker();
|
|
1370
|
+
case "recursive":
|
|
1371
|
+
return new RecursiveChunker();
|
|
1372
|
+
case "markdown":
|
|
1373
|
+
return new MarkdownChunker();
|
|
1374
|
+
case "code":
|
|
1375
|
+
return new CodeChunker();
|
|
1376
|
+
case "semantic":
|
|
1377
|
+
return new SemanticChunker();
|
|
1378
|
+
case "sentence":
|
|
1379
|
+
return new FixedChunker();
|
|
1380
|
+
case "paragraph":
|
|
1381
|
+
return new FixedChunker();
|
|
1382
|
+
default:
|
|
1383
|
+
return new RecursiveChunker();
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
async function chunk(text, strategy = "recursive", options) {
|
|
1387
|
+
const chunker = createChunker(strategy);
|
|
1388
|
+
return chunker.chunk(text, options);
|
|
1389
|
+
}
|
|
1390
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
1391
|
+
0 && (module.exports = {
|
|
1392
|
+
BaseChunker,
|
|
1393
|
+
CodeChunker,
|
|
1394
|
+
FixedChunker,
|
|
1395
|
+
MarkdownChunker,
|
|
1396
|
+
RecursiveChunker,
|
|
1397
|
+
SemanticChunker,
|
|
1398
|
+
chunk,
|
|
1399
|
+
createChunker,
|
|
1400
|
+
createCodeChunker,
|
|
1401
|
+
createFixedChunker,
|
|
1402
|
+
createMarkdownChunker,
|
|
1403
|
+
createRecursiveChunker,
|
|
1404
|
+
createSemanticChunker,
|
|
1405
|
+
defaultTokenCounter,
|
|
1406
|
+
mergeSmallChunks,
|
|
1407
|
+
splitLargeChunks
|
|
1408
|
+
});
|