vectra 0.12.2 → 0.12.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/LICENSE +1 -1
  2. package/README.draft.md +499 -0
  3. package/README.draft.outline.md +160 -0
  4. package/README.research.md +2159 -0
  5. package/bin/vectra.js +3 -0
  6. package/lib/FileFetcher.d.ts +5 -0
  7. package/lib/FileFetcher.d.ts.map +1 -0
  8. package/lib/FileFetcher.js +79 -0
  9. package/lib/FileFetcher.js.map +1 -0
  10. package/lib/GPT3Tokenizer.d.ts +9 -0
  11. package/lib/ItemSelector.d.ts +41 -0
  12. package/lib/ItemSelector.d.ts.map +1 -0
  13. package/lib/ItemSelector.js +168 -0
  14. package/lib/ItemSelector.js.map +1 -0
  15. package/lib/LocalDocument.d.ts +54 -0
  16. package/lib/LocalDocument.js +156 -0
  17. package/lib/LocalDocument.js.map +1 -0
  18. package/lib/LocalDocumentIndex.d.ts +132 -0
  19. package/lib/LocalDocumentIndex.js +456 -0
  20. package/lib/LocalDocumentIndex.js.map +1 -0
  21. package/lib/LocalDocumentResult.d.ts +45 -0
  22. package/lib/LocalDocumentResult.js +328 -0
  23. package/lib/LocalDocumentResult.js.map +1 -0
  24. package/lib/LocalIndex.d.ts +150 -0
  25. package/lib/LocalIndex.d.ts.map +1 -1
  26. package/lib/LocalIndex.js +515 -0
  27. package/lib/LocalIndex.js.map +1 -0
  28. package/lib/LocalIndex.spec.d.ts +2 -0
  29. package/lib/LocalIndex.spec.js +218 -7
  30. package/lib/LocalIndex.spec.js.map +1 -1
  31. package/lib/OpenAIEmbeddings.d.ts +126 -0
  32. package/lib/OpenAIEmbeddings.d.ts.map +1 -0
  33. package/lib/OpenAIEmbeddings.js +174 -0
  34. package/lib/OpenAIEmbeddings.js.map +1 -0
  35. package/lib/TextSplitter.d.ts +19 -0
  36. package/lib/TextSplitter.d.ts.map +1 -1
  37. package/lib/TextSplitter.js +457 -0
  38. package/lib/TextSplitter.js.map +1 -0
  39. package/lib/TextSplitter.spec.d.ts +2 -0
  40. package/lib/TextSplitter.spec.d.ts.map +1 -0
  41. package/lib/TextSplitter.spec.js +109 -0
  42. package/lib/TextSplitter.spec.js.map +1 -0
  43. package/lib/WebFetcher.d.ts +15 -0
  44. package/lib/WebFetcher.d.ts.map +1 -0
  45. package/lib/WebFetcher.js +234 -0
  46. package/lib/WebFetcher.js.map +1 -0
  47. package/lib/index.d.ts +12 -0
  48. package/lib/index.js +28 -0
  49. package/lib/index.js.map +1 -0
  50. package/lib/internals/Colorize.d.ts +14 -0
  51. package/lib/internals/Colorize.d.ts.map +1 -0
  52. package/lib/internals/Colorize.js +64 -0
  53. package/lib/internals/Colorize.js.map +1 -0
  54. package/lib/internals/index.d.ts +3 -0
  55. package/lib/internals/index.d.ts.map +1 -0
  56. package/lib/internals/index.js +19 -0
  57. package/lib/internals/index.js.map +1 -0
  58. package/lib/internals/types.d.ts +43 -0
  59. package/lib/internals/types.d.ts.map +1 -0
  60. package/lib/internals/types.js +3 -0
  61. package/lib/internals/types.js.map +1 -0
  62. package/lib/types.d.ts +146 -0
  63. package/lib/types.d.ts.map +1 -0
  64. package/lib/types.js +3 -0
  65. package/lib/types.js.map +1 -0
  66. package/lib/vectra-cli.d.ts +2 -0
  67. package/lib/vectra-cli.js +323 -0
  68. package/lib/vectra-cli.js.map +1 -0
  69. package/package.json +3 -1
  70. package/src/LocalIndex.spec.ts +265 -8
  71. package/src/LocalIndex.ts +1 -0
  72. package/src/TextSplitter.spec.ts +87 -0
  73. package/src/TextSplitter.ts +459 -531
@@ -4,558 +4,486 @@ import { TextChunk, Tokenizer } from "./types";
4
4
  const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
5
5
 
6
6
  export interface TextSplitterConfig {
7
- separators: string[];
8
- keepSeparators: boolean;
9
- chunkSize: number;
10
- chunkOverlap: number;
11
- tokenizer: Tokenizer;
12
- docType?: string;
7
+ separators: string[];
8
+ keepSeparators: boolean;
9
+ chunkSize: number;
10
+ chunkOverlap: number;
11
+ tokenizer: Tokenizer;
12
+ docType?: string;
13
13
  }
14
14
 
15
15
  export class TextSplitter {
16
- private readonly _config: TextSplitterConfig;
17
-
18
- public constructor(config?: Partial<TextSplitterConfig>) {
19
- this._config = Object.assign({
20
- keepSeparators: false,
21
- chunkSize: 400,
22
- chunkOverlap: 40,
23
- } as TextSplitterConfig, config);
24
-
25
- // Create a default tokenizer if none is provided
26
- if (!this._config.tokenizer) {
27
- this._config.tokenizer = new GPT3Tokenizer();
28
- }
16
+ private readonly _config: TextSplitterConfig;
29
17
 
30
- // Use default separators if none are provided
31
- if (!this._config.separators || this._config.separators.length === 0) {
32
- this._config.separators = this.getSeparators(this._config.docType);
33
- }
18
+ public constructor(config?: Partial<TextSplitterConfig>) {
19
+ this._config = Object.assign({
20
+ keepSeparators: false,
21
+ chunkSize: 400,
22
+ chunkOverlap: 40,
23
+ } as TextSplitterConfig, config);
34
24
 
35
- // Validate the config settings
36
- if (this._config.chunkSize < 1) {
37
- throw new Error("chunkSize must be >= 1");
38
- } else if (this._config.chunkOverlap < 0) {
39
- throw new Error("chunkOverlap must be >= 0");
40
- } else if (this._config.chunkOverlap > this._config.chunkSize) {
41
- throw new Error("chunkOverlap must be <= chunkSize");
42
- }
25
+ // Create a default tokenizer if none is provided
26
+ if (!this._config.tokenizer) {
27
+ this._config.tokenizer = new GPT3Tokenizer();
43
28
  }
44
29
 
45
- public split(text: string): TextChunk[] {
46
- // Get basic chunks
47
- const chunks = this.recursiveSplit(text, this._config.separators, 0);
48
-
49
- const that = this;
50
- function getOverlapTokens(tokens?: number[]): number[] {
51
- if (tokens != undefined) {
52
- const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
53
- return tokens.slice(0, len);
54
- } else {
55
- return [];
56
- }
57
- }
30
+ // Use default separators if none are provided
31
+ if (!this._config.separators || this._config.separators.length === 0) {
32
+ this._config.separators = this.getSeparators(this._config.docType);
33
+ }
58
34
 
59
- // Add overlap tokens and text to the start and end of each chunk
60
- if (this._config.chunkOverlap > 0) {
61
- for (let i = 1; i < chunks.length; i++) {
62
- const previousChunk = chunks[i - 1];
63
- const chunk = chunks[i];
64
- const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
65
- chunk.startOverlap = getOverlapTokens(previousChunk.tokens.reverse()).reverse();
66
- chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
67
- }
68
- }
35
+ // Validate the config settings
36
+ if (this._config.chunkSize < 1) {
37
+ throw new Error("chunkSize must be >= 1");
38
+ } else if (this._config.chunkOverlap < 0) {
39
+ throw new Error("chunkOverlap must be >= 0");
40
+ } else if (this._config.chunkOverlap > this._config.chunkSize) {
41
+ throw new Error("chunkOverlap must be <= chunkSize");
42
+ }
43
+ }
44
+
45
+ public split(text: string): TextChunk[] {
46
+ // Get basic chunks
47
+ const chunks = this.recursiveSplit(text, this._config.separators, 0);
69
48
 
70
- return chunks;
49
+ const that = this;
50
+ function getOverlapTokens(tokens?: number[]): number[] {
51
+ if (tokens != undefined) {
52
+ const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
53
+ return tokens.slice(0, len);
54
+ } else {
55
+ return [];
56
+ }
71
57
  }
72
58
 
73
- private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
74
- const chunks: TextChunk[] = [];
75
- if (text.length > 0) {
76
- // Split text into parts
77
- let parts: string[];
78
- let separator = '';
79
- const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
80
- if (separators.length > 0) {
81
- // Split by separator
82
- separator = separators[0];
83
- parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
84
- } else {
85
- // Cut text in half
86
- const half = Math.floor(text.length / 2);
87
- parts = [text.substring(0, half), text.substring(half)];
88
- }
89
-
90
- // Iterate over parts
91
- for (let i = 0; i < parts.length; i++) {
92
- const lastChunk = (i === parts.length - 1);
93
-
94
- // Get chunk text and endPos
95
- let chunk = parts[i];
96
- const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
97
- if (this._config.keepSeparators && !lastChunk) {
98
- chunk += separator;
99
- }
100
-
101
- // Ensure chunk contains text
102
- if (!this.containsAlphanumeric(chunk)) {
103
- continue;
104
- }
105
-
106
- // Optimization to avoid encoding really large chunks
107
- if (chunk.length / 6 > this._config.chunkSize) {
108
- // Break the text into smaller chunks
109
- const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
110
- chunks.push(...subChunks);
111
- } else {
112
- // Encode chunk text
113
- const tokens = this._config.tokenizer.encode(chunk);
114
- if (tokens.length > this._config.chunkSize) {
115
- // Break the text into smaller chunks
116
- const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
117
- chunks.push(...subChunks);
118
- } else {
119
- // Append chunk to output
120
- chunks.push({
121
- text: chunk,
122
- tokens: tokens,
123
- startPos: startPos,
124
- endPos: endPos,
125
- startOverlap: [],
126
- endOverlap: [],
127
- });
128
- }
129
-
130
- }
131
-
132
-
133
- // Update startPos
134
- startPos = endPos + 1;
135
- }
136
- }
59
+ // Add overlap tokens and text to the start and end of each chunk
60
+ if (this._config.chunkOverlap > 0) {
61
+ for (let i = 1; i < chunks.length; i++) {
62
+ const previousChunk = chunks[i - 1];
63
+ const chunk = chunks[i];
64
+ const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
137
65
 
138
- return this.combineChunks(chunks);
66
+ // Use copies to avoid reversing in place (preserve token order in previous chunks)
67
+ const prevTokensCopy = previousChunk.tokens.slice();
68
+ chunk.startOverlap = getOverlapTokens(prevTokensCopy.reverse()).reverse();
69
+ chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
70
+ }
139
71
  }
140
72
 
141
- private combineChunks(chunks: TextChunk[]): TextChunk[] {
142
- const combinedChunks: TextChunk[] = [];
143
- let currentChunk: TextChunk|undefined;
144
- let currentLength = 0;
145
- const separator = this._config.keepSeparators ? '' : ' ';
146
- for (let i = 0; i < chunks.length; i++) {
147
- const chunk = chunks[i];
148
- if (currentChunk) {
149
- const length = currentChunk.tokens.length + chunk.tokens.length;
150
- if (length > this._config.chunkSize) {
151
- combinedChunks.push(currentChunk);
152
- currentChunk = chunk;
153
- currentLength = chunk.tokens.length;
154
- } else {
155
- currentChunk.text += separator + chunk.text;
156
- currentChunk.endPos = chunk.endPos;
157
- currentChunk.tokens.push(...chunk.tokens);
158
- currentLength += chunk.tokens.length;
159
- }
160
- } else {
161
- currentChunk = chunk;
162
- currentLength = chunk.tokens.length;
163
- }
73
+ return chunks;
74
+ }
75
+
76
+ private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
77
+ const chunks: TextChunk[] = [];
78
+
79
+ if (text.length > 0) {
80
+ // Split text into parts
81
+ let parts: string[];
82
+ let separator = '';
83
+ const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
84
+
85
+ if (separators.length > 0) {
86
+ // Split by separator
87
+ separator = separators[0];
88
+ parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
89
+ } else {
90
+ // Cut text in half
91
+ const half = Math.floor(text.length / 2);
92
+ parts = [text.substring(0, half), text.substring(half)];
93
+ }
94
+
95
+ // Iterate over parts
96
+ for (let i = 0; i < parts.length; i++) {
97
+ const lastChunk = (i === parts.length - 1);
98
+
99
+ // Get chunk text and endPos
100
+ let chunk = parts[i];
101
+ const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
102
+
103
+ if (this._config.keepSeparators && !lastChunk) {
104
+ chunk += separator;
105
+ }
106
+
107
+ // Keep chunks that contain any non-whitespace; drop whitespace-only
108
+ if (!/\S/.test(chunk)) {
109
+ // drop whitespace-only chunks
110
+ startPos = endPos + 1;
111
+ continue;
164
112
  }
165
- if (currentChunk) {
166
- combinedChunks.push(currentChunk);
113
+
114
+ // Optimization to avoid encoding really large chunks
115
+ if (chunk.length / 6 > this._config.chunkSize) {
116
+ // Break the text into smaller chunks
117
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
118
+ chunks.push(...subChunks);
119
+ } else {
120
+ // Encode chunk text
121
+ const tokens = this._config.tokenizer.encode(chunk);
122
+ if (tokens.length > this._config.chunkSize) {
123
+ // Break the text into smaller chunks
124
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
125
+ chunks.push(...subChunks);
126
+ } else {
127
+ // Append chunk to output
128
+ chunks.push({
129
+ text: chunk,
130
+ tokens: tokens,
131
+ startPos: startPos,
132
+ endPos: endPos,
133
+ startOverlap: [],
134
+ endOverlap: [],
135
+ });
136
+ }
167
137
  }
168
- return combinedChunks;
138
+
139
+ // Update startPos
140
+ startPos = endPos + 1;
141
+ }
169
142
  }
170
143
 
171
- private containsAlphanumeric(text: string): boolean {
172
- for (let i = 0; i < text.length; i++) {
173
- if (ALPHANUMERIC_CHARS.includes(text[i])) {
174
- return true;
175
- }
176
- }
177
- return false;
144
+ return this.combineChunks(chunks);
145
+ }
146
+
147
+ private combineChunks(chunks: TextChunk[]): TextChunk[] {
148
+ const combinedChunks: TextChunk[] = [];
149
+ let currentChunk: TextChunk | undefined;
150
+ let currentLength = 0;
151
+
152
+ // When not keeping separators, we previously inserted a space between merged chunks.
153
+ // We will still use a space for normal merges, but we will prevent merging punctuation-only
154
+ // separator chunks (e.g., '---', '***', '====') to preserve them as standalone.
155
+ const separator = this._config.keepSeparators ? '' : ' ';
156
+
157
+ const isWhitespaceOnly = (t: string) => !/\S/.test(t);
158
+ const isPunctuationOnly = (t: string) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
159
+
160
+ for (let i = 0; i < chunks.length; i++) {
161
+ const chunk = chunks[i];
162
+
163
+ if (!currentChunk) {
164
+ currentChunk = chunk;
165
+ currentLength = chunk.tokens.length;
166
+ continue;
167
+ }
168
+
169
+ // If either the current or next chunk is punctuation-only (non-whitespace, no alphanumeric),
170
+ // do not merge; keep them as separate chunks to preserve separators like '---'.
171
+ if (isPunctuationOnly(currentChunk.text) || isPunctuationOnly(chunk.text)) {
172
+ combinedChunks.push(currentChunk);
173
+ currentChunk = chunk;
174
+ currentLength = chunk.tokens.length;
175
+ continue;
176
+ }
177
+
178
+ // Normal merge path constrained by token budget
179
+ const length = currentChunk.tokens.length + chunk.tokens.length;
180
+ if (length > this._config.chunkSize) {
181
+ combinedChunks.push(currentChunk);
182
+ currentChunk = chunk;
183
+ currentLength = chunk.tokens.length;
184
+ } else {
185
+ // Only insert separator if neither chunk is whitespace-only (defensive)
186
+ const joiner = (!this._config.keepSeparators && !isWhitespaceOnly(currentChunk.text) && !isWhitespaceOnly(chunk.text)) ? separator : '';
187
+ currentChunk.text += joiner + chunk.text;
188
+ currentChunk.endPos = chunk.endPos;
189
+ currentChunk.tokens.push(...chunk.tokens);
190
+ currentLength += chunk.tokens.length;
191
+ }
178
192
  }
179
193
 
180
- private splitBySpaces(text: string): string[] {
181
- // Split text by tokens and return parts
182
- const parts: string[] = [];
183
- let tokens = this._config.tokenizer.encode(text);
184
- do {
185
- if (tokens.length <= this._config.chunkSize) {
186
- parts.push(this._config.tokenizer.decode(tokens));
187
- break;
188
- } else {
189
- const span = tokens.splice(0, this._config.chunkSize);
190
- parts.push(this._config.tokenizer.decode(span));
191
- }
192
- } while (true);
193
-
194
- return parts;
194
+ if (currentChunk) {
195
+ combinedChunks.push(currentChunk);
195
196
  }
196
197
 
197
- private getSeparators(docType?: string): string[] {
198
- switch (docType ?? '') {
199
- case "cpp":
200
- return [
201
- // Split along class definitions
202
- "\nclass ",
203
- // Split along function definitions
204
- "\nvoid ",
205
- "\nint ",
206
- "\nfloat ",
207
- "\ndouble ",
208
- // Split along control flow statements
209
- "\nif ",
210
- "\nfor ",
211
- "\nwhile ",
212
- "\nswitch ",
213
- "\ncase ",
214
- // Split by the normal type of lines
215
- "\n\n",
216
- "\n",
217
- " "
218
- ];
219
- case "go":
220
- return [
221
- // Split along function definitions
222
- "\nfunc ",
223
- "\nvar ",
224
- "\nconst ",
225
- "\ntype ",
226
- // Split along control flow statements
227
- "\nif ",
228
- "\nfor ",
229
- "\nswitch ",
230
- "\ncase ",
231
- // Split by the normal type of lines
232
- "\n\n",
233
- "\n",
234
- " "
235
- ];
236
- case "java":
237
- case "c#":
238
- case "csharp":
239
- case "cs":
240
- case "ts":
241
- case "tsx":
242
- case "typescript":
243
- return [
244
- // split along regions
245
- "// LLM-REGION",
246
- "/* LLM-REGION",
247
- "/** LLM-REGION",
248
- // Split along class definitions
249
- "\nclass ",
250
- // Split along method definitions
251
- "\npublic ",
252
- "\nprotected ",
253
- "\nprivate ",
254
- "\nstatic ",
255
- // Split along control flow statements
256
- "\nif ",
257
- "\nfor ",
258
- "\nwhile ",
259
- "\nswitch ",
260
- "\ncase ",
261
- // Split by the normal type of lines
262
- "\n\n",
263
- "\n",
264
- " "
265
- ];
266
- case "js":
267
- case "jsx":
268
- case "javascript":
269
- return [
270
- // split along regions
271
- "// LLM-REGION",
272
- "/* LLM-REGION",
273
- "/** LLM-REGION",
274
- // Split along class definitions
275
- "\nclass ",
276
- // Split along function definitions
277
- "\nfunction ",
278
- "\nconst ",
279
- "\nlet ",
280
- "\nvar ",
281
- "\nclass ",
282
- // Split along control flow statements
283
- "\nif ",
284
- "\nfor ",
285
- "\nwhile ",
286
- "\nswitch ",
287
- "\ncase ",
288
- "\ndefault ",
289
- // Split by the normal type of lines
290
- "\n\n",
291
- "\n",
292
- " "
293
- ];
294
- case "php":
295
- return [
296
- // Split along function definitions
297
- "\nfunction ",
298
- // Split along class definitions
299
- "\nclass ",
300
- // Split along control flow statements
301
- "\nif ",
302
- "\nforeach ",
303
- "\nwhile ",
304
- "\ndo ",
305
- "\nswitch ",
306
- "\ncase ",
307
- // Split by the normal type of lines
308
- "\n\n",
309
- "\n",
310
- " "
311
- ];
312
- case "proto":
313
- return [
314
- // Split along message definitions
315
- "\nmessage ",
316
- // Split along service definitions
317
- "\nservice ",
318
- // Split along enum definitions
319
- "\nenum ",
320
- // Split along option definitions
321
- "\noption ",
322
- // Split along import statements
323
- "\nimport ",
324
- // Split along syntax declarations
325
- "\nsyntax ",
326
- // Split by the normal type of lines
327
- "\n\n",
328
- "\n",
329
- " "
330
- ];
331
- case "python":
332
- case "py":
333
- return [
334
- // First, try to split along class definitions
335
- "\nclass ",
336
- "\ndef ",
337
- "\n\tdef ",
338
- // Now split by the normal type of lines
339
- "\n\n",
340
- "\n",
341
- " "
342
- ];
343
- case "rst":
344
- return [
345
- // Split along section titles
346
- "\n===\n",
347
- "\n---\n",
348
- "\n***\n",
349
- // Split along directive markers
350
- "\n.. ",
351
- // Split by the normal type of lines
352
- "\n\n",
353
- "\n",
354
- " "
355
- ];
356
- case "ruby":
357
- return [
358
- // Split along method definitions
359
- "\ndef ",
360
- "\nclass ",
361
- // Split along control flow statements
362
- "\nif ",
363
- "\nunless ",
364
- "\nwhile ",
365
- "\nfor ",
366
- "\ndo ",
367
- "\nbegin ",
368
- "\nrescue ",
369
- // Split by the normal type of lines
370
- "\n\n",
371
- "\n",
372
- " "
373
- ];
374
- case "rust":
375
- return [
376
- // Split along function definitions
377
- "\nfn ",
378
- "\nconst ",
379
- "\nlet ",
380
- // Split along control flow statements
381
- "\nif ",
382
- "\nwhile ",
383
- "\nfor ",
384
- "\nloop ",
385
- "\nmatch ",
386
- "\nconst ",
387
- // Split by the normal type of lines
388
- "\n\n",
389
- "\n",
390
- " "
391
- ];
392
- case "scala":
393
- return [
394
- // Split along class definitions
395
- "\nclass ",
396
- "\nobject ",
397
- // Split along method definitions
398
- "\ndef ",
399
- "\nval ",
400
- "\nvar ",
401
- // Split along control flow statements
402
- "\nif ",
403
- "\nfor ",
404
- "\nwhile ",
405
- "\nmatch ",
406
- "\ncase ",
407
- // Split by the normal type of lines
408
- "\n\n",
409
- "\n",
410
- " "
411
- ];
412
- case "swift":
413
- return [
414
- // Split along function definitions
415
- "\nfunc ",
416
- // Split along class definitions
417
- "\nclass ",
418
- "\nstruct ",
419
- "\nenum ",
420
- // Split along control flow statements
421
- "\nif ",
422
- "\nfor ",
423
- "\nwhile ",
424
- "\ndo ",
425
- "\nswitch ",
426
- "\ncase ",
427
- // Split by the normal type of lines
428
- "\n\n",
429
- "\n",
430
- " "
431
- ];
432
- case "md":
433
- case "markdown":
434
- return [
435
- // First, try to split along Markdown headings (starting with level 2)
436
- "\n## ",
437
- "\n### ",
438
- "\n#### ",
439
- "\n##### ",
440
- "\n###### ",
441
- // Note the alternative syntax for headings (below) is not handled here
442
- // Heading level 2
443
- // ---------------
444
- // End of code block
445
- "```\n\n",
446
- // Horizontal lines
447
- "\n\n***\n\n",
448
- "\n\n---\n\n",
449
- "\n\n___\n\n",
450
- // Note that this splitter doesn't handle horizontal lines defined
451
- // by *three or more* of ***, ---, or ___, but this is not handled
452
- // Github tables
453
- "<table>",
454
- // "<tr>",
455
- // "<td>",
456
- // "<td ",
457
- "\n\n",
458
- "\n",
459
- " "
460
- ];
461
- case "latex":
462
- return [
463
- // First, try to split along Latex sections
464
- "\n\\chapter{",
465
- "\n\\section{",
466
- "\n\\subsection{",
467
- "\n\\subsubsection{",
468
-
469
- // Now split by environments
470
- "\n\\begin{enumerate}",
471
- "\n\\begin{itemize}",
472
- "\n\\begin{description}",
473
- "\n\\begin{list}",
474
- "\n\\begin{quote}",
475
- "\n\\begin{quotation}",
476
- "\n\\begin{verse}",
477
- "\n\\begin{verbatim}",
478
-
479
- // Now split by math environments
480
- "\n\\begin{align}",
481
- "$$",
482
- "$",
483
-
484
- // Now split by the normal type of lines
485
- "\n\n",
486
- "\n",
487
- " "
488
- ];
489
- case "html":
490
- return [
491
- // First, try to split along HTML tags
492
- "<body>",
493
- "<div>",
494
- "<p>",
495
- "<br>",
496
- "<li>",
497
- "<h1>",
498
- "<h2>",
499
- "<h3>",
500
- "<h4>",
501
- "<h5>",
502
- "<h6>",
503
- "<span>",
504
- "<table>",
505
- "<tr>",
506
- "<td>",
507
- "<th>",
508
- "<ul>",
509
- "<ol>",
510
- "<header>",
511
- "<footer>",
512
- "<nav>",
513
- // Head
514
- "<head>",
515
- "<style>",
516
- "<script>",
517
- "<meta>",
518
- "<title>",
519
- // Normal type of lines
520
- " "
521
- ];
522
- case "sol":
523
- return [
524
- // Split along compiler informations definitions
525
- "\npragma ",
526
- "\nusing ",
527
- // Split along contract definitions
528
- "\ncontract ",
529
- "\ninterface ",
530
- "\nlibrary ",
531
- // Split along method definitions
532
- "\nconstructor ",
533
- "\ntype ",
534
- "\nfunction ",
535
- "\nevent ",
536
- "\nmodifier ",
537
- "\nerror ",
538
- "\nstruct ",
539
- "\nenum ",
540
- // Split along control flow statements
541
- "\nif ",
542
- "\nfor ",
543
- "\nwhile ",
544
- "\ndo while ",
545
- "\nassembly ",
546
- // Split by the normal type of lines
547
- "\n\n",
548
- "\n",
549
- " "
550
- ];
551
- default:
552
- return [
553
- // Split by the normal type of lines
554
- "\n\n",
555
- "\n",
556
- " ",
557
- "",
558
- ];
559
- }
198
+ return combinedChunks;
199
+ }
200
+
201
+ private splitBySpaces(text: string): string[] {
202
+ // Split text by tokens and return parts
203
+ const parts: string[] = [];
204
+ let tokens = this._config.tokenizer.encode(text);
205
+
206
+ do {
207
+ if (tokens.length <= this._config.chunkSize) {
208
+ parts.push(this._config.tokenizer.decode(tokens));
209
+ break;
210
+ } else {
211
+ const span = tokens.splice(0, this._config.chunkSize);
212
+ parts.push(this._config.tokenizer.decode(span));
213
+ }
214
+ } while (true);
215
+
216
+ return parts;
217
+ }
218
+
219
+ private getSeparators(docType?: string): string[] {
220
+ switch (docType ?? '') {
221
+ case "cpp":
222
+ return [
223
+ "\nclass ",
224
+ "\nvoid ",
225
+ "\nint ",
226
+ "\nfloat ",
227
+ "\ndouble ",
228
+ "\nif ",
229
+ "\nfor ",
230
+ "\nwhile ",
231
+ "\nswitch ",
232
+ "\ncase ",
233
+ "\n\n",
234
+ "\n",
235
+ ];
236
+ case "go":
237
+ return [
238
+ "\nfunc ",
239
+ "\nvar ",
240
+ "\nconst ",
241
+ "\ntype ",
242
+ "\nif ",
243
+ "\nfor ",
244
+ "\nswitch ",
245
+ "\ncase ",
246
+ "\n\n",
247
+ "\n",
248
+ ];
249
+ case "java":
250
+ case "c#":
251
+ case "csharp":
252
+ case "cs":
253
+ case "ts":
254
+ case "tsx":
255
+ case "typescript":
256
+ return [
257
+ "// LLM-REGION",
258
+ "/* LLM-REGION",
259
+ "/** LLM-REGION",
260
+ "\nclass ",
261
+ "\npublic ",
262
+ "\nprotected ",
263
+ "\nprivate ",
264
+ "\nstatic ",
265
+ "\nif ",
266
+ "\nfor ",
267
+ "\nwhile ",
268
+ "\nswitch ",
269
+ "\ncase ",
270
+ "\n\n",
271
+ "\n",
272
+ " "
273
+ ];
274
+ case "js":
275
+ case "jsx":
276
+ case "javascript":
277
+ return [
278
+ "// LLM-REGION",
279
+ "/* LLM-REGION",
280
+ "/** LLM-REGION",
281
+ "\nclass ",
282
+ "\nfunction ",
283
+ "\nconst ",
284
+ "\nlet ",
285
+ "\nvar ",
286
+ "\nclass ",
287
+ "\nif ",
288
+ "\nfor ",
289
+ "\nwhile ",
290
+ "\nswitch ",
291
+ "\ncase ",
292
+ "\ndefault ",
293
+ "\n\n",
294
+ "\n",
295
+ ];
296
+ case "php":
297
+ return [
298
+ "\nfunction ",
299
+ "\nclass ",
300
+ "\nif ",
301
+ "\nforeach ",
302
+ "\nwhile ",
303
+ "\ndo ",
304
+ "\nswitch ",
305
+ "\ncase ",
306
+ "\n\n",
307
+ "\n",
308
+ ];
309
+ case "proto":
310
+ return [
311
+ "\nmessage ",
312
+ "\nservice ",
313
+ "\nenum ",
314
+ "\noption ",
315
+ "\nimport ",
316
+ "\nsyntax ",
317
+ "\n\n",
318
+ "\n",
319
+ ];
320
+ case "python":
321
+ case "py":
322
+ return [
323
+ "\nclass ",
324
+ "\ndef ",
325
+ "\n\tdef ",
326
+ "\n\n",
327
+ "\n",
328
+ ];
329
+ case "rst":
330
+ return [
331
+ "\n===\n",
332
+ "\n---\n",
333
+ "\n***\n",
334
+ "\n.. ",
335
+ "\n\n",
336
+ "\n",
337
+ ];
338
+ case "ruby":
339
+ return [
340
+ "\ndef ",
341
+ "\nclass ",
342
+ "\nif ",
343
+ "\nunless ",
344
+ "\nwhile ",
345
+ "\nfor ",
346
+ "\ndo ",
347
+ "\nbegin ",
348
+ "\nrescue ",
349
+ "\n\n",
350
+ "\n",
351
+ ];
352
+ case "rust":
353
+ return [
354
+ "\nfn ",
355
+ "\nconst ",
356
+ "\nlet ",
357
+ "\nif ",
358
+ "\nwhile ",
359
+ "\nfor ",
360
+ "\nloop ",
361
+ "\nmatch ",
362
+ "\nconst ",
363
+ "\n\n",
364
+ "\n",
365
+ ];
366
+ case "scala":
367
+ return [
368
+ "\nclass ",
369
+ "\nobject ",
370
+ "\ndef ",
371
+ "\nval ",
372
+ "\nvar ",
373
+ "\nif ",
374
+ "\nfor ",
375
+ "\nwhile ",
376
+ "\nmatch ",
377
+ "\ncase ",
378
+ "\n\n",
379
+ "\n",
380
+ ];
381
+ case "swift":
382
+ return [
383
+ "\nfunc ",
384
+ "\nclass ",
385
+ "\nstruct ",
386
+ "\nenum ",
387
+ "\nif ",
388
+ "\nfor ",
389
+ "\nwhile ",
390
+ "\ndo ",
391
+ "\nswitch ",
392
+ "\ncase ",
393
+ "\n\n",
394
+ "\n",
395
+ ];
396
+ case "md":
397
+ case "markdown":
398
+ return [
399
+ "\n## ",
400
+ "\n### ",
401
+ "\n#### ",
402
+ "\n##### ",
403
+ "\n###### ",
404
+ "```\n\n",
405
+ "\n\n***\n\n",
406
+ "\n\n---\n\n",
407
+ "\n\n___\n\n",
408
+ "<table>",
409
+ "\n\n",
410
+ "\n",
411
+ ];
412
+ case "latex":
413
+ return [
414
+ "\n\\chapter{",
415
+ "\n\\section{",
416
+ "\n\\subsection{",
417
+ "\n\\subsubsection{",
418
+ "\n\\begin{enumerate}",
419
+ "\n\\begin{itemize}",
420
+ "\n\\begin{description}",
421
+ "\n\\begin{list}",
422
+ "\n\\begin{quote}",
423
+ "\n\\begin{quotation}",
424
+ "\n\\begin{verse}",
425
+ "\n\\begin{verbatim}",
426
+ "\n\\begin{align}",
427
+ "\n\n",
428
+ "\n",
429
+ ];
430
+ case "html":
431
+ return [
432
+ "<body>",
433
+ "<div>",
434
+ "<p>",
435
+ "<br>",
436
+ "<li>",
437
+ "<h1>",
438
+ "<h2>",
439
+ "<h3>",
440
+ "<h4>",
441
+ "<h5>",
442
+ "<h6>",
443
+ "<span>",
444
+ "<table>",
445
+ "<tr>",
446
+ "<td>",
447
+ "<th>",
448
+ "<ul>",
449
+ "<ol>",
450
+ "<header>",
451
+ "<footer>",
452
+ "<nav>",
453
+ "<head>",
454
+ "<style>",
455
+ "<script>",
456
+ "<meta>",
457
+ "<title>",
458
+ ];
459
+ case "sol":
460
+ return [
461
+ "\npragma ",
462
+ "\nusing ",
463
+ "\ncontract ",
464
+ "\ninterface ",
465
+ "\nlibrary ",
466
+ "\nconstructor ",
467
+ "\ntype ",
468
+ "\nfunction ",
469
+ "\nevent ",
470
+ "\nmodifier ",
471
+ "\nerror ",
472
+ "\nstruct ",
473
+ "\nenum ",
474
+ "\nif ",
475
+ "\nfor ",
476
+ "\nwhile ",
477
+ "\ndo while ",
478
+ "\nassembly ",
479
+ "\n\n",
480
+ "\n",
481
+ ];
482
+ default:
483
+ return [
484
+ "\n\n",
485
+ "\n",
486
+ ];
560
487
  }
488
+ }
561
489
  }