vectra 0.12.2 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.draft.md +499 -0
- package/README.draft.outline.md +160 -0
- package/README.research.md +2159 -0
- package/bin/vectra.js +3 -0
- package/lib/FileFetcher.d.ts +5 -0
- package/lib/FileFetcher.d.ts.map +1 -0
- package/lib/FileFetcher.js +79 -0
- package/lib/FileFetcher.js.map +1 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +168 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/LocalDocument.d.ts +54 -0
- package/lib/LocalDocument.js +156 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +132 -0
- package/lib/LocalDocumentIndex.js +456 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +45 -0
- package/lib/LocalDocumentResult.js +328 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +150 -0
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +515 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/LocalIndex.spec.d.ts +2 -0
- package/lib/LocalIndex.spec.js +218 -7
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +126 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +174 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +19 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +457 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/TextSplitter.spec.d.ts +2 -0
- package/lib/TextSplitter.spec.d.ts.map +1 -0
- package/lib/TextSplitter.spec.js +109 -0
- package/lib/TextSplitter.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +15 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +234 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +12 -0
- package/lib/index.js +28 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +43 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +146 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.js +323 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +3 -1
- package/src/LocalIndex.spec.ts +265 -8
- package/src/LocalIndex.ts +1 -0
- package/src/TextSplitter.spec.ts +87 -0
- package/src/TextSplitter.ts +459 -531
package/src/TextSplitter.ts
CHANGED
|
@@ -4,558 +4,486 @@ import { TextChunk, Tokenizer } from "./types";
|
|
|
4
4
|
const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
|
|
5
5
|
|
|
6
6
|
export interface TextSplitterConfig {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
7
|
+
separators: string[];
|
|
8
|
+
keepSeparators: boolean;
|
|
9
|
+
chunkSize: number;
|
|
10
|
+
chunkOverlap: number;
|
|
11
|
+
tokenizer: Tokenizer;
|
|
12
|
+
docType?: string;
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
export class TextSplitter {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
public constructor(config?: Partial<TextSplitterConfig>) {
|
|
19
|
-
this._config = Object.assign({
|
|
20
|
-
keepSeparators: false,
|
|
21
|
-
chunkSize: 400,
|
|
22
|
-
chunkOverlap: 40,
|
|
23
|
-
} as TextSplitterConfig, config);
|
|
24
|
-
|
|
25
|
-
// Create a default tokenizer if none is provided
|
|
26
|
-
if (!this._config.tokenizer) {
|
|
27
|
-
this._config.tokenizer = new GPT3Tokenizer();
|
|
28
|
-
}
|
|
16
|
+
private readonly _config: TextSplitterConfig;
|
|
29
17
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
18
|
+
public constructor(config?: Partial<TextSplitterConfig>) {
|
|
19
|
+
this._config = Object.assign({
|
|
20
|
+
keepSeparators: false,
|
|
21
|
+
chunkSize: 400,
|
|
22
|
+
chunkOverlap: 40,
|
|
23
|
+
} as TextSplitterConfig, config);
|
|
34
24
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
} else if (this._config.chunkOverlap < 0) {
|
|
39
|
-
throw new Error("chunkOverlap must be >= 0");
|
|
40
|
-
} else if (this._config.chunkOverlap > this._config.chunkSize) {
|
|
41
|
-
throw new Error("chunkOverlap must be <= chunkSize");
|
|
42
|
-
}
|
|
25
|
+
// Create a default tokenizer if none is provided
|
|
26
|
+
if (!this._config.tokenizer) {
|
|
27
|
+
this._config.tokenizer = new GPT3Tokenizer();
|
|
43
28
|
}
|
|
44
29
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
const that = this;
|
|
50
|
-
function getOverlapTokens(tokens?: number[]): number[] {
|
|
51
|
-
if (tokens != undefined) {
|
|
52
|
-
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
53
|
-
return tokens.slice(0, len);
|
|
54
|
-
} else {
|
|
55
|
-
return [];
|
|
56
|
-
}
|
|
57
|
-
}
|
|
30
|
+
// Use default separators if none are provided
|
|
31
|
+
if (!this._config.separators || this._config.separators.length === 0) {
|
|
32
|
+
this._config.separators = this.getSeparators(this._config.docType);
|
|
33
|
+
}
|
|
58
34
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
35
|
+
// Validate the config settings
|
|
36
|
+
if (this._config.chunkSize < 1) {
|
|
37
|
+
throw new Error("chunkSize must be >= 1");
|
|
38
|
+
} else if (this._config.chunkOverlap < 0) {
|
|
39
|
+
throw new Error("chunkOverlap must be >= 0");
|
|
40
|
+
} else if (this._config.chunkOverlap > this._config.chunkSize) {
|
|
41
|
+
throw new Error("chunkOverlap must be <= chunkSize");
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
public split(text: string): TextChunk[] {
|
|
46
|
+
// Get basic chunks
|
|
47
|
+
const chunks = this.recursiveSplit(text, this._config.separators, 0);
|
|
69
48
|
|
|
70
|
-
|
|
49
|
+
const that = this;
|
|
50
|
+
function getOverlapTokens(tokens?: number[]): number[] {
|
|
51
|
+
if (tokens != undefined) {
|
|
52
|
+
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
53
|
+
return tokens.slice(0, len);
|
|
54
|
+
} else {
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
71
57
|
}
|
|
72
58
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
80
|
-
if (separators.length > 0) {
|
|
81
|
-
// Split by separator
|
|
82
|
-
separator = separators[0];
|
|
83
|
-
parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
|
|
84
|
-
} else {
|
|
85
|
-
// Cut text in half
|
|
86
|
-
const half = Math.floor(text.length / 2);
|
|
87
|
-
parts = [text.substring(0, half), text.substring(half)];
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// Iterate over parts
|
|
91
|
-
for (let i = 0; i < parts.length; i++) {
|
|
92
|
-
const lastChunk = (i === parts.length - 1);
|
|
93
|
-
|
|
94
|
-
// Get chunk text and endPos
|
|
95
|
-
let chunk = parts[i];
|
|
96
|
-
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
97
|
-
if (this._config.keepSeparators && !lastChunk) {
|
|
98
|
-
chunk += separator;
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// Ensure chunk contains text
|
|
102
|
-
if (!this.containsAlphanumeric(chunk)) {
|
|
103
|
-
continue;
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// Optimization to avoid encoding really large chunks
|
|
107
|
-
if (chunk.length / 6 > this._config.chunkSize) {
|
|
108
|
-
// Break the text into smaller chunks
|
|
109
|
-
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
110
|
-
chunks.push(...subChunks);
|
|
111
|
-
} else {
|
|
112
|
-
// Encode chunk text
|
|
113
|
-
const tokens = this._config.tokenizer.encode(chunk);
|
|
114
|
-
if (tokens.length > this._config.chunkSize) {
|
|
115
|
-
// Break the text into smaller chunks
|
|
116
|
-
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
117
|
-
chunks.push(...subChunks);
|
|
118
|
-
} else {
|
|
119
|
-
// Append chunk to output
|
|
120
|
-
chunks.push({
|
|
121
|
-
text: chunk,
|
|
122
|
-
tokens: tokens,
|
|
123
|
-
startPos: startPos,
|
|
124
|
-
endPos: endPos,
|
|
125
|
-
startOverlap: [],
|
|
126
|
-
endOverlap: [],
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
// Update startPos
|
|
134
|
-
startPos = endPos + 1;
|
|
135
|
-
}
|
|
136
|
-
}
|
|
59
|
+
// Add overlap tokens and text to the start and end of each chunk
|
|
60
|
+
if (this._config.chunkOverlap > 0) {
|
|
61
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
62
|
+
const previousChunk = chunks[i - 1];
|
|
63
|
+
const chunk = chunks[i];
|
|
64
|
+
const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
|
|
137
65
|
|
|
138
|
-
|
|
66
|
+
// Use copies to avoid reversing in place (preserve token order in previous chunks)
|
|
67
|
+
const prevTokensCopy = previousChunk.tokens.slice();
|
|
68
|
+
chunk.startOverlap = getOverlapTokens(prevTokensCopy.reverse()).reverse();
|
|
69
|
+
chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
|
|
70
|
+
}
|
|
139
71
|
}
|
|
140
72
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
73
|
+
return chunks;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
|
|
77
|
+
const chunks: TextChunk[] = [];
|
|
78
|
+
|
|
79
|
+
if (text.length > 0) {
|
|
80
|
+
// Split text into parts
|
|
81
|
+
let parts: string[];
|
|
82
|
+
let separator = '';
|
|
83
|
+
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
84
|
+
|
|
85
|
+
if (separators.length > 0) {
|
|
86
|
+
// Split by separator
|
|
87
|
+
separator = separators[0];
|
|
88
|
+
parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
|
|
89
|
+
} else {
|
|
90
|
+
// Cut text in half
|
|
91
|
+
const half = Math.floor(text.length / 2);
|
|
92
|
+
parts = [text.substring(0, half), text.substring(half)];
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Iterate over parts
|
|
96
|
+
for (let i = 0; i < parts.length; i++) {
|
|
97
|
+
const lastChunk = (i === parts.length - 1);
|
|
98
|
+
|
|
99
|
+
// Get chunk text and endPos
|
|
100
|
+
let chunk = parts[i];
|
|
101
|
+
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
102
|
+
|
|
103
|
+
if (this._config.keepSeparators && !lastChunk) {
|
|
104
|
+
chunk += separator;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Keep chunks that contain any non-whitespace; drop whitespace-only
|
|
108
|
+
if (!/\S/.test(chunk)) {
|
|
109
|
+
// drop whitespace-only chunks
|
|
110
|
+
startPos = endPos + 1;
|
|
111
|
+
continue;
|
|
164
112
|
}
|
|
165
|
-
|
|
166
|
-
|
|
113
|
+
|
|
114
|
+
// Optimization to avoid encoding really large chunks
|
|
115
|
+
if (chunk.length / 6 > this._config.chunkSize) {
|
|
116
|
+
// Break the text into smaller chunks
|
|
117
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
118
|
+
chunks.push(...subChunks);
|
|
119
|
+
} else {
|
|
120
|
+
// Encode chunk text
|
|
121
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
122
|
+
if (tokens.length > this._config.chunkSize) {
|
|
123
|
+
// Break the text into smaller chunks
|
|
124
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
125
|
+
chunks.push(...subChunks);
|
|
126
|
+
} else {
|
|
127
|
+
// Append chunk to output
|
|
128
|
+
chunks.push({
|
|
129
|
+
text: chunk,
|
|
130
|
+
tokens: tokens,
|
|
131
|
+
startPos: startPos,
|
|
132
|
+
endPos: endPos,
|
|
133
|
+
startOverlap: [],
|
|
134
|
+
endOverlap: [],
|
|
135
|
+
});
|
|
136
|
+
}
|
|
167
137
|
}
|
|
168
|
-
|
|
138
|
+
|
|
139
|
+
// Update startPos
|
|
140
|
+
startPos = endPos + 1;
|
|
141
|
+
}
|
|
169
142
|
}
|
|
170
143
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
144
|
+
return this.combineChunks(chunks);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
private combineChunks(chunks: TextChunk[]): TextChunk[] {
|
|
148
|
+
const combinedChunks: TextChunk[] = [];
|
|
149
|
+
let currentChunk: TextChunk | undefined;
|
|
150
|
+
let currentLength = 0;
|
|
151
|
+
|
|
152
|
+
// When not keeping separators, we previously inserted a space between merged chunks.
|
|
153
|
+
// We will still use a space for normal merges, but we will prevent merging punctuation-only
|
|
154
|
+
// separator chunks (e.g., '---', '***', '====') to preserve them as standalone.
|
|
155
|
+
const separator = this._config.keepSeparators ? '' : ' ';
|
|
156
|
+
|
|
157
|
+
const isWhitespaceOnly = (t: string) => !/\S/.test(t);
|
|
158
|
+
const isPunctuationOnly = (t: string) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
|
|
159
|
+
|
|
160
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
161
|
+
const chunk = chunks[i];
|
|
162
|
+
|
|
163
|
+
if (!currentChunk) {
|
|
164
|
+
currentChunk = chunk;
|
|
165
|
+
currentLength = chunk.tokens.length;
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// If either the current or next chunk is punctuation-only (non-whitespace, no alphanumeric),
|
|
170
|
+
// do not merge; keep them as separate chunks to preserve separators like '---'.
|
|
171
|
+
if (isPunctuationOnly(currentChunk.text) || isPunctuationOnly(chunk.text)) {
|
|
172
|
+
combinedChunks.push(currentChunk);
|
|
173
|
+
currentChunk = chunk;
|
|
174
|
+
currentLength = chunk.tokens.length;
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Normal merge path constrained by token budget
|
|
179
|
+
const length = currentChunk.tokens.length + chunk.tokens.length;
|
|
180
|
+
if (length > this._config.chunkSize) {
|
|
181
|
+
combinedChunks.push(currentChunk);
|
|
182
|
+
currentChunk = chunk;
|
|
183
|
+
currentLength = chunk.tokens.length;
|
|
184
|
+
} else {
|
|
185
|
+
// Only insert separator if neither chunk is whitespace-only (defensive)
|
|
186
|
+
const joiner = (!this._config.keepSeparators && !isWhitespaceOnly(currentChunk.text) && !isWhitespaceOnly(chunk.text)) ? separator : '';
|
|
187
|
+
currentChunk.text += joiner + chunk.text;
|
|
188
|
+
currentChunk.endPos = chunk.endPos;
|
|
189
|
+
currentChunk.tokens.push(...chunk.tokens);
|
|
190
|
+
currentLength += chunk.tokens.length;
|
|
191
|
+
}
|
|
178
192
|
}
|
|
179
193
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
const parts: string[] = [];
|
|
183
|
-
let tokens = this._config.tokenizer.encode(text);
|
|
184
|
-
do {
|
|
185
|
-
if (tokens.length <= this._config.chunkSize) {
|
|
186
|
-
parts.push(this._config.tokenizer.decode(tokens));
|
|
187
|
-
break;
|
|
188
|
-
} else {
|
|
189
|
-
const span = tokens.splice(0, this._config.chunkSize);
|
|
190
|
-
parts.push(this._config.tokenizer.decode(span));
|
|
191
|
-
}
|
|
192
|
-
} while (true);
|
|
193
|
-
|
|
194
|
-
return parts;
|
|
194
|
+
if (currentChunk) {
|
|
195
|
+
combinedChunks.push(currentChunk);
|
|
195
196
|
}
|
|
196
197
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
"\n",
|
|
487
|
-
" "
|
|
488
|
-
];
|
|
489
|
-
case "html":
|
|
490
|
-
return [
|
|
491
|
-
// First, try to split along HTML tags
|
|
492
|
-
"<body>",
|
|
493
|
-
"<div>",
|
|
494
|
-
"<p>",
|
|
495
|
-
"<br>",
|
|
496
|
-
"<li>",
|
|
497
|
-
"<h1>",
|
|
498
|
-
"<h2>",
|
|
499
|
-
"<h3>",
|
|
500
|
-
"<h4>",
|
|
501
|
-
"<h5>",
|
|
502
|
-
"<h6>",
|
|
503
|
-
"<span>",
|
|
504
|
-
"<table>",
|
|
505
|
-
"<tr>",
|
|
506
|
-
"<td>",
|
|
507
|
-
"<th>",
|
|
508
|
-
"<ul>",
|
|
509
|
-
"<ol>",
|
|
510
|
-
"<header>",
|
|
511
|
-
"<footer>",
|
|
512
|
-
"<nav>",
|
|
513
|
-
// Head
|
|
514
|
-
"<head>",
|
|
515
|
-
"<style>",
|
|
516
|
-
"<script>",
|
|
517
|
-
"<meta>",
|
|
518
|
-
"<title>",
|
|
519
|
-
// Normal type of lines
|
|
520
|
-
" "
|
|
521
|
-
];
|
|
522
|
-
case "sol":
|
|
523
|
-
return [
|
|
524
|
-
// Split along compiler informations definitions
|
|
525
|
-
"\npragma ",
|
|
526
|
-
"\nusing ",
|
|
527
|
-
// Split along contract definitions
|
|
528
|
-
"\ncontract ",
|
|
529
|
-
"\ninterface ",
|
|
530
|
-
"\nlibrary ",
|
|
531
|
-
// Split along method definitions
|
|
532
|
-
"\nconstructor ",
|
|
533
|
-
"\ntype ",
|
|
534
|
-
"\nfunction ",
|
|
535
|
-
"\nevent ",
|
|
536
|
-
"\nmodifier ",
|
|
537
|
-
"\nerror ",
|
|
538
|
-
"\nstruct ",
|
|
539
|
-
"\nenum ",
|
|
540
|
-
// Split along control flow statements
|
|
541
|
-
"\nif ",
|
|
542
|
-
"\nfor ",
|
|
543
|
-
"\nwhile ",
|
|
544
|
-
"\ndo while ",
|
|
545
|
-
"\nassembly ",
|
|
546
|
-
// Split by the normal type of lines
|
|
547
|
-
"\n\n",
|
|
548
|
-
"\n",
|
|
549
|
-
" "
|
|
550
|
-
];
|
|
551
|
-
default:
|
|
552
|
-
return [
|
|
553
|
-
// Split by the normal type of lines
|
|
554
|
-
"\n\n",
|
|
555
|
-
"\n",
|
|
556
|
-
" ",
|
|
557
|
-
"",
|
|
558
|
-
];
|
|
559
|
-
}
|
|
198
|
+
return combinedChunks;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
private splitBySpaces(text: string): string[] {
|
|
202
|
+
// Split text by tokens and return parts
|
|
203
|
+
const parts: string[] = [];
|
|
204
|
+
let tokens = this._config.tokenizer.encode(text);
|
|
205
|
+
|
|
206
|
+
do {
|
|
207
|
+
if (tokens.length <= this._config.chunkSize) {
|
|
208
|
+
parts.push(this._config.tokenizer.decode(tokens));
|
|
209
|
+
break;
|
|
210
|
+
} else {
|
|
211
|
+
const span = tokens.splice(0, this._config.chunkSize);
|
|
212
|
+
parts.push(this._config.tokenizer.decode(span));
|
|
213
|
+
}
|
|
214
|
+
} while (true);
|
|
215
|
+
|
|
216
|
+
return parts;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
private getSeparators(docType?: string): string[] {
|
|
220
|
+
switch (docType ?? '') {
|
|
221
|
+
case "cpp":
|
|
222
|
+
return [
|
|
223
|
+
"\nclass ",
|
|
224
|
+
"\nvoid ",
|
|
225
|
+
"\nint ",
|
|
226
|
+
"\nfloat ",
|
|
227
|
+
"\ndouble ",
|
|
228
|
+
"\nif ",
|
|
229
|
+
"\nfor ",
|
|
230
|
+
"\nwhile ",
|
|
231
|
+
"\nswitch ",
|
|
232
|
+
"\ncase ",
|
|
233
|
+
"\n\n",
|
|
234
|
+
"\n",
|
|
235
|
+
];
|
|
236
|
+
case "go":
|
|
237
|
+
return [
|
|
238
|
+
"\nfunc ",
|
|
239
|
+
"\nvar ",
|
|
240
|
+
"\nconst ",
|
|
241
|
+
"\ntype ",
|
|
242
|
+
"\nif ",
|
|
243
|
+
"\nfor ",
|
|
244
|
+
"\nswitch ",
|
|
245
|
+
"\ncase ",
|
|
246
|
+
"\n\n",
|
|
247
|
+
"\n",
|
|
248
|
+
];
|
|
249
|
+
case "java":
|
|
250
|
+
case "c#":
|
|
251
|
+
case "csharp":
|
|
252
|
+
case "cs":
|
|
253
|
+
case "ts":
|
|
254
|
+
case "tsx":
|
|
255
|
+
case "typescript":
|
|
256
|
+
return [
|
|
257
|
+
"// LLM-REGION",
|
|
258
|
+
"/* LLM-REGION",
|
|
259
|
+
"/** LLM-REGION",
|
|
260
|
+
"\nclass ",
|
|
261
|
+
"\npublic ",
|
|
262
|
+
"\nprotected ",
|
|
263
|
+
"\nprivate ",
|
|
264
|
+
"\nstatic ",
|
|
265
|
+
"\nif ",
|
|
266
|
+
"\nfor ",
|
|
267
|
+
"\nwhile ",
|
|
268
|
+
"\nswitch ",
|
|
269
|
+
"\ncase ",
|
|
270
|
+
"\n\n",
|
|
271
|
+
"\n",
|
|
272
|
+
" "
|
|
273
|
+
];
|
|
274
|
+
case "js":
|
|
275
|
+
case "jsx":
|
|
276
|
+
case "javascript":
|
|
277
|
+
return [
|
|
278
|
+
"// LLM-REGION",
|
|
279
|
+
"/* LLM-REGION",
|
|
280
|
+
"/** LLM-REGION",
|
|
281
|
+
"\nclass ",
|
|
282
|
+
"\nfunction ",
|
|
283
|
+
"\nconst ",
|
|
284
|
+
"\nlet ",
|
|
285
|
+
"\nvar ",
|
|
286
|
+
"\nclass ",
|
|
287
|
+
"\nif ",
|
|
288
|
+
"\nfor ",
|
|
289
|
+
"\nwhile ",
|
|
290
|
+
"\nswitch ",
|
|
291
|
+
"\ncase ",
|
|
292
|
+
"\ndefault ",
|
|
293
|
+
"\n\n",
|
|
294
|
+
"\n",
|
|
295
|
+
];
|
|
296
|
+
case "php":
|
|
297
|
+
return [
|
|
298
|
+
"\nfunction ",
|
|
299
|
+
"\nclass ",
|
|
300
|
+
"\nif ",
|
|
301
|
+
"\nforeach ",
|
|
302
|
+
"\nwhile ",
|
|
303
|
+
"\ndo ",
|
|
304
|
+
"\nswitch ",
|
|
305
|
+
"\ncase ",
|
|
306
|
+
"\n\n",
|
|
307
|
+
"\n",
|
|
308
|
+
];
|
|
309
|
+
case "proto":
|
|
310
|
+
return [
|
|
311
|
+
"\nmessage ",
|
|
312
|
+
"\nservice ",
|
|
313
|
+
"\nenum ",
|
|
314
|
+
"\noption ",
|
|
315
|
+
"\nimport ",
|
|
316
|
+
"\nsyntax ",
|
|
317
|
+
"\n\n",
|
|
318
|
+
"\n",
|
|
319
|
+
];
|
|
320
|
+
case "python":
|
|
321
|
+
case "py":
|
|
322
|
+
return [
|
|
323
|
+
"\nclass ",
|
|
324
|
+
"\ndef ",
|
|
325
|
+
"\n\tdef ",
|
|
326
|
+
"\n\n",
|
|
327
|
+
"\n",
|
|
328
|
+
];
|
|
329
|
+
case "rst":
|
|
330
|
+
return [
|
|
331
|
+
"\n===\n",
|
|
332
|
+
"\n---\n",
|
|
333
|
+
"\n***\n",
|
|
334
|
+
"\n.. ",
|
|
335
|
+
"\n\n",
|
|
336
|
+
"\n",
|
|
337
|
+
];
|
|
338
|
+
case "ruby":
|
|
339
|
+
return [
|
|
340
|
+
"\ndef ",
|
|
341
|
+
"\nclass ",
|
|
342
|
+
"\nif ",
|
|
343
|
+
"\nunless ",
|
|
344
|
+
"\nwhile ",
|
|
345
|
+
"\nfor ",
|
|
346
|
+
"\ndo ",
|
|
347
|
+
"\nbegin ",
|
|
348
|
+
"\nrescue ",
|
|
349
|
+
"\n\n",
|
|
350
|
+
"\n",
|
|
351
|
+
];
|
|
352
|
+
case "rust":
|
|
353
|
+
return [
|
|
354
|
+
"\nfn ",
|
|
355
|
+
"\nconst ",
|
|
356
|
+
"\nlet ",
|
|
357
|
+
"\nif ",
|
|
358
|
+
"\nwhile ",
|
|
359
|
+
"\nfor ",
|
|
360
|
+
"\nloop ",
|
|
361
|
+
"\nmatch ",
|
|
362
|
+
"\nconst ",
|
|
363
|
+
"\n\n",
|
|
364
|
+
"\n",
|
|
365
|
+
];
|
|
366
|
+
case "scala":
|
|
367
|
+
return [
|
|
368
|
+
"\nclass ",
|
|
369
|
+
"\nobject ",
|
|
370
|
+
"\ndef ",
|
|
371
|
+
"\nval ",
|
|
372
|
+
"\nvar ",
|
|
373
|
+
"\nif ",
|
|
374
|
+
"\nfor ",
|
|
375
|
+
"\nwhile ",
|
|
376
|
+
"\nmatch ",
|
|
377
|
+
"\ncase ",
|
|
378
|
+
"\n\n",
|
|
379
|
+
"\n",
|
|
380
|
+
];
|
|
381
|
+
case "swift":
|
|
382
|
+
return [
|
|
383
|
+
"\nfunc ",
|
|
384
|
+
"\nclass ",
|
|
385
|
+
"\nstruct ",
|
|
386
|
+
"\nenum ",
|
|
387
|
+
"\nif ",
|
|
388
|
+
"\nfor ",
|
|
389
|
+
"\nwhile ",
|
|
390
|
+
"\ndo ",
|
|
391
|
+
"\nswitch ",
|
|
392
|
+
"\ncase ",
|
|
393
|
+
"\n\n",
|
|
394
|
+
"\n",
|
|
395
|
+
];
|
|
396
|
+
case "md":
|
|
397
|
+
case "markdown":
|
|
398
|
+
return [
|
|
399
|
+
"\n## ",
|
|
400
|
+
"\n### ",
|
|
401
|
+
"\n#### ",
|
|
402
|
+
"\n##### ",
|
|
403
|
+
"\n###### ",
|
|
404
|
+
"```\n\n",
|
|
405
|
+
"\n\n***\n\n",
|
|
406
|
+
"\n\n---\n\n",
|
|
407
|
+
"\n\n___\n\n",
|
|
408
|
+
"<table>",
|
|
409
|
+
"\n\n",
|
|
410
|
+
"\n",
|
|
411
|
+
];
|
|
412
|
+
case "latex":
|
|
413
|
+
return [
|
|
414
|
+
"\n\\chapter{",
|
|
415
|
+
"\n\\section{",
|
|
416
|
+
"\n\\subsection{",
|
|
417
|
+
"\n\\subsubsection{",
|
|
418
|
+
"\n\\begin{enumerate}",
|
|
419
|
+
"\n\\begin{itemize}",
|
|
420
|
+
"\n\\begin{description}",
|
|
421
|
+
"\n\\begin{list}",
|
|
422
|
+
"\n\\begin{quote}",
|
|
423
|
+
"\n\\begin{quotation}",
|
|
424
|
+
"\n\\begin{verse}",
|
|
425
|
+
"\n\\begin{verbatim}",
|
|
426
|
+
"\n\\begin{align}",
|
|
427
|
+
"\n\n",
|
|
428
|
+
"\n",
|
|
429
|
+
];
|
|
430
|
+
case "html":
|
|
431
|
+
return [
|
|
432
|
+
"<body>",
|
|
433
|
+
"<div>",
|
|
434
|
+
"<p>",
|
|
435
|
+
"<br>",
|
|
436
|
+
"<li>",
|
|
437
|
+
"<h1>",
|
|
438
|
+
"<h2>",
|
|
439
|
+
"<h3>",
|
|
440
|
+
"<h4>",
|
|
441
|
+
"<h5>",
|
|
442
|
+
"<h6>",
|
|
443
|
+
"<span>",
|
|
444
|
+
"<table>",
|
|
445
|
+
"<tr>",
|
|
446
|
+
"<td>",
|
|
447
|
+
"<th>",
|
|
448
|
+
"<ul>",
|
|
449
|
+
"<ol>",
|
|
450
|
+
"<header>",
|
|
451
|
+
"<footer>",
|
|
452
|
+
"<nav>",
|
|
453
|
+
"<head>",
|
|
454
|
+
"<style>",
|
|
455
|
+
"<script>",
|
|
456
|
+
"<meta>",
|
|
457
|
+
"<title>",
|
|
458
|
+
];
|
|
459
|
+
case "sol":
|
|
460
|
+
return [
|
|
461
|
+
"\npragma ",
|
|
462
|
+
"\nusing ",
|
|
463
|
+
"\ncontract ",
|
|
464
|
+
"\ninterface ",
|
|
465
|
+
"\nlibrary ",
|
|
466
|
+
"\nconstructor ",
|
|
467
|
+
"\ntype ",
|
|
468
|
+
"\nfunction ",
|
|
469
|
+
"\nevent ",
|
|
470
|
+
"\nmodifier ",
|
|
471
|
+
"\nerror ",
|
|
472
|
+
"\nstruct ",
|
|
473
|
+
"\nenum ",
|
|
474
|
+
"\nif ",
|
|
475
|
+
"\nfor ",
|
|
476
|
+
"\nwhile ",
|
|
477
|
+
"\ndo while ",
|
|
478
|
+
"\nassembly ",
|
|
479
|
+
"\n\n",
|
|
480
|
+
"\n",
|
|
481
|
+
];
|
|
482
|
+
default:
|
|
483
|
+
return [
|
|
484
|
+
"\n\n",
|
|
485
|
+
"\n",
|
|
486
|
+
];
|
|
560
487
|
}
|
|
488
|
+
}
|
|
561
489
|
}
|