vectra 0.12.2 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.draft.md +499 -0
- package/README.draft.outline.md +160 -0
- package/README.research.md +2159 -0
- package/bin/vectra.js +3 -0
- package/lib/FileFetcher.d.ts +5 -0
- package/lib/FileFetcher.d.ts.map +1 -0
- package/lib/FileFetcher.js +79 -0
- package/lib/FileFetcher.js.map +1 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +168 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/LocalDocument.d.ts +54 -0
- package/lib/LocalDocument.js +156 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +132 -0
- package/lib/LocalDocumentIndex.js +456 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +45 -0
- package/lib/LocalDocumentResult.js +328 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +150 -0
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +515 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/LocalIndex.spec.d.ts +2 -0
- package/lib/LocalIndex.spec.js +218 -7
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +126 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +174 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +19 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +457 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/TextSplitter.spec.d.ts +2 -0
- package/lib/TextSplitter.spec.d.ts.map +1 -0
- package/lib/TextSplitter.spec.js +109 -0
- package/lib/TextSplitter.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +15 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +234 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +12 -0
- package/lib/index.js +28 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +43 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +146 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.js +323 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +3 -1
- package/src/LocalIndex.spec.ts +265 -8
- package/src/LocalIndex.ts +1 -0
- package/src/TextSplitter.spec.ts +87 -0
- package/src/TextSplitter.ts +459 -531
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TextSplitter = void 0;
|
|
4
|
+
const GPT3Tokenizer_1 = require("./GPT3Tokenizer");
|
|
5
|
+
const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
|
|
6
|
+
class TextSplitter {
|
|
7
|
+
constructor(config) {
|
|
8
|
+
this._config = Object.assign({
|
|
9
|
+
keepSeparators: false,
|
|
10
|
+
chunkSize: 400,
|
|
11
|
+
chunkOverlap: 40,
|
|
12
|
+
}, config);
|
|
13
|
+
// Create a default tokenizer if none is provided
|
|
14
|
+
if (!this._config.tokenizer) {
|
|
15
|
+
this._config.tokenizer = new GPT3Tokenizer_1.GPT3Tokenizer();
|
|
16
|
+
}
|
|
17
|
+
// Use default separators if none are provided
|
|
18
|
+
if (!this._config.separators || this._config.separators.length === 0) {
|
|
19
|
+
this._config.separators = this.getSeparators(this._config.docType);
|
|
20
|
+
}
|
|
21
|
+
// Validate the config settings
|
|
22
|
+
if (this._config.chunkSize < 1) {
|
|
23
|
+
throw new Error("chunkSize must be >= 1");
|
|
24
|
+
}
|
|
25
|
+
else if (this._config.chunkOverlap < 0) {
|
|
26
|
+
throw new Error("chunkOverlap must be >= 0");
|
|
27
|
+
}
|
|
28
|
+
else if (this._config.chunkOverlap > this._config.chunkSize) {
|
|
29
|
+
throw new Error("chunkOverlap must be <= chunkSize");
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
split(text) {
|
|
33
|
+
// Get basic chunks
|
|
34
|
+
const chunks = this.recursiveSplit(text, this._config.separators, 0);
|
|
35
|
+
const that = this;
|
|
36
|
+
function getOverlapTokens(tokens) {
|
|
37
|
+
if (tokens != undefined) {
|
|
38
|
+
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
39
|
+
return tokens.slice(0, len);
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
return [];
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Add overlap tokens and text to the start and end of each chunk
|
|
46
|
+
if (this._config.chunkOverlap > 0) {
|
|
47
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
48
|
+
const previousChunk = chunks[i - 1];
|
|
49
|
+
const chunk = chunks[i];
|
|
50
|
+
const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
|
|
51
|
+
// Use copies to avoid reversing in place (preserve token order in previous chunks)
|
|
52
|
+
const prevTokensCopy = previousChunk.tokens.slice();
|
|
53
|
+
chunk.startOverlap = getOverlapTokens(prevTokensCopy.reverse()).reverse();
|
|
54
|
+
chunk.endOverlap = getOverlapTokens(nextChunk === null || nextChunk === void 0 ? void 0 : nextChunk.tokens);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return chunks;
|
|
58
|
+
}
|
|
59
|
+
recursiveSplit(text, separators, startPos) {
|
|
60
|
+
const chunks = [];
|
|
61
|
+
if (text.length > 0) {
|
|
62
|
+
// Split text into parts
|
|
63
|
+
let parts;
|
|
64
|
+
let separator = '';
|
|
65
|
+
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
66
|
+
if (separators.length > 0) {
|
|
67
|
+
// Split by separator
|
|
68
|
+
separator = separators[0];
|
|
69
|
+
parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
// Cut text in half
|
|
73
|
+
const half = Math.floor(text.length / 2);
|
|
74
|
+
parts = [text.substring(0, half), text.substring(half)];
|
|
75
|
+
}
|
|
76
|
+
// Iterate over parts
|
|
77
|
+
for (let i = 0; i < parts.length; i++) {
|
|
78
|
+
const lastChunk = (i === parts.length - 1);
|
|
79
|
+
// Get chunk text and endPos
|
|
80
|
+
let chunk = parts[i];
|
|
81
|
+
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
82
|
+
if (this._config.keepSeparators && !lastChunk) {
|
|
83
|
+
chunk += separator;
|
|
84
|
+
}
|
|
85
|
+
// Keep chunks that contain any non-whitespace; drop whitespace-only
|
|
86
|
+
if (!/\S/.test(chunk)) {
|
|
87
|
+
// drop whitespace-only chunks
|
|
88
|
+
startPos = endPos + 1;
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
// Optimization to avoid encoding really large chunks
|
|
92
|
+
if (chunk.length / 6 > this._config.chunkSize) {
|
|
93
|
+
// Break the text into smaller chunks
|
|
94
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
95
|
+
chunks.push(...subChunks);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
// Encode chunk text
|
|
99
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
100
|
+
if (tokens.length > this._config.chunkSize) {
|
|
101
|
+
// Break the text into smaller chunks
|
|
102
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
103
|
+
chunks.push(...subChunks);
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
// Append chunk to output
|
|
107
|
+
chunks.push({
|
|
108
|
+
text: chunk,
|
|
109
|
+
tokens: tokens,
|
|
110
|
+
startPos: startPos,
|
|
111
|
+
endPos: endPos,
|
|
112
|
+
startOverlap: [],
|
|
113
|
+
endOverlap: [],
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
// Update startPos
|
|
118
|
+
startPos = endPos + 1;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return this.combineChunks(chunks);
|
|
122
|
+
}
|
|
123
|
+
combineChunks(chunks) {
|
|
124
|
+
const combinedChunks = [];
|
|
125
|
+
let currentChunk;
|
|
126
|
+
let currentLength = 0;
|
|
127
|
+
// When not keeping separators, we previously inserted a space between merged chunks.
|
|
128
|
+
// We will still use a space for normal merges, but we will prevent merging punctuation-only
|
|
129
|
+
// separator chunks (e.g., '---', '***', '====') to preserve them as standalone.
|
|
130
|
+
const separator = this._config.keepSeparators ? '' : ' ';
|
|
131
|
+
const isWhitespaceOnly = (t) => !/\S/.test(t);
|
|
132
|
+
const isPunctuationOnly = (t) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
|
|
133
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
134
|
+
const chunk = chunks[i];
|
|
135
|
+
if (!currentChunk) {
|
|
136
|
+
currentChunk = chunk;
|
|
137
|
+
currentLength = chunk.tokens.length;
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
// If either the current or next chunk is punctuation-only (non-whitespace, no alphanumeric),
|
|
141
|
+
// do not merge; keep them as separate chunks to preserve separators like '---'.
|
|
142
|
+
if (isPunctuationOnly(currentChunk.text) || isPunctuationOnly(chunk.text)) {
|
|
143
|
+
combinedChunks.push(currentChunk);
|
|
144
|
+
currentChunk = chunk;
|
|
145
|
+
currentLength = chunk.tokens.length;
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
// Normal merge path constrained by token budget
|
|
149
|
+
const length = currentChunk.tokens.length + chunk.tokens.length;
|
|
150
|
+
if (length > this._config.chunkSize) {
|
|
151
|
+
combinedChunks.push(currentChunk);
|
|
152
|
+
currentChunk = chunk;
|
|
153
|
+
currentLength = chunk.tokens.length;
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
// Only insert separator if neither chunk is whitespace-only (defensive)
|
|
157
|
+
const joiner = (!this._config.keepSeparators && !isWhitespaceOnly(currentChunk.text) && !isWhitespaceOnly(chunk.text)) ? separator : '';
|
|
158
|
+
currentChunk.text += joiner + chunk.text;
|
|
159
|
+
currentChunk.endPos = chunk.endPos;
|
|
160
|
+
currentChunk.tokens.push(...chunk.tokens);
|
|
161
|
+
currentLength += chunk.tokens.length;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
if (currentChunk) {
|
|
165
|
+
combinedChunks.push(currentChunk);
|
|
166
|
+
}
|
|
167
|
+
return combinedChunks;
|
|
168
|
+
}
|
|
169
|
+
splitBySpaces(text) {
|
|
170
|
+
// Split text by tokens and return parts
|
|
171
|
+
const parts = [];
|
|
172
|
+
let tokens = this._config.tokenizer.encode(text);
|
|
173
|
+
do {
|
|
174
|
+
if (tokens.length <= this._config.chunkSize) {
|
|
175
|
+
parts.push(this._config.tokenizer.decode(tokens));
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
else {
|
|
179
|
+
const span = tokens.splice(0, this._config.chunkSize);
|
|
180
|
+
parts.push(this._config.tokenizer.decode(span));
|
|
181
|
+
}
|
|
182
|
+
} while (true);
|
|
183
|
+
return parts;
|
|
184
|
+
}
|
|
185
|
+
getSeparators(docType) {
|
|
186
|
+
switch (docType !== null && docType !== void 0 ? docType : '') {
|
|
187
|
+
case "cpp":
|
|
188
|
+
return [
|
|
189
|
+
"\nclass ",
|
|
190
|
+
"\nvoid ",
|
|
191
|
+
"\nint ",
|
|
192
|
+
"\nfloat ",
|
|
193
|
+
"\ndouble ",
|
|
194
|
+
"\nif ",
|
|
195
|
+
"\nfor ",
|
|
196
|
+
"\nwhile ",
|
|
197
|
+
"\nswitch ",
|
|
198
|
+
"\ncase ",
|
|
199
|
+
"\n\n",
|
|
200
|
+
"\n",
|
|
201
|
+
];
|
|
202
|
+
case "go":
|
|
203
|
+
return [
|
|
204
|
+
"\nfunc ",
|
|
205
|
+
"\nvar ",
|
|
206
|
+
"\nconst ",
|
|
207
|
+
"\ntype ",
|
|
208
|
+
"\nif ",
|
|
209
|
+
"\nfor ",
|
|
210
|
+
"\nswitch ",
|
|
211
|
+
"\ncase ",
|
|
212
|
+
"\n\n",
|
|
213
|
+
"\n",
|
|
214
|
+
];
|
|
215
|
+
case "java":
|
|
216
|
+
case "c#":
|
|
217
|
+
case "csharp":
|
|
218
|
+
case "cs":
|
|
219
|
+
case "ts":
|
|
220
|
+
case "tsx":
|
|
221
|
+
case "typescript":
|
|
222
|
+
return [
|
|
223
|
+
"// LLM-REGION",
|
|
224
|
+
"/* LLM-REGION",
|
|
225
|
+
"/** LLM-REGION",
|
|
226
|
+
"\nclass ",
|
|
227
|
+
"\npublic ",
|
|
228
|
+
"\nprotected ",
|
|
229
|
+
"\nprivate ",
|
|
230
|
+
"\nstatic ",
|
|
231
|
+
"\nif ",
|
|
232
|
+
"\nfor ",
|
|
233
|
+
"\nwhile ",
|
|
234
|
+
"\nswitch ",
|
|
235
|
+
"\ncase ",
|
|
236
|
+
"\n\n",
|
|
237
|
+
"\n",
|
|
238
|
+
" "
|
|
239
|
+
];
|
|
240
|
+
case "js":
|
|
241
|
+
case "jsx":
|
|
242
|
+
case "javascript":
|
|
243
|
+
return [
|
|
244
|
+
"// LLM-REGION",
|
|
245
|
+
"/* LLM-REGION",
|
|
246
|
+
"/** LLM-REGION",
|
|
247
|
+
"\nclass ",
|
|
248
|
+
"\nfunction ",
|
|
249
|
+
"\nconst ",
|
|
250
|
+
"\nlet ",
|
|
251
|
+
"\nvar ",
|
|
252
|
+
"\nclass ",
|
|
253
|
+
"\nif ",
|
|
254
|
+
"\nfor ",
|
|
255
|
+
"\nwhile ",
|
|
256
|
+
"\nswitch ",
|
|
257
|
+
"\ncase ",
|
|
258
|
+
"\ndefault ",
|
|
259
|
+
"\n\n",
|
|
260
|
+
"\n",
|
|
261
|
+
];
|
|
262
|
+
case "php":
|
|
263
|
+
return [
|
|
264
|
+
"\nfunction ",
|
|
265
|
+
"\nclass ",
|
|
266
|
+
"\nif ",
|
|
267
|
+
"\nforeach ",
|
|
268
|
+
"\nwhile ",
|
|
269
|
+
"\ndo ",
|
|
270
|
+
"\nswitch ",
|
|
271
|
+
"\ncase ",
|
|
272
|
+
"\n\n",
|
|
273
|
+
"\n",
|
|
274
|
+
];
|
|
275
|
+
case "proto":
|
|
276
|
+
return [
|
|
277
|
+
"\nmessage ",
|
|
278
|
+
"\nservice ",
|
|
279
|
+
"\nenum ",
|
|
280
|
+
"\noption ",
|
|
281
|
+
"\nimport ",
|
|
282
|
+
"\nsyntax ",
|
|
283
|
+
"\n\n",
|
|
284
|
+
"\n",
|
|
285
|
+
];
|
|
286
|
+
case "python":
|
|
287
|
+
case "py":
|
|
288
|
+
return [
|
|
289
|
+
"\nclass ",
|
|
290
|
+
"\ndef ",
|
|
291
|
+
"\n\tdef ",
|
|
292
|
+
"\n\n",
|
|
293
|
+
"\n",
|
|
294
|
+
];
|
|
295
|
+
case "rst":
|
|
296
|
+
return [
|
|
297
|
+
"\n===\n",
|
|
298
|
+
"\n---\n",
|
|
299
|
+
"\n***\n",
|
|
300
|
+
"\n.. ",
|
|
301
|
+
"\n\n",
|
|
302
|
+
"\n",
|
|
303
|
+
];
|
|
304
|
+
case "ruby":
|
|
305
|
+
return [
|
|
306
|
+
"\ndef ",
|
|
307
|
+
"\nclass ",
|
|
308
|
+
"\nif ",
|
|
309
|
+
"\nunless ",
|
|
310
|
+
"\nwhile ",
|
|
311
|
+
"\nfor ",
|
|
312
|
+
"\ndo ",
|
|
313
|
+
"\nbegin ",
|
|
314
|
+
"\nrescue ",
|
|
315
|
+
"\n\n",
|
|
316
|
+
"\n",
|
|
317
|
+
];
|
|
318
|
+
case "rust":
|
|
319
|
+
return [
|
|
320
|
+
"\nfn ",
|
|
321
|
+
"\nconst ",
|
|
322
|
+
"\nlet ",
|
|
323
|
+
"\nif ",
|
|
324
|
+
"\nwhile ",
|
|
325
|
+
"\nfor ",
|
|
326
|
+
"\nloop ",
|
|
327
|
+
"\nmatch ",
|
|
328
|
+
"\nconst ",
|
|
329
|
+
"\n\n",
|
|
330
|
+
"\n",
|
|
331
|
+
];
|
|
332
|
+
case "scala":
|
|
333
|
+
return [
|
|
334
|
+
"\nclass ",
|
|
335
|
+
"\nobject ",
|
|
336
|
+
"\ndef ",
|
|
337
|
+
"\nval ",
|
|
338
|
+
"\nvar ",
|
|
339
|
+
"\nif ",
|
|
340
|
+
"\nfor ",
|
|
341
|
+
"\nwhile ",
|
|
342
|
+
"\nmatch ",
|
|
343
|
+
"\ncase ",
|
|
344
|
+
"\n\n",
|
|
345
|
+
"\n",
|
|
346
|
+
];
|
|
347
|
+
case "swift":
|
|
348
|
+
return [
|
|
349
|
+
"\nfunc ",
|
|
350
|
+
"\nclass ",
|
|
351
|
+
"\nstruct ",
|
|
352
|
+
"\nenum ",
|
|
353
|
+
"\nif ",
|
|
354
|
+
"\nfor ",
|
|
355
|
+
"\nwhile ",
|
|
356
|
+
"\ndo ",
|
|
357
|
+
"\nswitch ",
|
|
358
|
+
"\ncase ",
|
|
359
|
+
"\n\n",
|
|
360
|
+
"\n",
|
|
361
|
+
];
|
|
362
|
+
case "md":
|
|
363
|
+
case "markdown":
|
|
364
|
+
return [
|
|
365
|
+
"\n## ",
|
|
366
|
+
"\n### ",
|
|
367
|
+
"\n#### ",
|
|
368
|
+
"\n##### ",
|
|
369
|
+
"\n###### ",
|
|
370
|
+
"```\n\n",
|
|
371
|
+
"\n\n***\n\n",
|
|
372
|
+
"\n\n---\n\n",
|
|
373
|
+
"\n\n___\n\n",
|
|
374
|
+
"<table>",
|
|
375
|
+
"\n\n",
|
|
376
|
+
"\n",
|
|
377
|
+
];
|
|
378
|
+
case "latex":
|
|
379
|
+
return [
|
|
380
|
+
"\n\\chapter{",
|
|
381
|
+
"\n\\section{",
|
|
382
|
+
"\n\\subsection{",
|
|
383
|
+
"\n\\subsubsection{",
|
|
384
|
+
"\n\\begin{enumerate}",
|
|
385
|
+
"\n\\begin{itemize}",
|
|
386
|
+
"\n\\begin{description}",
|
|
387
|
+
"\n\\begin{list}",
|
|
388
|
+
"\n\\begin{quote}",
|
|
389
|
+
"\n\\begin{quotation}",
|
|
390
|
+
"\n\\begin{verse}",
|
|
391
|
+
"\n\\begin{verbatim}",
|
|
392
|
+
"\n\\begin{align}",
|
|
393
|
+
"\n\n",
|
|
394
|
+
"\n",
|
|
395
|
+
];
|
|
396
|
+
case "html":
|
|
397
|
+
return [
|
|
398
|
+
"<body>",
|
|
399
|
+
"<div>",
|
|
400
|
+
"<p>",
|
|
401
|
+
"<br>",
|
|
402
|
+
"<li>",
|
|
403
|
+
"<h1>",
|
|
404
|
+
"<h2>",
|
|
405
|
+
"<h3>",
|
|
406
|
+
"<h4>",
|
|
407
|
+
"<h5>",
|
|
408
|
+
"<h6>",
|
|
409
|
+
"<span>",
|
|
410
|
+
"<table>",
|
|
411
|
+
"<tr>",
|
|
412
|
+
"<td>",
|
|
413
|
+
"<th>",
|
|
414
|
+
"<ul>",
|
|
415
|
+
"<ol>",
|
|
416
|
+
"<header>",
|
|
417
|
+
"<footer>",
|
|
418
|
+
"<nav>",
|
|
419
|
+
"<head>",
|
|
420
|
+
"<style>",
|
|
421
|
+
"<script>",
|
|
422
|
+
"<meta>",
|
|
423
|
+
"<title>",
|
|
424
|
+
];
|
|
425
|
+
case "sol":
|
|
426
|
+
return [
|
|
427
|
+
"\npragma ",
|
|
428
|
+
"\nusing ",
|
|
429
|
+
"\ncontract ",
|
|
430
|
+
"\ninterface ",
|
|
431
|
+
"\nlibrary ",
|
|
432
|
+
"\nconstructor ",
|
|
433
|
+
"\ntype ",
|
|
434
|
+
"\nfunction ",
|
|
435
|
+
"\nevent ",
|
|
436
|
+
"\nmodifier ",
|
|
437
|
+
"\nerror ",
|
|
438
|
+
"\nstruct ",
|
|
439
|
+
"\nenum ",
|
|
440
|
+
"\nif ",
|
|
441
|
+
"\nfor ",
|
|
442
|
+
"\nwhile ",
|
|
443
|
+
"\ndo while ",
|
|
444
|
+
"\nassembly ",
|
|
445
|
+
"\n\n",
|
|
446
|
+
"\n",
|
|
447
|
+
];
|
|
448
|
+
default:
|
|
449
|
+
return [
|
|
450
|
+
"\n\n",
|
|
451
|
+
"\n",
|
|
452
|
+
];
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
exports.TextSplitter = TextSplitter;
|
|
457
|
+
//# sourceMappingURL=TextSplitter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TextSplitter.js","sourceRoot":"","sources":["../src/TextSplitter.ts"],"names":[],"mappings":";;;AAAA,mDAAgD;AAGhD,MAAM,kBAAkB,GAAG,gEAAgE,CAAC;AAW5F,MAAa,YAAY;IAGvB,YAAmB,MAAoC;QACrD,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;YAC3B,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,GAAG;YACd,YAAY,EAAE,EAAE;SACK,EAAE,MAAM,CAAC,CAAC;QAEjC,iDAAiD;QACjD,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;YAC5B,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,IAAI,6BAAa,EAAE,CAAC;QAC/C,CAAC;QAED,8CAA8C;QAC9C,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACrE,IAAI,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACrE,CAAC;QAED,+BAA+B;QAC/B,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,CAAC;QAC5C,CAAC;aAAM,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YACzC,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;QAC/C,CAAC;aAAM,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;YAC9D,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IAEM,KAAK,CAAC,IAAY;QACvB,mBAAmB;QACnB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QAErE,MAAM,IAAI,GAAG,IAAI,CAAC;QAClB,SAAS,gBAAgB,CAAC,MAAiB;YACzC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC;gBAClG,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC;QAED,iEAAiE;QACjE,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,MAAM,aAAa,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;gBACpC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;gBACxB,MAAM,SAAS,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;gBAEpE,mFAAmF;gBACnF,MAAM,cAAc,GAAG,aAAa,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;gBACpD,KAAK,CAAC,YAAY,GAAG,gBAAgB,CAAC,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC;gBAC1E,KAAK,CAAC,UAAU,GAAG,gBAAgB,CAAC,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,MAAM,CAAC,CAAC;YACzD,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,UAAoB,EAAE,QAAgB;QACzE,MAAM,MAAM,GAAgB,EAAE,CAAC;QAE/B,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpB,wBAAwB;YACxB,IAAI,KAAe,CAAC;YACpB,IAAI,SAAS,GAAG,EAAE,CAAC;YACnB,MAAM,cAAc,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAExE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,qBAAqB;gBACrB,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;gBAC1B,KAAK,GAAG,SAAS,IAAI,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YAC9E,CAAC;iBAAM,CAAC;gBACN,mBAAmB;gBACnB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBACzC,KAAK,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAED,qBAAqB;YACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAE3C,4BAA4B;gBAC5B,IAAI,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACrB,MAAM,MAAM,GAAG,CAAC,QAAQ,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAEpF,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,SAAS,EAAE,CAAC;oBAC9C,KAAK,IAAI,SAAS,CAAC;gBACrB,CAAC;gBAED,oEAAoE;gBACpE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;oBACtB,8BAA8B;oBAC9B,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;oBACtB,SAAS;gBACX,CAAC;gBAED,qDAAqD;gBACrD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;oBAC9C,qCAAqC;oBACrC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,cAAc,EAAE,QAAQ,CAAC,CAAC;oBACvE,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;gBAC5B,CAAC;qBAAM,CAAC;oBACN,oBAAoB;oBACpB,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;oBACpD,IAAI,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;wBAC3C,qCAAqC;wBACrC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,cAAc,EAAE,QAAQ,CAAC,CAAC;wBACvE,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;oBAC5B,CAAC;yBAAM,CAAC;wBACN,yBAAyB;wBACzB,MAAM,CAAC,IAAI,CAAC;4BACV,IAAI,EAAE,KAAK;4BACX,MAAM,EAAE,MAAM;4BACd,QAAQ,EAAE,QAAQ;4BAClB,MAAM,EAAE,MAAM;4BACd,YAAY,EAAE,EAAE;4BAChB,UAAU,EAAE,EAAE;yBACf,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;gBAED,kBAAkB;gBAClB,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAEO,aAAa,CAAC,MAAmB;QACvC,MAAM,cAAc,GAAgB,EAAE,CAAC;QACvC,IAAI,YAAmC,CAAC;QACxC,IAAI,aAAa,GAAG,CAAC,CAAC;QAEtB,qFAAqF;QACrF,4FAA4F;QAC5F,gFAAgF;QAChF,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;QAEzD,MAAM,gBAAgB,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACtD,MAAM,iBAAiB,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAEhF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YAExB,IAAI,CAAC,YAAY,EAAE,CAAC;gBAClB,YAAY,GAAG,KAAK,CAAC;gBACrB,aAAa,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;gBACpC,SAAS;YACX,CAAC;YAED,6FAA6F;YAC7F,gFAAgF;YAChF,IAAI,iBAAiB,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,iBAAiB,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1E,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBAClC,YAAY,GAAG,KAAK,CAAC;gBACrB,aAAa,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;gBACpC,SAAS;YACX,CAAC;YAED,gDAAgD;YAChD,MAAM,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YAChE,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;gBACpC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBAClC,YAAY,GAAG,KAAK,CAAC;gBACrB,aAAa,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YACtC,CAAC;iBAAM,CAAC;gBACN,wEAAwE;gBACxE,MAAM,MAAM,GAAG,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,gBAAgB,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;gBACxI,YAAY,CAAC,IAAI,IAAI,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC;gBACzC,YAAY,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;gBACnC,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;gBAC1C,aAAa,IAAI,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YACvC,CAAC;QACH,CAAC;QAED,IAAI,YAAY,EAAE,CAAC;YACjB,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QAED,OAAO,cAAc,CAAC;IACxB,CAAC;IAEO,aAAa,CAAC,IAAY;QAChC,wCAAwC;QACxC,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAEjD,GAAG,CAAC;YACF,IAAI,MAAM,CAAC,MAAM,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;gBAC5C,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;gBAClD,MAAM;YACR,CAAC;iBAAM,CAAC;gBACN,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;gBACtD,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;YAClD,CAAC;QACH,CAAC,QAAQ,IAAI,EAAE;QAEf,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,aAAa,CAAC,OAAgB;QACpC,QAAQ,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,EAAE,CAAC;YACtB,KAAK,KAAK;gBACR,OAAO;oBACL,UAAU;oBACV,SAAS;oBACT,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,IAAI;gBACP,OAAO;oBACL,SAAS;oBACT,QAAQ;oBACR,UAAU;oBACV,SAAS;oBACT,OAAO;oBACP,QAAQ;oBACR,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM,CAAC;YACZ,KAAK,IAAI,CAAC;YACV,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,KAAK,CAAC;YACX,KAAK,YAAY;gBACf,OAAO;oBACL,eAAe;oBACf,eAAe;oBACf,gBAAgB;oBAChB,UAAU;oBACV,WAAW;oBACX,cAAc;oBACd,YAAY;oBACZ,WAAW;oBACX,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;oBACJ,GAAG;iBACJ,CAAC;YACJ,KAAK,IAAI,CAAC;YACV,KAAK,KAAK,CAAC;YACX,KAAK,YAAY;gBACf,OAAO;oBACL,eAAe;oBACf,eAAe;oBACf,gBAAgB;oBAChB,UAAU;oBACV,aAAa;oBACb,UAAU;oBACV,QAAQ;oBACR,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,YAAY;oBACZ,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,aAAa;oBACb,UAAU;oBACV,OAAO;oBACP,YAAY;oBACZ,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,YAAY;oBACZ,YAAY;oBACZ,SAAS;oBACT,WAAW;oBACX,WAAW;oBACX,WAAW;oBACX,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI;gBACP,OAAO;oBACL,UAAU;oBACV,QAAQ;oBACR,UAAU;oBACV,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,SAAS;oBACT,SAAS;oBACT,SAAS;oBACT,OAAO;oBACP,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM;gBACT,OAAO;oBACL,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,UAAU;oBACV,QAAQ;oBACR,OAAO;oBACP,UAAU;oBACV,WAAW;oBACX,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM;gBACT,OAAO;oBACL,OAAO;oBACP,UAAU;oBACV,QAAQ;oBACR,OAAO;oBACP,UAAU;oBACV,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,UAAU;oBACV,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,UAAU;oBACV,WAAW;oBACX,QAAQ;oBACR,QAAQ;oBACR,QAAQ;oBACR,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,UAAU;oBACV,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,SAAS;oBACT,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,IAAI,CAAC;YACV,KAAK,UAAU;gBACb,OAAO;oBACL,OAAO;oBACP,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,aAAa;oBACb,aAAa;oBACb,aAAa;oBACb,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,cAAc;oBACd,cAAc;oBACd,iBAAiB;oBACjB,oBAAoB;oBACpB,sBAAsB;oBACtB,oBAAoB;oBACpB,wBAAwB;oBACxB,iBAAiB;oBACjB,kBAAkB;oBAClB,sBAAsB;oBACtB,kBAAkB;oBAClB,qBAAqB;oBACrB,kBAAkB;oBAClB,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM;gBACT,OAAO;oBACL,QAAQ;oBACR,OAAO;oBACP,KAAK;oBACL,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,QAAQ;oBACR,SAAS;oBACT,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,UAAU;oBACV,UAAU;oBACV,OAAO;oBACP,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,QAAQ;oBACR,SAAS;iBACV,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,WAAW;oBACX,UAAU;oBACV,aAAa;oBACb,cAAc;oBACd,YAAY;oBACZ,gBAAgB;oBAChB,SAAS;oBACT,aAAa;oBACb,UAAU;oBACV,aAAa;oBACb,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,aAAa;oBACb,aAAa;oBACb,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ;gBACE,OAAO;oBACL,MAAM;oBACN,IAAI;iBACL,CAAC;QACN,CAAC;IACH,CAAC;CACF;AA1dD,oCA0dC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TextSplitter.spec.d.ts","sourceRoot":"","sources":["../src/TextSplitter.spec.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
const mocha_1 = require("mocha");
|
|
37
|
+
const assert = __importStar(require("node:assert"));
|
|
38
|
+
const TextSplitter_1 = require("./TextSplitter");
|
|
39
|
+
(0, mocha_1.describe)('TextSplitter', () => {
|
|
40
|
+
const makeSplitter = (opts) => new TextSplitter_1.TextSplitter(Object.assign({ chunkSize: 16, chunkOverlap: 0 }, opts));
|
|
41
|
+
(0, mocha_1.it)('keeps a leading punctuation-only chunk ("---")', () => {
|
|
42
|
+
const splitter = makeSplitter({ chunkSize: 3, chunkOverlap: 0 });
|
|
43
|
+
const chunks = splitter.split('---');
|
|
44
|
+
assert.deepStrictEqual(chunks.map(c => c.text), ['---']);
|
|
45
|
+
});
|
|
46
|
+
(0, mocha_1.it)('keeps punctuation-only separators (---, ***, ====) at start, middle, and end', () => {
|
|
47
|
+
const splitter = makeSplitter({ chunkSize: 4, chunkOverlap: 0 });
|
|
48
|
+
const text = ['---', 'Hello world', '***', 'Middle', '===='].join('\n');
|
|
49
|
+
const chunks = splitter.split(text);
|
|
50
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
51
|
+
assert.ok(chunks.some(c => c.text.includes('***')));
|
|
52
|
+
assert.ok(chunks.some(c => c.text.includes('====')));
|
|
53
|
+
});
|
|
54
|
+
(0, mocha_1.it)('preserves frontmatter delimiters when chunk size is small and overlap is zero', () => {
|
|
55
|
+
var _a;
|
|
56
|
+
const splitter = makeSplitter({ chunkSize: 12, chunkOverlap: 0 });
|
|
57
|
+
const md = [
|
|
58
|
+
'---',
|
|
59
|
+
'title: Test',
|
|
60
|
+
'tags: [a, b]',
|
|
61
|
+
'---',
|
|
62
|
+
'# Heading',
|
|
63
|
+
'Body text goes here.'
|
|
64
|
+
].join('\n');
|
|
65
|
+
const chunks = splitter.split(md);
|
|
66
|
+
const joined = chunks.map(c => c.text).join('\n');
|
|
67
|
+
const delimiterCount = ((_a = joined.match(/^---$/gm)) !== null && _a !== void 0 ? _a : []).length;
|
|
68
|
+
assert.strictEqual(delimiterCount, 2);
|
|
69
|
+
});
|
|
70
|
+
(0, mocha_1.it)('keeps trailing punctuation-only chunk', () => {
|
|
71
|
+
const splitter = makeSplitter({ chunkSize: 4, chunkOverlap: 0 });
|
|
72
|
+
const chunks = splitter.split('Content\n---');
|
|
73
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
74
|
+
});
|
|
75
|
+
(0, mocha_1.it)('drops pure whitespace-only chunks', () => {
|
|
76
|
+
const splitter = makeSplitter({ chunkSize: 10, chunkOverlap: 0 });
|
|
77
|
+
const chunks1 = splitter.split(' \t ');
|
|
78
|
+
const chunks2 = splitter.split('\n\n');
|
|
79
|
+
const chunks3 = splitter.split(' \n \n ');
|
|
80
|
+
assert.strictEqual(chunks1.length, 0);
|
|
81
|
+
assert.strictEqual(chunks2.length, 0);
|
|
82
|
+
assert.strictEqual(chunks3.length, 0);
|
|
83
|
+
});
|
|
84
|
+
(0, mocha_1.it)('still returns alphanumeric chunks normally', () => {
|
|
85
|
+
const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 0 });
|
|
86
|
+
const chunks = splitter.split('abcde fghij');
|
|
87
|
+
assert.ok(chunks.length > 0);
|
|
88
|
+
assert.ok(chunks.map(c => c.text).join(' ').includes('abcde'));
|
|
89
|
+
assert.ok(chunks.map(c => c.text).join(' ').includes('fghij'));
|
|
90
|
+
});
|
|
91
|
+
(0, mocha_1.it)('does not regress with non-zero overlap', () => {
|
|
92
|
+
const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 2 });
|
|
93
|
+
const chunks = splitter.split('---\nabcdef');
|
|
94
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
95
|
+
});
|
|
96
|
+
(0, mocha_1.it)('handles multiple punctuation-only separators interleaved with content', () => {
|
|
97
|
+
const splitter = makeSplitter({ chunkSize: 8, chunkOverlap: 0 });
|
|
98
|
+
const text = ['***', 'A', '---', 'B', '====', 'C'].join('\n');
|
|
99
|
+
const chunks = splitter.split(text);
|
|
100
|
+
assert.ok(chunks.some(c => c.text.includes('***')));
|
|
101
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
102
|
+
assert.ok(chunks.some(c => c.text.includes('====')));
|
|
103
|
+
const joined = chunks.map(c => c.text).join('\n');
|
|
104
|
+
assert.ok(joined.includes('\nA\n'));
|
|
105
|
+
assert.ok(joined.includes('\nB\n'));
|
|
106
|
+
assert.ok(joined.includes('\nC'));
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
//# sourceMappingURL=TextSplitter.spec.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TextSplitter.spec.js","sourceRoot":"","sources":["../src/TextSplitter.spec.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iCAAqC;AACrC,oDAAsC;AACtC,iDAA8C;AAE9C,IAAA,gBAAQ,EAAC,cAAc,EAAE,GAAG,EAAE;IAC5B,MAAM,YAAY,GAAG,CAAC,IAA6D,EAAE,EAAE,CACrF,IAAI,2BAAY,iBAAG,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,IAAK,IAAI,EAAG,CAAC;IAEhE,IAAA,UAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACrC,MAAM,CAAC,eAAe,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,8EAA8E,EAAE,GAAG,EAAE;QACtF,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,GAAG,CAAC,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,+EAA+E,EAAE,GAAG,EAAE;;QACvF,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QAClE,MAAM,EAAE,GAAG;YACT,KAAK;YACL,aAAa;YACb,cAAc;YACd,KAAK;YACL,WAAW;YACX,sBAAsB;SACvB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEb,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAElD,MAAM,cAAc,GAAG,CAAC,MAAA,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,mCAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC9D,MAAM,CAAC,WAAW,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC9C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QAC1C,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;QAC3C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC7B,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;IACjE,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,uEAAuE,EAAE,GAAG,EAAE;QAC/E,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,GAAG,CAAC,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAErD,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAClD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { AxiosRequestConfig } from "axios";
|
|
2
|
+
import { TextFetcher } from './types';
|
|
3
|
+
export interface WebFetcherConfig {
|
|
4
|
+
headers?: Record<string, string>;
|
|
5
|
+
requestConfig?: AxiosRequestConfig;
|
|
6
|
+
htmlToMarkdown: boolean;
|
|
7
|
+
summarizeHtml: boolean;
|
|
8
|
+
}
|
|
9
|
+
export declare class WebFetcher implements TextFetcher {
|
|
10
|
+
private readonly _config;
|
|
11
|
+
constructor(config?: Partial<WebFetcherConfig>);
|
|
12
|
+
fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean>;
|
|
13
|
+
private htmlToMarkdown;
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=WebFetcher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"WebFetcher.d.ts","sourceRoot":"","sources":["../src/WebFetcher.ts"],"names":[],"mappings":"AAAA,OAAc,EAAE,kBAAkB,EAAE,MAAM,OAAO,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AA2BtC,MAAM,WAAW,gBAAgB;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAC,MAAM,CAAC,CAAC;IAChC,aAAa,CAAC,EAAE,kBAAkB,CAAC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,aAAa,EAAE,OAAO,CAAC;CAC1B;AAED,qBAAa,UAAW,YAAW,WAAW;IAC1C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAmB;gBAExB,MAAM,CAAC,EAAE,OAAO,CAAC,gBAAgB,CAAC;IAOxC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC;IAyChI,OAAO,CAAC,cAAc;CAmCzB"}
|