vectra 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/bin/vectra.js +3 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.js +17 -0
- package/lib/GPT3Tokenizer.js.map +1 -0
- package/lib/ItemSelector.d.ts +1 -1
- package/lib/ItemSelector.d.ts.map +1 -1
- package/lib/ItemSelector.js.map +1 -1
- package/lib/LocalDocument.d.ts +16 -0
- package/lib/LocalDocument.d.ts.map +1 -0
- package/lib/LocalDocument.js +99 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +48 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.js +367 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +12 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -0
- package/lib/LocalDocumentResult.js +186 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +9 -63
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +14 -1
- package/lib/LocalIndex.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +98 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +139 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +17 -0
- package/lib/TextSplitter.d.ts.map +1 -0
- package/lib/TextSplitter.js +460 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/WebFetcher.d.ts +16 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +144 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +8 -0
- package/lib/index.d.ts.map +1 -1
- package/lib/index.js +13 -1
- package/lib/index.js.map +1 -1
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +42 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +133 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -0
- package/lib/vectra-cli.js +277 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +21 -3
- package/src/GPT3Tokenizer.ts +15 -0
- package/src/ItemSelector.ts +9 -9
- package/src/LocalDocument.ts +70 -0
- package/src/LocalDocumentIndex.ts +355 -0
- package/src/LocalDocumentResult.ts +206 -0
- package/src/LocalIndex.ts +12 -78
- package/src/OpenAIEmbeddings.ts +205 -0
- package/src/TextSplitter.ts +480 -0
- package/src/WebFetcher.ts +128 -0
- package/src/index.ts +8 -0
- package/src/internals/Colorize.ts +64 -0
- package/src/internals/index.ts +2 -0
- package/src/internals/types.ts +46 -0
- package/src/types.ts +160 -0
- package/src/vectra-cli.ts +238 -0
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
import { GPT3Tokenizer } from "./GPT3Tokenizer";
|
|
2
|
+
import { TextChunk, Tokenizer } from "./types";
|
|
3
|
+
|
|
4
|
+
export interface TextSplitterConfig {
|
|
5
|
+
separators: string[];
|
|
6
|
+
keepSeparators: boolean;
|
|
7
|
+
chunkSize: number;
|
|
8
|
+
chunkOverlap: number;
|
|
9
|
+
tokenizer: Tokenizer;
|
|
10
|
+
docType?: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export class TextSplitter {
|
|
14
|
+
private readonly _config: TextSplitterConfig;
|
|
15
|
+
|
|
16
|
+
public constructor(config?: Partial<TextSplitterConfig>) {
|
|
17
|
+
this._config = Object.assign({
|
|
18
|
+
separators: ["\n\n", "\n", " ", ""],
|
|
19
|
+
keepSeparators: false,
|
|
20
|
+
chunkSize: 400,
|
|
21
|
+
chunkOverlap: 40,
|
|
22
|
+
} as TextSplitterConfig, config);
|
|
23
|
+
|
|
24
|
+
// Create a default tokenizer if none is provided
|
|
25
|
+
if (!this._config.tokenizer) {
|
|
26
|
+
this._config.tokenizer = new GPT3Tokenizer();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Use default separators if none are provided
|
|
30
|
+
if (!this._config.separators || this._config.separators.length === 0) {
|
|
31
|
+
this._config.separators = this.getSeparators(this._config.docType);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Validate the config settings
|
|
35
|
+
if (this._config.chunkSize < 1) {
|
|
36
|
+
throw new Error("chunkSize must be >= 1");
|
|
37
|
+
} else if (this._config.chunkOverlap < 0) {
|
|
38
|
+
throw new Error("chunkOverlap must be >= 0");
|
|
39
|
+
} else if (this._config.chunkOverlap > this._config.chunkSize) {
|
|
40
|
+
throw new Error("chunkOverlap must be <= chunkSize");
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
public split(text: string): TextChunk[] {
|
|
45
|
+
// Get basic chunks
|
|
46
|
+
const chunks = this.recursiveSplit(text, this._config.separators, 0);
|
|
47
|
+
|
|
48
|
+
const that = this;
|
|
49
|
+
function getOverlapTokens(tokens?: number[]): number[] {
|
|
50
|
+
if (tokens != undefined) {
|
|
51
|
+
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
52
|
+
return tokens.slice(tokens.length);
|
|
53
|
+
} else {
|
|
54
|
+
return [];
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Add overlap tokens and text to the start and end of each chunk
|
|
59
|
+
if (this._config.chunkOverlap > 0) {
|
|
60
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
61
|
+
const previousChunk = chunks[i - 1];
|
|
62
|
+
const chunk = chunks[i];
|
|
63
|
+
const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
|
|
64
|
+
chunk.startOverlap = getOverlapTokens(previousChunk.tokens.reverse()).reverse();
|
|
65
|
+
chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return chunks;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
|
|
73
|
+
const chunks: TextChunk[] = [];
|
|
74
|
+
if (text.length > 0 && separators.length > 0) {
|
|
75
|
+
const separator = separators[0];
|
|
76
|
+
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
77
|
+
const parts = text.split(separator);
|
|
78
|
+
for (let i = 0; i < parts.length; i++) {
|
|
79
|
+
const lastChunk = (i === parts.length - 1);
|
|
80
|
+
|
|
81
|
+
// Get chunk text and endPos
|
|
82
|
+
let chunk = parts[i];
|
|
83
|
+
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
84
|
+
if (this._config.keepSeparators && !lastChunk) {
|
|
85
|
+
chunk += separator;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Encode chunk text
|
|
89
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
90
|
+
if (tokens.length > this._config.chunkSize) {
|
|
91
|
+
// Break the text into smaller chunks
|
|
92
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
93
|
+
chunks.push(...subChunks);
|
|
94
|
+
} else {
|
|
95
|
+
// Append chunk to output
|
|
96
|
+
chunks.push({
|
|
97
|
+
text: chunk,
|
|
98
|
+
tokens: tokens,
|
|
99
|
+
startPos: startPos,
|
|
100
|
+
endPos: endPos,
|
|
101
|
+
startOverlap: [],
|
|
102
|
+
endOverlap: [],
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Update startPos
|
|
107
|
+
startPos = endPos + 1;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return chunks;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
private getSeparators(docType?: string): string[] {
|
|
115
|
+
switch (docType ?? '') {
|
|
116
|
+
case "cpp":
|
|
117
|
+
return [
|
|
118
|
+
// Split along class definitions
|
|
119
|
+
"\nclass ",
|
|
120
|
+
// Split along function definitions
|
|
121
|
+
"\nvoid ",
|
|
122
|
+
"\nint ",
|
|
123
|
+
"\nfloat ",
|
|
124
|
+
"\ndouble ",
|
|
125
|
+
// Split along control flow statements
|
|
126
|
+
"\nif ",
|
|
127
|
+
"\nfor ",
|
|
128
|
+
"\nwhile ",
|
|
129
|
+
"\nswitch ",
|
|
130
|
+
"\ncase ",
|
|
131
|
+
// Split by the normal type of lines
|
|
132
|
+
"\n\n",
|
|
133
|
+
"\n",
|
|
134
|
+
" ",
|
|
135
|
+
"",
|
|
136
|
+
];
|
|
137
|
+
case "go":
|
|
138
|
+
return [
|
|
139
|
+
// Split along function definitions
|
|
140
|
+
"\nfunc ",
|
|
141
|
+
"\nvar ",
|
|
142
|
+
"\nconst ",
|
|
143
|
+
"\ntype ",
|
|
144
|
+
// Split along control flow statements
|
|
145
|
+
"\nif ",
|
|
146
|
+
"\nfor ",
|
|
147
|
+
"\nswitch ",
|
|
148
|
+
"\ncase ",
|
|
149
|
+
// Split by the normal type of lines
|
|
150
|
+
"\n\n",
|
|
151
|
+
"\n",
|
|
152
|
+
" ",
|
|
153
|
+
"",
|
|
154
|
+
];
|
|
155
|
+
case "java":
|
|
156
|
+
case "c#":
|
|
157
|
+
case "csharp":
|
|
158
|
+
case "cs":
|
|
159
|
+
case "ts":
|
|
160
|
+
case "tsx":
|
|
161
|
+
case "typescript":
|
|
162
|
+
return [
|
|
163
|
+
// Split along class definitions
|
|
164
|
+
"\nclass ",
|
|
165
|
+
// Split along method definitions
|
|
166
|
+
"\npublic ",
|
|
167
|
+
"\nprotected ",
|
|
168
|
+
"\nprivate ",
|
|
169
|
+
"\nstatic ",
|
|
170
|
+
// Split along control flow statements
|
|
171
|
+
"\nif ",
|
|
172
|
+
"\nfor ",
|
|
173
|
+
"\nwhile ",
|
|
174
|
+
"\nswitch ",
|
|
175
|
+
"\ncase ",
|
|
176
|
+
// Split by the normal type of lines
|
|
177
|
+
"\n\n",
|
|
178
|
+
"\n",
|
|
179
|
+
" ",
|
|
180
|
+
"",
|
|
181
|
+
];
|
|
182
|
+
case "js":
|
|
183
|
+
case "jsx":
|
|
184
|
+
case "javascript":
|
|
185
|
+
return [
|
|
186
|
+
// Split along class definitions
|
|
187
|
+
"\nclass ",
|
|
188
|
+
// Split along function definitions
|
|
189
|
+
"\nfunction ",
|
|
190
|
+
"\nconst ",
|
|
191
|
+
"\nlet ",
|
|
192
|
+
"\nvar ",
|
|
193
|
+
"\nclass ",
|
|
194
|
+
// Split along control flow statements
|
|
195
|
+
"\nif ",
|
|
196
|
+
"\nfor ",
|
|
197
|
+
"\nwhile ",
|
|
198
|
+
"\nswitch ",
|
|
199
|
+
"\ncase ",
|
|
200
|
+
"\ndefault ",
|
|
201
|
+
// Split by the normal type of lines
|
|
202
|
+
"\n\n",
|
|
203
|
+
"\n",
|
|
204
|
+
" ",
|
|
205
|
+
"",
|
|
206
|
+
];
|
|
207
|
+
case "php":
|
|
208
|
+
return [
|
|
209
|
+
// Split along function definitions
|
|
210
|
+
"\nfunction ",
|
|
211
|
+
// Split along class definitions
|
|
212
|
+
"\nclass ",
|
|
213
|
+
// Split along control flow statements
|
|
214
|
+
"\nif ",
|
|
215
|
+
"\nforeach ",
|
|
216
|
+
"\nwhile ",
|
|
217
|
+
"\ndo ",
|
|
218
|
+
"\nswitch ",
|
|
219
|
+
"\ncase ",
|
|
220
|
+
// Split by the normal type of lines
|
|
221
|
+
"\n\n",
|
|
222
|
+
"\n",
|
|
223
|
+
" ",
|
|
224
|
+
"",
|
|
225
|
+
];
|
|
226
|
+
case "proto":
|
|
227
|
+
return [
|
|
228
|
+
// Split along message definitions
|
|
229
|
+
"\nmessage ",
|
|
230
|
+
// Split along service definitions
|
|
231
|
+
"\nservice ",
|
|
232
|
+
// Split along enum definitions
|
|
233
|
+
"\nenum ",
|
|
234
|
+
// Split along option definitions
|
|
235
|
+
"\noption ",
|
|
236
|
+
// Split along import statements
|
|
237
|
+
"\nimport ",
|
|
238
|
+
// Split along syntax declarations
|
|
239
|
+
"\nsyntax ",
|
|
240
|
+
// Split by the normal type of lines
|
|
241
|
+
"\n\n",
|
|
242
|
+
"\n",
|
|
243
|
+
" ",
|
|
244
|
+
"",
|
|
245
|
+
];
|
|
246
|
+
case "python":
|
|
247
|
+
case "py":
|
|
248
|
+
return [
|
|
249
|
+
// First, try to split along class definitions
|
|
250
|
+
"\nclass ",
|
|
251
|
+
"\ndef ",
|
|
252
|
+
"\n\tdef ",
|
|
253
|
+
// Now split by the normal type of lines
|
|
254
|
+
"\n\n",
|
|
255
|
+
"\n",
|
|
256
|
+
" ",
|
|
257
|
+
"",
|
|
258
|
+
];
|
|
259
|
+
case "rst":
|
|
260
|
+
return [
|
|
261
|
+
// Split along section titles
|
|
262
|
+
"\n===\n",
|
|
263
|
+
"\n---\n",
|
|
264
|
+
"\n***\n",
|
|
265
|
+
// Split along directive markers
|
|
266
|
+
"\n.. ",
|
|
267
|
+
// Split by the normal type of lines
|
|
268
|
+
"\n\n",
|
|
269
|
+
"\n",
|
|
270
|
+
" ",
|
|
271
|
+
"",
|
|
272
|
+
];
|
|
273
|
+
case "ruby":
|
|
274
|
+
return [
|
|
275
|
+
// Split along method definitions
|
|
276
|
+
"\ndef ",
|
|
277
|
+
"\nclass ",
|
|
278
|
+
// Split along control flow statements
|
|
279
|
+
"\nif ",
|
|
280
|
+
"\nunless ",
|
|
281
|
+
"\nwhile ",
|
|
282
|
+
"\nfor ",
|
|
283
|
+
"\ndo ",
|
|
284
|
+
"\nbegin ",
|
|
285
|
+
"\nrescue ",
|
|
286
|
+
// Split by the normal type of lines
|
|
287
|
+
"\n\n",
|
|
288
|
+
"\n",
|
|
289
|
+
" ",
|
|
290
|
+
"",
|
|
291
|
+
];
|
|
292
|
+
case "rust":
|
|
293
|
+
return [
|
|
294
|
+
// Split along function definitions
|
|
295
|
+
"\nfn ",
|
|
296
|
+
"\nconst ",
|
|
297
|
+
"\nlet ",
|
|
298
|
+
// Split along control flow statements
|
|
299
|
+
"\nif ",
|
|
300
|
+
"\nwhile ",
|
|
301
|
+
"\nfor ",
|
|
302
|
+
"\nloop ",
|
|
303
|
+
"\nmatch ",
|
|
304
|
+
"\nconst ",
|
|
305
|
+
// Split by the normal type of lines
|
|
306
|
+
"\n\n",
|
|
307
|
+
"\n",
|
|
308
|
+
" ",
|
|
309
|
+
"",
|
|
310
|
+
];
|
|
311
|
+
case "scala":
|
|
312
|
+
return [
|
|
313
|
+
// Split along class definitions
|
|
314
|
+
"\nclass ",
|
|
315
|
+
"\nobject ",
|
|
316
|
+
// Split along method definitions
|
|
317
|
+
"\ndef ",
|
|
318
|
+
"\nval ",
|
|
319
|
+
"\nvar ",
|
|
320
|
+
// Split along control flow statements
|
|
321
|
+
"\nif ",
|
|
322
|
+
"\nfor ",
|
|
323
|
+
"\nwhile ",
|
|
324
|
+
"\nmatch ",
|
|
325
|
+
"\ncase ",
|
|
326
|
+
// Split by the normal type of lines
|
|
327
|
+
"\n\n",
|
|
328
|
+
"\n",
|
|
329
|
+
" ",
|
|
330
|
+
"",
|
|
331
|
+
];
|
|
332
|
+
case "swift":
|
|
333
|
+
return [
|
|
334
|
+
// Split along function definitions
|
|
335
|
+
"\nfunc ",
|
|
336
|
+
// Split along class definitions
|
|
337
|
+
"\nclass ",
|
|
338
|
+
"\nstruct ",
|
|
339
|
+
"\nenum ",
|
|
340
|
+
// Split along control flow statements
|
|
341
|
+
"\nif ",
|
|
342
|
+
"\nfor ",
|
|
343
|
+
"\nwhile ",
|
|
344
|
+
"\ndo ",
|
|
345
|
+
"\nswitch ",
|
|
346
|
+
"\ncase ",
|
|
347
|
+
// Split by the normal type of lines
|
|
348
|
+
"\n\n",
|
|
349
|
+
"\n",
|
|
350
|
+
" ",
|
|
351
|
+
"",
|
|
352
|
+
];
|
|
353
|
+
case "markdown":
|
|
354
|
+
return [
|
|
355
|
+
// First, try to split along Markdown headings (starting with level 2)
|
|
356
|
+
"\n## ",
|
|
357
|
+
"\n### ",
|
|
358
|
+
"\n#### ",
|
|
359
|
+
"\n##### ",
|
|
360
|
+
"\n###### ",
|
|
361
|
+
// Note the alternative syntax for headings (below) is not handled here
|
|
362
|
+
// Heading level 2
|
|
363
|
+
// ---------------
|
|
364
|
+
// End of code block
|
|
365
|
+
"```\n\n",
|
|
366
|
+
// Horizontal lines
|
|
367
|
+
"\n\n***\n\n",
|
|
368
|
+
"\n\n---\n\n",
|
|
369
|
+
"\n\n___\n\n",
|
|
370
|
+
// Note that this splitter doesn't handle horizontal lines defined
|
|
371
|
+
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
372
|
+
"\n\n",
|
|
373
|
+
"\n",
|
|
374
|
+
" ",
|
|
375
|
+
"",
|
|
376
|
+
];
|
|
377
|
+
case "latex":
|
|
378
|
+
return [
|
|
379
|
+
// First, try to split along Latex sections
|
|
380
|
+
"\n\\chapter{",
|
|
381
|
+
"\n\\section{",
|
|
382
|
+
"\n\\subsection{",
|
|
383
|
+
"\n\\subsubsection{",
|
|
384
|
+
|
|
385
|
+
// Now split by environments
|
|
386
|
+
"\n\\begin{enumerate}",
|
|
387
|
+
"\n\\begin{itemize}",
|
|
388
|
+
"\n\\begin{description}",
|
|
389
|
+
"\n\\begin{list}",
|
|
390
|
+
"\n\\begin{quote}",
|
|
391
|
+
"\n\\begin{quotation}",
|
|
392
|
+
"\n\\begin{verse}",
|
|
393
|
+
"\n\\begin{verbatim}",
|
|
394
|
+
|
|
395
|
+
// Now split by math environments
|
|
396
|
+
"\n\\begin{align}",
|
|
397
|
+
"$$",
|
|
398
|
+
"$",
|
|
399
|
+
|
|
400
|
+
// Now split by the normal type of lines
|
|
401
|
+
"\n\n",
|
|
402
|
+
"\n",
|
|
403
|
+
" ",
|
|
404
|
+
"",
|
|
405
|
+
];
|
|
406
|
+
case "html":
|
|
407
|
+
return [
|
|
408
|
+
// First, try to split along HTML tags
|
|
409
|
+
"<body>",
|
|
410
|
+
"<div>",
|
|
411
|
+
"<p>",
|
|
412
|
+
"<br>",
|
|
413
|
+
"<li>",
|
|
414
|
+
"<h1>",
|
|
415
|
+
"<h2>",
|
|
416
|
+
"<h3>",
|
|
417
|
+
"<h4>",
|
|
418
|
+
"<h5>",
|
|
419
|
+
"<h6>",
|
|
420
|
+
"<span>",
|
|
421
|
+
"<table>",
|
|
422
|
+
"<tr>",
|
|
423
|
+
"<td>",
|
|
424
|
+
"<th>",
|
|
425
|
+
"<ul>",
|
|
426
|
+
"<ol>",
|
|
427
|
+
"<header>",
|
|
428
|
+
"<footer>",
|
|
429
|
+
"<nav>",
|
|
430
|
+
// Head
|
|
431
|
+
"<head>",
|
|
432
|
+
"<style>",
|
|
433
|
+
"<script>",
|
|
434
|
+
"<meta>",
|
|
435
|
+
"<title>",
|
|
436
|
+
// Normal type of lines
|
|
437
|
+
" ",
|
|
438
|
+
"",
|
|
439
|
+
];
|
|
440
|
+
case "sol":
|
|
441
|
+
return [
|
|
442
|
+
// Split along compiler informations definitions
|
|
443
|
+
"\npragma ",
|
|
444
|
+
"\nusing ",
|
|
445
|
+
// Split along contract definitions
|
|
446
|
+
"\ncontract ",
|
|
447
|
+
"\ninterface ",
|
|
448
|
+
"\nlibrary ",
|
|
449
|
+
// Split along method definitions
|
|
450
|
+
"\nconstructor ",
|
|
451
|
+
"\ntype ",
|
|
452
|
+
"\nfunction ",
|
|
453
|
+
"\nevent ",
|
|
454
|
+
"\nmodifier ",
|
|
455
|
+
"\nerror ",
|
|
456
|
+
"\nstruct ",
|
|
457
|
+
"\nenum ",
|
|
458
|
+
// Split along control flow statements
|
|
459
|
+
"\nif ",
|
|
460
|
+
"\nfor ",
|
|
461
|
+
"\nwhile ",
|
|
462
|
+
"\ndo while ",
|
|
463
|
+
"\nassembly ",
|
|
464
|
+
// Split by the normal type of lines
|
|
465
|
+
"\n\n",
|
|
466
|
+
"\n",
|
|
467
|
+
" ",
|
|
468
|
+
"",
|
|
469
|
+
];
|
|
470
|
+
default:
|
|
471
|
+
return [
|
|
472
|
+
// Split by the normal type of lines
|
|
473
|
+
"\n\n",
|
|
474
|
+
"\n",
|
|
475
|
+
" ",
|
|
476
|
+
"",
|
|
477
|
+
];
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import axios, { AxiosRequestConfig } from "axios";
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
import { TextFetcher } from './types';
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
const ALLOWED_CONTENT_TYPES = [
|
|
7
|
+
"text/html",
|
|
8
|
+
"application/json",
|
|
9
|
+
"application/xml",
|
|
10
|
+
"application/javascript",
|
|
11
|
+
"text/plain",
|
|
12
|
+
];
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
const DEFAULT_HEADERS = {
|
|
16
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
17
|
+
"Accept-Encoding": "gzip, deflate",
|
|
18
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
19
|
+
"Alt-Used": "LEAVE-THIS-KEY-SET-BY-TOOL",
|
|
20
|
+
Connection: "keep-alive",
|
|
21
|
+
Host: "LEAVE-THIS-KEY-SET-BY-TOOL",
|
|
22
|
+
Referer: "https://www.google.com/",
|
|
23
|
+
"Sec-Fetch-Dest": "document",
|
|
24
|
+
"Sec-Fetch-Mode": "navigate",
|
|
25
|
+
"Sec-Fetch-Site": "cross-site",
|
|
26
|
+
"Upgrade-Insecure-Requests": "1",
|
|
27
|
+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
export interface WebFetcherConfig {
|
|
31
|
+
headers?: Record<string,string>;
|
|
32
|
+
requestConfig?: AxiosRequestConfig;
|
|
33
|
+
htmlToText: boolean;
|
|
34
|
+
summarizeHtml: boolean;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export class WebFetcher implements TextFetcher {
|
|
38
|
+
private readonly _config: WebFetcherConfig;
|
|
39
|
+
|
|
40
|
+
public constructor(config?: Partial<WebFetcherConfig>) {
|
|
41
|
+
this._config = Object.assign({
|
|
42
|
+
htmlToText: true,
|
|
43
|
+
summarizeHtml: false,
|
|
44
|
+
} as WebFetcherConfig, config);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
public async fetch(uri: string): Promise<string> {
|
|
48
|
+
const {data, contentType} = await this.fetchPage(uri);
|
|
49
|
+
if (contentType === "text/html" && this._config.htmlToText) {
|
|
50
|
+
return this.extractText(data, uri, this._config.summarizeHtml);
|
|
51
|
+
} else {
|
|
52
|
+
return data;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
private extractText(html: string, baseUrl: string, summarize: boolean): string {
|
|
57
|
+
// Parse all elements including <noscript> tags
|
|
58
|
+
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
59
|
+
|
|
60
|
+
// If we want a summary, just get use the <body/>
|
|
61
|
+
let text = '';
|
|
62
|
+
$(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
|
|
63
|
+
// Remove any children to avoid duplicate text
|
|
64
|
+
let content = $(elem).clone().children().remove().end().text().trim();
|
|
65
|
+
const $el = $(elem);
|
|
66
|
+
|
|
67
|
+
// Print links in markdown format
|
|
68
|
+
let href = $el.attr("href");
|
|
69
|
+
if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
|
|
70
|
+
if (!href.startsWith("http")) {
|
|
71
|
+
// Try converting to a relevant link
|
|
72
|
+
try {
|
|
73
|
+
href = new URL(href, baseUrl).toString();
|
|
74
|
+
} catch {
|
|
75
|
+
// Leave as is
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// If the link has content, use that as the text
|
|
80
|
+
const altText = $el.find("img[alt]").attr("alt")?.trim();
|
|
81
|
+
if (altText) {
|
|
82
|
+
content += ` ${altText}`;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
text += ` [${content}](${href})`;
|
|
86
|
+
}
|
|
87
|
+
// otherwise just print the content
|
|
88
|
+
else if (content !== "") {
|
|
89
|
+
text += ` ${content}`;
|
|
90
|
+
}
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
// Remove newlines
|
|
94
|
+
return text.trim().replace(/\n+/g, ' ');
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
|
|
98
|
+
const httpClient = axios.create({
|
|
99
|
+
validateStatus: () => true,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Clone headers to avoid mutating the original
|
|
103
|
+
const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
|
|
104
|
+
|
|
105
|
+
// get hostname from url
|
|
106
|
+
const host = new URL(baseUrl).hostname;
|
|
107
|
+
headers['Host'] = host;
|
|
108
|
+
headers['Alt-Used'] = host;
|
|
109
|
+
|
|
110
|
+
// Fetch page and check for errors
|
|
111
|
+
const response = await httpClient.get(baseUrl, {
|
|
112
|
+
headers,
|
|
113
|
+
...this._config.requestConfig,
|
|
114
|
+
});
|
|
115
|
+
if (response.status >= 400) {
|
|
116
|
+
throw new Error(`Site returned an HTTP status of ${response.status}`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Check for valid content type
|
|
120
|
+
const contentType = response.headers['content-type'];
|
|
121
|
+
const contentTypeArray = contentType.split(';');
|
|
122
|
+
if (!contentTypeArray[0] || !ALLOWED_CONTENT_TYPES.includes(contentTypeArray[0])) {
|
|
123
|
+
throw new Error(`Site returned an invalid content type of ${contentType}`);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return {data: response.data, contentType: contentTypeArray[0]};
|
|
127
|
+
}
|
|
128
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -1,2 +1,10 @@
|
|
|
1
|
+
export * from './GPT3Tokenizer';
|
|
1
2
|
export * from './ItemSelector';
|
|
2
3
|
export * from './LocalIndex';
|
|
4
|
+
export * from './LocalDocument';
|
|
5
|
+
export * from './LocalDocumentIndex';
|
|
6
|
+
export * from './LocalDocumentResult';
|
|
7
|
+
export * from './OpenAIEmbeddings';
|
|
8
|
+
export * from './TextSplitter';
|
|
9
|
+
export * from './types';
|
|
10
|
+
export * from './WebFetcher';
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
const colorizer = require('json-colorizer');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @private
|
|
5
|
+
*/
|
|
6
|
+
export class Colorize {
|
|
7
|
+
public static replaceLine(text: string): string {
|
|
8
|
+
return '\x1b[A\x1b[2K' + text;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
public static error(error: Error|string): string {
|
|
12
|
+
if (typeof error === 'string') {
|
|
13
|
+
return `\x1b[31;1m${error}\x1b[0m`;
|
|
14
|
+
} else {
|
|
15
|
+
return `\x1b[31;1m${error.message}\x1b[0m`;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
public static output(output: object | string, quote: string = '', units: string = ''): string {
|
|
20
|
+
if (typeof output === 'string') {
|
|
21
|
+
return `\x1b[32m${quote}${output}${quote}\x1b[0m`;
|
|
22
|
+
} else if (typeof output === 'object' && output !== null) {
|
|
23
|
+
return colorizer(output, {
|
|
24
|
+
pretty: true,
|
|
25
|
+
colors: {
|
|
26
|
+
BRACE: 'white',
|
|
27
|
+
BRACKET: 'white',
|
|
28
|
+
COLON: 'white',
|
|
29
|
+
COMMA: 'white',
|
|
30
|
+
STRING_KEY: 'white',
|
|
31
|
+
STRING_LITERAL: 'green',
|
|
32
|
+
NUMBER_LITERAL: 'blue',
|
|
33
|
+
BOOLEAN_LITERAL: 'blue',
|
|
34
|
+
NULL_LITERAL: 'blue'
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
} else if (typeof output == 'number') {
|
|
38
|
+
return `\x1b[34m${output}${units}\x1b[0m`;
|
|
39
|
+
} else {
|
|
40
|
+
return `\x1b[34m${output}\x1b[0m`;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
public static progress(message: string): string {
|
|
45
|
+
return message;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
public static success(message: string): string {
|
|
49
|
+
return `\x1b[32;1m${message}\x1b[0m`;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
public static title(title: string): string {
|
|
53
|
+
return `\x1b[35;1m${title}\x1b[0m`;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
public static value(field: string, value: any, units: string = ''): string {
|
|
57
|
+
return `${field}: ${Colorize.output(value, '"', units)}`;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
public static warning(warning: string): string {
|
|
61
|
+
return `\x1b[33m${warning}\x1b[0m`;
|
|
62
|
+
|
|
63
|
+
}
|
|
64
|
+
}
|