vectra 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/LocalDocumentIndex.d.ts +5 -2
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +20 -12
- package/lib/LocalDocumentIndex.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +1 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -1
- package/lib/OpenAIEmbeddings.js +3 -1
- package/lib/OpenAIEmbeddings.js.map +1 -1
- package/lib/TextSplitter.d.ts +2 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +101 -49
- package/lib/TextSplitter.js.map +1 -1
- package/lib/WebFetcher.d.ts +6 -4
- package/lib/WebFetcher.d.ts.map +1 -1
- package/lib/WebFetcher.js +132 -52
- package/lib/WebFetcher.js.map +1 -1
- package/lib/types.d.ts +8 -1
- package/lib/types.d.ts.map +1 -1
- package/lib/vectra-cli.js +8 -8
- package/lib/vectra-cli.js.map +1 -1
- package/package.json +3 -1
- package/src/LocalDocumentIndex.ts +20 -13
- package/src/OpenAIEmbeddings.ts +4 -1
- package/src/TextSplitter.ts +104 -49
- package/src/WebFetcher.ts +159 -58
- package/src/types.ts +6 -1
- package/src/vectra-cli.ts +8 -8
package/src/TextSplitter.ts
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { GPT3Tokenizer } from "./GPT3Tokenizer";
|
|
2
2
|
import { TextChunk, Tokenizer } from "./types";
|
|
3
3
|
|
|
4
|
+
const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
|
|
5
|
+
|
|
4
6
|
export interface TextSplitterConfig {
|
|
5
7
|
separators: string[];
|
|
6
8
|
keepSeparators: boolean;
|
|
@@ -15,7 +17,6 @@ export class TextSplitter {
|
|
|
15
17
|
|
|
16
18
|
public constructor(config?: Partial<TextSplitterConfig>) {
|
|
17
19
|
this._config = Object.assign({
|
|
18
|
-
separators: ["\n\n", "\n", " ", ""],
|
|
19
20
|
keepSeparators: false,
|
|
20
21
|
chunkSize: 400,
|
|
21
22
|
chunkOverlap: 40,
|
|
@@ -71,10 +72,22 @@ export class TextSplitter {
|
|
|
71
72
|
|
|
72
73
|
private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
|
|
73
74
|
const chunks: TextChunk[] = [];
|
|
74
|
-
if (text.length > 0
|
|
75
|
-
|
|
75
|
+
if (text.length > 0) {
|
|
76
|
+
// Split text into parts
|
|
77
|
+
let parts: string[];
|
|
78
|
+
let separator = '';
|
|
76
79
|
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
77
|
-
|
|
80
|
+
if (separators.length > 0) {
|
|
81
|
+
// Split by separator
|
|
82
|
+
separator = separators[0];
|
|
83
|
+
parts = text.split(separator);
|
|
84
|
+
} else {
|
|
85
|
+
// Cut text in half
|
|
86
|
+
const half = Math.floor(text.length / 2);
|
|
87
|
+
parts = [text.substring(0, half), text.substring(half)];
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Iterate over parts
|
|
78
91
|
for (let i = 0; i < parts.length; i++) {
|
|
79
92
|
const lastChunk = (i === parts.length - 1);
|
|
80
93
|
|
|
@@ -85,30 +98,82 @@ export class TextSplitter {
|
|
|
85
98
|
chunk += separator;
|
|
86
99
|
}
|
|
87
100
|
|
|
88
|
-
//
|
|
89
|
-
|
|
90
|
-
|
|
101
|
+
// Ensure chunk contains text
|
|
102
|
+
if (!this.containsAlphanumeric(chunk)) {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Optimization to avoid encoding really large chunks
|
|
107
|
+
if (chunk.length / 6 > this._config.chunkSize) {
|
|
91
108
|
// Break the text into smaller chunks
|
|
92
109
|
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
93
110
|
chunks.push(...subChunks);
|
|
94
111
|
} else {
|
|
95
|
-
//
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
112
|
+
// Encode chunk text
|
|
113
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
114
|
+
if (tokens.length > this._config.chunkSize) {
|
|
115
|
+
// Break the text into smaller chunks
|
|
116
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
117
|
+
chunks.push(...subChunks);
|
|
118
|
+
} else {
|
|
119
|
+
// Append chunk to output
|
|
120
|
+
chunks.push({
|
|
121
|
+
text: chunk,
|
|
122
|
+
tokens: tokens,
|
|
123
|
+
startPos: startPos,
|
|
124
|
+
endPos: endPos,
|
|
125
|
+
startOverlap: [],
|
|
126
|
+
endOverlap: [],
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
|
|
104
130
|
}
|
|
105
131
|
|
|
132
|
+
|
|
106
133
|
// Update startPos
|
|
107
134
|
startPos = endPos + 1;
|
|
108
135
|
}
|
|
109
136
|
}
|
|
110
137
|
|
|
111
|
-
return chunks;
|
|
138
|
+
return this.combineChunks(chunks);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
private combineChunks(chunks: TextChunk[]): TextChunk[] {
|
|
142
|
+
const combinedChunks: TextChunk[] = [];
|
|
143
|
+
let currentChunk: TextChunk|undefined;
|
|
144
|
+
let currentLength = 0;
|
|
145
|
+
const separator = this._config.keepSeparators ? '' : ' ';
|
|
146
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
147
|
+
const chunk = chunks[i];
|
|
148
|
+
if (currentChunk) {
|
|
149
|
+
const length = currentChunk.tokens.length + chunk.tokens.length;
|
|
150
|
+
if (length > this._config.chunkSize) {
|
|
151
|
+
combinedChunks.push(currentChunk);
|
|
152
|
+
currentChunk = chunk;
|
|
153
|
+
currentLength = chunk.tokens.length;
|
|
154
|
+
} else {
|
|
155
|
+
currentChunk.text += separator + chunk.text;
|
|
156
|
+
currentChunk.tokens.push(...chunk.tokens);
|
|
157
|
+
currentLength += chunk.tokens.length;
|
|
158
|
+
}
|
|
159
|
+
} else {
|
|
160
|
+
currentChunk = chunk;
|
|
161
|
+
currentLength = chunk.tokens.length;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
if (currentChunk) {
|
|
165
|
+
combinedChunks.push(currentChunk);
|
|
166
|
+
}
|
|
167
|
+
return combinedChunks;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
private containsAlphanumeric(text: string): boolean {
|
|
171
|
+
for (let i = 0; i < text.length; i++) {
|
|
172
|
+
if (ALPHANUMERIC_CHARS.includes(text[i])) {
|
|
173
|
+
return true;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return false;
|
|
112
177
|
}
|
|
113
178
|
|
|
114
179
|
private getSeparators(docType?: string): string[] {
|
|
@@ -131,8 +196,7 @@ export class TextSplitter {
|
|
|
131
196
|
// Split by the normal type of lines
|
|
132
197
|
"\n\n",
|
|
133
198
|
"\n",
|
|
134
|
-
" "
|
|
135
|
-
"",
|
|
199
|
+
" "
|
|
136
200
|
];
|
|
137
201
|
case "go":
|
|
138
202
|
return [
|
|
@@ -149,8 +213,7 @@ export class TextSplitter {
|
|
|
149
213
|
// Split by the normal type of lines
|
|
150
214
|
"\n\n",
|
|
151
215
|
"\n",
|
|
152
|
-
" "
|
|
153
|
-
"",
|
|
216
|
+
" "
|
|
154
217
|
];
|
|
155
218
|
case "java":
|
|
156
219
|
case "c#":
|
|
@@ -176,8 +239,7 @@ export class TextSplitter {
|
|
|
176
239
|
// Split by the normal type of lines
|
|
177
240
|
"\n\n",
|
|
178
241
|
"\n",
|
|
179
|
-
" "
|
|
180
|
-
"",
|
|
242
|
+
" "
|
|
181
243
|
];
|
|
182
244
|
case "js":
|
|
183
245
|
case "jsx":
|
|
@@ -201,8 +263,7 @@ export class TextSplitter {
|
|
|
201
263
|
// Split by the normal type of lines
|
|
202
264
|
"\n\n",
|
|
203
265
|
"\n",
|
|
204
|
-
" "
|
|
205
|
-
"",
|
|
266
|
+
" "
|
|
206
267
|
];
|
|
207
268
|
case "php":
|
|
208
269
|
return [
|
|
@@ -220,8 +281,7 @@ export class TextSplitter {
|
|
|
220
281
|
// Split by the normal type of lines
|
|
221
282
|
"\n\n",
|
|
222
283
|
"\n",
|
|
223
|
-
" "
|
|
224
|
-
"",
|
|
284
|
+
" "
|
|
225
285
|
];
|
|
226
286
|
case "proto":
|
|
227
287
|
return [
|
|
@@ -240,8 +300,7 @@ export class TextSplitter {
|
|
|
240
300
|
// Split by the normal type of lines
|
|
241
301
|
"\n\n",
|
|
242
302
|
"\n",
|
|
243
|
-
" "
|
|
244
|
-
"",
|
|
303
|
+
" "
|
|
245
304
|
];
|
|
246
305
|
case "python":
|
|
247
306
|
case "py":
|
|
@@ -253,8 +312,7 @@ export class TextSplitter {
|
|
|
253
312
|
// Now split by the normal type of lines
|
|
254
313
|
"\n\n",
|
|
255
314
|
"\n",
|
|
256
|
-
" "
|
|
257
|
-
"",
|
|
315
|
+
" "
|
|
258
316
|
];
|
|
259
317
|
case "rst":
|
|
260
318
|
return [
|
|
@@ -267,8 +325,7 @@ export class TextSplitter {
|
|
|
267
325
|
// Split by the normal type of lines
|
|
268
326
|
"\n\n",
|
|
269
327
|
"\n",
|
|
270
|
-
" "
|
|
271
|
-
"",
|
|
328
|
+
" "
|
|
272
329
|
];
|
|
273
330
|
case "ruby":
|
|
274
331
|
return [
|
|
@@ -286,8 +343,7 @@ export class TextSplitter {
|
|
|
286
343
|
// Split by the normal type of lines
|
|
287
344
|
"\n\n",
|
|
288
345
|
"\n",
|
|
289
|
-
" "
|
|
290
|
-
"",
|
|
346
|
+
" "
|
|
291
347
|
];
|
|
292
348
|
case "rust":
|
|
293
349
|
return [
|
|
@@ -305,8 +361,7 @@ export class TextSplitter {
|
|
|
305
361
|
// Split by the normal type of lines
|
|
306
362
|
"\n\n",
|
|
307
363
|
"\n",
|
|
308
|
-
" "
|
|
309
|
-
"",
|
|
364
|
+
" "
|
|
310
365
|
];
|
|
311
366
|
case "scala":
|
|
312
367
|
return [
|
|
@@ -326,8 +381,7 @@ export class TextSplitter {
|
|
|
326
381
|
// Split by the normal type of lines
|
|
327
382
|
"\n\n",
|
|
328
383
|
"\n",
|
|
329
|
-
" "
|
|
330
|
-
"",
|
|
384
|
+
" "
|
|
331
385
|
];
|
|
332
386
|
case "swift":
|
|
333
387
|
return [
|
|
@@ -347,9 +401,9 @@ export class TextSplitter {
|
|
|
347
401
|
// Split by the normal type of lines
|
|
348
402
|
"\n\n",
|
|
349
403
|
"\n",
|
|
350
|
-
" "
|
|
351
|
-
"",
|
|
404
|
+
" "
|
|
352
405
|
];
|
|
406
|
+
case "md":
|
|
353
407
|
case "markdown":
|
|
354
408
|
return [
|
|
355
409
|
// First, try to split along Markdown headings (starting with level 2)
|
|
@@ -369,10 +423,14 @@ export class TextSplitter {
|
|
|
369
423
|
"\n\n___\n\n",
|
|
370
424
|
// Note that this splitter doesn't handle horizontal lines defined
|
|
371
425
|
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
426
|
+
// Github tables
|
|
427
|
+
"<table>",
|
|
428
|
+
// "<tr>",
|
|
429
|
+
// "<td>",
|
|
430
|
+
// "<td ",
|
|
372
431
|
"\n\n",
|
|
373
432
|
"\n",
|
|
374
|
-
" "
|
|
375
|
-
"",
|
|
433
|
+
" "
|
|
376
434
|
];
|
|
377
435
|
case "latex":
|
|
378
436
|
return [
|
|
@@ -400,8 +458,7 @@ export class TextSplitter {
|
|
|
400
458
|
// Now split by the normal type of lines
|
|
401
459
|
"\n\n",
|
|
402
460
|
"\n",
|
|
403
|
-
" "
|
|
404
|
-
"",
|
|
461
|
+
" "
|
|
405
462
|
];
|
|
406
463
|
case "html":
|
|
407
464
|
return [
|
|
@@ -434,8 +491,7 @@ export class TextSplitter {
|
|
|
434
491
|
"<meta>",
|
|
435
492
|
"<title>",
|
|
436
493
|
// Normal type of lines
|
|
437
|
-
" "
|
|
438
|
-
"",
|
|
494
|
+
" "
|
|
439
495
|
];
|
|
440
496
|
case "sol":
|
|
441
497
|
return [
|
|
@@ -464,8 +520,7 @@ export class TextSplitter {
|
|
|
464
520
|
// Split by the normal type of lines
|
|
465
521
|
"\n\n",
|
|
466
522
|
"\n",
|
|
467
|
-
" "
|
|
468
|
-
"",
|
|
523
|
+
" "
|
|
469
524
|
];
|
|
470
525
|
default:
|
|
471
526
|
return [
|
package/src/WebFetcher.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import axios, { AxiosRequestConfig } from "axios";
|
|
2
|
-
import * as cheerio from "cheerio";
|
|
3
2
|
import { TextFetcher } from './types';
|
|
3
|
+
import * as cheerio from 'cheerio';
|
|
4
|
+
import TurndownService from 'turndown';
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
const ALLOWED_CONTENT_TYPES = [
|
|
@@ -30,7 +31,7 @@ const DEFAULT_HEADERS = {
|
|
|
30
31
|
export interface WebFetcherConfig {
|
|
31
32
|
headers?: Record<string,string>;
|
|
32
33
|
requestConfig?: AxiosRequestConfig;
|
|
33
|
-
|
|
34
|
+
htmlToMarkdown: boolean;
|
|
34
35
|
summarizeHtml: boolean;
|
|
35
36
|
}
|
|
36
37
|
|
|
@@ -39,62 +40,12 @@ export class WebFetcher implements TextFetcher {
|
|
|
39
40
|
|
|
40
41
|
public constructor(config?: Partial<WebFetcherConfig>) {
|
|
41
42
|
this._config = Object.assign({
|
|
42
|
-
|
|
43
|
+
htmlToMarkdown: true,
|
|
43
44
|
summarizeHtml: false,
|
|
44
45
|
} as WebFetcherConfig, config);
|
|
45
46
|
}
|
|
46
47
|
|
|
47
|
-
public async fetch(uri: string): Promise<string> {
|
|
48
|
-
const {data, contentType} = await this.fetchPage(uri);
|
|
49
|
-
if (contentType === "text/html" && this._config.htmlToText) {
|
|
50
|
-
return this.extractText(data, uri, this._config.summarizeHtml);
|
|
51
|
-
} else {
|
|
52
|
-
return data;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
private extractText(html: string, baseUrl: string, summarize: boolean): string {
|
|
57
|
-
// Parse all elements including <noscript> tags
|
|
58
|
-
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
59
|
-
|
|
60
|
-
// If we want a summary, just get use the <body/>
|
|
61
|
-
let text = '';
|
|
62
|
-
$(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
|
|
63
|
-
// Remove any children to avoid duplicate text
|
|
64
|
-
let content = $(elem).clone().children().remove().end().text().trim();
|
|
65
|
-
const $el = $(elem);
|
|
66
|
-
|
|
67
|
-
// Print links in markdown format
|
|
68
|
-
let href = $el.attr("href");
|
|
69
|
-
if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
|
|
70
|
-
if (!href.startsWith("http")) {
|
|
71
|
-
// Try converting to a relevant link
|
|
72
|
-
try {
|
|
73
|
-
href = new URL(href, baseUrl).toString();
|
|
74
|
-
} catch {
|
|
75
|
-
// Leave as is
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// If the link has content, use that as the text
|
|
80
|
-
const altText = $el.find("img[alt]").attr("alt")?.trim();
|
|
81
|
-
if (altText) {
|
|
82
|
-
content += ` ${altText}`;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
text += ` [${content}](${href})`;
|
|
86
|
-
}
|
|
87
|
-
// otherwise just print the content
|
|
88
|
-
else if (content !== "") {
|
|
89
|
-
text += ` ${content}`;
|
|
90
|
-
}
|
|
91
|
-
});
|
|
92
|
-
|
|
93
|
-
// Remove newlines
|
|
94
|
-
return text.trim().replace(/\n+/g, ' ');
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
|
|
48
|
+
public async fetch(uri: string): Promise<{ text: string; docType: string|undefined; }> {
|
|
98
49
|
const httpClient = axios.create({
|
|
99
50
|
validateStatus: () => true,
|
|
100
51
|
});
|
|
@@ -103,12 +54,12 @@ export class WebFetcher implements TextFetcher {
|
|
|
103
54
|
const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
|
|
104
55
|
|
|
105
56
|
// get hostname from url
|
|
106
|
-
const host = new URL(
|
|
57
|
+
const host = new URL(uri).hostname;
|
|
107
58
|
headers['Host'] = host;
|
|
108
59
|
headers['Alt-Used'] = host;
|
|
109
60
|
|
|
110
61
|
// Fetch page and check for errors
|
|
111
|
-
const response = await httpClient.get(
|
|
62
|
+
const response = await httpClient.get(uri, {
|
|
112
63
|
headers,
|
|
113
64
|
...this._config.requestConfig,
|
|
114
65
|
});
|
|
@@ -123,6 +74,156 @@ export class WebFetcher implements TextFetcher {
|
|
|
123
74
|
throw new Error(`Site returned an invalid content type of ${contentType}`);
|
|
124
75
|
}
|
|
125
76
|
|
|
126
|
-
|
|
77
|
+
// Convert content type to doc type
|
|
78
|
+
const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
|
|
79
|
+
if (docType == 'html' && this._config.htmlToMarkdown) {
|
|
80
|
+
const text = this.htmlToMarkdown(response.data, uri);
|
|
81
|
+
return {text, docType: 'md'};
|
|
82
|
+
} else {
|
|
83
|
+
const text = response.data;
|
|
84
|
+
return {text, docType};
|
|
85
|
+
}
|
|
127
86
|
}
|
|
128
|
-
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
private htmlToMarkdown(html: string, baseUrl: string): string {
|
|
90
|
+
// Parse HTML and remove scripts
|
|
91
|
+
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
92
|
+
|
|
93
|
+
// Remove scripts and convert relative links to absolute
|
|
94
|
+
$('script').remove();
|
|
95
|
+
$('a').each((i, elem) => {
|
|
96
|
+
const $el = $(elem);
|
|
97
|
+
const href = $el.attr("href");
|
|
98
|
+
if (href && !href.startsWith("http")) {
|
|
99
|
+
// Try converting to an absolute link
|
|
100
|
+
try {
|
|
101
|
+
$el.attr("href", new URL(href, baseUrl).toString());
|
|
102
|
+
} catch {
|
|
103
|
+
// Leave as is
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
// Convert to markdown
|
|
109
|
+
const body = $('body').html() ?? '';
|
|
110
|
+
const turndownService = new TurndownService({
|
|
111
|
+
hr: '\n\n---\n\n',
|
|
112
|
+
});
|
|
113
|
+
convertTables(turndownService);
|
|
114
|
+
const md = turndownService.turndown(body);
|
|
115
|
+
|
|
116
|
+
// Remove any overly long header text
|
|
117
|
+
const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
|
|
118
|
+
if (contentStart > 64) {
|
|
119
|
+
return md.slice(contentStart);
|
|
120
|
+
} else {
|
|
121
|
+
return md;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function convertTables(turndownService: TurndownService): void {
|
|
127
|
+
turndownService.addRule('tableCell', {
|
|
128
|
+
filter: ['th', 'td'],
|
|
129
|
+
replacement: function (content, node) {
|
|
130
|
+
return cell(content, node)
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
turndownService.addRule('tableRow', {
|
|
135
|
+
filter: 'tr',
|
|
136
|
+
replacement: function (content, node) {
|
|
137
|
+
var borderCells = ''
|
|
138
|
+
var alignMap: any = { left: ':--', right: '--:', center: ':-:' }
|
|
139
|
+
|
|
140
|
+
if (isHeadingRow(node)) {
|
|
141
|
+
for (var i = 0; i < node.childNodes.length; i++) {
|
|
142
|
+
var border = '---'
|
|
143
|
+
var align: string = (
|
|
144
|
+
node.childNodes[i].getAttribute('align') || ''
|
|
145
|
+
).toLowerCase()
|
|
146
|
+
|
|
147
|
+
if (align) border = alignMap[align] || border
|
|
148
|
+
|
|
149
|
+
borderCells += cell(border, node.childNodes[i])
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return '\n' + content + (borderCells ? '\n' + borderCells : '')
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
turndownService.addRule('table', {
|
|
157
|
+
filter: ['table'],
|
|
158
|
+
replacement: function (content, node) {
|
|
159
|
+
// Ensure there are no blank lines
|
|
160
|
+
content = content.replace('\n\n', '\n')
|
|
161
|
+
return '\n\n' + content + '\n\n'
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
turndownService.addRule('tableSection', {
|
|
166
|
+
filter: ['thead', 'tbody', 'tfoot'],
|
|
167
|
+
replacement: function (content) {
|
|
168
|
+
return content
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const indexOf = Array.prototype.indexOf
|
|
174
|
+
const every = Array.prototype.every
|
|
175
|
+
|
|
176
|
+
// A tr is a heading row if:
|
|
177
|
+
// - the parent is a THEAD
|
|
178
|
+
// - or if its the first child of the TABLE or the first TBODY (possibly
|
|
179
|
+
// following a blank THEAD)
|
|
180
|
+
// - and every cell is a TH
|
|
181
|
+
function isHeadingRow(tr: any) {
|
|
182
|
+
var parentNode = tr.parentNode
|
|
183
|
+
return (
|
|
184
|
+
parentNode.nodeName === 'THEAD' ||
|
|
185
|
+
(
|
|
186
|
+
parentNode.firstChild === tr &&
|
|
187
|
+
(parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
|
|
188
|
+
every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function isFirstTbody(element: any) {
|
|
194
|
+
var previousSibling = element.previousSibling
|
|
195
|
+
return (
|
|
196
|
+
element.nodeName === 'TBODY' && (
|
|
197
|
+
!previousSibling ||
|
|
198
|
+
(
|
|
199
|
+
previousSibling.nodeName === 'THEAD' &&
|
|
200
|
+
/^\s*$/i.test(previousSibling.textContent)
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
function cell(content: string, node: any): string {
|
|
207
|
+
var index = indexOf.call(node.parentNode.childNodes, node)
|
|
208
|
+
var prefix = ' '
|
|
209
|
+
if (index === 0) {
|
|
210
|
+
prefix = '| '
|
|
211
|
+
}
|
|
212
|
+
return cleanContent(prefix + content + ' |');
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function cleanContent(content: string): string {
|
|
216
|
+
let output = '';
|
|
217
|
+
const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
|
|
218
|
+
for (let i = 0; i < content.length; i++) {
|
|
219
|
+
if (chars.includes(content[i])) {
|
|
220
|
+
if (output[output.length - 1] != ' ') {
|
|
221
|
+
output += ' ';
|
|
222
|
+
}
|
|
223
|
+
continue;
|
|
224
|
+
} else {
|
|
225
|
+
output += content[i];
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return output;
|
|
229
|
+
}
|
package/src/types.ts
CHANGED
|
@@ -4,6 +4,11 @@
|
|
|
4
4
|
* An AI model that can be used to create embeddings.
|
|
5
5
|
*/
|
|
6
6
|
export interface EmbeddingsModel {
|
|
7
|
+
/**
|
|
8
|
+
* Maximum number of tokens
|
|
9
|
+
*/
|
|
10
|
+
readonly maxTokens: number;
|
|
11
|
+
|
|
7
12
|
/**
|
|
8
13
|
* Creates embeddings for the given inputs.
|
|
9
14
|
* @param inputs Text inputs to create embeddings for.
|
|
@@ -51,7 +56,7 @@ export interface TextChunk {
|
|
|
51
56
|
}
|
|
52
57
|
|
|
53
58
|
export interface TextFetcher {
|
|
54
|
-
fetch(uri: string): Promise<string>;
|
|
59
|
+
fetch(uri: string): Promise<{ text: string; docType: string|undefined; }>;
|
|
55
60
|
}
|
|
56
61
|
|
|
57
62
|
export interface IndexStats {
|
package/src/vectra-cli.ts
CHANGED
|
@@ -22,7 +22,7 @@ export async function run() {
|
|
|
22
22
|
const index = new LocalDocumentIndex({ folderPath });
|
|
23
23
|
await index.deleteIndex();
|
|
24
24
|
})
|
|
25
|
-
.command('add
|
|
25
|
+
.command('add <index>', `adds one or more web pages to an index`, (yargs) => {
|
|
26
26
|
return yargs
|
|
27
27
|
.option('keys', {
|
|
28
28
|
alias: 'k',
|
|
@@ -81,9 +81,9 @@ export async function run() {
|
|
|
81
81
|
for (const uri of uris) {
|
|
82
82
|
try {
|
|
83
83
|
console.log(Colorize.progress(`fetching ${uri}`));
|
|
84
|
-
const
|
|
84
|
+
const { text, docType } = await fetcher.fetch(uri);
|
|
85
85
|
console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
|
|
86
|
-
await index.upsertDocument(uri,
|
|
86
|
+
await index.upsertDocument(uri, text, docType);
|
|
87
87
|
console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
|
|
88
88
|
} catch (err: unknown) {
|
|
89
89
|
console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${uri}\n${(err as Error).message}`)));
|
|
@@ -142,25 +142,25 @@ export async function run() {
|
|
|
142
142
|
.option('document-count', {
|
|
143
143
|
alias: 'dc',
|
|
144
144
|
describe: 'max number of documents to return (defaults to 10)',
|
|
145
|
-
type: '
|
|
145
|
+
type: 'number',
|
|
146
146
|
default: 10
|
|
147
147
|
})
|
|
148
148
|
.option('chunk-count', {
|
|
149
149
|
alias: 'cc',
|
|
150
150
|
describe: 'max number of chunks to return (defaults to 50)',
|
|
151
|
-
type: '
|
|
151
|
+
type: 'number',
|
|
152
152
|
default: 50
|
|
153
153
|
})
|
|
154
154
|
.option('section-count', {
|
|
155
155
|
alias: 'sc',
|
|
156
156
|
describe: 'max number of document sections to render (defaults to 1)',
|
|
157
|
-
type: '
|
|
157
|
+
type: 'number',
|
|
158
158
|
default: 1
|
|
159
159
|
})
|
|
160
160
|
.option('tokens', {
|
|
161
161
|
alias: 't',
|
|
162
162
|
describe: 'max number of tokens to render for each document section (defaults to 2000)',
|
|
163
|
-
type: '
|
|
163
|
+
type: 'number',
|
|
164
164
|
default: 2000
|
|
165
165
|
})
|
|
166
166
|
.option('format', {
|
|
@@ -200,7 +200,7 @@ export async function run() {
|
|
|
200
200
|
const sections = await result.renderSections(args.tokens, args.sectionCount);
|
|
201
201
|
for (let i = 0; i < sections.length; i++) {
|
|
202
202
|
const section = sections[i];
|
|
203
|
-
console.log(Colorize.title(args.sectionCount
|
|
203
|
+
console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
|
|
204
204
|
console.log(Colorize.value('score', section.score));
|
|
205
205
|
console.log(Colorize.value('tokens', section.tokenCount));
|
|
206
206
|
console.log(Colorize.output(section.text));
|