vectra 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/LocalDocumentIndex.d.ts +5 -2
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +14 -8
- package/lib/LocalDocumentIndex.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts.map +1 -1
- package/lib/OpenAIEmbeddings.js +1 -0
- package/lib/OpenAIEmbeddings.js.map +1 -1
- package/lib/TextSplitter.d.ts +2 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +99 -52
- package/lib/TextSplitter.js.map +1 -1
- package/lib/WebFetcher.d.ts +6 -4
- package/lib/WebFetcher.d.ts.map +1 -1
- package/lib/WebFetcher.js +132 -52
- package/lib/WebFetcher.js.map +1 -1
- package/lib/types.d.ts +4 -1
- package/lib/types.d.ts.map +1 -1
- package/lib/vectra-cli.js +7 -7
- package/lib/vectra-cli.js.map +1 -1
- package/package.json +3 -1
- package/src/FileFetcher.ts +31 -0
- package/src/LocalDocumentIndex.ts +14 -8
- package/src/LocalIndex.ts +17 -5
- package/src/OpenAIEmbeddings.ts +4 -2
- package/src/TextSplitter.ts +101 -52
- package/src/WebFetcher.ts +159 -59
- package/src/index.ts +1 -0
- package/src/types.ts +1 -1
- package/src/vectra-cli.ts +18 -13
package/src/TextSplitter.ts
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { GPT3Tokenizer } from "./GPT3Tokenizer";
|
|
2
2
|
import { TextChunk, Tokenizer } from "./types";
|
|
3
3
|
|
|
4
|
+
const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
|
|
5
|
+
|
|
4
6
|
export interface TextSplitterConfig {
|
|
5
7
|
separators: string[];
|
|
6
8
|
keepSeparators: boolean;
|
|
@@ -15,7 +17,6 @@ export class TextSplitter {
|
|
|
15
17
|
|
|
16
18
|
public constructor(config?: Partial<TextSplitterConfig>) {
|
|
17
19
|
this._config = Object.assign({
|
|
18
|
-
separators: ["\n\n", "\n", " ", ""],
|
|
19
20
|
keepSeparators: false,
|
|
20
21
|
chunkSize: 400,
|
|
21
22
|
chunkOverlap: 40,
|
|
@@ -71,10 +72,22 @@ export class TextSplitter {
|
|
|
71
72
|
|
|
72
73
|
private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
|
|
73
74
|
const chunks: TextChunk[] = [];
|
|
74
|
-
if (text.length > 0
|
|
75
|
-
|
|
75
|
+
if (text.length > 0) {
|
|
76
|
+
// Split text into parts
|
|
77
|
+
let parts: string[];
|
|
78
|
+
let separator = '';
|
|
76
79
|
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
77
|
-
|
|
80
|
+
if (separators.length > 0) {
|
|
81
|
+
// Split by separator
|
|
82
|
+
separator = separators[0];
|
|
83
|
+
parts = text.split(separator);
|
|
84
|
+
} else {
|
|
85
|
+
// Cut text in half
|
|
86
|
+
const half = Math.floor(text.length / 2);
|
|
87
|
+
parts = [text.substring(0, half), text.substring(half)];
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Iterate over parts
|
|
78
91
|
for (let i = 0; i < parts.length; i++) {
|
|
79
92
|
const lastChunk = (i === parts.length - 1);
|
|
80
93
|
|
|
@@ -85,36 +98,82 @@ export class TextSplitter {
|
|
|
85
98
|
chunk += separator;
|
|
86
99
|
}
|
|
87
100
|
|
|
88
|
-
//
|
|
89
|
-
|
|
90
|
-
if (trimmed.length === 0 || trimmed == '\n') {
|
|
101
|
+
// Ensure chunk contains text
|
|
102
|
+
if (!this.containsAlphanumeric(chunk)) {
|
|
91
103
|
continue;
|
|
92
104
|
}
|
|
93
105
|
|
|
94
|
-
//
|
|
95
|
-
|
|
96
|
-
if (tokens.length > this._config.chunkSize) {
|
|
106
|
+
// Optimization to avoid encoding really large chunks
|
|
107
|
+
if (chunk.length / 6 > this._config.chunkSize) {
|
|
97
108
|
// Break the text into smaller chunks
|
|
98
109
|
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
99
110
|
chunks.push(...subChunks);
|
|
100
111
|
} else {
|
|
101
|
-
//
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
112
|
+
// Encode chunk text
|
|
113
|
+
const tokens = this._config.tokenizer.encode(chunk);
|
|
114
|
+
if (tokens.length > this._config.chunkSize) {
|
|
115
|
+
// Break the text into smaller chunks
|
|
116
|
+
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
117
|
+
chunks.push(...subChunks);
|
|
118
|
+
} else {
|
|
119
|
+
// Append chunk to output
|
|
120
|
+
chunks.push({
|
|
121
|
+
text: chunk,
|
|
122
|
+
tokens: tokens,
|
|
123
|
+
startPos: startPos,
|
|
124
|
+
endPos: endPos,
|
|
125
|
+
startOverlap: [],
|
|
126
|
+
endOverlap: [],
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
|
|
110
130
|
}
|
|
111
131
|
|
|
132
|
+
|
|
112
133
|
// Update startPos
|
|
113
134
|
startPos = endPos + 1;
|
|
114
135
|
}
|
|
115
136
|
}
|
|
116
137
|
|
|
117
|
-
return chunks;
|
|
138
|
+
return this.combineChunks(chunks);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
private combineChunks(chunks: TextChunk[]): TextChunk[] {
|
|
142
|
+
const combinedChunks: TextChunk[] = [];
|
|
143
|
+
let currentChunk: TextChunk|undefined;
|
|
144
|
+
let currentLength = 0;
|
|
145
|
+
const separator = this._config.keepSeparators ? '' : ' ';
|
|
146
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
147
|
+
const chunk = chunks[i];
|
|
148
|
+
if (currentChunk) {
|
|
149
|
+
const length = currentChunk.tokens.length + chunk.tokens.length;
|
|
150
|
+
if (length > this._config.chunkSize) {
|
|
151
|
+
combinedChunks.push(currentChunk);
|
|
152
|
+
currentChunk = chunk;
|
|
153
|
+
currentLength = chunk.tokens.length;
|
|
154
|
+
} else {
|
|
155
|
+
currentChunk.text += separator + chunk.text;
|
|
156
|
+
currentChunk.tokens.push(...chunk.tokens);
|
|
157
|
+
currentLength += chunk.tokens.length;
|
|
158
|
+
}
|
|
159
|
+
} else {
|
|
160
|
+
currentChunk = chunk;
|
|
161
|
+
currentLength = chunk.tokens.length;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
if (currentChunk) {
|
|
165
|
+
combinedChunks.push(currentChunk);
|
|
166
|
+
}
|
|
167
|
+
return combinedChunks;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
private containsAlphanumeric(text: string): boolean {
|
|
171
|
+
for (let i = 0; i < text.length; i++) {
|
|
172
|
+
if (ALPHANUMERIC_CHARS.includes(text[i])) {
|
|
173
|
+
return true;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return false;
|
|
118
177
|
}
|
|
119
178
|
|
|
120
179
|
private getSeparators(docType?: string): string[] {
|
|
@@ -137,8 +196,7 @@ export class TextSplitter {
|
|
|
137
196
|
// Split by the normal type of lines
|
|
138
197
|
"\n\n",
|
|
139
198
|
"\n",
|
|
140
|
-
" "
|
|
141
|
-
"",
|
|
199
|
+
" "
|
|
142
200
|
];
|
|
143
201
|
case "go":
|
|
144
202
|
return [
|
|
@@ -155,8 +213,7 @@ export class TextSplitter {
|
|
|
155
213
|
// Split by the normal type of lines
|
|
156
214
|
"\n\n",
|
|
157
215
|
"\n",
|
|
158
|
-
" "
|
|
159
|
-
"",
|
|
216
|
+
" "
|
|
160
217
|
];
|
|
161
218
|
case "java":
|
|
162
219
|
case "c#":
|
|
@@ -182,8 +239,7 @@ export class TextSplitter {
|
|
|
182
239
|
// Split by the normal type of lines
|
|
183
240
|
"\n\n",
|
|
184
241
|
"\n",
|
|
185
|
-
" "
|
|
186
|
-
"",
|
|
242
|
+
" "
|
|
187
243
|
];
|
|
188
244
|
case "js":
|
|
189
245
|
case "jsx":
|
|
@@ -207,8 +263,7 @@ export class TextSplitter {
|
|
|
207
263
|
// Split by the normal type of lines
|
|
208
264
|
"\n\n",
|
|
209
265
|
"\n",
|
|
210
|
-
" "
|
|
211
|
-
"",
|
|
266
|
+
" "
|
|
212
267
|
];
|
|
213
268
|
case "php":
|
|
214
269
|
return [
|
|
@@ -226,8 +281,7 @@ export class TextSplitter {
|
|
|
226
281
|
// Split by the normal type of lines
|
|
227
282
|
"\n\n",
|
|
228
283
|
"\n",
|
|
229
|
-
" "
|
|
230
|
-
"",
|
|
284
|
+
" "
|
|
231
285
|
];
|
|
232
286
|
case "proto":
|
|
233
287
|
return [
|
|
@@ -246,8 +300,7 @@ export class TextSplitter {
|
|
|
246
300
|
// Split by the normal type of lines
|
|
247
301
|
"\n\n",
|
|
248
302
|
"\n",
|
|
249
|
-
" "
|
|
250
|
-
"",
|
|
303
|
+
" "
|
|
251
304
|
];
|
|
252
305
|
case "python":
|
|
253
306
|
case "py":
|
|
@@ -259,8 +312,7 @@ export class TextSplitter {
|
|
|
259
312
|
// Now split by the normal type of lines
|
|
260
313
|
"\n\n",
|
|
261
314
|
"\n",
|
|
262
|
-
" "
|
|
263
|
-
"",
|
|
315
|
+
" "
|
|
264
316
|
];
|
|
265
317
|
case "rst":
|
|
266
318
|
return [
|
|
@@ -273,8 +325,7 @@ export class TextSplitter {
|
|
|
273
325
|
// Split by the normal type of lines
|
|
274
326
|
"\n\n",
|
|
275
327
|
"\n",
|
|
276
|
-
" "
|
|
277
|
-
"",
|
|
328
|
+
" "
|
|
278
329
|
];
|
|
279
330
|
case "ruby":
|
|
280
331
|
return [
|
|
@@ -292,8 +343,7 @@ export class TextSplitter {
|
|
|
292
343
|
// Split by the normal type of lines
|
|
293
344
|
"\n\n",
|
|
294
345
|
"\n",
|
|
295
|
-
" "
|
|
296
|
-
"",
|
|
346
|
+
" "
|
|
297
347
|
];
|
|
298
348
|
case "rust":
|
|
299
349
|
return [
|
|
@@ -311,8 +361,7 @@ export class TextSplitter {
|
|
|
311
361
|
// Split by the normal type of lines
|
|
312
362
|
"\n\n",
|
|
313
363
|
"\n",
|
|
314
|
-
" "
|
|
315
|
-
"",
|
|
364
|
+
" "
|
|
316
365
|
];
|
|
317
366
|
case "scala":
|
|
318
367
|
return [
|
|
@@ -332,8 +381,7 @@ export class TextSplitter {
|
|
|
332
381
|
// Split by the normal type of lines
|
|
333
382
|
"\n\n",
|
|
334
383
|
"\n",
|
|
335
|
-
" "
|
|
336
|
-
"",
|
|
384
|
+
" "
|
|
337
385
|
];
|
|
338
386
|
case "swift":
|
|
339
387
|
return [
|
|
@@ -353,9 +401,9 @@ export class TextSplitter {
|
|
|
353
401
|
// Split by the normal type of lines
|
|
354
402
|
"\n\n",
|
|
355
403
|
"\n",
|
|
356
|
-
" "
|
|
357
|
-
"",
|
|
404
|
+
" "
|
|
358
405
|
];
|
|
406
|
+
case "md":
|
|
359
407
|
case "markdown":
|
|
360
408
|
return [
|
|
361
409
|
// First, try to split along Markdown headings (starting with level 2)
|
|
@@ -375,10 +423,14 @@ export class TextSplitter {
|
|
|
375
423
|
"\n\n___\n\n",
|
|
376
424
|
// Note that this splitter doesn't handle horizontal lines defined
|
|
377
425
|
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
426
|
+
// Github tables
|
|
427
|
+
"<table>",
|
|
428
|
+
// "<tr>",
|
|
429
|
+
// "<td>",
|
|
430
|
+
// "<td ",
|
|
378
431
|
"\n\n",
|
|
379
432
|
"\n",
|
|
380
|
-
" "
|
|
381
|
-
"",
|
|
433
|
+
" "
|
|
382
434
|
];
|
|
383
435
|
case "latex":
|
|
384
436
|
return [
|
|
@@ -406,8 +458,7 @@ export class TextSplitter {
|
|
|
406
458
|
// Now split by the normal type of lines
|
|
407
459
|
"\n\n",
|
|
408
460
|
"\n",
|
|
409
|
-
" "
|
|
410
|
-
"",
|
|
461
|
+
" "
|
|
411
462
|
];
|
|
412
463
|
case "html":
|
|
413
464
|
return [
|
|
@@ -440,8 +491,7 @@ export class TextSplitter {
|
|
|
440
491
|
"<meta>",
|
|
441
492
|
"<title>",
|
|
442
493
|
// Normal type of lines
|
|
443
|
-
" "
|
|
444
|
-
"",
|
|
494
|
+
" "
|
|
445
495
|
];
|
|
446
496
|
case "sol":
|
|
447
497
|
return [
|
|
@@ -470,8 +520,7 @@ export class TextSplitter {
|
|
|
470
520
|
// Split by the normal type of lines
|
|
471
521
|
"\n\n",
|
|
472
522
|
"\n",
|
|
473
|
-
" "
|
|
474
|
-
"",
|
|
523
|
+
" "
|
|
475
524
|
];
|
|
476
525
|
default:
|
|
477
526
|
return [
|
package/src/WebFetcher.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import axios, { AxiosRequestConfig } from "axios";
|
|
2
|
-
import * as cheerio from "cheerio";
|
|
3
2
|
import { TextFetcher } from './types';
|
|
4
|
-
|
|
3
|
+
import * as cheerio from 'cheerio';
|
|
4
|
+
import TurndownService from 'turndown';
|
|
5
5
|
|
|
6
6
|
const ALLOWED_CONTENT_TYPES = [
|
|
7
7
|
"text/html",
|
|
@@ -30,7 +30,7 @@ const DEFAULT_HEADERS = {
|
|
|
30
30
|
export interface WebFetcherConfig {
|
|
31
31
|
headers?: Record<string,string>;
|
|
32
32
|
requestConfig?: AxiosRequestConfig;
|
|
33
|
-
|
|
33
|
+
htmlToMarkdown: boolean;
|
|
34
34
|
summarizeHtml: boolean;
|
|
35
35
|
}
|
|
36
36
|
|
|
@@ -39,62 +39,12 @@ export class WebFetcher implements TextFetcher {
|
|
|
39
39
|
|
|
40
40
|
public constructor(config?: Partial<WebFetcherConfig>) {
|
|
41
41
|
this._config = Object.assign({
|
|
42
|
-
|
|
42
|
+
htmlToMarkdown: true,
|
|
43
43
|
summarizeHtml: false,
|
|
44
44
|
} as WebFetcherConfig, config);
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
public async fetch(uri: string): Promise<
|
|
48
|
-
const {data, contentType} = await this.fetchPage(uri);
|
|
49
|
-
if (contentType === "text/html" && this._config.htmlToText) {
|
|
50
|
-
return this.extractText(data, uri, this._config.summarizeHtml);
|
|
51
|
-
} else {
|
|
52
|
-
return data;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
private extractText(html: string, baseUrl: string, summarize: boolean): string {
|
|
57
|
-
// Parse all elements including <noscript> tags
|
|
58
|
-
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
59
|
-
|
|
60
|
-
// If we want a summary, just get use the <body/>
|
|
61
|
-
let text = '';
|
|
62
|
-
$(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
|
|
63
|
-
// Remove any children to avoid duplicate text
|
|
64
|
-
let content = $(elem).clone().children().remove().end().text().trim();
|
|
65
|
-
const $el = $(elem);
|
|
66
|
-
|
|
67
|
-
// Print links in markdown format
|
|
68
|
-
let href = $el.attr("href");
|
|
69
|
-
if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
|
|
70
|
-
if (!href.startsWith("http")) {
|
|
71
|
-
// Try converting to a relevant link
|
|
72
|
-
try {
|
|
73
|
-
href = new URL(href, baseUrl).toString();
|
|
74
|
-
} catch {
|
|
75
|
-
// Leave as is
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// If the link has content, use that as the text
|
|
80
|
-
const altText = $el.find("img[alt]").attr("alt")?.trim();
|
|
81
|
-
if (altText) {
|
|
82
|
-
content += ` ${altText}`;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
text += ` [${content}](${href})`;
|
|
86
|
-
}
|
|
87
|
-
// otherwise just print the content
|
|
88
|
-
else if (content !== "") {
|
|
89
|
-
text += ` ${content}`;
|
|
90
|
-
}
|
|
91
|
-
});
|
|
92
|
-
|
|
93
|
-
// Remove newlines
|
|
94
|
-
return text.trim().replace(/\n+/g, ' ');
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
|
|
47
|
+
public async fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean> {
|
|
98
48
|
const httpClient = axios.create({
|
|
99
49
|
validateStatus: () => true,
|
|
100
50
|
});
|
|
@@ -103,12 +53,12 @@ export class WebFetcher implements TextFetcher {
|
|
|
103
53
|
const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
|
|
104
54
|
|
|
105
55
|
// get hostname from url
|
|
106
|
-
const host = new URL(
|
|
56
|
+
const host = new URL(uri).hostname;
|
|
107
57
|
headers['Host'] = host;
|
|
108
58
|
headers['Alt-Used'] = host;
|
|
109
59
|
|
|
110
60
|
// Fetch page and check for errors
|
|
111
|
-
const response = await httpClient.get(
|
|
61
|
+
const response = await httpClient.get(uri, {
|
|
112
62
|
headers,
|
|
113
63
|
...this._config.requestConfig,
|
|
114
64
|
});
|
|
@@ -123,6 +73,156 @@ export class WebFetcher implements TextFetcher {
|
|
|
123
73
|
throw new Error(`Site returned an invalid content type of ${contentType}`);
|
|
124
74
|
}
|
|
125
75
|
|
|
126
|
-
|
|
76
|
+
// Convert content type to doc type
|
|
77
|
+
const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
|
|
78
|
+
if (docType == 'html' && this._config.htmlToMarkdown) {
|
|
79
|
+
const text = this.htmlToMarkdown(response.data, uri);
|
|
80
|
+
return await onDocument(uri, text, 'md');
|
|
81
|
+
} else {
|
|
82
|
+
const text = response.data;
|
|
83
|
+
return await onDocument(uri, text, docType);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
private htmlToMarkdown(html: string, baseUrl: string): string {
|
|
89
|
+
// Parse HTML and remove scripts
|
|
90
|
+
const $ = cheerio.load(html, { scriptingEnabled: true });
|
|
91
|
+
|
|
92
|
+
// Remove scripts and convert relative links to absolute
|
|
93
|
+
$('script').remove();
|
|
94
|
+
$('a').each((i, elem) => {
|
|
95
|
+
const $el = $(elem);
|
|
96
|
+
const href = $el.attr("href");
|
|
97
|
+
if (href && !href.startsWith("http")) {
|
|
98
|
+
// Try converting to an absolute link
|
|
99
|
+
try {
|
|
100
|
+
$el.attr("href", new URL(href, baseUrl).toString());
|
|
101
|
+
} catch {
|
|
102
|
+
// Leave as is
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
// Convert to markdown
|
|
108
|
+
const body = $('body').html() ?? '';
|
|
109
|
+
const turndownService = new TurndownService({
|
|
110
|
+
hr: '\n\n---\n\n',
|
|
111
|
+
});
|
|
112
|
+
convertTables(turndownService);
|
|
113
|
+
const md = turndownService.turndown(body);
|
|
114
|
+
|
|
115
|
+
// Remove any overly long header text
|
|
116
|
+
const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
|
|
117
|
+
if (contentStart > 64) {
|
|
118
|
+
return md.slice(contentStart);
|
|
119
|
+
} else {
|
|
120
|
+
return md;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function convertTables(turndownService: TurndownService): void {
|
|
126
|
+
turndownService.addRule('tableCell', {
|
|
127
|
+
filter: ['th', 'td'],
|
|
128
|
+
replacement: function (content, node) {
|
|
129
|
+
return cell(content, node)
|
|
130
|
+
}
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
turndownService.addRule('tableRow', {
|
|
134
|
+
filter: 'tr',
|
|
135
|
+
replacement: function (content, node) {
|
|
136
|
+
var borderCells = ''
|
|
137
|
+
var alignMap: any = { left: ':--', right: '--:', center: ':-:' }
|
|
138
|
+
|
|
139
|
+
if (isHeadingRow(node)) {
|
|
140
|
+
for (var i = 0; i < node.childNodes.length; i++) {
|
|
141
|
+
var border = '---'
|
|
142
|
+
var align: string = (
|
|
143
|
+
node.childNodes[i].getAttribute('align') || ''
|
|
144
|
+
).toLowerCase()
|
|
145
|
+
|
|
146
|
+
if (align) border = alignMap[align] || border
|
|
147
|
+
|
|
148
|
+
borderCells += cell(border, node.childNodes[i])
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return '\n' + content + (borderCells ? '\n' + borderCells : '')
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
turndownService.addRule('table', {
|
|
156
|
+
filter: ['table'],
|
|
157
|
+
replacement: function (content, node) {
|
|
158
|
+
// Ensure there are no blank lines
|
|
159
|
+
content = content.replace('\n\n', '\n')
|
|
160
|
+
return '\n\n' + content + '\n\n'
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
turndownService.addRule('tableSection', {
|
|
165
|
+
filter: ['thead', 'tbody', 'tfoot'],
|
|
166
|
+
replacement: function (content) {
|
|
167
|
+
return content
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const indexOf = Array.prototype.indexOf
|
|
173
|
+
const every = Array.prototype.every
|
|
174
|
+
|
|
175
|
+
// A tr is a heading row if:
|
|
176
|
+
// - the parent is a THEAD
|
|
177
|
+
// - or if its the first child of the TABLE or the first TBODY (possibly
|
|
178
|
+
// following a blank THEAD)
|
|
179
|
+
// - and every cell is a TH
|
|
180
|
+
function isHeadingRow(tr: any) {
|
|
181
|
+
var parentNode = tr.parentNode
|
|
182
|
+
return (
|
|
183
|
+
parentNode.nodeName === 'THEAD' ||
|
|
184
|
+
(
|
|
185
|
+
parentNode.firstChild === tr &&
|
|
186
|
+
(parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
|
|
187
|
+
every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function isFirstTbody(element: any) {
|
|
193
|
+
var previousSibling = element.previousSibling
|
|
194
|
+
return (
|
|
195
|
+
element.nodeName === 'TBODY' && (
|
|
196
|
+
!previousSibling ||
|
|
197
|
+
(
|
|
198
|
+
previousSibling.nodeName === 'THEAD' &&
|
|
199
|
+
/^\s*$/i.test(previousSibling.textContent)
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function cell(content: string, node: any): string {
|
|
206
|
+
var index = indexOf.call(node.parentNode.childNodes, node)
|
|
207
|
+
var prefix = ' '
|
|
208
|
+
if (index === 0) {
|
|
209
|
+
prefix = '| '
|
|
127
210
|
}
|
|
128
|
-
|
|
211
|
+
return cleanContent(prefix + content + ' |');
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function cleanContent(content: string): string {
|
|
215
|
+
let output = '';
|
|
216
|
+
const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
|
|
217
|
+
for (let i = 0; i < content.length; i++) {
|
|
218
|
+
if (chars.includes(content[i])) {
|
|
219
|
+
if (output[output.length - 1] != ' ') {
|
|
220
|
+
output += ' ';
|
|
221
|
+
}
|
|
222
|
+
continue;
|
|
223
|
+
} else {
|
|
224
|
+
output += content[i];
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return output;
|
|
228
|
+
}
|
package/src/index.ts
CHANGED
package/src/types.ts
CHANGED
|
@@ -56,7 +56,7 @@ export interface TextChunk {
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
export interface TextFetcher {
|
|
59
|
-
fetch(uri: string): Promise<
|
|
59
|
+
fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean>;
|
|
60
60
|
}
|
|
61
61
|
|
|
62
62
|
export interface IndexStats {
|
package/src/vectra-cli.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { LocalDocumentIndex } from "./LocalDocumentIndex";
|
|
|
5
5
|
import { WebFetcher } from './WebFetcher';
|
|
6
6
|
import { OpenAIEmbeddings } from './OpenAIEmbeddings';
|
|
7
7
|
import { Colorize } from './internals';
|
|
8
|
+
import { FileFetcher } from './FileFetcher';
|
|
8
9
|
|
|
9
10
|
export async function run() {
|
|
10
11
|
// prettier-ignore
|
|
@@ -77,16 +78,20 @@ export async function run() {
|
|
|
77
78
|
const uris = await getItemList(args.uri as string[], args.list as string, 'web page');
|
|
78
79
|
|
|
79
80
|
// Fetch web pages
|
|
80
|
-
const
|
|
81
|
-
|
|
81
|
+
const fileFetcher = new FileFetcher();
|
|
82
|
+
const webFetcher = new WebFetcher();
|
|
83
|
+
for (const path of uris) {
|
|
82
84
|
try {
|
|
83
|
-
console.log(Colorize.progress(`fetching ${
|
|
84
|
-
const
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
console.log(Colorize.progress(`fetching ${path}`));
|
|
86
|
+
const fetcher = path.startsWith('http') ? webFetcher : fileFetcher;
|
|
87
|
+
await fetcher.fetch(path, async (uri, text, docType) => {
|
|
88
|
+
console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
|
|
89
|
+
await index.upsertDocument(uri, text, docType);
|
|
90
|
+
console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
|
|
91
|
+
return true;
|
|
92
|
+
});
|
|
88
93
|
} catch (err: unknown) {
|
|
89
|
-
console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${
|
|
94
|
+
console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${path}\n${(err as Error).message}`)));
|
|
90
95
|
}
|
|
91
96
|
}
|
|
92
97
|
})
|
|
@@ -142,25 +147,25 @@ export async function run() {
|
|
|
142
147
|
.option('document-count', {
|
|
143
148
|
alias: 'dc',
|
|
144
149
|
describe: 'max number of documents to return (defaults to 10)',
|
|
145
|
-
type: '
|
|
150
|
+
type: 'number',
|
|
146
151
|
default: 10
|
|
147
152
|
})
|
|
148
153
|
.option('chunk-count', {
|
|
149
154
|
alias: 'cc',
|
|
150
155
|
describe: 'max number of chunks to return (defaults to 50)',
|
|
151
|
-
type: '
|
|
156
|
+
type: 'number',
|
|
152
157
|
default: 50
|
|
153
158
|
})
|
|
154
159
|
.option('section-count', {
|
|
155
160
|
alias: 'sc',
|
|
156
161
|
describe: 'max number of document sections to render (defaults to 1)',
|
|
157
|
-
type: '
|
|
162
|
+
type: 'number',
|
|
158
163
|
default: 1
|
|
159
164
|
})
|
|
160
165
|
.option('tokens', {
|
|
161
166
|
alias: 't',
|
|
162
167
|
describe: 'max number of tokens to render for each document section (defaults to 2000)',
|
|
163
|
-
type: '
|
|
168
|
+
type: 'number',
|
|
164
169
|
default: 2000
|
|
165
170
|
})
|
|
166
171
|
.option('format', {
|
|
@@ -200,7 +205,7 @@ export async function run() {
|
|
|
200
205
|
const sections = await result.renderSections(args.tokens, args.sectionCount);
|
|
201
206
|
for (let i = 0; i < sections.length; i++) {
|
|
202
207
|
const section = sections[i];
|
|
203
|
-
console.log(Colorize.title(args.sectionCount
|
|
208
|
+
console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
|
|
204
209
|
console.log(Colorize.value('score', section.score));
|
|
205
210
|
console.log(Colorize.value('tokens', section.tokenCount));
|
|
206
211
|
console.log(Colorize.output(section.text));
|