vectra 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  import { GPT3Tokenizer } from "./GPT3Tokenizer";
2
2
  import { TextChunk, Tokenizer } from "./types";
3
3
 
4
+ const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
5
+
4
6
  export interface TextSplitterConfig {
5
7
  separators: string[];
6
8
  keepSeparators: boolean;
@@ -15,7 +17,6 @@ export class TextSplitter {
15
17
 
16
18
  public constructor(config?: Partial<TextSplitterConfig>) {
17
19
  this._config = Object.assign({
18
- separators: ["\n\n", "\n", " ", ""],
19
20
  keepSeparators: false,
20
21
  chunkSize: 400,
21
22
  chunkOverlap: 40,
@@ -71,10 +72,22 @@ export class TextSplitter {
71
72
 
72
73
  private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
73
74
  const chunks: TextChunk[] = [];
74
- if (text.length > 0 && separators.length > 0) {
75
- const separator = separators[0];
75
+ if (text.length > 0) {
76
+ // Split text into parts
77
+ let parts: string[];
78
+ let separator = '';
76
79
  const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
77
- const parts = text.split(separator);
80
+ if (separators.length > 0) {
81
+ // Split by separator
82
+ separator = separators[0];
83
+ parts = text.split(separator);
84
+ } else {
85
+ // Cut text in half
86
+ const half = Math.floor(text.length / 2);
87
+ parts = [text.substring(0, half), text.substring(half)];
88
+ }
89
+
90
+ // Iterate over parts
78
91
  for (let i = 0; i < parts.length; i++) {
79
92
  const lastChunk = (i === parts.length - 1);
80
93
 
@@ -85,36 +98,82 @@ export class TextSplitter {
85
98
  chunk += separator;
86
99
  }
87
100
 
88
- // Check for empty chunk
89
- const trimmed = chunk.trim();
90
- if (trimmed.length === 0 || trimmed == '\n') {
101
+ // Ensure chunk contains text
102
+ if (!this.containsAlphanumeric(chunk)) {
91
103
  continue;
92
104
  }
93
105
 
94
- // Encode chunk text
95
- const tokens = this._config.tokenizer.encode(chunk);
96
- if (tokens.length > this._config.chunkSize) {
106
+ // Optimization to avoid encoding really large chunks
107
+ if (chunk.length / 6 > this._config.chunkSize) {
97
108
  // Break the text into smaller chunks
98
109
  const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
99
110
  chunks.push(...subChunks);
100
111
  } else {
101
- // Append chunk to output
102
- chunks.push({
103
- text: chunk,
104
- tokens: tokens,
105
- startPos: startPos,
106
- endPos: endPos,
107
- startOverlap: [],
108
- endOverlap: [],
109
- });
112
+ // Encode chunk text
113
+ const tokens = this._config.tokenizer.encode(chunk);
114
+ if (tokens.length > this._config.chunkSize) {
115
+ // Break the text into smaller chunks
116
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
117
+ chunks.push(...subChunks);
118
+ } else {
119
+ // Append chunk to output
120
+ chunks.push({
121
+ text: chunk,
122
+ tokens: tokens,
123
+ startPos: startPos,
124
+ endPos: endPos,
125
+ startOverlap: [],
126
+ endOverlap: [],
127
+ });
128
+ }
129
+
110
130
  }
111
131
 
132
+
112
133
  // Update startPos
113
134
  startPos = endPos + 1;
114
135
  }
115
136
  }
116
137
 
117
- return chunks;
138
+ return this.combineChunks(chunks);
139
+ }
140
+
141
+ private combineChunks(chunks: TextChunk[]): TextChunk[] {
142
+ const combinedChunks: TextChunk[] = [];
143
+ let currentChunk: TextChunk|undefined;
144
+ let currentLength = 0;
145
+ const separator = this._config.keepSeparators ? '' : ' ';
146
+ for (let i = 0; i < chunks.length; i++) {
147
+ const chunk = chunks[i];
148
+ if (currentChunk) {
149
+ const length = currentChunk.tokens.length + chunk.tokens.length;
150
+ if (length > this._config.chunkSize) {
151
+ combinedChunks.push(currentChunk);
152
+ currentChunk = chunk;
153
+ currentLength = chunk.tokens.length;
154
+ } else {
155
+ currentChunk.text += separator + chunk.text;
156
+ currentChunk.tokens.push(...chunk.tokens);
157
+ currentLength += chunk.tokens.length;
158
+ }
159
+ } else {
160
+ currentChunk = chunk;
161
+ currentLength = chunk.tokens.length;
162
+ }
163
+ }
164
+ if (currentChunk) {
165
+ combinedChunks.push(currentChunk);
166
+ }
167
+ return combinedChunks;
168
+ }
169
+
170
+ private containsAlphanumeric(text: string): boolean {
171
+ for (let i = 0; i < text.length; i++) {
172
+ if (ALPHANUMERIC_CHARS.includes(text[i])) {
173
+ return true;
174
+ }
175
+ }
176
+ return false;
118
177
  }
119
178
 
120
179
  private getSeparators(docType?: string): string[] {
@@ -137,8 +196,7 @@ export class TextSplitter {
137
196
  // Split by the normal type of lines
138
197
  "\n\n",
139
198
  "\n",
140
- " ",
141
- "",
199
+ " "
142
200
  ];
143
201
  case "go":
144
202
  return [
@@ -155,8 +213,7 @@ export class TextSplitter {
155
213
  // Split by the normal type of lines
156
214
  "\n\n",
157
215
  "\n",
158
- " ",
159
- "",
216
+ " "
160
217
  ];
161
218
  case "java":
162
219
  case "c#":
@@ -182,8 +239,7 @@ export class TextSplitter {
182
239
  // Split by the normal type of lines
183
240
  "\n\n",
184
241
  "\n",
185
- " ",
186
- "",
242
+ " "
187
243
  ];
188
244
  case "js":
189
245
  case "jsx":
@@ -207,8 +263,7 @@ export class TextSplitter {
207
263
  // Split by the normal type of lines
208
264
  "\n\n",
209
265
  "\n",
210
- " ",
211
- "",
266
+ " "
212
267
  ];
213
268
  case "php":
214
269
  return [
@@ -226,8 +281,7 @@ export class TextSplitter {
226
281
  // Split by the normal type of lines
227
282
  "\n\n",
228
283
  "\n",
229
- " ",
230
- "",
284
+ " "
231
285
  ];
232
286
  case "proto":
233
287
  return [
@@ -246,8 +300,7 @@ export class TextSplitter {
246
300
  // Split by the normal type of lines
247
301
  "\n\n",
248
302
  "\n",
249
- " ",
250
- "",
303
+ " "
251
304
  ];
252
305
  case "python":
253
306
  case "py":
@@ -259,8 +312,7 @@ export class TextSplitter {
259
312
  // Now split by the normal type of lines
260
313
  "\n\n",
261
314
  "\n",
262
- " ",
263
- "",
315
+ " "
264
316
  ];
265
317
  case "rst":
266
318
  return [
@@ -273,8 +325,7 @@ export class TextSplitter {
273
325
  // Split by the normal type of lines
274
326
  "\n\n",
275
327
  "\n",
276
- " ",
277
- "",
328
+ " "
278
329
  ];
279
330
  case "ruby":
280
331
  return [
@@ -292,8 +343,7 @@ export class TextSplitter {
292
343
  // Split by the normal type of lines
293
344
  "\n\n",
294
345
  "\n",
295
- " ",
296
- "",
346
+ " "
297
347
  ];
298
348
  case "rust":
299
349
  return [
@@ -311,8 +361,7 @@ export class TextSplitter {
311
361
  // Split by the normal type of lines
312
362
  "\n\n",
313
363
  "\n",
314
- " ",
315
- "",
364
+ " "
316
365
  ];
317
366
  case "scala":
318
367
  return [
@@ -332,8 +381,7 @@ export class TextSplitter {
332
381
  // Split by the normal type of lines
333
382
  "\n\n",
334
383
  "\n",
335
- " ",
336
- "",
384
+ " "
337
385
  ];
338
386
  case "swift":
339
387
  return [
@@ -353,9 +401,9 @@ export class TextSplitter {
353
401
  // Split by the normal type of lines
354
402
  "\n\n",
355
403
  "\n",
356
- " ",
357
- "",
404
+ " "
358
405
  ];
406
+ case "md":
359
407
  case "markdown":
360
408
  return [
361
409
  // First, try to split along Markdown headings (starting with level 2)
@@ -375,10 +423,14 @@ export class TextSplitter {
375
423
  "\n\n___\n\n",
376
424
  // Note that this splitter doesn't handle horizontal lines defined
377
425
  // by *three or more* of ***, ---, or ___, but this is not handled
426
+ // Github tables
427
+ "<table>",
428
+ // "<tr>",
429
+ // "<td>",
430
+ // "<td ",
378
431
  "\n\n",
379
432
  "\n",
380
- " ",
381
- "",
433
+ " "
382
434
  ];
383
435
  case "latex":
384
436
  return [
@@ -406,8 +458,7 @@ export class TextSplitter {
406
458
  // Now split by the normal type of lines
407
459
  "\n\n",
408
460
  "\n",
409
- " ",
410
- "",
461
+ " "
411
462
  ];
412
463
  case "html":
413
464
  return [
@@ -440,8 +491,7 @@ export class TextSplitter {
440
491
  "<meta>",
441
492
  "<title>",
442
493
  // Normal type of lines
443
- " ",
444
- "",
494
+ " "
445
495
  ];
446
496
  case "sol":
447
497
  return [
@@ -470,8 +520,7 @@ export class TextSplitter {
470
520
  // Split by the normal type of lines
471
521
  "\n\n",
472
522
  "\n",
473
- " ",
474
- "",
523
+ " "
475
524
  ];
476
525
  default:
477
526
  return [
package/src/WebFetcher.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import axios, { AxiosRequestConfig } from "axios";
2
- import * as cheerio from "cheerio";
3
2
  import { TextFetcher } from './types';
4
-
3
+ import * as cheerio from 'cheerio';
4
+ import TurndownService from 'turndown';
5
5
 
6
6
  const ALLOWED_CONTENT_TYPES = [
7
7
  "text/html",
@@ -30,7 +30,7 @@ const DEFAULT_HEADERS = {
30
30
  export interface WebFetcherConfig {
31
31
  headers?: Record<string,string>;
32
32
  requestConfig?: AxiosRequestConfig;
33
- htmlToText: boolean;
33
+ htmlToMarkdown: boolean;
34
34
  summarizeHtml: boolean;
35
35
  }
36
36
 
@@ -39,62 +39,12 @@ export class WebFetcher implements TextFetcher {
39
39
 
40
40
  public constructor(config?: Partial<WebFetcherConfig>) {
41
41
  this._config = Object.assign({
42
- htmlToText: true,
42
+ htmlToMarkdown: true,
43
43
  summarizeHtml: false,
44
44
  } as WebFetcherConfig, config);
45
45
  }
46
46
 
47
- public async fetch(uri: string): Promise<string> {
48
- const {data, contentType} = await this.fetchPage(uri);
49
- if (contentType === "text/html" && this._config.htmlToText) {
50
- return this.extractText(data, uri, this._config.summarizeHtml);
51
- } else {
52
- return data;
53
- }
54
- }
55
-
56
- private extractText(html: string, baseUrl: string, summarize: boolean): string {
57
- // Parse all elements including <noscript> tags
58
- const $ = cheerio.load(html, { scriptingEnabled: true });
59
-
60
- // If we want a summary, just get use the <body/>
61
- let text = '';
62
- $(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
63
- // Remove any children to avoid duplicate text
64
- let content = $(elem).clone().children().remove().end().text().trim();
65
- const $el = $(elem);
66
-
67
- // Print links in markdown format
68
- let href = $el.attr("href");
69
- if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
70
- if (!href.startsWith("http")) {
71
- // Try converting to a relevant link
72
- try {
73
- href = new URL(href, baseUrl).toString();
74
- } catch {
75
- // Leave as is
76
- }
77
- }
78
-
79
- // If the link has content, use that as the text
80
- const altText = $el.find("img[alt]").attr("alt")?.trim();
81
- if (altText) {
82
- content += ` ${altText}`;
83
- }
84
-
85
- text += ` [${content}](${href})`;
86
- }
87
- // otherwise just print the content
88
- else if (content !== "") {
89
- text += ` ${content}`;
90
- }
91
- });
92
-
93
- // Remove newlines
94
- return text.trim().replace(/\n+/g, ' ');
95
- }
96
-
97
- private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
47
+ public async fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean> {
98
48
  const httpClient = axios.create({
99
49
  validateStatus: () => true,
100
50
  });
@@ -103,12 +53,12 @@ export class WebFetcher implements TextFetcher {
103
53
  const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
104
54
 
105
55
  // get hostname from url
106
- const host = new URL(baseUrl).hostname;
56
+ const host = new URL(uri).hostname;
107
57
  headers['Host'] = host;
108
58
  headers['Alt-Used'] = host;
109
59
 
110
60
  // Fetch page and check for errors
111
- const response = await httpClient.get(baseUrl, {
61
+ const response = await httpClient.get(uri, {
112
62
  headers,
113
63
  ...this._config.requestConfig,
114
64
  });
@@ -123,6 +73,156 @@ export class WebFetcher implements TextFetcher {
123
73
  throw new Error(`Site returned an invalid content type of ${contentType}`);
124
74
  }
125
75
 
126
- return {data: response.data, contentType: contentTypeArray[0]};
76
+ // Convert content type to doc type
77
+ const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
78
+ if (docType == 'html' && this._config.htmlToMarkdown) {
79
+ const text = this.htmlToMarkdown(response.data, uri);
80
+ return await onDocument(uri, text, 'md');
81
+ } else {
82
+ const text = response.data;
83
+ return await onDocument(uri, text, docType);
84
+ }
85
+ }
86
+
87
+
88
+ private htmlToMarkdown(html: string, baseUrl: string): string {
89
+ // Parse HTML and remove scripts
90
+ const $ = cheerio.load(html, { scriptingEnabled: true });
91
+
92
+ // Remove scripts and convert relative links to absolute
93
+ $('script').remove();
94
+ $('a').each((i, elem) => {
95
+ const $el = $(elem);
96
+ const href = $el.attr("href");
97
+ if (href && !href.startsWith("http")) {
98
+ // Try converting to an absolute link
99
+ try {
100
+ $el.attr("href", new URL(href, baseUrl).toString());
101
+ } catch {
102
+ // Leave as is
103
+ }
104
+ }
105
+ });
106
+
107
+ // Convert to markdown
108
+ const body = $('body').html() ?? '';
109
+ const turndownService = new TurndownService({
110
+ hr: '\n\n---\n\n',
111
+ });
112
+ convertTables(turndownService);
113
+ const md = turndownService.turndown(body);
114
+
115
+ // Remove any overly long header text
116
+ const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
117
+ if (contentStart > 64) {
118
+ return md.slice(contentStart);
119
+ } else {
120
+ return md;
121
+ }
122
+ }
123
+ }
124
+
125
+ function convertTables(turndownService: TurndownService): void {
126
+ turndownService.addRule('tableCell', {
127
+ filter: ['th', 'td'],
128
+ replacement: function (content, node) {
129
+ return cell(content, node)
130
+ }
131
+ });
132
+
133
+ turndownService.addRule('tableRow', {
134
+ filter: 'tr',
135
+ replacement: function (content, node) {
136
+ var borderCells = ''
137
+ var alignMap: any = { left: ':--', right: '--:', center: ':-:' }
138
+
139
+ if (isHeadingRow(node)) {
140
+ for (var i = 0; i < node.childNodes.length; i++) {
141
+ var border = '---'
142
+ var align: string = (
143
+ node.childNodes[i].getAttribute('align') || ''
144
+ ).toLowerCase()
145
+
146
+ if (align) border = alignMap[align] || border
147
+
148
+ borderCells += cell(border, node.childNodes[i])
149
+ }
150
+ }
151
+ return '\n' + content + (borderCells ? '\n' + borderCells : '')
152
+ }
153
+ });
154
+
155
+ turndownService.addRule('table', {
156
+ filter: ['table'],
157
+ replacement: function (content, node) {
158
+ // Ensure there are no blank lines
159
+ content = content.replace('\n\n', '\n')
160
+ return '\n\n' + content + '\n\n'
161
+ }
162
+ });
163
+
164
+ turndownService.addRule('tableSection', {
165
+ filter: ['thead', 'tbody', 'tfoot'],
166
+ replacement: function (content) {
167
+ return content
168
+ }
169
+ });
170
+ }
171
+
172
+ const indexOf = Array.prototype.indexOf
173
+ const every = Array.prototype.every
174
+
175
+ // A tr is a heading row if:
176
+ // - the parent is a THEAD
177
+ // - or if its the first child of the TABLE or the first TBODY (possibly
178
+ // following a blank THEAD)
179
+ // - and every cell is a TH
180
+ function isHeadingRow(tr: any) {
181
+ var parentNode = tr.parentNode
182
+ return (
183
+ parentNode.nodeName === 'THEAD' ||
184
+ (
185
+ parentNode.firstChild === tr &&
186
+ (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
187
+ every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
188
+ )
189
+ )
190
+ }
191
+
192
+ function isFirstTbody(element: any) {
193
+ var previousSibling = element.previousSibling
194
+ return (
195
+ element.nodeName === 'TBODY' && (
196
+ !previousSibling ||
197
+ (
198
+ previousSibling.nodeName === 'THEAD' &&
199
+ /^\s*$/i.test(previousSibling.textContent)
200
+ )
201
+ )
202
+ )
203
+ }
204
+
205
+ function cell(content: string, node: any): string {
206
+ var index = indexOf.call(node.parentNode.childNodes, node)
207
+ var prefix = ' '
208
+ if (index === 0) {
209
+ prefix = '| '
127
210
  }
128
- }
211
+ return cleanContent(prefix + content + ' |');
212
+ }
213
+
214
+ function cleanContent(content: string): string {
215
+ let output = '';
216
+ const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
217
+ for (let i = 0; i < content.length; i++) {
218
+ if (chars.includes(content[i])) {
219
+ if (output[output.length - 1] != ' ') {
220
+ output += ' ';
221
+ }
222
+ continue;
223
+ } else {
224
+ output += content[i];
225
+ }
226
+ }
227
+ return output;
228
+ }
package/src/index.ts CHANGED
@@ -1,3 +1,4 @@
1
+ export * from './FileFetcher';
1
2
  export * from './GPT3Tokenizer';
2
3
  export * from './ItemSelector';
3
4
  export * from './LocalIndex';
package/src/types.ts CHANGED
@@ -56,7 +56,7 @@ export interface TextChunk {
56
56
  }
57
57
 
58
58
  export interface TextFetcher {
59
- fetch(uri: string): Promise<string>;
59
+ fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean>;
60
60
  }
61
61
 
62
62
  export interface IndexStats {
package/src/vectra-cli.ts CHANGED
@@ -5,6 +5,7 @@ import { LocalDocumentIndex } from "./LocalDocumentIndex";
5
5
  import { WebFetcher } from './WebFetcher';
6
6
  import { OpenAIEmbeddings } from './OpenAIEmbeddings';
7
7
  import { Colorize } from './internals';
8
+ import { FileFetcher } from './FileFetcher';
8
9
 
9
10
  export async function run() {
10
11
  // prettier-ignore
@@ -77,16 +78,20 @@ export async function run() {
77
78
  const uris = await getItemList(args.uri as string[], args.list as string, 'web page');
78
79
 
79
80
  // Fetch web pages
80
- const fetcher = new WebFetcher();
81
- for (const uri of uris) {
81
+ const fileFetcher = new FileFetcher();
82
+ const webFetcher = new WebFetcher();
83
+ for (const path of uris) {
82
84
  try {
83
- console.log(Colorize.progress(`fetching ${uri}`));
84
- const content = await fetcher.fetch(uri);
85
- console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
86
- await index.upsertDocument(uri, content);
87
- console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
85
+ console.log(Colorize.progress(`fetching ${path}`));
86
+ const fetcher = path.startsWith('http') ? webFetcher : fileFetcher;
87
+ await fetcher.fetch(path, async (uri, text, docType) => {
88
+ console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
89
+ await index.upsertDocument(uri, text, docType);
90
+ console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
91
+ return true;
92
+ });
88
93
  } catch (err: unknown) {
89
- console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${uri}\n${(err as Error).message}`)));
94
+ console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${path}\n${(err as Error).message}`)));
90
95
  }
91
96
  }
92
97
  })
@@ -142,25 +147,25 @@ export async function run() {
142
147
  .option('document-count', {
143
148
  alias: 'dc',
144
149
  describe: 'max number of documents to return (defaults to 10)',
145
- type: 'count',
150
+ type: 'number',
146
151
  default: 10
147
152
  })
148
153
  .option('chunk-count', {
149
154
  alias: 'cc',
150
155
  describe: 'max number of chunks to return (defaults to 50)',
151
- type: 'count',
156
+ type: 'number',
152
157
  default: 50
153
158
  })
154
159
  .option('section-count', {
155
160
  alias: 'sc',
156
161
  describe: 'max number of document sections to render (defaults to 1)',
157
- type: 'count',
162
+ type: 'number',
158
163
  default: 1
159
164
  })
160
165
  .option('tokens', {
161
166
  alias: 't',
162
167
  describe: 'max number of tokens to render for each document section (defaults to 2000)',
163
- type: 'count',
168
+ type: 'number',
164
169
  default: 2000
165
170
  })
166
171
  .option('format', {
@@ -200,7 +205,7 @@ export async function run() {
200
205
  const sections = await result.renderSections(args.tokens, args.sectionCount);
201
206
  for (let i = 0; i < sections.length; i++) {
202
207
  const section = sections[i];
203
- console.log(Colorize.title(args.sectionCount > 1 ? 'Section' : `Section ${1}`));
208
+ console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
204
209
  console.log(Colorize.value('score', section.score));
205
210
  console.log(Colorize.value('tokens', section.tokenCount));
206
211
  console.log(Colorize.output(section.text));