vectra 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  import { GPT3Tokenizer } from "./GPT3Tokenizer";
2
2
  import { TextChunk, Tokenizer } from "./types";
3
3
 
4
+ const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
5
+
4
6
  export interface TextSplitterConfig {
5
7
  separators: string[];
6
8
  keepSeparators: boolean;
@@ -15,7 +17,6 @@ export class TextSplitter {
15
17
 
16
18
  public constructor(config?: Partial<TextSplitterConfig>) {
17
19
  this._config = Object.assign({
18
- separators: ["\n\n", "\n", " ", ""],
19
20
  keepSeparators: false,
20
21
  chunkSize: 400,
21
22
  chunkOverlap: 40,
@@ -71,10 +72,22 @@ export class TextSplitter {
71
72
 
72
73
  private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
73
74
  const chunks: TextChunk[] = [];
74
- if (text.length > 0 && separators.length > 0) {
75
- const separator = separators[0];
75
+ if (text.length > 0) {
76
+ // Split text into parts
77
+ let parts: string[];
78
+ let separator = '';
76
79
  const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
77
- const parts = text.split(separator);
80
+ if (separators.length > 0) {
81
+ // Split by separator
82
+ separator = separators[0];
83
+ parts = text.split(separator);
84
+ } else {
85
+ // Cut text in half
86
+ const half = Math.floor(text.length / 2);
87
+ parts = [text.substring(0, half), text.substring(half)];
88
+ }
89
+
90
+ // Iterate over parts
78
91
  for (let i = 0; i < parts.length; i++) {
79
92
  const lastChunk = (i === parts.length - 1);
80
93
 
@@ -85,30 +98,82 @@ export class TextSplitter {
85
98
  chunk += separator;
86
99
  }
87
100
 
88
- // Encode chunk text
89
- const tokens = this._config.tokenizer.encode(chunk);
90
- if (tokens.length > this._config.chunkSize) {
101
+ // Ensure chunk contains text
102
+ if (!this.containsAlphanumeric(chunk)) {
103
+ continue;
104
+ }
105
+
106
+ // Optimization to avoid encoding really large chunks
107
+ if (chunk.length / 6 > this._config.chunkSize) {
91
108
  // Break the text into smaller chunks
92
109
  const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
93
110
  chunks.push(...subChunks);
94
111
  } else {
95
- // Append chunk to output
96
- chunks.push({
97
- text: chunk,
98
- tokens: tokens,
99
- startPos: startPos,
100
- endPos: endPos,
101
- startOverlap: [],
102
- endOverlap: [],
103
- });
112
+ // Encode chunk text
113
+ const tokens = this._config.tokenizer.encode(chunk);
114
+ if (tokens.length > this._config.chunkSize) {
115
+ // Break the text into smaller chunks
116
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
117
+ chunks.push(...subChunks);
118
+ } else {
119
+ // Append chunk to output
120
+ chunks.push({
121
+ text: chunk,
122
+ tokens: tokens,
123
+ startPos: startPos,
124
+ endPos: endPos,
125
+ startOverlap: [],
126
+ endOverlap: [],
127
+ });
128
+ }
129
+
104
130
  }
105
131
 
132
+
106
133
  // Update startPos
107
134
  startPos = endPos + 1;
108
135
  }
109
136
  }
110
137
 
111
- return chunks;
138
+ return this.combineChunks(chunks);
139
+ }
140
+
141
+ private combineChunks(chunks: TextChunk[]): TextChunk[] {
142
+ const combinedChunks: TextChunk[] = [];
143
+ let currentChunk: TextChunk|undefined;
144
+ let currentLength = 0;
145
+ const separator = this._config.keepSeparators ? '' : ' ';
146
+ for (let i = 0; i < chunks.length; i++) {
147
+ const chunk = chunks[i];
148
+ if (currentChunk) {
149
+ const length = currentChunk.tokens.length + chunk.tokens.length;
150
+ if (length > this._config.chunkSize) {
151
+ combinedChunks.push(currentChunk);
152
+ currentChunk = chunk;
153
+ currentLength = chunk.tokens.length;
154
+ } else {
155
+ currentChunk.text += separator + chunk.text;
156
+ currentChunk.tokens.push(...chunk.tokens);
157
+ currentLength += chunk.tokens.length;
158
+ }
159
+ } else {
160
+ currentChunk = chunk;
161
+ currentLength = chunk.tokens.length;
162
+ }
163
+ }
164
+ if (currentChunk) {
165
+ combinedChunks.push(currentChunk);
166
+ }
167
+ return combinedChunks;
168
+ }
169
+
170
+ private containsAlphanumeric(text: string): boolean {
171
+ for (let i = 0; i < text.length; i++) {
172
+ if (ALPHANUMERIC_CHARS.includes(text[i])) {
173
+ return true;
174
+ }
175
+ }
176
+ return false;
112
177
  }
113
178
 
114
179
  private getSeparators(docType?: string): string[] {
@@ -131,8 +196,7 @@ export class TextSplitter {
131
196
  // Split by the normal type of lines
132
197
  "\n\n",
133
198
  "\n",
134
- " ",
135
- "",
199
+ " "
136
200
  ];
137
201
  case "go":
138
202
  return [
@@ -149,8 +213,7 @@ export class TextSplitter {
149
213
  // Split by the normal type of lines
150
214
  "\n\n",
151
215
  "\n",
152
- " ",
153
- "",
216
+ " "
154
217
  ];
155
218
  case "java":
156
219
  case "c#":
@@ -176,8 +239,7 @@ export class TextSplitter {
176
239
  // Split by the normal type of lines
177
240
  "\n\n",
178
241
  "\n",
179
- " ",
180
- "",
242
+ " "
181
243
  ];
182
244
  case "js":
183
245
  case "jsx":
@@ -201,8 +263,7 @@ export class TextSplitter {
201
263
  // Split by the normal type of lines
202
264
  "\n\n",
203
265
  "\n",
204
- " ",
205
- "",
266
+ " "
206
267
  ];
207
268
  case "php":
208
269
  return [
@@ -220,8 +281,7 @@ export class TextSplitter {
220
281
  // Split by the normal type of lines
221
282
  "\n\n",
222
283
  "\n",
223
- " ",
224
- "",
284
+ " "
225
285
  ];
226
286
  case "proto":
227
287
  return [
@@ -240,8 +300,7 @@ export class TextSplitter {
240
300
  // Split by the normal type of lines
241
301
  "\n\n",
242
302
  "\n",
243
- " ",
244
- "",
303
+ " "
245
304
  ];
246
305
  case "python":
247
306
  case "py":
@@ -253,8 +312,7 @@ export class TextSplitter {
253
312
  // Now split by the normal type of lines
254
313
  "\n\n",
255
314
  "\n",
256
- " ",
257
- "",
315
+ " "
258
316
  ];
259
317
  case "rst":
260
318
  return [
@@ -267,8 +325,7 @@ export class TextSplitter {
267
325
  // Split by the normal type of lines
268
326
  "\n\n",
269
327
  "\n",
270
- " ",
271
- "",
328
+ " "
272
329
  ];
273
330
  case "ruby":
274
331
  return [
@@ -286,8 +343,7 @@ export class TextSplitter {
286
343
  // Split by the normal type of lines
287
344
  "\n\n",
288
345
  "\n",
289
- " ",
290
- "",
346
+ " "
291
347
  ];
292
348
  case "rust":
293
349
  return [
@@ -305,8 +361,7 @@ export class TextSplitter {
305
361
  // Split by the normal type of lines
306
362
  "\n\n",
307
363
  "\n",
308
- " ",
309
- "",
364
+ " "
310
365
  ];
311
366
  case "scala":
312
367
  return [
@@ -326,8 +381,7 @@ export class TextSplitter {
326
381
  // Split by the normal type of lines
327
382
  "\n\n",
328
383
  "\n",
329
- " ",
330
- "",
384
+ " "
331
385
  ];
332
386
  case "swift":
333
387
  return [
@@ -347,9 +401,9 @@ export class TextSplitter {
347
401
  // Split by the normal type of lines
348
402
  "\n\n",
349
403
  "\n",
350
- " ",
351
- "",
404
+ " "
352
405
  ];
406
+ case "md":
353
407
  case "markdown":
354
408
  return [
355
409
  // First, try to split along Markdown headings (starting with level 2)
@@ -369,10 +423,14 @@ export class TextSplitter {
369
423
  "\n\n___\n\n",
370
424
  // Note that this splitter doesn't handle horizontal lines defined
371
425
  // by *three or more* of ***, ---, or ___, but this is not handled
426
+ // Github tables
427
+ "<table>",
428
+ // "<tr>",
429
+ // "<td>",
430
+ // "<td ",
372
431
  "\n\n",
373
432
  "\n",
374
- " ",
375
- "",
433
+ " "
376
434
  ];
377
435
  case "latex":
378
436
  return [
@@ -400,8 +458,7 @@ export class TextSplitter {
400
458
  // Now split by the normal type of lines
401
459
  "\n\n",
402
460
  "\n",
403
- " ",
404
- "",
461
+ " "
405
462
  ];
406
463
  case "html":
407
464
  return [
@@ -434,8 +491,7 @@ export class TextSplitter {
434
491
  "<meta>",
435
492
  "<title>",
436
493
  // Normal type of lines
437
- " ",
438
- "",
494
+ " "
439
495
  ];
440
496
  case "sol":
441
497
  return [
@@ -464,8 +520,7 @@ export class TextSplitter {
464
520
  // Split by the normal type of lines
465
521
  "\n\n",
466
522
  "\n",
467
- " ",
468
- "",
523
+ " "
469
524
  ];
470
525
  default:
471
526
  return [
package/src/WebFetcher.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import axios, { AxiosRequestConfig } from "axios";
2
- import * as cheerio from "cheerio";
3
2
  import { TextFetcher } from './types';
3
+ import * as cheerio from 'cheerio';
4
+ import TurndownService from 'turndown';
4
5
 
5
6
 
6
7
  const ALLOWED_CONTENT_TYPES = [
@@ -30,7 +31,7 @@ const DEFAULT_HEADERS = {
30
31
  export interface WebFetcherConfig {
31
32
  headers?: Record<string,string>;
32
33
  requestConfig?: AxiosRequestConfig;
33
- htmlToText: boolean;
34
+ htmlToMarkdown: boolean;
34
35
  summarizeHtml: boolean;
35
36
  }
36
37
 
@@ -39,62 +40,12 @@ export class WebFetcher implements TextFetcher {
39
40
 
40
41
  public constructor(config?: Partial<WebFetcherConfig>) {
41
42
  this._config = Object.assign({
42
- htmlToText: true,
43
+ htmlToMarkdown: true,
43
44
  summarizeHtml: false,
44
45
  } as WebFetcherConfig, config);
45
46
  }
46
47
 
47
- public async fetch(uri: string): Promise<string> {
48
- const {data, contentType} = await this.fetchPage(uri);
49
- if (contentType === "text/html" && this._config.htmlToText) {
50
- return this.extractText(data, uri, this._config.summarizeHtml);
51
- } else {
52
- return data;
53
- }
54
- }
55
-
56
- private extractText(html: string, baseUrl: string, summarize: boolean): string {
57
- // Parse all elements including <noscript> tags
58
- const $ = cheerio.load(html, { scriptingEnabled: true });
59
-
60
- // If we want a summary, just get use the <body/>
61
- let text = '';
62
- $(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
63
- // Remove any children to avoid duplicate text
64
- let content = $(elem).clone().children().remove().end().text().trim();
65
- const $el = $(elem);
66
-
67
- // Print links in markdown format
68
- let href = $el.attr("href");
69
- if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
70
- if (!href.startsWith("http")) {
71
- // Try converting to a relevant link
72
- try {
73
- href = new URL(href, baseUrl).toString();
74
- } catch {
75
- // Leave as is
76
- }
77
- }
78
-
79
- // If the link has content, use that as the text
80
- const altText = $el.find("img[alt]").attr("alt")?.trim();
81
- if (altText) {
82
- content += ` ${altText}`;
83
- }
84
-
85
- text += ` [${content}](${href})`;
86
- }
87
- // otherwise just print the content
88
- else if (content !== "") {
89
- text += ` ${content}`;
90
- }
91
- });
92
-
93
- // Remove newlines
94
- return text.trim().replace(/\n+/g, ' ');
95
- }
96
-
97
- private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
48
+ public async fetch(uri: string): Promise<{ text: string; docType: string|undefined; }> {
98
49
  const httpClient = axios.create({
99
50
  validateStatus: () => true,
100
51
  });
@@ -103,12 +54,12 @@ export class WebFetcher implements TextFetcher {
103
54
  const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
104
55
 
105
56
  // get hostname from url
106
- const host = new URL(baseUrl).hostname;
57
+ const host = new URL(uri).hostname;
107
58
  headers['Host'] = host;
108
59
  headers['Alt-Used'] = host;
109
60
 
110
61
  // Fetch page and check for errors
111
- const response = await httpClient.get(baseUrl, {
62
+ const response = await httpClient.get(uri, {
112
63
  headers,
113
64
  ...this._config.requestConfig,
114
65
  });
@@ -123,6 +74,156 @@ export class WebFetcher implements TextFetcher {
123
74
  throw new Error(`Site returned an invalid content type of ${contentType}`);
124
75
  }
125
76
 
126
- return {data: response.data, contentType: contentTypeArray[0]};
77
+ // Convert content type to doc type
78
+ const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
79
+ if (docType == 'html' && this._config.htmlToMarkdown) {
80
+ const text = this.htmlToMarkdown(response.data, uri);
81
+ return {text, docType: 'md'};
82
+ } else {
83
+ const text = response.data;
84
+ return {text, docType};
85
+ }
127
86
  }
128
- }
87
+
88
+
89
+ private htmlToMarkdown(html: string, baseUrl: string): string {
90
+ // Parse HTML and remove scripts
91
+ const $ = cheerio.load(html, { scriptingEnabled: true });
92
+
93
+ // Remove scripts and convert relative links to absolute
94
+ $('script').remove();
95
+ $('a').each((i, elem) => {
96
+ const $el = $(elem);
97
+ const href = $el.attr("href");
98
+ if (href && !href.startsWith("http")) {
99
+ // Try converting to an absolute link
100
+ try {
101
+ $el.attr("href", new URL(href, baseUrl).toString());
102
+ } catch {
103
+ // Leave as is
104
+ }
105
+ }
106
+ });
107
+
108
+ // Convert to markdown
109
+ const body = $('body').html() ?? '';
110
+ const turndownService = new TurndownService({
111
+ hr: '\n\n---\n\n',
112
+ });
113
+ convertTables(turndownService);
114
+ const md = turndownService.turndown(body);
115
+
116
+ // Remove any overly long header text
117
+ const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
118
+ if (contentStart > 64) {
119
+ return md.slice(contentStart);
120
+ } else {
121
+ return md;
122
+ }
123
+ }
124
+ }
125
+
126
+ function convertTables(turndownService: TurndownService): void {
127
+ turndownService.addRule('tableCell', {
128
+ filter: ['th', 'td'],
129
+ replacement: function (content, node) {
130
+ return cell(content, node)
131
+ }
132
+ });
133
+
134
+ turndownService.addRule('tableRow', {
135
+ filter: 'tr',
136
+ replacement: function (content, node) {
137
+ var borderCells = ''
138
+ var alignMap: any = { left: ':--', right: '--:', center: ':-:' }
139
+
140
+ if (isHeadingRow(node)) {
141
+ for (var i = 0; i < node.childNodes.length; i++) {
142
+ var border = '---'
143
+ var align: string = (
144
+ node.childNodes[i].getAttribute('align') || ''
145
+ ).toLowerCase()
146
+
147
+ if (align) border = alignMap[align] || border
148
+
149
+ borderCells += cell(border, node.childNodes[i])
150
+ }
151
+ }
152
+ return '\n' + content + (borderCells ? '\n' + borderCells : '')
153
+ }
154
+ });
155
+
156
+ turndownService.addRule('table', {
157
+ filter: ['table'],
158
+ replacement: function (content, node) {
159
+ // Ensure there are no blank lines
160
+ content = content.replace('\n\n', '\n')
161
+ return '\n\n' + content + '\n\n'
162
+ }
163
+ });
164
+
165
+ turndownService.addRule('tableSection', {
166
+ filter: ['thead', 'tbody', 'tfoot'],
167
+ replacement: function (content) {
168
+ return content
169
+ }
170
+ });
171
+ }
172
+
173
+ const indexOf = Array.prototype.indexOf
174
+ const every = Array.prototype.every
175
+
176
+ // A tr is a heading row if:
177
+ // - the parent is a THEAD
178
+ // - or if its the first child of the TABLE or the first TBODY (possibly
179
+ // following a blank THEAD)
180
+ // - and every cell is a TH
181
+ function isHeadingRow(tr: any) {
182
+ var parentNode = tr.parentNode
183
+ return (
184
+ parentNode.nodeName === 'THEAD' ||
185
+ (
186
+ parentNode.firstChild === tr &&
187
+ (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
188
+ every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
189
+ )
190
+ )
191
+ }
192
+
193
+ function isFirstTbody(element: any) {
194
+ var previousSibling = element.previousSibling
195
+ return (
196
+ element.nodeName === 'TBODY' && (
197
+ !previousSibling ||
198
+ (
199
+ previousSibling.nodeName === 'THEAD' &&
200
+ /^\s*$/i.test(previousSibling.textContent)
201
+ )
202
+ )
203
+ )
204
+ }
205
+
206
+ function cell(content: string, node: any): string {
207
+ var index = indexOf.call(node.parentNode.childNodes, node)
208
+ var prefix = ' '
209
+ if (index === 0) {
210
+ prefix = '| '
211
+ }
212
+ return cleanContent(prefix + content + ' |');
213
+ }
214
+
215
+ function cleanContent(content: string): string {
216
+ let output = '';
217
+ const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
218
+ for (let i = 0; i < content.length; i++) {
219
+ if (chars.includes(content[i])) {
220
+ if (output[output.length - 1] != ' ') {
221
+ output += ' ';
222
+ }
223
+ continue;
224
+ } else {
225
+ output += content[i];
226
+ }
227
+ }
228
+ return output;
229
+ }
package/src/types.ts CHANGED
@@ -4,6 +4,11 @@
4
4
  * An AI model that can be used to create embeddings.
5
5
  */
6
6
  export interface EmbeddingsModel {
7
+ /**
8
+ * Maximum number of tokens
9
+ */
10
+ readonly maxTokens: number;
11
+
7
12
  /**
8
13
  * Creates embeddings for the given inputs.
9
14
  * @param inputs Text inputs to create embeddings for.
@@ -51,7 +56,7 @@ export interface TextChunk {
51
56
  }
52
57
 
53
58
  export interface TextFetcher {
54
- fetch(uri: string): Promise<string>;
59
+ fetch(uri: string): Promise<{ text: string; docType: string|undefined; }>;
55
60
  }
56
61
 
57
62
  export interface IndexStats {
package/src/vectra-cli.ts CHANGED
@@ -22,7 +22,7 @@ export async function run() {
22
22
  const index = new LocalDocumentIndex({ folderPath });
23
23
  await index.deleteIndex();
24
24
  })
25
- .command('add-web <index>', `adds one or more web pages to an index`, (yargs) => {
25
+ .command('add <index>', `adds one or more web pages to an index`, (yargs) => {
26
26
  return yargs
27
27
  .option('keys', {
28
28
  alias: 'k',
@@ -81,9 +81,9 @@ export async function run() {
81
81
  for (const uri of uris) {
82
82
  try {
83
83
  console.log(Colorize.progress(`fetching ${uri}`));
84
- const content = await fetcher.fetch(uri);
84
+ const { text, docType } = await fetcher.fetch(uri);
85
85
  console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
86
- await index.upsertDocument(uri, content);
86
+ await index.upsertDocument(uri, text, docType);
87
87
  console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
88
88
  } catch (err: unknown) {
89
89
  console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${uri}\n${(err as Error).message}`)));
@@ -142,25 +142,25 @@ export async function run() {
142
142
  .option('document-count', {
143
143
  alias: 'dc',
144
144
  describe: 'max number of documents to return (defaults to 10)',
145
- type: 'count',
145
+ type: 'number',
146
146
  default: 10
147
147
  })
148
148
  .option('chunk-count', {
149
149
  alias: 'cc',
150
150
  describe: 'max number of chunks to return (defaults to 50)',
151
- type: 'count',
151
+ type: 'number',
152
152
  default: 50
153
153
  })
154
154
  .option('section-count', {
155
155
  alias: 'sc',
156
156
  describe: 'max number of document sections to render (defaults to 1)',
157
- type: 'count',
157
+ type: 'number',
158
158
  default: 1
159
159
  })
160
160
  .option('tokens', {
161
161
  alias: 't',
162
162
  describe: 'max number of tokens to render for each document section (defaults to 2000)',
163
- type: 'count',
163
+ type: 'number',
164
164
  default: 2000
165
165
  })
166
166
  .option('format', {
@@ -200,7 +200,7 @@ export async function run() {
200
200
  const sections = await result.renderSections(args.tokens, args.sectionCount);
201
201
  for (let i = 0; i < sections.length; i++) {
202
202
  const section = sections[i];
203
- console.log(Colorize.title(args.sectionCount > 1 ? 'Section' : `Section ${1}`));
203
+ console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
204
204
  console.log(Colorize.value('score', section.score));
205
205
  console.log(Colorize.value('tokens', section.tokenCount));
206
206
  console.log(Colorize.output(section.text));