vectra 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  import { GPT3Tokenizer } from "./GPT3Tokenizer";
2
2
  import { TextChunk, Tokenizer } from "./types";
3
3
 
4
+ const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
5
+
4
6
  export interface TextSplitterConfig {
5
7
  separators: string[];
6
8
  keepSeparators: boolean;
@@ -15,7 +17,6 @@ export class TextSplitter {
15
17
 
16
18
  public constructor(config?: Partial<TextSplitterConfig>) {
17
19
  this._config = Object.assign({
18
- separators: ["\n\n", "\n", " ", ""],
19
20
  keepSeparators: false,
20
21
  chunkSize: 400,
21
22
  chunkOverlap: 40,
@@ -71,10 +72,22 @@ export class TextSplitter {
71
72
 
72
73
  private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
73
74
  const chunks: TextChunk[] = [];
74
- if (text.length > 0 && separators.length > 0) {
75
- const separator = separators[0];
75
+ if (text.length > 0) {
76
+ // Split text into parts
77
+ let parts: string[];
78
+ let separator = '';
76
79
  const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
77
- const parts = text.split(separator);
80
+ if (separators.length > 0) {
81
+ // Split by separator
82
+ separator = separators[0];
83
+ parts = text.split(separator);
84
+ } else {
85
+ // Cut text in half
86
+ const half = Math.floor(text.length / 2);
87
+ parts = [text.substring(0, half), text.substring(half)];
88
+ }
89
+
90
+ // Iterate over parts
78
91
  for (let i = 0; i < parts.length; i++) {
79
92
  const lastChunk = (i === parts.length - 1);
80
93
 
@@ -85,36 +98,82 @@ export class TextSplitter {
85
98
  chunk += separator;
86
99
  }
87
100
 
88
- // Check for empty chunk
89
- const trimmed = chunk.trim();
90
- if (trimmed.length === 0 || trimmed == '\n') {
101
+ // Ensure chunk contains text
102
+ if (!this.containsAlphanumeric(chunk)) {
91
103
  continue;
92
104
  }
93
105
 
94
- // Encode chunk text
95
- const tokens = this._config.tokenizer.encode(chunk);
96
- if (tokens.length > this._config.chunkSize) {
106
+ // Optimization to avoid encoding really large chunks
107
+ if (chunk.length / 6 > this._config.chunkSize) {
97
108
  // Break the text into smaller chunks
98
109
  const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
99
110
  chunks.push(...subChunks);
100
111
  } else {
101
- // Append chunk to output
102
- chunks.push({
103
- text: chunk,
104
- tokens: tokens,
105
- startPos: startPos,
106
- endPos: endPos,
107
- startOverlap: [],
108
- endOverlap: [],
109
- });
112
+ // Encode chunk text
113
+ const tokens = this._config.tokenizer.encode(chunk);
114
+ if (tokens.length > this._config.chunkSize) {
115
+ // Break the text into smaller chunks
116
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
117
+ chunks.push(...subChunks);
118
+ } else {
119
+ // Append chunk to output
120
+ chunks.push({
121
+ text: chunk,
122
+ tokens: tokens,
123
+ startPos: startPos,
124
+ endPos: endPos,
125
+ startOverlap: [],
126
+ endOverlap: [],
127
+ });
128
+ }
129
+
110
130
  }
111
131
 
132
+
112
133
  // Update startPos
113
134
  startPos = endPos + 1;
114
135
  }
115
136
  }
116
137
 
117
- return chunks;
138
+ return this.combineChunks(chunks);
139
+ }
140
+
141
+ private combineChunks(chunks: TextChunk[]): TextChunk[] {
142
+ const combinedChunks: TextChunk[] = [];
143
+ let currentChunk: TextChunk|undefined;
144
+ let currentLength = 0;
145
+ const separator = this._config.keepSeparators ? '' : ' ';
146
+ for (let i = 0; i < chunks.length; i++) {
147
+ const chunk = chunks[i];
148
+ if (currentChunk) {
149
+ const length = currentChunk.tokens.length + chunk.tokens.length;
150
+ if (length > this._config.chunkSize) {
151
+ combinedChunks.push(currentChunk);
152
+ currentChunk = chunk;
153
+ currentLength = chunk.tokens.length;
154
+ } else {
155
+ currentChunk.text += separator + chunk.text;
156
+ currentChunk.tokens.push(...chunk.tokens);
157
+ currentLength += chunk.tokens.length;
158
+ }
159
+ } else {
160
+ currentChunk = chunk;
161
+ currentLength = chunk.tokens.length;
162
+ }
163
+ }
164
+ if (currentChunk) {
165
+ combinedChunks.push(currentChunk);
166
+ }
167
+ return combinedChunks;
168
+ }
169
+
170
+ private containsAlphanumeric(text: string): boolean {
171
+ for (let i = 0; i < text.length; i++) {
172
+ if (ALPHANUMERIC_CHARS.includes(text[i])) {
173
+ return true;
174
+ }
175
+ }
176
+ return false;
118
177
  }
119
178
 
120
179
  private getSeparators(docType?: string): string[] {
@@ -137,8 +196,7 @@ export class TextSplitter {
137
196
  // Split by the normal type of lines
138
197
  "\n\n",
139
198
  "\n",
140
- " ",
141
- "",
199
+ " "
142
200
  ];
143
201
  case "go":
144
202
  return [
@@ -155,8 +213,7 @@ export class TextSplitter {
155
213
  // Split by the normal type of lines
156
214
  "\n\n",
157
215
  "\n",
158
- " ",
159
- "",
216
+ " "
160
217
  ];
161
218
  case "java":
162
219
  case "c#":
@@ -182,8 +239,7 @@ export class TextSplitter {
182
239
  // Split by the normal type of lines
183
240
  "\n\n",
184
241
  "\n",
185
- " ",
186
- "",
242
+ " "
187
243
  ];
188
244
  case "js":
189
245
  case "jsx":
@@ -207,8 +263,7 @@ export class TextSplitter {
207
263
  // Split by the normal type of lines
208
264
  "\n\n",
209
265
  "\n",
210
- " ",
211
- "",
266
+ " "
212
267
  ];
213
268
  case "php":
214
269
  return [
@@ -226,8 +281,7 @@ export class TextSplitter {
226
281
  // Split by the normal type of lines
227
282
  "\n\n",
228
283
  "\n",
229
- " ",
230
- "",
284
+ " "
231
285
  ];
232
286
  case "proto":
233
287
  return [
@@ -246,8 +300,7 @@ export class TextSplitter {
246
300
  // Split by the normal type of lines
247
301
  "\n\n",
248
302
  "\n",
249
- " ",
250
- "",
303
+ " "
251
304
  ];
252
305
  case "python":
253
306
  case "py":
@@ -259,8 +312,7 @@ export class TextSplitter {
259
312
  // Now split by the normal type of lines
260
313
  "\n\n",
261
314
  "\n",
262
- " ",
263
- "",
315
+ " "
264
316
  ];
265
317
  case "rst":
266
318
  return [
@@ -273,8 +325,7 @@ export class TextSplitter {
273
325
  // Split by the normal type of lines
274
326
  "\n\n",
275
327
  "\n",
276
- " ",
277
- "",
328
+ " "
278
329
  ];
279
330
  case "ruby":
280
331
  return [
@@ -292,8 +343,7 @@ export class TextSplitter {
292
343
  // Split by the normal type of lines
293
344
  "\n\n",
294
345
  "\n",
295
- " ",
296
- "",
346
+ " "
297
347
  ];
298
348
  case "rust":
299
349
  return [
@@ -311,8 +361,7 @@ export class TextSplitter {
311
361
  // Split by the normal type of lines
312
362
  "\n\n",
313
363
  "\n",
314
- " ",
315
- "",
364
+ " "
316
365
  ];
317
366
  case "scala":
318
367
  return [
@@ -332,8 +381,7 @@ export class TextSplitter {
332
381
  // Split by the normal type of lines
333
382
  "\n\n",
334
383
  "\n",
335
- " ",
336
- "",
384
+ " "
337
385
  ];
338
386
  case "swift":
339
387
  return [
@@ -353,9 +401,9 @@ export class TextSplitter {
353
401
  // Split by the normal type of lines
354
402
  "\n\n",
355
403
  "\n",
356
- " ",
357
- "",
404
+ " "
358
405
  ];
406
+ case "md":
359
407
  case "markdown":
360
408
  return [
361
409
  // First, try to split along Markdown headings (starting with level 2)
@@ -375,10 +423,14 @@ export class TextSplitter {
375
423
  "\n\n___\n\n",
376
424
  // Note that this splitter doesn't handle horizontal lines defined
377
425
  // by *three or more* of ***, ---, or ___, but this is not handled
426
+ // Github tables
427
+ "<table>",
428
+ // "<tr>",
429
+ // "<td>",
430
+ // "<td ",
378
431
  "\n\n",
379
432
  "\n",
380
- " ",
381
- "",
433
+ " "
382
434
  ];
383
435
  case "latex":
384
436
  return [
@@ -406,8 +458,7 @@ export class TextSplitter {
406
458
  // Now split by the normal type of lines
407
459
  "\n\n",
408
460
  "\n",
409
- " ",
410
- "",
461
+ " "
411
462
  ];
412
463
  case "html":
413
464
  return [
@@ -440,8 +491,7 @@ export class TextSplitter {
440
491
  "<meta>",
441
492
  "<title>",
442
493
  // Normal type of lines
443
- " ",
444
- "",
494
+ " "
445
495
  ];
446
496
  case "sol":
447
497
  return [
@@ -470,8 +520,7 @@ export class TextSplitter {
470
520
  // Split by the normal type of lines
471
521
  "\n\n",
472
522
  "\n",
473
- " ",
474
- "",
523
+ " "
475
524
  ];
476
525
  default:
477
526
  return [
package/src/WebFetcher.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import axios, { AxiosRequestConfig } from "axios";
2
- import * as cheerio from "cheerio";
3
2
  import { TextFetcher } from './types';
3
+ import * as cheerio from 'cheerio';
4
+ import TurndownService from 'turndown';
4
5
 
5
6
 
6
7
  const ALLOWED_CONTENT_TYPES = [
@@ -30,7 +31,7 @@ const DEFAULT_HEADERS = {
30
31
  export interface WebFetcherConfig {
31
32
  headers?: Record<string,string>;
32
33
  requestConfig?: AxiosRequestConfig;
33
- htmlToText: boolean;
34
+ htmlToMarkdown: boolean;
34
35
  summarizeHtml: boolean;
35
36
  }
36
37
 
@@ -39,62 +40,12 @@ export class WebFetcher implements TextFetcher {
39
40
 
40
41
  public constructor(config?: Partial<WebFetcherConfig>) {
41
42
  this._config = Object.assign({
42
- htmlToText: true,
43
+ htmlToMarkdown: true,
43
44
  summarizeHtml: false,
44
45
  } as WebFetcherConfig, config);
45
46
  }
46
47
 
47
- public async fetch(uri: string): Promise<string> {
48
- const {data, contentType} = await this.fetchPage(uri);
49
- if (contentType === "text/html" && this._config.htmlToText) {
50
- return this.extractText(data, uri, this._config.summarizeHtml);
51
- } else {
52
- return data;
53
- }
54
- }
55
-
56
- private extractText(html: string, baseUrl: string, summarize: boolean): string {
57
- // Parse all elements including <noscript> tags
58
- const $ = cheerio.load(html, { scriptingEnabled: true });
59
-
60
- // If we want a summary, just get use the <body/>
61
- let text = '';
62
- $(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
63
- // Remove any children to avoid duplicate text
64
- let content = $(elem).clone().children().remove().end().text().trim();
65
- const $el = $(elem);
66
-
67
- // Print links in markdown format
68
- let href = $el.attr("href");
69
- if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
70
- if (!href.startsWith("http")) {
71
- // Try converting to a relevant link
72
- try {
73
- href = new URL(href, baseUrl).toString();
74
- } catch {
75
- // Leave as is
76
- }
77
- }
78
-
79
- // If the link has content, use that as the text
80
- const altText = $el.find("img[alt]").attr("alt")?.trim();
81
- if (altText) {
82
- content += ` ${altText}`;
83
- }
84
-
85
- text += ` [${content}](${href})`;
86
- }
87
- // otherwise just print the content
88
- else if (content !== "") {
89
- text += ` ${content}`;
90
- }
91
- });
92
-
93
- // Remove newlines
94
- return text.trim().replace(/\n+/g, ' ');
95
- }
96
-
97
- private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
48
+ public async fetch(uri: string): Promise<{ text: string; docType: string|undefined; }> {
98
49
  const httpClient = axios.create({
99
50
  validateStatus: () => true,
100
51
  });
@@ -103,12 +54,12 @@ export class WebFetcher implements TextFetcher {
103
54
  const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
104
55
 
105
56
  // get hostname from url
106
- const host = new URL(baseUrl).hostname;
57
+ const host = new URL(uri).hostname;
107
58
  headers['Host'] = host;
108
59
  headers['Alt-Used'] = host;
109
60
 
110
61
  // Fetch page and check for errors
111
- const response = await httpClient.get(baseUrl, {
62
+ const response = await httpClient.get(uri, {
112
63
  headers,
113
64
  ...this._config.requestConfig,
114
65
  });
@@ -123,6 +74,156 @@ export class WebFetcher implements TextFetcher {
123
74
  throw new Error(`Site returned an invalid content type of ${contentType}`);
124
75
  }
125
76
 
126
- return {data: response.data, contentType: contentTypeArray[0]};
77
+ // Convert content type to doc type
78
+ const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
79
+ if (docType == 'html' && this._config.htmlToMarkdown) {
80
+ const text = this.htmlToMarkdown(response.data, uri);
81
+ return {text, docType: 'md'};
82
+ } else {
83
+ const text = response.data;
84
+ return {text, docType};
85
+ }
127
86
  }
128
- }
87
+
88
+
89
+ private htmlToMarkdown(html: string, baseUrl: string): string {
90
+ // Parse HTML and remove scripts
91
+ const $ = cheerio.load(html, { scriptingEnabled: true });
92
+
93
+ // Remove scripts and convert relative links to absolute
94
+ $('script').remove();
95
+ $('a').each((i, elem) => {
96
+ const $el = $(elem);
97
+ const href = $el.attr("href");
98
+ if (href && !href.startsWith("http")) {
99
+ // Try converting to an absolute link
100
+ try {
101
+ $el.attr("href", new URL(href, baseUrl).toString());
102
+ } catch {
103
+ // Leave as is
104
+ }
105
+ }
106
+ });
107
+
108
+ // Convert to markdown
109
+ const body = $('body').html() ?? '';
110
+ const turndownService = new TurndownService({
111
+ hr: '\n\n---\n\n',
112
+ });
113
+ convertTables(turndownService);
114
+ const md = turndownService.turndown(body);
115
+
116
+ // Remove any overly long header text
117
+ const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
118
+ if (contentStart > 64) {
119
+ return md.slice(contentStart);
120
+ } else {
121
+ return md;
122
+ }
123
+ }
124
+ }
125
+
126
+ function convertTables(turndownService: TurndownService): void {
127
+ turndownService.addRule('tableCell', {
128
+ filter: ['th', 'td'],
129
+ replacement: function (content, node) {
130
+ return cell(content, node)
131
+ }
132
+ });
133
+
134
+ turndownService.addRule('tableRow', {
135
+ filter: 'tr',
136
+ replacement: function (content, node) {
137
+ var borderCells = ''
138
+ var alignMap: any = { left: ':--', right: '--:', center: ':-:' }
139
+
140
+ if (isHeadingRow(node)) {
141
+ for (var i = 0; i < node.childNodes.length; i++) {
142
+ var border = '---'
143
+ var align: string = (
144
+ node.childNodes[i].getAttribute('align') || ''
145
+ ).toLowerCase()
146
+
147
+ if (align) border = alignMap[align] || border
148
+
149
+ borderCells += cell(border, node.childNodes[i])
150
+ }
151
+ }
152
+ return '\n' + content + (borderCells ? '\n' + borderCells : '')
153
+ }
154
+ });
155
+
156
+ turndownService.addRule('table', {
157
+ filter: ['table'],
158
+ replacement: function (content, node) {
159
+ // Ensure there are no blank lines
160
+ content = content.replace('\n\n', '\n')
161
+ return '\n\n' + content + '\n\n'
162
+ }
163
+ });
164
+
165
+ turndownService.addRule('tableSection', {
166
+ filter: ['thead', 'tbody', 'tfoot'],
167
+ replacement: function (content) {
168
+ return content
169
+ }
170
+ });
171
+ }
172
+
173
+ const indexOf = Array.prototype.indexOf
174
+ const every = Array.prototype.every
175
+
176
+ // A tr is a heading row if:
177
+ // - the parent is a THEAD
178
+ // - or if its the first child of the TABLE or the first TBODY (possibly
179
+ // following a blank THEAD)
180
+ // - and every cell is a TH
181
+ function isHeadingRow(tr: any) {
182
+ var parentNode = tr.parentNode
183
+ return (
184
+ parentNode.nodeName === 'THEAD' ||
185
+ (
186
+ parentNode.firstChild === tr &&
187
+ (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
188
+ every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
189
+ )
190
+ )
191
+ }
192
+
193
+ function isFirstTbody(element: any) {
194
+ var previousSibling = element.previousSibling
195
+ return (
196
+ element.nodeName === 'TBODY' && (
197
+ !previousSibling ||
198
+ (
199
+ previousSibling.nodeName === 'THEAD' &&
200
+ /^\s*$/i.test(previousSibling.textContent)
201
+ )
202
+ )
203
+ )
204
+ }
205
+
206
+ function cell(content: string, node: any): string {
207
+ var index = indexOf.call(node.parentNode.childNodes, node)
208
+ var prefix = ' '
209
+ if (index === 0) {
210
+ prefix = '| '
211
+ }
212
+ return cleanContent(prefix + content + ' |');
213
+ }
214
+
215
+ function cleanContent(content: string): string {
216
+ let output = '';
217
+ const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
218
+ for (let i = 0; i < content.length; i++) {
219
+ if (chars.includes(content[i])) {
220
+ if (output[output.length - 1] != ' ') {
221
+ output += ' ';
222
+ }
223
+ continue;
224
+ } else {
225
+ output += content[i];
226
+ }
227
+ }
228
+ return output;
229
+ }
package/src/types.ts CHANGED
@@ -56,7 +56,7 @@ export interface TextChunk {
56
56
  }
57
57
 
58
58
  export interface TextFetcher {
59
- fetch(uri: string): Promise<string>;
59
+ fetch(uri: string): Promise<{ text: string; docType: string|undefined; }>;
60
60
  }
61
61
 
62
62
  export interface IndexStats {
package/src/vectra-cli.ts CHANGED
@@ -81,9 +81,9 @@ export async function run() {
81
81
  for (const uri of uris) {
82
82
  try {
83
83
  console.log(Colorize.progress(`fetching ${uri}`));
84
- const content = await fetcher.fetch(uri);
84
+ const { text, docType } = await fetcher.fetch(uri);
85
85
  console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
86
- await index.upsertDocument(uri, content);
86
+ await index.upsertDocument(uri, text, docType);
87
87
  console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
88
88
  } catch (err: unknown) {
89
89
  console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${uri}\n${(err as Error).message}`)));
@@ -142,25 +142,25 @@ export async function run() {
142
142
  .option('document-count', {
143
143
  alias: 'dc',
144
144
  describe: 'max number of documents to return (defaults to 10)',
145
- type: 'count',
145
+ type: 'number',
146
146
  default: 10
147
147
  })
148
148
  .option('chunk-count', {
149
149
  alias: 'cc',
150
150
  describe: 'max number of chunks to return (defaults to 50)',
151
- type: 'count',
151
+ type: 'number',
152
152
  default: 50
153
153
  })
154
154
  .option('section-count', {
155
155
  alias: 'sc',
156
156
  describe: 'max number of document sections to render (defaults to 1)',
157
- type: 'count',
157
+ type: 'number',
158
158
  default: 1
159
159
  })
160
160
  .option('tokens', {
161
161
  alias: 't',
162
162
  describe: 'max number of tokens to render for each document section (defaults to 2000)',
163
- type: 'count',
163
+ type: 'number',
164
164
  default: 2000
165
165
  })
166
166
  .option('format', {
@@ -200,7 +200,7 @@ export async function run() {
200
200
  const sections = await result.renderSections(args.tokens, args.sectionCount);
201
201
  for (let i = 0; i < sections.length; i++) {
202
202
  const section = sections[i];
203
- console.log(Colorize.title(args.sectionCount > 1 ? 'Section' : `Section ${1}`));
203
+ console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
204
204
  console.log(Colorize.value('score', section.score));
205
205
  console.log(Colorize.value('tokens', section.tokenCount));
206
206
  console.log(Colorize.output(section.text));