npm - vectra - Versions diffs - 0.2.1 → 0.3.0 - Mend

vectra 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/lib/LocalDocumentIndex.d.ts +5 -2
package/lib/LocalDocumentIndex.d.ts.map +1 -1
package/lib/LocalDocumentIndex.js +20 -12
package/lib/LocalDocumentIndex.js.map +1 -1
package/lib/OpenAIEmbeddings.d.ts +1 -0
package/lib/OpenAIEmbeddings.d.ts.map +1 -1
package/lib/OpenAIEmbeddings.js +3 -1
package/lib/OpenAIEmbeddings.js.map +1 -1
package/lib/TextSplitter.d.ts +2 -0
package/lib/TextSplitter.d.ts.map +1 -1
package/lib/TextSplitter.js +101 -49
package/lib/TextSplitter.js.map +1 -1
package/lib/WebFetcher.d.ts +6 -4
package/lib/WebFetcher.d.ts.map +1 -1
package/lib/WebFetcher.js +132 -52
package/lib/WebFetcher.js.map +1 -1
package/lib/types.d.ts +8 -1
package/lib/types.d.ts.map +1 -1
package/lib/vectra-cli.js +8 -8
package/lib/vectra-cli.js.map +1 -1
package/package.json +3 -1
package/src/LocalDocumentIndex.ts +20 -13
package/src/OpenAIEmbeddings.ts +4 -1
package/src/TextSplitter.ts +104 -49
package/src/WebFetcher.ts +159 -58
package/src/types.ts +6 -1
package/src/vectra-cli.ts +8 -8

package/src/TextSplitter.ts CHANGED Viewed

@@ -1,6 +1,8 @@
 import { GPT3Tokenizer } from "./GPT3Tokenizer";
 import { TextChunk, Tokenizer } from "./types";
+const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
 export interface TextSplitterConfig {
     separators: string[];
     keepSeparators: boolean;
@@ -15,7 +17,6 @@ export class TextSplitter {
     public constructor(config?: Partial<TextSplitterConfig>) {
         this._config = Object.assign({
-            separators: ["\n\n", "\n", " ", ""],
             keepSeparators: false,
             chunkSize: 400,
             chunkOverlap: 40,
@@ -71,10 +72,22 @@ export class TextSplitter {
     private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
         const chunks: TextChunk[] = [];
-        if (text.length > 0 && separators.length > 0) {
-            const separator = separators[0];
+        if (text.length > 0) {
+            // Split text into parts
+            let parts: string[];
+            let separator = '';
             const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
-            const parts = text.split(separator);
+            if (separators.length > 0) {
+                // Split by separator
+                separator = separators[0];
+                parts = text.split(separator);
+            } else {
+                // Cut text in half
+                const half = Math.floor(text.length / 2);
+                parts = [text.substring(0, half), text.substring(half)];
+            }
+            // Iterate over parts
             for (let i = 0; i < parts.length; i++) {
                 const lastChunk = (i === parts.length - 1);
@@ -85,30 +98,82 @@ export class TextSplitter {
                     chunk += separator;
                 }
-                // Encode chunk text
-                const tokens = this._config.tokenizer.encode(chunk);
-                if (tokens.length > this._config.chunkSize) {
+                // Ensure chunk contains text
+                if (!this.containsAlphanumeric(chunk)) {
+                    continue;
+                }
+                // Optimization to avoid encoding really large chunks
+                if (chunk.length / 6 > this._config.chunkSize) {
                     // Break the text into smaller chunks
                     const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
                     chunks.push(...subChunks);
                 } else {
-                    // Append chunk to output
-                    chunks.push({
-                        text: chunk,
-                        tokens: tokens,
-                        startPos: startPos,
-                        endPos: endPos,
-                        startOverlap: [],
-                        endOverlap: [],
-                    });
+                    // Encode chunk text
+                    const tokens = this._config.tokenizer.encode(chunk);
+                    if (tokens.length > this._config.chunkSize) {
+                        // Break the text into smaller chunks
+                        const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
+                        chunks.push(...subChunks);
+                    } else {
+                        // Append chunk to output
+                        chunks.push({
+                            text: chunk,
+                            tokens: tokens,
+                            startPos: startPos,
+                            endPos: endPos,
+                            startOverlap: [],
+                            endOverlap: [],
+                        });
+                    }
                 }
                 // Update startPos
                 startPos = endPos + 1;
             }
         }
-        return chunks;
+        return this.combineChunks(chunks);
+    }
+    private combineChunks(chunks: TextChunk[]): TextChunk[] {
+        const combinedChunks: TextChunk[] = [];
+        let currentChunk: TextChunk|undefined;
+        let currentLength = 0;
+        const separator = this._config.keepSeparators ? '' : ' ';
+        for (let i = 0; i < chunks.length; i++) {
+            const chunk = chunks[i];
+            if (currentChunk) {
+                const length = currentChunk.tokens.length + chunk.tokens.length;
+                if (length > this._config.chunkSize) {
+                    combinedChunks.push(currentChunk);
+                    currentChunk = chunk;
+                    currentLength = chunk.tokens.length;
+                } else {
+                    currentChunk.text += separator + chunk.text;
+                    currentChunk.tokens.push(...chunk.tokens);
+                    currentLength += chunk.tokens.length;
+                }
+            } else {
+                currentChunk = chunk;
+                currentLength = chunk.tokens.length;
+            }
+        }
+        if (currentChunk) {
+            combinedChunks.push(currentChunk);
+        }
+        return combinedChunks;
+    }
+    private containsAlphanumeric(text: string): boolean {
+        for (let i = 0; i < text.length; i++) {
+            if (ALPHANUMERIC_CHARS.includes(text[i])) {
+                return true;
+            }
+        }
+        return false;
     }
     private getSeparators(docType?: string): string[] {
@@ -131,8 +196,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "go":
                 return [
@@ -149,8 +213,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "java":
             case "c#":
@@ -176,8 +239,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "js":
             case "jsx":
@@ -201,8 +263,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "php":
                 return [
@@ -220,8 +281,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "proto":
                 return [
@@ -240,8 +300,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "python":
             case "py":
@@ -253,8 +312,7 @@ export class TextSplitter {
                     // Now split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "rst":
                 return [
@@ -267,8 +325,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "ruby":
                 return [
@@ -286,8 +343,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "rust":
                 return [
@@ -305,8 +361,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "scala":
                 return [
@@ -326,8 +381,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "swift":
                 return [
@@ -347,9 +401,9 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
+            case "md":
             case "markdown":
                 return [
                     // First, try to split along Markdown headings (starting with level 2)
@@ -369,10 +423,14 @@ export class TextSplitter {
                     "\n\n___\n\n",
                     // Note that this splitter doesn't handle horizontal lines defined
                     // by *three or more* of ***, ---, or ___, but this is not handled
+                    // Github tables
+                    "<table>",
+                    // "<tr>",
+                    // "<td>",
+                    // "<td ",
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "latex":
                 return [
@@ -400,8 +458,7 @@ export class TextSplitter {
                     // Now split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             case "html":
                 return [
@@ -434,8 +491,7 @@ export class TextSplitter {
                     "<meta>",
                     "<title>",
                     // Normal type of lines
-                    " ",
-                    "",
+                    " "
                 ];
             case "sol":
                 return [
@@ -464,8 +520,7 @@ export class TextSplitter {
                     // Split by the normal type of lines
                     "\n\n",
                     "\n",
-                    " ",
-                    "",
+                    " "
                 ];
             default:
                 return [

package/src/WebFetcher.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import axios, { AxiosRequestConfig } from "axios";
-import * as cheerio from "cheerio";
 import { TextFetcher } from './types';
+import * as cheerio from 'cheerio';
+import TurndownService  from 'turndown';
 const ALLOWED_CONTENT_TYPES = [
@@ -30,7 +31,7 @@ const DEFAULT_HEADERS = {
 export interface WebFetcherConfig {
     headers?: Record<string,string>;
     requestConfig?: AxiosRequestConfig;
-    htmlToText: boolean;
+    htmlToMarkdown: boolean;
     summarizeHtml: boolean;
 }
@@ -39,62 +40,12 @@ export class WebFetcher implements TextFetcher {
     public constructor(config?: Partial<WebFetcherConfig>) {
         this._config = Object.assign({
-            htmlToText: true,
+            htmlToMarkdown: true,
             summarizeHtml: false,
         } as WebFetcherConfig, config);
     }
-    public async fetch(uri: string): Promise<string> {
-        const {data, contentType} = await this.fetchPage(uri);
-        if (contentType === "text/html" && this._config.htmlToText) {
-            return this.extractText(data, uri, this._config.summarizeHtml);
-        } else {
-            return data;
-        }
-    }
-    private extractText(html: string, baseUrl: string, summarize: boolean): string {
-        // Parse all elements including <noscript> tags
-        const $ = cheerio.load(html, { scriptingEnabled: true });
-        // If we want a summary, just get use the <body/>
-        let text = '';
-        $(`${summarize ? 'body ' : '*'}:not(style):not(script):not(svg)`).each((i, elem: any) => {
-            // Remove any children to avoid duplicate text
-            let content = $(elem).clone().children().remove().end().text().trim();
-            const $el = $(elem);
-            // Print links in markdown format
-            let href = $el.attr("href");
-            if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
-                if (!href.startsWith("http")) {
-                    // Try converting to a relevant link
-                    try {
-                        href = new URL(href, baseUrl).toString();
-                    } catch {
-                        // Leave as is
-                    }
-                }
-                // If the link has content, use that as the text
-                const altText = $el.find("img[alt]").attr("alt")?.trim();
-                if (altText) {
-                    content += ` ${altText}`;
-                }
-                text += ` [${content}](${href})`;
-            }
-            // otherwise just print the content
-            else if (content !== "") {
-                text += ` ${content}`;
-            }
-        });
-        // Remove newlines
-        return text.trim().replace(/\n+/g, ' ');
-    }
-    private async fetchPage(baseUrl: string): Promise<{data: string; contentType: string;}> {
+    public async fetch(uri: string): Promise<{ text: string; docType: string|undefined; }> {
         const httpClient = axios.create({
             validateStatus: () => true,
         });
@@ -103,12 +54,12 @@ export class WebFetcher implements TextFetcher {
         const headers = Object.assign({}, DEFAULT_HEADERS, this._config.headers)
         // get hostname from url
-        const host = new URL(baseUrl).hostname;
+        const host = new URL(uri).hostname;
         headers['Host'] = host;
         headers['Alt-Used'] = host;
         // Fetch page and check for errors
-        const response = await httpClient.get(baseUrl, {
+        const response = await httpClient.get(uri, {
             headers,
             ...this._config.requestConfig,
         });
@@ -123,6 +74,156 @@ export class WebFetcher implements TextFetcher {
             throw new Error(`Site returned an invalid content type of ${contentType}`);
         }
-        return {data: response.data, contentType: contentTypeArray[0]};
+        // Convert content type to doc type
+        const docType = contentTypeArray[0] != 'text/plain' ? contentTypeArray[0].split('/')[1] : undefined;
+        if (docType == 'html' && this._config.htmlToMarkdown) {
+            const text = this.htmlToMarkdown(response.data, uri);
+            return {text, docType: 'md'};
+        } else {
+            const text = response.data;
+            return {text, docType};
+        }
     }
-}
+    private htmlToMarkdown(html: string, baseUrl: string): string {
+        // Parse HTML and remove scripts
+        const $ = cheerio.load(html, { scriptingEnabled: true });
+        // Remove scripts and convert relative links to absolute
+        $('script').remove();
+        $('a').each((i, elem) => {
+            const $el = $(elem);
+            const href = $el.attr("href");
+            if (href && !href.startsWith("http")) {
+                // Try converting to an absolute link
+                try {
+                    $el.attr("href", new URL(href, baseUrl).toString());
+                } catch {
+                    // Leave as is
+                }
+            }
+        });
+        // Convert to markdown
+        const body = $('body').html() ?? '';
+        const turndownService = new TurndownService({
+            hr: '\n\n---\n\n',
+        });
+        convertTables(turndownService);
+        const md = turndownService.turndown(body);
+        // Remove any overly long header text
+        const contentStart = Math.min(md.indexOf('\n'), md.indexOf(' '));
+        if (contentStart > 64) {
+            return md.slice(contentStart);
+        } else {
+            return md;
+        }
+    }
+}
+function convertTables(turndownService: TurndownService): void {
+    turndownService.addRule('tableCell', {
+        filter: ['th', 'td'],
+        replacement: function (content, node) {
+            return cell(content, node)
+        }
+    });
+    turndownService.addRule('tableRow', {
+        filter: 'tr',
+        replacement: function (content, node) {
+            var borderCells = ''
+            var alignMap: any = { left: ':--', right: '--:', center: ':-:' }
+            if (isHeadingRow(node)) {
+                for (var i = 0; i < node.childNodes.length; i++) {
+                    var border = '---'
+                    var align: string = (
+                        node.childNodes[i].getAttribute('align') || ''
+                    ).toLowerCase()
+                    if (align) border = alignMap[align] || border
+                    borderCells += cell(border, node.childNodes[i])
+                }
+            }
+            return '\n' + content + (borderCells ? '\n' + borderCells : '')
+        }
+    });
+    turndownService.addRule('table', {
+        filter: ['table'],
+        replacement: function (content, node) {
+            // Ensure there are no blank lines
+            content = content.replace('\n\n', '\n')
+            return '\n\n' + content + '\n\n'
+        }
+    });
+    turndownService.addRule('tableSection', {
+        filter: ['thead', 'tbody', 'tfoot'],
+        replacement: function (content) {
+            return content
+        }
+    });
+}
+const indexOf = Array.prototype.indexOf
+const every = Array.prototype.every
+// A tr is a heading row if:
+// - the parent is a THEAD
+// - or if its the first child of the TABLE or the first TBODY (possibly
+//   following a blank THEAD)
+// - and every cell is a TH
+function isHeadingRow(tr: any) {
+    var parentNode = tr.parentNode
+    return (
+        parentNode.nodeName === 'THEAD' ||
+        (
+            parentNode.firstChild === tr &&
+            (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
+            every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
+        )
+    )
+}
+function isFirstTbody(element: any) {
+    var previousSibling = element.previousSibling
+    return (
+        element.nodeName === 'TBODY' && (
+            !previousSibling ||
+            (
+                previousSibling.nodeName === 'THEAD' &&
+                /^\s*$/i.test(previousSibling.textContent)
+            )
+        )
+    )
+}
+function cell(content: string, node: any): string {
+    var index = indexOf.call(node.parentNode.childNodes, node)
+    var prefix = ' '
+    if (index === 0) {
+        prefix = '| '
+    }
+    return cleanContent(prefix + content + ' |');
+}
+function cleanContent(content: string): string {
+    let output = '';
+    const chars = ['\n', '\r', '\t', '\f', '\v', '\u00a0', '\u2028', '\u2029', ' '];
+    for (let i = 0; i < content.length; i++) {
+        if (chars.includes(content[i])) {
+            if (output[output.length - 1] != ' ') {
+                output += ' ';
+            }
+            continue;
+        } else {
+            output += content[i];
+        }
+    }
+    return output;
+}

package/src/types.ts CHANGED Viewed

@@ -4,6 +4,11 @@
  * An AI model that can be used to create embeddings.
  */
 export interface EmbeddingsModel {
+    /**
+     * Maximum number of tokens
+     */
+    readonly maxTokens: number;
     /**
      * Creates embeddings for the given inputs.
      * @param inputs Text inputs to create embeddings for.
@@ -51,7 +56,7 @@ export interface TextChunk {
 }
 export interface TextFetcher {
-    fetch(uri: string): Promise<string>;
+    fetch(uri: string): Promise<{ text: string; docType: string|undefined; }>;
 }
 export interface IndexStats {

package/src/vectra-cli.ts CHANGED Viewed

@@ -22,7 +22,7 @@ export async function run() {
             const index = new LocalDocumentIndex({ folderPath });
             await index.deleteIndex();
         })
-        .command('add-web <index>', `adds one or more web pages to an index`, (yargs) => {
+        .command('add <index>', `adds one or more web pages to an index`, (yargs) => {
             return yargs
                 .option('keys', {
                     alias: 'k',
@@ -81,9 +81,9 @@ export async function run() {
             for (const uri of uris) {
                 try {
                     console.log(Colorize.progress(`fetching ${uri}`));
-                    const content =  await fetcher.fetch(uri);
+                    const { text, docType } =  await fetcher.fetch(uri);
                     console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
-                    await index.upsertDocument(uri, content);
+                    await index.upsertDocument(uri, text, docType);
                     console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
                 } catch (err: unknown) {
                     console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${uri}\n${(err as Error).message}`)));
@@ -142,25 +142,25 @@ export async function run() {
                 .option('document-count', {
                     alias: 'dc',
                     describe: 'max number of documents to return (defaults to 10)',
-                    type: 'count',
+                    type: 'number',
                     default: 10
                 })
                 .option('chunk-count', {
                     alias: 'cc',
                     describe: 'max number of chunks to return (defaults to 50)',
-                    type: 'count',
+                    type: 'number',
                     default: 50
                 })
                 .option('section-count', {
                     alias: 'sc',
                     describe: 'max number of document sections to render (defaults to 1)',
-                    type: 'count',
+                    type: 'number',
                     default: 1
                 })
                 .option('tokens', {
                     alias: 't',
                     describe: 'max number of tokens to render for each document section (defaults to 2000)',
-                    type: 'count',
+                    type: 'number',
                     default: 2000
                 })
                 .option('format', {
@@ -200,7 +200,7 @@ export async function run() {
                     const sections = await result.renderSections(args.tokens, args.sectionCount);
                     for (let i = 0; i < sections.length; i++) {
                         const section = sections[i];
-                        console.log(Colorize.title(args.sectionCount > 1 ? 'Section' : `Section ${1}`));
+                        console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
                         console.log(Colorize.value('score', section.score));
                         console.log(Colorize.value('tokens', section.tokenCount));
                         console.log(Colorize.output(section.text));