npm - vectra - Versions diffs - 0.7.6 → 0.10.0 - Mend

vectra 0.7.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/README.md +10 -10
package/package.json +7 -6
package/src/ItemSelector.ts +7 -1
package/src/LocalDocumentIndex.ts +10 -4
package/src/LocalDocumentResult.ts +70 -22
package/src/LocalIndex.ts +77 -10
package/src/TextSplitter.ts +10 -15
package/src/internals/Colorize.ts +3 -3
package/src/internals/wink-bm25-text-search.d.ts +4 -0
package/src/types.ts +2 -1
package/src/vectra-cli.ts +13 -2
package/lib/FileFetcher.d.ts +0 -5
package/lib/FileFetcher.d.ts.map +0 -1
package/lib/FileFetcher.js +0 -69
package/lib/FileFetcher.js.map +0 -1
package/lib/GPT3Tokenizer.d.ts +0 -9
package/lib/GPT3Tokenizer.d.ts.map +0 -1
package/lib/GPT3Tokenizer.js +0 -17
package/lib/GPT3Tokenizer.js.map +0 -1
package/lib/ItemSelector.d.ts +0 -41
package/lib/ItemSelector.d.ts.map +0 -1
package/lib/ItemSelector.js +0 -162
package/lib/ItemSelector.js.map +0 -1
package/lib/LocalDocument.d.ts +0 -54
package/lib/LocalDocument.d.ts.map +0 -1
package/lib/LocalDocument.js +0 -146
package/lib/LocalDocument.js.map +0 -1
package/lib/LocalDocumentIndex.d.ts +0 -128
package/lib/LocalDocumentIndex.d.ts.map +0 -1
package/lib/LocalDocumentIndex.js +0 -446
package/lib/LocalDocumentIndex.js.map +0 -1
package/lib/LocalDocumentResult.d.ts +0 -45
package/lib/LocalDocumentResult.d.ts.map +0 -1
package/lib/LocalDocumentResult.js +0 -282
package/lib/LocalDocumentResult.js.map +0 -1
package/lib/LocalIndex.d.ts +0 -136
package/lib/LocalIndex.d.ts.map +0 -1
package/lib/LocalIndex.js +0 -413
package/lib/LocalIndex.js.map +0 -1
package/lib/OpenAIEmbeddings.d.ts +0 -126
package/lib/OpenAIEmbeddings.d.ts.map +0 -1
package/lib/OpenAIEmbeddings.js +0 -174
package/lib/OpenAIEmbeddings.js.map +0 -1
package/lib/TextSplitter.d.ts +0 -20
package/lib/TextSplitter.d.ts.map +0 -1
package/lib/TextSplitter.js +0 -543
package/lib/TextSplitter.js.map +0 -1
package/lib/WebFetcher.d.ts +0 -15
package/lib/WebFetcher.d.ts.map +0 -1
package/lib/WebFetcher.js +0 -224
package/lib/WebFetcher.js.map +0 -1
package/lib/index.d.ts +0 -12
package/lib/index.d.ts.map +0 -1
package/lib/index.js +0 -28
package/lib/index.js.map +0 -1
package/lib/internals/Colorize.d.ts +0 -14
package/lib/internals/Colorize.d.ts.map +0 -1
package/lib/internals/Colorize.js +0 -64
package/lib/internals/Colorize.js.map +0 -1
package/lib/internals/index.d.ts +0 -3
package/lib/internals/index.d.ts.map +0 -1
package/lib/internals/index.js +0 -19
package/lib/internals/index.js.map +0 -1
package/lib/internals/types.d.ts +0 -43
package/lib/internals/types.d.ts.map +0 -1
package/lib/internals/types.js +0 -3
package/lib/internals/types.js.map +0 -1
package/lib/types.d.ts +0 -145
package/lib/types.d.ts.map +0 -1
package/lib/types.js +0 -3
package/lib/types.js.map +0 -1
package/lib/vectra-cli.d.ts +0 -2
package/lib/vectra-cli.d.ts.map +0 -1
package/lib/vectra-cli.js +0 -303
package/lib/vectra-cli.js.map +0 -1

package/README.md CHANGED Viewed

@@ -1,5 +1,6 @@
 # Vectra
-Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata.  When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
+Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
 When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by simularity. Every item in the index will first be filtered by metadata and then ranked for simularity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms.
@@ -8,9 +9,10 @@ Keep in mind that your entire Vectra index is loaded into memory so it's not wel
 Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace.
 ## Other Language Bindings
 This repo contains the TypeScript/JavaScript binding for Vectra but other language bindings are being created. Since Vectra is file based, any language binding can be used to read or write a Vectra index. That means you can build a Vectra index using JS and then read it using Python.
-- [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
+-   [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
 ## Installation
@@ -31,7 +33,7 @@ const index = new LocalIndex(path.join(__dirname, '..', 'index'));
 Next, from inside an async function, create your index:
 ```typescript
-if (!await index.isIndexCreated()) {
+if (!(await index.isIndexCreated())) {
     await index.createIndex();
 }
 ```
@@ -39,26 +41,24 @@ if (!await index.isIndexCreated()) {
 Add some items to your index:
 ```typescript
-import { OpenAIApi, Configuration } from 'openai';
+import { OpenAI } from 'openai';
-const configuration = new Configuration({
+const openai = new OpenAI({
     apiKey: `<YOUR_KEY>`,
 });
-const api = new OpenAIApi(configuration);
 async function getVector(text: string) {
-    const response = await api.createEmbedding({
+    const response = await openai.embeddings.create({
         'model': 'text-embedding-ada-002',
         'input': text,
     });
-    return response.data.data[0].embedding;
+    return response.data[0].embedding;
 }
 async function addItem(text: string) {
     await index.insertItem({
         vector: await getVector(text),
-        metadata: { text }
+        metadata: { text },
     });
 }

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
     "name": "vectra",
     "author": "Steven Ickman",
     "description": "A vector database that uses the local file system for storage.",
-    "version": "0.7.6",
+    "version": "0.10.0",
     "license": "MIT",
     "keywords": [
         "gpt"
@@ -35,14 +35,14 @@
         "openai": "^3.2.1",
         "turndown": "^7.1.2",
         "uuid": "^9.0.0",
+        "wink-nlp": "^2.3.2",
         "yargs": "^17.7.2"
     },
-    "resolutions": {
-    },
+    "resolutions": {},
     "devDependencies": {
-        "@types/node": "^14.14.31",
-        "@types/mocha": "^8.2.0",
         "@types/assert": "^1.5.3",
+        "@types/mocha": "^8.2.0",
+        "@types/node": "^14.14.31",
         "@types/turndown": "^5.0.1",
         "@types/uuid": "9.0.1",
         "@types/yargs": "17.0.24",
@@ -50,7 +50,8 @@
         "nyc": "^15.1.0",
         "shx": "^0.3.2",
         "ts-mocha": "10.0.0",
-        "typescript": "^4.2.3"
+        "typescript": "^4.2.3",
+        "wink-bm25-text-search": "^3.1.2"
     },
     "scripts": {
         "build": "tsc -b",

package/src/ItemSelector.ts CHANGED Viewed

@@ -149,7 +149,13 @@ export class ItemSelector {
                     }
                     break;
                 case '$nin':
-                    if (typeof value == 'boolean' || filter[key]!.includes(value)) {
+                    if (typeof value == 'boolean') {
+                        return false;
+                    }
+                    else if (typeof value == 'string' && filter[key]!.includes(value)) {
+                        return false;
+                    }
+                    else if (filter[key]!.some(val => typeof val == 'string' && val.includes(value as string))) {
                         return false;
                     }
                     break;

package/src/LocalDocumentIndex.ts CHANGED Viewed

@@ -30,6 +30,12 @@ export interface DocumentQueryOptions {
      * Optional. Filter to apply to the document metadata.
      */
     filter?: MetadataFilter;
+    /**
+     * Optional. Turn on bm25 keyword search to perform hybrid search - semantic + keyword
+     */
+    isBm25?: boolean;
 }
 /**
@@ -60,7 +66,7 @@ export interface LocalDocumentIndexConfig {
 /**
  * Represents a local index of documents stored on disk.
  */
-export class LocalDocumentIndex extends LocalIndex {
+export class LocalDocumentIndex extends LocalIndex<DocumentChunkMetadata> {
     private readonly _embeddings?: EmbeddingsModel;
     private readonly _tokenizer: Tokenizer;
     private readonly _chunkingConfig?: TextSplitterConfig;
@@ -158,7 +164,7 @@ export class LocalDocumentIndex extends LocalIndex {
         await this.beginUpdate();
         try {
             // Get list of chunks for document
-            const chunks = await this.listItemsByMetadata<DocumentChunkMetadata>({ documentId });
+            const chunks = await this.listItemsByMetadata({ documentId });
             // Delete chunks
             for (const chunk of chunks) {
@@ -326,7 +332,7 @@ export class LocalDocumentIndex extends LocalIndex {
     public async listDocuments(): Promise<LocalDocumentResult[]> {
         // Sort chunks by document ID
         const docs: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
-        const chunks = await this.listItems<DocumentChunkMetadata>();
+        const chunks = await this.listItems();
         chunks.forEach(chunk => {
             const metadata = chunk.metadata;
             if (docs[metadata.documentId] == undefined) {
@@ -378,7 +384,7 @@ export class LocalDocumentIndex extends LocalIndex {
         }
         // Query index for chunks
-        const results = await this.queryItems<DocumentChunkMetadata>(embeddings.output![0], options.maxChunks!, options.filter);
+        const results = await this.queryItems(embeddings.output![0], query, options.maxChunks!, options.filter, options.isBm25);
         // Group chunks by document
         const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};

package/src/LocalDocumentResult.ts CHANGED Viewed

@@ -66,7 +66,8 @@ export class LocalDocumentResult extends LocalDocument {
                     startPos: startPos + offset,
                     endPos: startPos + offset + chunkLength - 1,
                     score: chunk.score,
-                    tokenCount: chunkLength
+                    tokenCount: chunkLength,
+                    isBm25: false
                 });
                 offset += chunkLength;
             }
@@ -103,7 +104,8 @@ export class LocalDocumentResult extends LocalDocument {
             return {
                 text: text,
                 tokenCount: section.tokenCount,
-                score: section.score
+                score: section.score,
+                isBm25: false,
             };
         });
     }
@@ -127,7 +129,8 @@ export class LocalDocumentResult extends LocalDocument {
             return [{
                 text,
                 tokenCount: length,
-                score: 1.0
+                score: 1.0,
+                isBm25: false,
             }];
         }
@@ -148,7 +151,8 @@ export class LocalDocumentResult extends LocalDocument {
                 startPos,
                 endPos,
                 score: chunk.score,
-                tokenCount: this._tokenizer.encode(chunkText).length
+                tokenCount: this._tokenizer.encode(chunkText).length,
+                isBm25: Boolean(chunk.item.metadata.isBm25),
             };
         }).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
@@ -163,36 +167,63 @@ export class LocalDocumentResult extends LocalDocument {
             return [{
                 text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
                 tokenCount: maxTokens,
-                score: topChunk.score
+                score: topChunk.score,
+                isBm25: false,
             }];
         }
-        // Generate sections
+        // Generate semantic sections
         const sections: Section[] = [];
         for (let i = 0; i < chunks.length; i++) {
             const chunk = chunks[i];
             let section = sections[sections.length - 1];
-            if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
-                section = {
-                    chunks: [],
-                    score: 0,
-                    tokenCount: 0
-                };
-                sections.push(section);
+            if (!chunk.isBm25) {
+                if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
+                    section = {
+                        chunks: [],
+                        score: 0,
+                        tokenCount: 0
+                    };
+                    sections.push(section);
+                }
+                section.chunks.push(chunk);
+                section.score += chunk.score;
+                section.tokenCount += chunk.tokenCount;
             }
-            section.chunks.push(chunk);
-            section.score += chunk.score;
-            section.tokenCount += chunk.tokenCount;
         }
+        // Generate bm25 sections
+        const bm25Sections: Section[] = [];
+        for (let i = 0; i < chunks.length; i++) {
+            const chunk = chunks[i];
+            let section = bm25Sections[bm25Sections.length - 1];
+            if (chunk.isBm25) {
+                if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
+                    section = {
+                        chunks: [],
+                        score: 0,
+                        tokenCount: 0
+                    };
+                    bm25Sections.push(section);
+                }
+                section.chunks.push(chunk);
+                section.score += chunk.score;
+                section.tokenCount += chunk.tokenCount;
+            }
+        }
         // Normalize section scores
         sections.forEach(section => section.score /= section.chunks.length);
+        bm25Sections.forEach(section => section.score /= section.chunks.length);
         // Sort sections by score and limit to maxSections
         sections.sort((a, b) => b.score - a.score);
+        bm25Sections.sort((a, b) => b.score - a.score);
         if (sections.length > maxSections) {
             sections.splice(maxSections, sections.length - maxSections);
         }
+        if (bm25Sections.length > maxSections) {
+            bm25Sections.splice(maxSections, bm25Sections.length - maxSections);
+        }
         // Combine adjacent chunks of text
         sections.forEach(section => {
@@ -216,7 +247,8 @@ export class LocalDocumentResult extends LocalDocument {
                 startPos: -1,
                 endPos: -1,
                 score: 0,
-                tokenCount: this._tokenizer.encode('\n\n...\n\n').length
+                tokenCount: this._tokenizer.encode('\n\n...\n\n').length,
+                isBm25: false,
             };
             sections.forEach(section => {
                 // Insert connectors between chunks
@@ -242,7 +274,8 @@ export class LocalDocumentResult extends LocalDocument {
                             startPos: sectionStart - beforeBudget,
                             endPos: sectionStart - 1,
                             score: 0,
-                            tokenCount: beforeBudget
+                            tokenCount: beforeBudget,
+                            isBm25: false,
                         };
                         section.chunks.unshift(chunk);
                         section.tokenCount += chunk.tokenCount;
@@ -258,7 +291,8 @@ export class LocalDocumentResult extends LocalDocument {
                             startPos: sectionEnd + 1,
                             endPos: sectionEnd + afterBudget,
                             score: 0,
-                            tokenCount: afterBudget
+                            tokenCount: afterBudget,
+                            isBm25: false,
                         };
                         section.chunks.push(chunk);
                         section.tokenCount += chunk.tokenCount;
@@ -268,16 +302,29 @@ export class LocalDocumentResult extends LocalDocument {
             });
         }
-        // Return final rendered sections
-        return sections.map(section => {
+        const semanticDocTextSections = sections.map(section => {
+            let text = '';
+            section.chunks.forEach(chunk => text += chunk.text);
+            return {
+                text: text,
+                tokenCount: section.tokenCount,
+                score: section.score,
+                isBm25: false,
+            };
+        });
+        const bm25DocTextSections = bm25Sections.map(section => {
             let text = '';
             section.chunks.forEach(chunk => text += chunk.text);
             return {
                 text: text,
                 tokenCount: section.tokenCount,
-                score: section.score
+                score: section.score,
+                isBm25: true,
             };
         });
+        // Return final rendered sections
+        return [...semanticDocTextSections, ...bm25DocTextSections];
     }
     private encodeBeforeText(text: string, budget: number): number[] {
@@ -300,6 +347,7 @@ interface SectionChunk {
     endPos: number;
     score: number;
     tokenCount: number;
+    isBm25: boolean;
 }
 interface Section {

package/src/LocalIndex.ts CHANGED Viewed

@@ -3,7 +3,11 @@ import * as path from 'path';
 import { v4 } from 'uuid';
 import { ItemSelector } from './ItemSelector';
 import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
+import { LocalDocument } from './LocalDocument';
+import { LocalDocumentIndex } from './LocalDocumentIndex';
+import bm25 from 'wink-bm25-text-search';
+import winkNLP from 'wink-nlp';
+import model from 'wink-eng-lite-web-model';
 export interface CreateIndexConfig {
     version: number;
     deleteIfExists?: boolean;
@@ -18,12 +22,14 @@ export interface CreateIndexConfig {
  * This class is used to create, update, and query a local vector index.
  * Each index is a folder on disk containing an index.json file and an optional set of metadata files.
  */
-export class LocalIndex {
+export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<string,MetadataTypes>>{
     private readonly _folderPath: string;
     private readonly _indexName: string;
     private _data?: IndexData;
     private _update?: IndexData;
+    //member fields for BM25
+    private _bm25Engine: any;
     /**
      * Creates a new instance of LocalIndex.
@@ -177,7 +183,7 @@ export class LocalIndex {
      * @param id ID of the item to retrieve.
      * @returns Item or undefined if not found.
      */
-    public async getItem<TMetadata = Record<string,MetadataTypes>>(id: string): Promise<IndexItem<TMetadata> | undefined> {
+    public async getItem<TItemMetadata extends TMetadata = TMetadata>(id: string): Promise<IndexItem<TItemMetadata> | undefined> {
         await this.loadIndexData();
         return this._data!.items.find(i => i.id === id) as any | undefined;
     }
@@ -190,7 +196,7 @@ export class LocalIndex {
      * @param item Item to insert.
      * @returns Inserted item.
      */
-    public async insertItem<TMetadata = Record<string,MetadataTypes>>(item: Partial<IndexItem<TMetadata>>): Promise<IndexItem<TMetadata>> {
+    public async insertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
         if (this._update) {
             return await this.addItemToUpdate(item, true) as any;
         } else {
@@ -220,7 +226,7 @@ export class LocalIndex {
      * array is returned so no modifications should be made to the array.
      * @returns Array of all items in the index.
      */
-    public async listItems<TMetadata = Record<string,MetadataTypes>>(): Promise<IndexItem<TMetadata>[]> {
+    public async listItems<TItemMetadata extends TMetadata = TMetadata>(): Promise<IndexItem<TItemMetadata>[]> {
         await this.loadIndexData();
         return this._data!.items.slice() as any;
     }
@@ -232,7 +238,7 @@ export class LocalIndex {
      * @param filter Filter to apply.
      * @returns Array of items matching the filter.
      */
-    public async listItemsByMetadata<TMetadata = Record<string,MetadataTypes>>(filter: MetadataFilter): Promise<IndexItem<TMetadata>[]> {
+    public async listItemsByMetadata<TItemMetadata extends TMetadata = TMetadata>(filter: MetadataFilter): Promise<IndexItem<TItemMetadata>[]> {
         await this.loadIndexData();
         return this._data!.items.filter(i => ItemSelector.select(i.metadata, filter)) as any;
     }
@@ -247,7 +253,7 @@ export class LocalIndex {
      * @param filter Optional. Filter to apply.
      * @returns Similar items to the vector that matche the supplied filter.
      */
-    public async queryItems<TMetadata = Record<string,MetadataTypes>>(vector: number[], topK: number, filter?: MetadataFilter): Promise<QueryResult<TMetadata>[]> {
+    public async queryItems<TItemMetadata extends TMetadata = TMetadata>(vector: number[], query: string, topK: number, filter?: MetadataFilter, isBm25?: boolean): Promise<QueryResult<TItemMetadata>[]> {
         await this.loadIndexData();
         // Filter items
@@ -269,7 +275,7 @@ export class LocalIndex {
         distances.sort((a, b) => b.distance - a.distance);
         // Find top k
-        const top: QueryResult<TMetadata>[] = distances.slice(0, topK).map(d => {
+        const top: QueryResult<TItemMetadata>[] = distances.slice(0, topK).map(d => {
             return {
                 item: Object.assign({}, items[d.index]) as any,
                 score: d.distance
@@ -285,6 +291,36 @@ export class LocalIndex {
             }
         }
+        //Peform bm25 search only if enabled. Avoid duplicate chunks, which are already selected during semantic search.
+        if (isBm25) {
+            const itemSet = new Set();
+            for (const item of top) itemSet.add(item.item.id);
+            this.setupbm25();
+            let currDoc;
+            let currDocTxt;
+            for (let i = 0; i < items.length; i++) {
+                if (!itemSet.has(items[i].id)) {
+                    const item = items[i];
+                    currDoc = new LocalDocument((this as unknown) as LocalDocumentIndex, item.metadata.documentId.toString(), '');
+                    currDocTxt = await currDoc.loadText();
+                    const startPos = item.metadata.startPos;
+                    const endPos = item.metadata.endPos;
+                    const chunkText = currDocTxt.substring(Number(startPos), Number(endPos) + 1);
+                    this._bm25Engine.addDoc({body: chunkText}, i);
+                }
+            }
+            this._bm25Engine.consolidate();
+            var results = await this.bm25Search(query, items, topK);
+            results.forEach((res: any) => {
+                top.push({
+                    item: Object.assign({}, {...items[res[0]], metadata: {...items[res[0]].metadata, isBm25: true}}) as any,
+                    score: res[1]
+                });
+            });
+        }
         return top;
     }
@@ -296,7 +332,7 @@ export class LocalIndex {
      * @param item Item to insert or replace.
      * @returns Upserted item.
      */
-    public async upsertItem<TMetadata = Record<string,MetadataTypes>>(item: Partial<IndexItem<TMetadata>>): Promise<IndexItem<TMetadata>> {
+    public async upsertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
         if (this._update) {
             return await this.addItemToUpdate(item, false) as any;
         } else {
@@ -350,7 +386,7 @@ export class LocalIndex {
             }
             // Save remaining metadata to disk
-            metadataFile = `${v4}.json`;
+            metadataFile = `${v4()}.json`;
             const metadataPath = path.join(this._folderPath, metadataFile);
             await fs.writeFile(metadataPath, JSON.stringify(item.metadata));
         } else if (item.metadata) {
@@ -385,6 +421,37 @@ export class LocalIndex {
             return newItem;
         }
     }
+    private async setupbm25(): Promise<any> {
+        this._bm25Engine = bm25();
+        const nlp = winkNLP( model );
+        const its = nlp.its;
+        const prepTask = function ( text: string ) {
+            const tokens: any[] = [];
+            nlp.readDoc(text)
+                .tokens()
+                // Use only words ignoring punctuations etc and from them remove stop words
+                .filter( (t: any) => ( t.out(its.type) === 'word' && !t.out(its.stopWordFlag) ) )
+                // Handle negation and extract stem of the word
+                .each( (t: any) => tokens.push( (t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem) ) );
+            return tokens;
+        };
+        this._bm25Engine.defineConfig( { fldWeights: { body: 1 } } );
+        // Step II: Define PrepTasks pipe.
+        this._bm25Engine.definePrepTasks( [ prepTask ] );
+    }
+    private async bm25Search(searchQuery: string, items: any, topK: number): Promise<any> {
+        var query = searchQuery;
+        // `results` is an array of [ doc-id, score ], sorted by score
+        var results = this._bm25Engine.search( query );
+        return results.slice(0, topK);
+    }
 }
 interface IndexData {

package/src/TextSplitter.ts CHANGED Viewed

@@ -178,23 +178,18 @@ export class TextSplitter {
     }
     private splitBySpaces(text: string): string[] {
+        // Split text by tokens and return parts
         const parts: string[] = [];
-        const words = text.split(' ');
-        if (words.length > 0) {
-            let part = words[0];
-            for (let i = 1; i < words.length; i++) {
-                const nextWord = words[i];
-                if (this._config.tokenizer.encode(part + ' ' + nextWord).length <= this._config.chunkSize) {
-                    part += ' ' + nextWord;
-                } else {
-                    parts.push(part);
-                    part = nextWord;
-                }
+        let tokens = this._config.tokenizer.encode(text);
+        do {
+            if (tokens.length <= this._config.chunkSize) {
+                parts.push(this._config.tokenizer.decode(tokens));
+                break;
+            } else {
+                const span = tokens.splice(0, this._config.chunkSize);
+                parts.push(this._config.tokenizer.decode(span));
             }
-            parts.push(part);
-        } else {
-            parts.push(text);
-        }
+        } while (true);
         return parts;
     }

package/src/internals/Colorize.ts CHANGED Viewed

@@ -16,9 +16,9 @@ export class Colorize {
         }
     }
-    public static output(output: object | string, quote: string = '', units: string = ''): string {
+    public static output(output: object | string, isBm25: boolean = false, quote: string = '', units: string = ''): string {
         if (typeof output === 'string') {
-            return `\x1b[32m${quote}${output}${quote}\x1b[0m`;
+            return isBm25 ? `\x1b[34m${quote}${output}${quote}\x1b[0m` : `\x1b[32m${quote}${output}${quote}\x1b[0m`;
         } else if (typeof output === 'object' && output !== null) {
             return colorizer(output, {
                 pretty: true,
@@ -54,7 +54,7 @@ export class Colorize {
     }
     public static value(field: string, value: any, units: string = ''): string {
-        return `${field}: ${Colorize.output(value, '"', units)}`;
+        return `${field}: ${Colorize.output(value, false, '"', units)}`;
     }
     public static warning(warning: string): string {

package/src/internals/wink-bm25-text-search.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+declare module 'wink-bm25-text-search' {
+    const bm25: any;
+    export default bm25;
+  }

package/src/types.ts CHANGED Viewed

@@ -24,7 +24,7 @@ export interface EmbeddingsModel {
  * `error` - An error occurred while creating the embeddings.
  * `rate_limited` - The request was rate limited.
  */
-export type EmbeddingsResponseStatus = 'success' | 'error' | 'rate_limited';
+export type EmbeddingsResponseStatus = 'success' | 'error' | 'rate_limited' | 'cancelled';
 /**
  * Response returned by a `EmbeddingsClient`.
@@ -172,4 +172,5 @@ export interface DocumentTextSection {
     text: string;
     tokenCount: number;
     score: number;
+    isBm25: boolean;
 }

package/src/vectra-cli.ts CHANGED Viewed

@@ -191,6 +191,12 @@ export async function run() {
                     type: 'boolean',
                     default: true
                 })
+                .option('bm25', {
+                    alias: 'b',
+                    describe: 'Use Okapi-bm25 keyword search alogrithm to perform hybrid search - semantic + keyword. Displayed in blue during search.',
+                    type: 'boolean',
+                    default: false
+                })
                 .demandOption(['keys']);
         }, async (args) => {
             console.log(Colorize.title('Querying Index'));
@@ -217,6 +223,7 @@ export async function run() {
             const results = await index.queryDocuments(query, {
                 maxDocuments: args.documentCount,
                 maxChunks: args.chunkCount,
+                isBm25: args.bm25 as boolean,
             });
             // Render results
@@ -226,12 +233,15 @@ export async function run() {
                 console.log(Colorize.value('chunks', result.chunks.length));
                 if (args.format == 'sections') {
                     const sections = await result.renderSections(args.tokens, args.sectionCount, args.overlap);
+                    console.log(sections.length);
                     for (let i = 0; i < sections.length; i++) {
                         const section = sections[i];
+                        const isBm25 = sections[i].isBm25;
+                        console.log(isBm25);
                         console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
                         console.log(Colorize.value('score', section.score));
                         console.log(Colorize.value('tokens', section.tokenCount));
-                        console.log(Colorize.output(section.text));
+                        console.log(Colorize.output(section.text, isBm25));
                     }
                 } else if (args.format == 'chunks') {
                     const text = await result.loadText();
@@ -239,11 +249,12 @@ export async function run() {
                         const chunk = result.chunks[i];
                         const startPos = chunk.item.metadata.startPos;
                         const endPos = chunk.item.metadata.endPos;
+                        const isBm25 = Boolean(chunk.item.metadata.isBm25);
                         console.log(Colorize.title(`Chunk ${i + 1}`));
                         console.log(Colorize.value('score', chunk.score));
                         console.log(Colorize.value('startPos', startPos));
                         console.log(Colorize.value('endPos', endPos));
-                        console.log(Colorize.output(text.substring(startPos, endPos + 1)));
+                        console.log(Colorize.output(text.substring(startPos, endPos + 1), isBm25));
                     }
                 }
             }

package/lib/FileFetcher.d.ts DELETED Viewed

@@ -1,5 +0,0 @@
-import { TextFetcher } from './types';
-export declare class FileFetcher implements TextFetcher {
-    fetch(uri: string, onDocument: (uri: string, text: string, docType?: string | undefined) => Promise<boolean>): Promise<boolean>;
-}
-//# sourceMappingURL=FileFetcher.d.ts.map

package/lib/FileFetcher.d.ts.map DELETED Viewed

	@@ -1 +0,0 @@
1	- {"version":3,"file":"FileFetcher.d.ts","sourceRoot":"","sources":["../src/FileFetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAItC,qBAAa,WAAY,YAAW,WAAW;IAC9B,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,KAAK,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC;CAyB/I"}