vectra 0.7.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +10 -10
  2. package/package.json +7 -6
  3. package/src/ItemSelector.ts +7 -1
  4. package/src/LocalDocumentIndex.ts +10 -4
  5. package/src/LocalDocumentResult.ts +70 -22
  6. package/src/LocalIndex.ts +77 -10
  7. package/src/TextSplitter.ts +10 -15
  8. package/src/internals/Colorize.ts +3 -3
  9. package/src/internals/wink-bm25-text-search.d.ts +4 -0
  10. package/src/types.ts +2 -1
  11. package/src/vectra-cli.ts +13 -2
  12. package/lib/FileFetcher.d.ts +0 -5
  13. package/lib/FileFetcher.d.ts.map +0 -1
  14. package/lib/FileFetcher.js +0 -69
  15. package/lib/FileFetcher.js.map +0 -1
  16. package/lib/GPT3Tokenizer.d.ts +0 -9
  17. package/lib/GPT3Tokenizer.d.ts.map +0 -1
  18. package/lib/GPT3Tokenizer.js +0 -17
  19. package/lib/GPT3Tokenizer.js.map +0 -1
  20. package/lib/ItemSelector.d.ts +0 -41
  21. package/lib/ItemSelector.d.ts.map +0 -1
  22. package/lib/ItemSelector.js +0 -162
  23. package/lib/ItemSelector.js.map +0 -1
  24. package/lib/LocalDocument.d.ts +0 -54
  25. package/lib/LocalDocument.d.ts.map +0 -1
  26. package/lib/LocalDocument.js +0 -146
  27. package/lib/LocalDocument.js.map +0 -1
  28. package/lib/LocalDocumentIndex.d.ts +0 -128
  29. package/lib/LocalDocumentIndex.d.ts.map +0 -1
  30. package/lib/LocalDocumentIndex.js +0 -446
  31. package/lib/LocalDocumentIndex.js.map +0 -1
  32. package/lib/LocalDocumentResult.d.ts +0 -45
  33. package/lib/LocalDocumentResult.d.ts.map +0 -1
  34. package/lib/LocalDocumentResult.js +0 -282
  35. package/lib/LocalDocumentResult.js.map +0 -1
  36. package/lib/LocalIndex.d.ts +0 -136
  37. package/lib/LocalIndex.d.ts.map +0 -1
  38. package/lib/LocalIndex.js +0 -413
  39. package/lib/LocalIndex.js.map +0 -1
  40. package/lib/OpenAIEmbeddings.d.ts +0 -126
  41. package/lib/OpenAIEmbeddings.d.ts.map +0 -1
  42. package/lib/OpenAIEmbeddings.js +0 -174
  43. package/lib/OpenAIEmbeddings.js.map +0 -1
  44. package/lib/TextSplitter.d.ts +0 -20
  45. package/lib/TextSplitter.d.ts.map +0 -1
  46. package/lib/TextSplitter.js +0 -543
  47. package/lib/TextSplitter.js.map +0 -1
  48. package/lib/WebFetcher.d.ts +0 -15
  49. package/lib/WebFetcher.d.ts.map +0 -1
  50. package/lib/WebFetcher.js +0 -224
  51. package/lib/WebFetcher.js.map +0 -1
  52. package/lib/index.d.ts +0 -12
  53. package/lib/index.d.ts.map +0 -1
  54. package/lib/index.js +0 -28
  55. package/lib/index.js.map +0 -1
  56. package/lib/internals/Colorize.d.ts +0 -14
  57. package/lib/internals/Colorize.d.ts.map +0 -1
  58. package/lib/internals/Colorize.js +0 -64
  59. package/lib/internals/Colorize.js.map +0 -1
  60. package/lib/internals/index.d.ts +0 -3
  61. package/lib/internals/index.d.ts.map +0 -1
  62. package/lib/internals/index.js +0 -19
  63. package/lib/internals/index.js.map +0 -1
  64. package/lib/internals/types.d.ts +0 -43
  65. package/lib/internals/types.d.ts.map +0 -1
  66. package/lib/internals/types.js +0 -3
  67. package/lib/internals/types.js.map +0 -1
  68. package/lib/types.d.ts +0 -145
  69. package/lib/types.d.ts.map +0 -1
  70. package/lib/types.js +0 -3
  71. package/lib/types.js.map +0 -1
  72. package/lib/vectra-cli.d.ts +0 -2
  73. package/lib/vectra-cli.d.ts.map +0 -1
  74. package/lib/vectra-cli.js +0 -303
  75. package/lib/vectra-cli.js.map +0 -1
package/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Vectra
2
- Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
2
+
3
+ Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
3
4
 
4
5
  When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by simularity. Every item in the index will first be filtered by metadata and then ranked for simularity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms.
5
6
 
@@ -8,9 +9,10 @@ Keep in mind that your entire Vectra index is loaded into memory so it's not wel
8
9
  Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace.
9
10
 
10
11
  ## Other Language Bindings
12
+
11
13
  This repo contains the TypeScript/JavaScript binding for Vectra but other language bindings are being created. Since Vectra is file based, any language binding can be used to read or write a Vectra index. That means you can build a Vectra index using JS and then read it using Python.
12
14
 
13
- - [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
15
+ - [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
14
16
 
15
17
  ## Installation
16
18
 
@@ -31,7 +33,7 @@ const index = new LocalIndex(path.join(__dirname, '..', 'index'));
31
33
  Next, from inside an async function, create your index:
32
34
 
33
35
  ```typescript
34
- if (!await index.isIndexCreated()) {
36
+ if (!(await index.isIndexCreated())) {
35
37
  await index.createIndex();
36
38
  }
37
39
  ```
@@ -39,26 +41,24 @@ if (!await index.isIndexCreated()) {
39
41
  Add some items to your index:
40
42
 
41
43
  ```typescript
42
- import { OpenAIApi, Configuration } from 'openai';
44
+ import { OpenAI } from 'openai';
43
45
 
44
- const configuration = new Configuration({
46
+ const openai = new OpenAI({
45
47
  apiKey: `<YOUR_KEY>`,
46
48
  });
47
49
 
48
- const api = new OpenAIApi(configuration);
49
-
50
50
  async function getVector(text: string) {
51
- const response = await api.createEmbedding({
51
+ const response = await openai.embeddings.create({
52
52
  'model': 'text-embedding-ada-002',
53
53
  'input': text,
54
54
  });
55
- return response.data.data[0].embedding;
55
+ return response.data[0].embedding;
56
56
  }
57
57
 
58
58
  async function addItem(text: string) {
59
59
  await index.insertItem({
60
60
  vector: await getVector(text),
61
- metadata: { text }
61
+ metadata: { text },
62
62
  });
63
63
  }
64
64
 
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "vectra",
3
3
  "author": "Steven Ickman",
4
4
  "description": "A vector database that uses the local file system for storage.",
5
- "version": "0.7.6",
5
+ "version": "0.10.0",
6
6
  "license": "MIT",
7
7
  "keywords": [
8
8
  "gpt"
@@ -35,14 +35,14 @@
35
35
  "openai": "^3.2.1",
36
36
  "turndown": "^7.1.2",
37
37
  "uuid": "^9.0.0",
38
+ "wink-nlp": "^2.3.2",
38
39
  "yargs": "^17.7.2"
39
40
  },
40
- "resolutions": {
41
- },
41
+ "resolutions": {},
42
42
  "devDependencies": {
43
- "@types/node": "^14.14.31",
44
- "@types/mocha": "^8.2.0",
45
43
  "@types/assert": "^1.5.3",
44
+ "@types/mocha": "^8.2.0",
45
+ "@types/node": "^14.14.31",
46
46
  "@types/turndown": "^5.0.1",
47
47
  "@types/uuid": "9.0.1",
48
48
  "@types/yargs": "17.0.24",
@@ -50,7 +50,8 @@
50
50
  "nyc": "^15.1.0",
51
51
  "shx": "^0.3.2",
52
52
  "ts-mocha": "10.0.0",
53
- "typescript": "^4.2.3"
53
+ "typescript": "^4.2.3",
54
+ "wink-bm25-text-search": "^3.1.2"
54
55
  },
55
56
  "scripts": {
56
57
  "build": "tsc -b",
@@ -149,7 +149,13 @@ export class ItemSelector {
149
149
  }
150
150
  break;
151
151
  case '$nin':
152
- if (typeof value == 'boolean' || filter[key]!.includes(value)) {
152
+ if (typeof value == 'boolean') {
153
+ return false;
154
+ }
155
+ else if (typeof value == 'string' && filter[key]!.includes(value)) {
156
+ return false;
157
+ }
158
+ else if (filter[key]!.some(val => typeof val == 'string' && val.includes(value as string))) {
153
159
  return false;
154
160
  }
155
161
  break;
@@ -30,6 +30,12 @@ export interface DocumentQueryOptions {
30
30
  * Optional. Filter to apply to the document metadata.
31
31
  */
32
32
  filter?: MetadataFilter;
33
+
34
+ /**
35
+ * Optional. Turn on bm25 keyword search to perform hybrid search - semantic + keyword
36
+ */
37
+ isBm25?: boolean;
38
+
33
39
  }
34
40
 
35
41
  /**
@@ -60,7 +66,7 @@ export interface LocalDocumentIndexConfig {
60
66
  /**
61
67
  * Represents a local index of documents stored on disk.
62
68
  */
63
- export class LocalDocumentIndex extends LocalIndex {
69
+ export class LocalDocumentIndex extends LocalIndex<DocumentChunkMetadata> {
64
70
  private readonly _embeddings?: EmbeddingsModel;
65
71
  private readonly _tokenizer: Tokenizer;
66
72
  private readonly _chunkingConfig?: TextSplitterConfig;
@@ -158,7 +164,7 @@ export class LocalDocumentIndex extends LocalIndex {
158
164
  await this.beginUpdate();
159
165
  try {
160
166
  // Get list of chunks for document
161
- const chunks = await this.listItemsByMetadata<DocumentChunkMetadata>({ documentId });
167
+ const chunks = await this.listItemsByMetadata({ documentId });
162
168
 
163
169
  // Delete chunks
164
170
  for (const chunk of chunks) {
@@ -326,7 +332,7 @@ export class LocalDocumentIndex extends LocalIndex {
326
332
  public async listDocuments(): Promise<LocalDocumentResult[]> {
327
333
  // Sort chunks by document ID
328
334
  const docs: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
329
- const chunks = await this.listItems<DocumentChunkMetadata>();
335
+ const chunks = await this.listItems();
330
336
  chunks.forEach(chunk => {
331
337
  const metadata = chunk.metadata;
332
338
  if (docs[metadata.documentId] == undefined) {
@@ -378,7 +384,7 @@ export class LocalDocumentIndex extends LocalIndex {
378
384
  }
379
385
 
380
386
  // Query index for chunks
381
- const results = await this.queryItems<DocumentChunkMetadata>(embeddings.output![0], options.maxChunks!, options.filter);
387
+ const results = await this.queryItems(embeddings.output![0], query, options.maxChunks!, options.filter, options.isBm25);
382
388
 
383
389
  // Group chunks by document
384
390
  const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
@@ -66,7 +66,8 @@ export class LocalDocumentResult extends LocalDocument {
66
66
  startPos: startPos + offset,
67
67
  endPos: startPos + offset + chunkLength - 1,
68
68
  score: chunk.score,
69
- tokenCount: chunkLength
69
+ tokenCount: chunkLength,
70
+ isBm25: false
70
71
  });
71
72
  offset += chunkLength;
72
73
  }
@@ -103,7 +104,8 @@ export class LocalDocumentResult extends LocalDocument {
103
104
  return {
104
105
  text: text,
105
106
  tokenCount: section.tokenCount,
106
- score: section.score
107
+ score: section.score,
108
+ isBm25: false,
107
109
  };
108
110
  });
109
111
  }
@@ -127,7 +129,8 @@ export class LocalDocumentResult extends LocalDocument {
127
129
  return [{
128
130
  text,
129
131
  tokenCount: length,
130
- score: 1.0
132
+ score: 1.0,
133
+ isBm25: false,
131
134
  }];
132
135
  }
133
136
 
@@ -148,7 +151,8 @@ export class LocalDocumentResult extends LocalDocument {
148
151
  startPos,
149
152
  endPos,
150
153
  score: chunk.score,
151
- tokenCount: this._tokenizer.encode(chunkText).length
154
+ tokenCount: this._tokenizer.encode(chunkText).length,
155
+ isBm25: Boolean(chunk.item.metadata.isBm25),
152
156
  };
153
157
  }).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
154
158
 
@@ -163,36 +167,63 @@ export class LocalDocumentResult extends LocalDocument {
163
167
  return [{
164
168
  text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
165
169
  tokenCount: maxTokens,
166
- score: topChunk.score
170
+ score: topChunk.score,
171
+ isBm25: false,
167
172
  }];
168
173
  }
169
174
 
170
- // Generate sections
175
+ // Generate semantic sections
171
176
  const sections: Section[] = [];
172
177
  for (let i = 0; i < chunks.length; i++) {
173
178
  const chunk = chunks[i];
174
179
  let section = sections[sections.length - 1];
175
- if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
176
- section = {
177
- chunks: [],
178
- score: 0,
179
- tokenCount: 0
180
- };
181
- sections.push(section);
180
+ if (!chunk.isBm25) {
181
+ if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
182
+ section = {
183
+ chunks: [],
184
+ score: 0,
185
+ tokenCount: 0
186
+ };
187
+ sections.push(section);
188
+ }
189
+ section.chunks.push(chunk);
190
+ section.score += chunk.score;
191
+ section.tokenCount += chunk.tokenCount;
182
192
  }
183
- section.chunks.push(chunk);
184
- section.score += chunk.score;
185
- section.tokenCount += chunk.tokenCount;
186
193
  }
187
194
 
195
+ // Generate bm25 sections
196
+ const bm25Sections: Section[] = [];
197
+ for (let i = 0; i < chunks.length; i++) {
198
+ const chunk = chunks[i];
199
+ let section = bm25Sections[bm25Sections.length - 1];
200
+ if (chunk.isBm25) {
201
+ if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
202
+ section = {
203
+ chunks: [],
204
+ score: 0,
205
+ tokenCount: 0
206
+ };
207
+ bm25Sections.push(section);
208
+ }
209
+ section.chunks.push(chunk);
210
+ section.score += chunk.score;
211
+ section.tokenCount += chunk.tokenCount;
212
+ }
213
+ }
188
214
  // Normalize section scores
189
215
  sections.forEach(section => section.score /= section.chunks.length);
216
+ bm25Sections.forEach(section => section.score /= section.chunks.length);
190
217
 
191
218
  // Sort sections by score and limit to maxSections
192
219
  sections.sort((a, b) => b.score - a.score);
220
+ bm25Sections.sort((a, b) => b.score - a.score);
193
221
  if (sections.length > maxSections) {
194
222
  sections.splice(maxSections, sections.length - maxSections);
195
223
  }
224
+ if (bm25Sections.length > maxSections) {
225
+ bm25Sections.splice(maxSections, bm25Sections.length - maxSections);
226
+ }
196
227
 
197
228
  // Combine adjacent chunks of text
198
229
  sections.forEach(section => {
@@ -216,7 +247,8 @@ export class LocalDocumentResult extends LocalDocument {
216
247
  startPos: -1,
217
248
  endPos: -1,
218
249
  score: 0,
219
- tokenCount: this._tokenizer.encode('\n\n...\n\n').length
250
+ tokenCount: this._tokenizer.encode('\n\n...\n\n').length,
251
+ isBm25: false,
220
252
  };
221
253
  sections.forEach(section => {
222
254
  // Insert connectors between chunks
@@ -242,7 +274,8 @@ export class LocalDocumentResult extends LocalDocument {
242
274
  startPos: sectionStart - beforeBudget,
243
275
  endPos: sectionStart - 1,
244
276
  score: 0,
245
- tokenCount: beforeBudget
277
+ tokenCount: beforeBudget,
278
+ isBm25: false,
246
279
  };
247
280
  section.chunks.unshift(chunk);
248
281
  section.tokenCount += chunk.tokenCount;
@@ -258,7 +291,8 @@ export class LocalDocumentResult extends LocalDocument {
258
291
  startPos: sectionEnd + 1,
259
292
  endPos: sectionEnd + afterBudget,
260
293
  score: 0,
261
- tokenCount: afterBudget
294
+ tokenCount: afterBudget,
295
+ isBm25: false,
262
296
  };
263
297
  section.chunks.push(chunk);
264
298
  section.tokenCount += chunk.tokenCount;
@@ -268,16 +302,29 @@ export class LocalDocumentResult extends LocalDocument {
268
302
  });
269
303
  }
270
304
 
271
- // Return final rendered sections
272
- return sections.map(section => {
305
+ const semanticDocTextSections = sections.map(section => {
306
+ let text = '';
307
+ section.chunks.forEach(chunk => text += chunk.text);
308
+ return {
309
+ text: text,
310
+ tokenCount: section.tokenCount,
311
+ score: section.score,
312
+ isBm25: false,
313
+ };
314
+ });
315
+ const bm25DocTextSections = bm25Sections.map(section => {
273
316
  let text = '';
274
317
  section.chunks.forEach(chunk => text += chunk.text);
275
318
  return {
276
319
  text: text,
277
320
  tokenCount: section.tokenCount,
278
- score: section.score
321
+ score: section.score,
322
+ isBm25: true,
279
323
  };
280
324
  });
325
+
326
+ // Return final rendered sections
327
+ return [...semanticDocTextSections, ...bm25DocTextSections];
281
328
  }
282
329
 
283
330
  private encodeBeforeText(text: string, budget: number): number[] {
@@ -300,6 +347,7 @@ interface SectionChunk {
300
347
  endPos: number;
301
348
  score: number;
302
349
  tokenCount: number;
350
+ isBm25: boolean;
303
351
  }
304
352
 
305
353
  interface Section {
package/src/LocalIndex.ts CHANGED
@@ -3,7 +3,11 @@ import * as path from 'path';
3
3
  import { v4 } from 'uuid';
4
4
  import { ItemSelector } from './ItemSelector';
5
5
  import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
6
-
6
+ import { LocalDocument } from './LocalDocument';
7
+ import { LocalDocumentIndex } from './LocalDocumentIndex';
8
+ import bm25 from 'wink-bm25-text-search';
9
+ import winkNLP from 'wink-nlp';
10
+ import model from 'wink-eng-lite-web-model';
7
11
  export interface CreateIndexConfig {
8
12
  version: number;
9
13
  deleteIfExists?: boolean;
@@ -18,12 +22,14 @@ export interface CreateIndexConfig {
18
22
  * This class is used to create, update, and query a local vector index.
19
23
  * Each index is a folder on disk containing an index.json file and an optional set of metadata files.
20
24
  */
21
- export class LocalIndex {
25
+ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<string,MetadataTypes>>{
22
26
  private readonly _folderPath: string;
23
27
  private readonly _indexName: string;
24
28
 
25
29
  private _data?: IndexData;
26
30
  private _update?: IndexData;
31
+ //member fields for BM25
32
+ private _bm25Engine: any;
27
33
 
28
34
  /**
29
35
  * Creates a new instance of LocalIndex.
@@ -177,7 +183,7 @@ export class LocalIndex {
177
183
  * @param id ID of the item to retrieve.
178
184
  * @returns Item or undefined if not found.
179
185
  */
180
- public async getItem<TMetadata = Record<string,MetadataTypes>>(id: string): Promise<IndexItem<TMetadata> | undefined> {
186
+ public async getItem<TItemMetadata extends TMetadata = TMetadata>(id: string): Promise<IndexItem<TItemMetadata> | undefined> {
181
187
  await this.loadIndexData();
182
188
  return this._data!.items.find(i => i.id === id) as any | undefined;
183
189
  }
@@ -190,7 +196,7 @@ export class LocalIndex {
190
196
  * @param item Item to insert.
191
197
  * @returns Inserted item.
192
198
  */
193
- public async insertItem<TMetadata = Record<string,MetadataTypes>>(item: Partial<IndexItem<TMetadata>>): Promise<IndexItem<TMetadata>> {
199
+ public async insertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
194
200
  if (this._update) {
195
201
  return await this.addItemToUpdate(item, true) as any;
196
202
  } else {
@@ -220,7 +226,7 @@ export class LocalIndex {
220
226
  * array is returned so no modifications should be made to the array.
221
227
  * @returns Array of all items in the index.
222
228
  */
223
- public async listItems<TMetadata = Record<string,MetadataTypes>>(): Promise<IndexItem<TMetadata>[]> {
229
+ public async listItems<TItemMetadata extends TMetadata = TMetadata>(): Promise<IndexItem<TItemMetadata>[]> {
224
230
  await this.loadIndexData();
225
231
  return this._data!.items.slice() as any;
226
232
  }
@@ -232,7 +238,7 @@ export class LocalIndex {
232
238
  * @param filter Filter to apply.
233
239
  * @returns Array of items matching the filter.
234
240
  */
235
- public async listItemsByMetadata<TMetadata = Record<string,MetadataTypes>>(filter: MetadataFilter): Promise<IndexItem<TMetadata>[]> {
241
+ public async listItemsByMetadata<TItemMetadata extends TMetadata = TMetadata>(filter: MetadataFilter): Promise<IndexItem<TItemMetadata>[]> {
236
242
  await this.loadIndexData();
237
243
  return this._data!.items.filter(i => ItemSelector.select(i.metadata, filter)) as any;
238
244
  }
@@ -247,7 +253,7 @@ export class LocalIndex {
247
253
  * @param filter Optional. Filter to apply.
248
254
  * @returns Similar items to the vector that matche the supplied filter.
249
255
  */
250
- public async queryItems<TMetadata = Record<string,MetadataTypes>>(vector: number[], topK: number, filter?: MetadataFilter): Promise<QueryResult<TMetadata>[]> {
256
+ public async queryItems<TItemMetadata extends TMetadata = TMetadata>(vector: number[], query: string, topK: number, filter?: MetadataFilter, isBm25?: boolean): Promise<QueryResult<TItemMetadata>[]> {
251
257
  await this.loadIndexData();
252
258
 
253
259
  // Filter items
@@ -269,7 +275,7 @@ export class LocalIndex {
269
275
  distances.sort((a, b) => b.distance - a.distance);
270
276
 
271
277
  // Find top k
272
- const top: QueryResult<TMetadata>[] = distances.slice(0, topK).map(d => {
278
+ const top: QueryResult<TItemMetadata>[] = distances.slice(0, topK).map(d => {
273
279
  return {
274
280
  item: Object.assign({}, items[d.index]) as any,
275
281
  score: d.distance
@@ -285,6 +291,36 @@ export class LocalIndex {
285
291
  }
286
292
  }
287
293
 
294
+ //Peform bm25 search only if enabled. Avoid duplicate chunks, which are already selected during semantic search.
295
+ if (isBm25) {
296
+ const itemSet = new Set();
297
+ for (const item of top) itemSet.add(item.item.id);
298
+
299
+ this.setupbm25();
300
+
301
+ let currDoc;
302
+ let currDocTxt;
303
+ for (let i = 0; i < items.length; i++) {
304
+ if (!itemSet.has(items[i].id)) {
305
+ const item = items[i];
306
+ currDoc = new LocalDocument((this as unknown) as LocalDocumentIndex, item.metadata.documentId.toString(), '');
307
+ currDocTxt = await currDoc.loadText();
308
+ const startPos = item.metadata.startPos;
309
+ const endPos = item.metadata.endPos;
310
+ const chunkText = currDocTxt.substring(Number(startPos), Number(endPos) + 1);
311
+ this._bm25Engine.addDoc({body: chunkText}, i);
312
+ }
313
+ }
314
+ this._bm25Engine.consolidate();
315
+ var results = await this.bm25Search(query, items, topK);
316
+ results.forEach((res: any) => {
317
+ top.push({
318
+ item: Object.assign({}, {...items[res[0]], metadata: {...items[res[0]].metadata, isBm25: true}}) as any,
319
+ score: res[1]
320
+ });
321
+ });
322
+
323
+ }
288
324
  return top;
289
325
  }
290
326
 
@@ -296,7 +332,7 @@ export class LocalIndex {
296
332
  * @param item Item to insert or replace.
297
333
  * @returns Upserted item.
298
334
  */
299
- public async upsertItem<TMetadata = Record<string,MetadataTypes>>(item: Partial<IndexItem<TMetadata>>): Promise<IndexItem<TMetadata>> {
335
+ public async upsertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
300
336
  if (this._update) {
301
337
  return await this.addItemToUpdate(item, false) as any;
302
338
  } else {
@@ -350,7 +386,7 @@ export class LocalIndex {
350
386
  }
351
387
 
352
388
  // Save remaining metadata to disk
353
- metadataFile = `${v4}.json`;
389
+ metadataFile = `${v4()}.json`;
354
390
  const metadataPath = path.join(this._folderPath, metadataFile);
355
391
  await fs.writeFile(metadataPath, JSON.stringify(item.metadata));
356
392
  } else if (item.metadata) {
@@ -385,6 +421,37 @@ export class LocalIndex {
385
421
  return newItem;
386
422
  }
387
423
  }
424
+
425
+ private async setupbm25(): Promise<any> {
426
+ this._bm25Engine = bm25();
427
+ const nlp = winkNLP( model );
428
+ const its = nlp.its;
429
+
430
+ const prepTask = function ( text: string ) {
431
+ const tokens: any[] = [];
432
+ nlp.readDoc(text)
433
+ .tokens()
434
+ // Use only words ignoring punctuations etc and from them remove stop words
435
+ .filter( (t: any) => ( t.out(its.type) === 'word' && !t.out(its.stopWordFlag) ) )
436
+ // Handle negation and extract stem of the word
437
+ .each( (t: any) => tokens.push( (t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem) ) );
438
+
439
+ return tokens;
440
+ };
441
+
442
+ this._bm25Engine.defineConfig( { fldWeights: { body: 1 } } );
443
+ // Step II: Define PrepTasks pipe.
444
+ this._bm25Engine.definePrepTasks( [ prepTask ] );
445
+ }
446
+
447
+ private async bm25Search(searchQuery: string, items: any, topK: number): Promise<any> {
448
+ var query = searchQuery;
449
+ // `results` is an array of [ doc-id, score ], sorted by score
450
+ var results = this._bm25Engine.search( query );
451
+
452
+ return results.slice(0, topK);
453
+ }
454
+
388
455
  }
389
456
 
390
457
  interface IndexData {
@@ -178,23 +178,18 @@ export class TextSplitter {
178
178
  }
179
179
 
180
180
  private splitBySpaces(text: string): string[] {
181
+ // Split text by tokens and return parts
181
182
  const parts: string[] = [];
182
- const words = text.split(' ');
183
- if (words.length > 0) {
184
- let part = words[0];
185
- for (let i = 1; i < words.length; i++) {
186
- const nextWord = words[i];
187
- if (this._config.tokenizer.encode(part + ' ' + nextWord).length <= this._config.chunkSize) {
188
- part += ' ' + nextWord;
189
- } else {
190
- parts.push(part);
191
- part = nextWord;
192
- }
183
+ let tokens = this._config.tokenizer.encode(text);
184
+ do {
185
+ if (tokens.length <= this._config.chunkSize) {
186
+ parts.push(this._config.tokenizer.decode(tokens));
187
+ break;
188
+ } else {
189
+ const span = tokens.splice(0, this._config.chunkSize);
190
+ parts.push(this._config.tokenizer.decode(span));
193
191
  }
194
- parts.push(part);
195
- } else {
196
- parts.push(text);
197
- }
192
+ } while (true);
198
193
 
199
194
  return parts;
200
195
  }
@@ -16,9 +16,9 @@ export class Colorize {
16
16
  }
17
17
  }
18
18
 
19
- public static output(output: object | string, quote: string = '', units: string = ''): string {
19
+ public static output(output: object | string, isBm25: boolean = false, quote: string = '', units: string = ''): string {
20
20
  if (typeof output === 'string') {
21
- return `\x1b[32m${quote}${output}${quote}\x1b[0m`;
21
+ return isBm25 ? `\x1b[34m${quote}${output}${quote}\x1b[0m` : `\x1b[32m${quote}${output}${quote}\x1b[0m`;
22
22
  } else if (typeof output === 'object' && output !== null) {
23
23
  return colorizer(output, {
24
24
  pretty: true,
@@ -54,7 +54,7 @@ export class Colorize {
54
54
  }
55
55
 
56
56
  public static value(field: string, value: any, units: string = ''): string {
57
- return `${field}: ${Colorize.output(value, '"', units)}`;
57
+ return `${field}: ${Colorize.output(value, false, '"', units)}`;
58
58
  }
59
59
 
60
60
  public static warning(warning: string): string {
@@ -0,0 +1,4 @@
1
+ declare module 'wink-bm25-text-search' {
2
+ const bm25: any;
3
+ export default bm25;
4
+ }
package/src/types.ts CHANGED
@@ -24,7 +24,7 @@ export interface EmbeddingsModel {
24
24
  * `error` - An error occurred while creating the embeddings.
25
25
  * `rate_limited` - The request was rate limited.
26
26
  */
27
- export type EmbeddingsResponseStatus = 'success' | 'error' | 'rate_limited';
27
+ export type EmbeddingsResponseStatus = 'success' | 'error' | 'rate_limited' | 'cancelled';
28
28
 
29
29
  /**
30
30
  * Response returned by a `EmbeddingsClient`.
@@ -172,4 +172,5 @@ export interface DocumentTextSection {
172
172
  text: string;
173
173
  tokenCount: number;
174
174
  score: number;
175
+ isBm25: boolean;
175
176
  }
package/src/vectra-cli.ts CHANGED
@@ -191,6 +191,12 @@ export async function run() {
191
191
  type: 'boolean',
192
192
  default: true
193
193
  })
194
+ .option('bm25', {
195
+ alias: 'b',
196
+ describe: 'Use Okapi-bm25 keyword search alogrithm to perform hybrid search - semantic + keyword. Displayed in blue during search.',
197
+ type: 'boolean',
198
+ default: false
199
+ })
194
200
  .demandOption(['keys']);
195
201
  }, async (args) => {
196
202
  console.log(Colorize.title('Querying Index'));
@@ -217,6 +223,7 @@ export async function run() {
217
223
  const results = await index.queryDocuments(query, {
218
224
  maxDocuments: args.documentCount,
219
225
  maxChunks: args.chunkCount,
226
+ isBm25: args.bm25 as boolean,
220
227
  });
221
228
 
222
229
  // Render results
@@ -226,12 +233,15 @@ export async function run() {
226
233
  console.log(Colorize.value('chunks', result.chunks.length));
227
234
  if (args.format == 'sections') {
228
235
  const sections = await result.renderSections(args.tokens, args.sectionCount, args.overlap);
236
+ console.log(sections.length);
229
237
  for (let i = 0; i < sections.length; i++) {
230
238
  const section = sections[i];
239
+ const isBm25 = sections[i].isBm25;
240
+ console.log(isBm25);
231
241
  console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
232
242
  console.log(Colorize.value('score', section.score));
233
243
  console.log(Colorize.value('tokens', section.tokenCount));
234
- console.log(Colorize.output(section.text));
244
+ console.log(Colorize.output(section.text, isBm25));
235
245
  }
236
246
  } else if (args.format == 'chunks') {
237
247
  const text = await result.loadText();
@@ -239,11 +249,12 @@ export async function run() {
239
249
  const chunk = result.chunks[i];
240
250
  const startPos = chunk.item.metadata.startPos;
241
251
  const endPos = chunk.item.metadata.endPos;
252
+ const isBm25 = Boolean(chunk.item.metadata.isBm25);
242
253
  console.log(Colorize.title(`Chunk ${i + 1}`));
243
254
  console.log(Colorize.value('score', chunk.score));
244
255
  console.log(Colorize.value('startPos', startPos));
245
256
  console.log(Colorize.value('endPos', endPos));
246
- console.log(Colorize.output(text.substring(startPos, endPos + 1)));
257
+ console.log(Colorize.output(text.substring(startPos, endPos + 1), isBm25));
247
258
  }
248
259
  }
249
260
  }
@@ -1,5 +0,0 @@
1
- import { TextFetcher } from './types';
2
- export declare class FileFetcher implements TextFetcher {
3
- fetch(uri: string, onDocument: (uri: string, text: string, docType?: string | undefined) => Promise<boolean>): Promise<boolean>;
4
- }
5
- //# sourceMappingURL=FileFetcher.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"FileFetcher.d.ts","sourceRoot":"","sources":["../src/FileFetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAItC,qBAAa,WAAY,YAAW,WAAW;IAC9B,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,KAAK,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC;CAyB/I"}