vectra 0.7.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -10
- package/package.json +7 -6
- package/src/ItemSelector.ts +7 -1
- package/src/LocalDocumentIndex.ts +10 -4
- package/src/LocalDocumentResult.ts +70 -22
- package/src/LocalIndex.ts +77 -10
- package/src/TextSplitter.ts +10 -15
- package/src/internals/Colorize.ts +3 -3
- package/src/internals/wink-bm25-text-search.d.ts +4 -0
- package/src/types.ts +2 -1
- package/src/vectra-cli.ts +13 -2
- package/lib/FileFetcher.d.ts +0 -5
- package/lib/FileFetcher.d.ts.map +0 -1
- package/lib/FileFetcher.js +0 -69
- package/lib/FileFetcher.js.map +0 -1
- package/lib/GPT3Tokenizer.d.ts +0 -9
- package/lib/GPT3Tokenizer.d.ts.map +0 -1
- package/lib/GPT3Tokenizer.js +0 -17
- package/lib/GPT3Tokenizer.js.map +0 -1
- package/lib/ItemSelector.d.ts +0 -41
- package/lib/ItemSelector.d.ts.map +0 -1
- package/lib/ItemSelector.js +0 -162
- package/lib/ItemSelector.js.map +0 -1
- package/lib/LocalDocument.d.ts +0 -54
- package/lib/LocalDocument.d.ts.map +0 -1
- package/lib/LocalDocument.js +0 -146
- package/lib/LocalDocument.js.map +0 -1
- package/lib/LocalDocumentIndex.d.ts +0 -128
- package/lib/LocalDocumentIndex.d.ts.map +0 -1
- package/lib/LocalDocumentIndex.js +0 -446
- package/lib/LocalDocumentIndex.js.map +0 -1
- package/lib/LocalDocumentResult.d.ts +0 -45
- package/lib/LocalDocumentResult.d.ts.map +0 -1
- package/lib/LocalDocumentResult.js +0 -282
- package/lib/LocalDocumentResult.js.map +0 -1
- package/lib/LocalIndex.d.ts +0 -136
- package/lib/LocalIndex.d.ts.map +0 -1
- package/lib/LocalIndex.js +0 -413
- package/lib/LocalIndex.js.map +0 -1
- package/lib/OpenAIEmbeddings.d.ts +0 -126
- package/lib/OpenAIEmbeddings.d.ts.map +0 -1
- package/lib/OpenAIEmbeddings.js +0 -174
- package/lib/OpenAIEmbeddings.js.map +0 -1
- package/lib/TextSplitter.d.ts +0 -20
- package/lib/TextSplitter.d.ts.map +0 -1
- package/lib/TextSplitter.js +0 -543
- package/lib/TextSplitter.js.map +0 -1
- package/lib/WebFetcher.d.ts +0 -15
- package/lib/WebFetcher.d.ts.map +0 -1
- package/lib/WebFetcher.js +0 -224
- package/lib/WebFetcher.js.map +0 -1
- package/lib/index.d.ts +0 -12
- package/lib/index.d.ts.map +0 -1
- package/lib/index.js +0 -28
- package/lib/index.js.map +0 -1
- package/lib/internals/Colorize.d.ts +0 -14
- package/lib/internals/Colorize.d.ts.map +0 -1
- package/lib/internals/Colorize.js +0 -64
- package/lib/internals/Colorize.js.map +0 -1
- package/lib/internals/index.d.ts +0 -3
- package/lib/internals/index.d.ts.map +0 -1
- package/lib/internals/index.js +0 -19
- package/lib/internals/index.js.map +0 -1
- package/lib/internals/types.d.ts +0 -43
- package/lib/internals/types.d.ts.map +0 -1
- package/lib/internals/types.js +0 -3
- package/lib/internals/types.js.map +0 -1
- package/lib/types.d.ts +0 -145
- package/lib/types.d.ts.map +0 -1
- package/lib/types.js +0 -3
- package/lib/types.js.map +0 -1
- package/lib/vectra-cli.d.ts +0 -2
- package/lib/vectra-cli.d.ts.map +0 -1
- package/lib/vectra-cli.js +0 -303
- package/lib/vectra-cli.js.map +0 -1
package/README.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Vectra
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
|
|
3
4
|
|
|
4
5
|
When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by simularity. Every item in the index will first be filtered by metadata and then ranked for simularity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms.
|
|
5
6
|
|
|
@@ -8,9 +9,10 @@ Keep in mind that your entire Vectra index is loaded into memory so it's not wel
|
|
|
8
9
|
Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace.
|
|
9
10
|
|
|
10
11
|
## Other Language Bindings
|
|
12
|
+
|
|
11
13
|
This repo contains the TypeScript/JavaScript binding for Vectra but other language bindings are being created. Since Vectra is file based, any language binding can be used to read or write a Vectra index. That means you can build a Vectra index using JS and then read it using Python.
|
|
12
14
|
|
|
13
|
-
-
|
|
15
|
+
- [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
|
|
14
16
|
|
|
15
17
|
## Installation
|
|
16
18
|
|
|
@@ -31,7 +33,7 @@ const index = new LocalIndex(path.join(__dirname, '..', 'index'));
|
|
|
31
33
|
Next, from inside an async function, create your index:
|
|
32
34
|
|
|
33
35
|
```typescript
|
|
34
|
-
if (!await index.isIndexCreated()) {
|
|
36
|
+
if (!(await index.isIndexCreated())) {
|
|
35
37
|
await index.createIndex();
|
|
36
38
|
}
|
|
37
39
|
```
|
|
@@ -39,26 +41,24 @@ if (!await index.isIndexCreated()) {
|
|
|
39
41
|
Add some items to your index:
|
|
40
42
|
|
|
41
43
|
```typescript
|
|
42
|
-
import {
|
|
44
|
+
import { OpenAI } from 'openai';
|
|
43
45
|
|
|
44
|
-
const
|
|
46
|
+
const openai = new OpenAI({
|
|
45
47
|
apiKey: `<YOUR_KEY>`,
|
|
46
48
|
});
|
|
47
49
|
|
|
48
|
-
const api = new OpenAIApi(configuration);
|
|
49
|
-
|
|
50
50
|
async function getVector(text: string) {
|
|
51
|
-
const response = await
|
|
51
|
+
const response = await openai.embeddings.create({
|
|
52
52
|
'model': 'text-embedding-ada-002',
|
|
53
53
|
'input': text,
|
|
54
54
|
});
|
|
55
|
-
return response.data
|
|
55
|
+
return response.data[0].embedding;
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
async function addItem(text: string) {
|
|
59
59
|
await index.insertItem({
|
|
60
60
|
vector: await getVector(text),
|
|
61
|
-
metadata: { text }
|
|
61
|
+
metadata: { text },
|
|
62
62
|
});
|
|
63
63
|
}
|
|
64
64
|
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "vectra",
|
|
3
3
|
"author": "Steven Ickman",
|
|
4
4
|
"description": "A vector database that uses the local file system for storage.",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.10.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"keywords": [
|
|
8
8
|
"gpt"
|
|
@@ -35,14 +35,14 @@
|
|
|
35
35
|
"openai": "^3.2.1",
|
|
36
36
|
"turndown": "^7.1.2",
|
|
37
37
|
"uuid": "^9.0.0",
|
|
38
|
+
"wink-nlp": "^2.3.2",
|
|
38
39
|
"yargs": "^17.7.2"
|
|
39
40
|
},
|
|
40
|
-
"resolutions": {
|
|
41
|
-
},
|
|
41
|
+
"resolutions": {},
|
|
42
42
|
"devDependencies": {
|
|
43
|
-
"@types/node": "^14.14.31",
|
|
44
|
-
"@types/mocha": "^8.2.0",
|
|
45
43
|
"@types/assert": "^1.5.3",
|
|
44
|
+
"@types/mocha": "^8.2.0",
|
|
45
|
+
"@types/node": "^14.14.31",
|
|
46
46
|
"@types/turndown": "^5.0.1",
|
|
47
47
|
"@types/uuid": "9.0.1",
|
|
48
48
|
"@types/yargs": "17.0.24",
|
|
@@ -50,7 +50,8 @@
|
|
|
50
50
|
"nyc": "^15.1.0",
|
|
51
51
|
"shx": "^0.3.2",
|
|
52
52
|
"ts-mocha": "10.0.0",
|
|
53
|
-
"typescript": "^4.2.3"
|
|
53
|
+
"typescript": "^4.2.3",
|
|
54
|
+
"wink-bm25-text-search": "^3.1.2"
|
|
54
55
|
},
|
|
55
56
|
"scripts": {
|
|
56
57
|
"build": "tsc -b",
|
package/src/ItemSelector.ts
CHANGED
|
@@ -149,7 +149,13 @@ export class ItemSelector {
|
|
|
149
149
|
}
|
|
150
150
|
break;
|
|
151
151
|
case '$nin':
|
|
152
|
-
if (typeof value == 'boolean'
|
|
152
|
+
if (typeof value == 'boolean') {
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
else if (typeof value == 'string' && filter[key]!.includes(value)) {
|
|
156
|
+
return false;
|
|
157
|
+
}
|
|
158
|
+
else if (filter[key]!.some(val => typeof val == 'string' && val.includes(value as string))) {
|
|
153
159
|
return false;
|
|
154
160
|
}
|
|
155
161
|
break;
|
|
@@ -30,6 +30,12 @@ export interface DocumentQueryOptions {
|
|
|
30
30
|
* Optional. Filter to apply to the document metadata.
|
|
31
31
|
*/
|
|
32
32
|
filter?: MetadataFilter;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Optional. Turn on bm25 keyword search to perform hybrid search - semantic + keyword
|
|
36
|
+
*/
|
|
37
|
+
isBm25?: boolean;
|
|
38
|
+
|
|
33
39
|
}
|
|
34
40
|
|
|
35
41
|
/**
|
|
@@ -60,7 +66,7 @@ export interface LocalDocumentIndexConfig {
|
|
|
60
66
|
/**
|
|
61
67
|
* Represents a local index of documents stored on disk.
|
|
62
68
|
*/
|
|
63
|
-
export class LocalDocumentIndex extends LocalIndex {
|
|
69
|
+
export class LocalDocumentIndex extends LocalIndex<DocumentChunkMetadata> {
|
|
64
70
|
private readonly _embeddings?: EmbeddingsModel;
|
|
65
71
|
private readonly _tokenizer: Tokenizer;
|
|
66
72
|
private readonly _chunkingConfig?: TextSplitterConfig;
|
|
@@ -158,7 +164,7 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
158
164
|
await this.beginUpdate();
|
|
159
165
|
try {
|
|
160
166
|
// Get list of chunks for document
|
|
161
|
-
const chunks = await this.listItemsByMetadata
|
|
167
|
+
const chunks = await this.listItemsByMetadata({ documentId });
|
|
162
168
|
|
|
163
169
|
// Delete chunks
|
|
164
170
|
for (const chunk of chunks) {
|
|
@@ -326,7 +332,7 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
326
332
|
public async listDocuments(): Promise<LocalDocumentResult[]> {
|
|
327
333
|
// Sort chunks by document ID
|
|
328
334
|
const docs: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
|
|
329
|
-
const chunks = await this.listItems
|
|
335
|
+
const chunks = await this.listItems();
|
|
330
336
|
chunks.forEach(chunk => {
|
|
331
337
|
const metadata = chunk.metadata;
|
|
332
338
|
if (docs[metadata.documentId] == undefined) {
|
|
@@ -378,7 +384,7 @@ export class LocalDocumentIndex extends LocalIndex {
|
|
|
378
384
|
}
|
|
379
385
|
|
|
380
386
|
// Query index for chunks
|
|
381
|
-
const results = await this.queryItems
|
|
387
|
+
const results = await this.queryItems(embeddings.output![0], query, options.maxChunks!, options.filter, options.isBm25);
|
|
382
388
|
|
|
383
389
|
// Group chunks by document
|
|
384
390
|
const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
|
|
@@ -66,7 +66,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
66
66
|
startPos: startPos + offset,
|
|
67
67
|
endPos: startPos + offset + chunkLength - 1,
|
|
68
68
|
score: chunk.score,
|
|
69
|
-
tokenCount: chunkLength
|
|
69
|
+
tokenCount: chunkLength,
|
|
70
|
+
isBm25: false
|
|
70
71
|
});
|
|
71
72
|
offset += chunkLength;
|
|
72
73
|
}
|
|
@@ -103,7 +104,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
103
104
|
return {
|
|
104
105
|
text: text,
|
|
105
106
|
tokenCount: section.tokenCount,
|
|
106
|
-
score: section.score
|
|
107
|
+
score: section.score,
|
|
108
|
+
isBm25: false,
|
|
107
109
|
};
|
|
108
110
|
});
|
|
109
111
|
}
|
|
@@ -127,7 +129,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
127
129
|
return [{
|
|
128
130
|
text,
|
|
129
131
|
tokenCount: length,
|
|
130
|
-
score: 1.0
|
|
132
|
+
score: 1.0,
|
|
133
|
+
isBm25: false,
|
|
131
134
|
}];
|
|
132
135
|
}
|
|
133
136
|
|
|
@@ -148,7 +151,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
148
151
|
startPos,
|
|
149
152
|
endPos,
|
|
150
153
|
score: chunk.score,
|
|
151
|
-
tokenCount: this._tokenizer.encode(chunkText).length
|
|
154
|
+
tokenCount: this._tokenizer.encode(chunkText).length,
|
|
155
|
+
isBm25: Boolean(chunk.item.metadata.isBm25),
|
|
152
156
|
};
|
|
153
157
|
}).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
|
|
154
158
|
|
|
@@ -163,36 +167,63 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
163
167
|
return [{
|
|
164
168
|
text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
|
|
165
169
|
tokenCount: maxTokens,
|
|
166
|
-
score: topChunk.score
|
|
170
|
+
score: topChunk.score,
|
|
171
|
+
isBm25: false,
|
|
167
172
|
}];
|
|
168
173
|
}
|
|
169
174
|
|
|
170
|
-
// Generate sections
|
|
175
|
+
// Generate semantic sections
|
|
171
176
|
const sections: Section[] = [];
|
|
172
177
|
for (let i = 0; i < chunks.length; i++) {
|
|
173
178
|
const chunk = chunks[i];
|
|
174
179
|
let section = sections[sections.length - 1];
|
|
175
|
-
if (!
|
|
176
|
-
section
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
180
|
+
if (!chunk.isBm25) {
|
|
181
|
+
if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
|
|
182
|
+
section = {
|
|
183
|
+
chunks: [],
|
|
184
|
+
score: 0,
|
|
185
|
+
tokenCount: 0
|
|
186
|
+
};
|
|
187
|
+
sections.push(section);
|
|
188
|
+
}
|
|
189
|
+
section.chunks.push(chunk);
|
|
190
|
+
section.score += chunk.score;
|
|
191
|
+
section.tokenCount += chunk.tokenCount;
|
|
182
192
|
}
|
|
183
|
-
section.chunks.push(chunk);
|
|
184
|
-
section.score += chunk.score;
|
|
185
|
-
section.tokenCount += chunk.tokenCount;
|
|
186
193
|
}
|
|
187
194
|
|
|
195
|
+
// Generate bm25 sections
|
|
196
|
+
const bm25Sections: Section[] = [];
|
|
197
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
198
|
+
const chunk = chunks[i];
|
|
199
|
+
let section = bm25Sections[bm25Sections.length - 1];
|
|
200
|
+
if (chunk.isBm25) {
|
|
201
|
+
if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
|
|
202
|
+
section = {
|
|
203
|
+
chunks: [],
|
|
204
|
+
score: 0,
|
|
205
|
+
tokenCount: 0
|
|
206
|
+
};
|
|
207
|
+
bm25Sections.push(section);
|
|
208
|
+
}
|
|
209
|
+
section.chunks.push(chunk);
|
|
210
|
+
section.score += chunk.score;
|
|
211
|
+
section.tokenCount += chunk.tokenCount;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
188
214
|
// Normalize section scores
|
|
189
215
|
sections.forEach(section => section.score /= section.chunks.length);
|
|
216
|
+
bm25Sections.forEach(section => section.score /= section.chunks.length);
|
|
190
217
|
|
|
191
218
|
// Sort sections by score and limit to maxSections
|
|
192
219
|
sections.sort((a, b) => b.score - a.score);
|
|
220
|
+
bm25Sections.sort((a, b) => b.score - a.score);
|
|
193
221
|
if (sections.length > maxSections) {
|
|
194
222
|
sections.splice(maxSections, sections.length - maxSections);
|
|
195
223
|
}
|
|
224
|
+
if (bm25Sections.length > maxSections) {
|
|
225
|
+
bm25Sections.splice(maxSections, bm25Sections.length - maxSections);
|
|
226
|
+
}
|
|
196
227
|
|
|
197
228
|
// Combine adjacent chunks of text
|
|
198
229
|
sections.forEach(section => {
|
|
@@ -216,7 +247,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
216
247
|
startPos: -1,
|
|
217
248
|
endPos: -1,
|
|
218
249
|
score: 0,
|
|
219
|
-
tokenCount: this._tokenizer.encode('\n\n...\n\n').length
|
|
250
|
+
tokenCount: this._tokenizer.encode('\n\n...\n\n').length,
|
|
251
|
+
isBm25: false,
|
|
220
252
|
};
|
|
221
253
|
sections.forEach(section => {
|
|
222
254
|
// Insert connectors between chunks
|
|
@@ -242,7 +274,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
242
274
|
startPos: sectionStart - beforeBudget,
|
|
243
275
|
endPos: sectionStart - 1,
|
|
244
276
|
score: 0,
|
|
245
|
-
tokenCount: beforeBudget
|
|
277
|
+
tokenCount: beforeBudget,
|
|
278
|
+
isBm25: false,
|
|
246
279
|
};
|
|
247
280
|
section.chunks.unshift(chunk);
|
|
248
281
|
section.tokenCount += chunk.tokenCount;
|
|
@@ -258,7 +291,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
258
291
|
startPos: sectionEnd + 1,
|
|
259
292
|
endPos: sectionEnd + afterBudget,
|
|
260
293
|
score: 0,
|
|
261
|
-
tokenCount: afterBudget
|
|
294
|
+
tokenCount: afterBudget,
|
|
295
|
+
isBm25: false,
|
|
262
296
|
};
|
|
263
297
|
section.chunks.push(chunk);
|
|
264
298
|
section.tokenCount += chunk.tokenCount;
|
|
@@ -268,16 +302,29 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
268
302
|
});
|
|
269
303
|
}
|
|
270
304
|
|
|
271
|
-
|
|
272
|
-
|
|
305
|
+
const semanticDocTextSections = sections.map(section => {
|
|
306
|
+
let text = '';
|
|
307
|
+
section.chunks.forEach(chunk => text += chunk.text);
|
|
308
|
+
return {
|
|
309
|
+
text: text,
|
|
310
|
+
tokenCount: section.tokenCount,
|
|
311
|
+
score: section.score,
|
|
312
|
+
isBm25: false,
|
|
313
|
+
};
|
|
314
|
+
});
|
|
315
|
+
const bm25DocTextSections = bm25Sections.map(section => {
|
|
273
316
|
let text = '';
|
|
274
317
|
section.chunks.forEach(chunk => text += chunk.text);
|
|
275
318
|
return {
|
|
276
319
|
text: text,
|
|
277
320
|
tokenCount: section.tokenCount,
|
|
278
|
-
score: section.score
|
|
321
|
+
score: section.score,
|
|
322
|
+
isBm25: true,
|
|
279
323
|
};
|
|
280
324
|
});
|
|
325
|
+
|
|
326
|
+
// Return final rendered sections
|
|
327
|
+
return [...semanticDocTextSections, ...bm25DocTextSections];
|
|
281
328
|
}
|
|
282
329
|
|
|
283
330
|
private encodeBeforeText(text: string, budget: number): number[] {
|
|
@@ -300,6 +347,7 @@ interface SectionChunk {
|
|
|
300
347
|
endPos: number;
|
|
301
348
|
score: number;
|
|
302
349
|
tokenCount: number;
|
|
350
|
+
isBm25: boolean;
|
|
303
351
|
}
|
|
304
352
|
|
|
305
353
|
interface Section {
|
package/src/LocalIndex.ts
CHANGED
|
@@ -3,7 +3,11 @@ import * as path from 'path';
|
|
|
3
3
|
import { v4 } from 'uuid';
|
|
4
4
|
import { ItemSelector } from './ItemSelector';
|
|
5
5
|
import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
|
|
6
|
-
|
|
6
|
+
import { LocalDocument } from './LocalDocument';
|
|
7
|
+
import { LocalDocumentIndex } from './LocalDocumentIndex';
|
|
8
|
+
import bm25 from 'wink-bm25-text-search';
|
|
9
|
+
import winkNLP from 'wink-nlp';
|
|
10
|
+
import model from 'wink-eng-lite-web-model';
|
|
7
11
|
export interface CreateIndexConfig {
|
|
8
12
|
version: number;
|
|
9
13
|
deleteIfExists?: boolean;
|
|
@@ -18,12 +22,14 @@ export interface CreateIndexConfig {
|
|
|
18
22
|
* This class is used to create, update, and query a local vector index.
|
|
19
23
|
* Each index is a folder on disk containing an index.json file and an optional set of metadata files.
|
|
20
24
|
*/
|
|
21
|
-
export class LocalIndex {
|
|
25
|
+
export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<string,MetadataTypes>>{
|
|
22
26
|
private readonly _folderPath: string;
|
|
23
27
|
private readonly _indexName: string;
|
|
24
28
|
|
|
25
29
|
private _data?: IndexData;
|
|
26
30
|
private _update?: IndexData;
|
|
31
|
+
//member fields for BM25
|
|
32
|
+
private _bm25Engine: any;
|
|
27
33
|
|
|
28
34
|
/**
|
|
29
35
|
* Creates a new instance of LocalIndex.
|
|
@@ -177,7 +183,7 @@ export class LocalIndex {
|
|
|
177
183
|
* @param id ID of the item to retrieve.
|
|
178
184
|
* @returns Item or undefined if not found.
|
|
179
185
|
*/
|
|
180
|
-
public async getItem<TMetadata =
|
|
186
|
+
public async getItem<TItemMetadata extends TMetadata = TMetadata>(id: string): Promise<IndexItem<TItemMetadata> | undefined> {
|
|
181
187
|
await this.loadIndexData();
|
|
182
188
|
return this._data!.items.find(i => i.id === id) as any | undefined;
|
|
183
189
|
}
|
|
@@ -190,7 +196,7 @@ export class LocalIndex {
|
|
|
190
196
|
* @param item Item to insert.
|
|
191
197
|
* @returns Inserted item.
|
|
192
198
|
*/
|
|
193
|
-
public async insertItem<TMetadata =
|
|
199
|
+
public async insertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
|
|
194
200
|
if (this._update) {
|
|
195
201
|
return await this.addItemToUpdate(item, true) as any;
|
|
196
202
|
} else {
|
|
@@ -220,7 +226,7 @@ export class LocalIndex {
|
|
|
220
226
|
* array is returned so no modifications should be made to the array.
|
|
221
227
|
* @returns Array of all items in the index.
|
|
222
228
|
*/
|
|
223
|
-
public async listItems<TMetadata =
|
|
229
|
+
public async listItems<TItemMetadata extends TMetadata = TMetadata>(): Promise<IndexItem<TItemMetadata>[]> {
|
|
224
230
|
await this.loadIndexData();
|
|
225
231
|
return this._data!.items.slice() as any;
|
|
226
232
|
}
|
|
@@ -232,7 +238,7 @@ export class LocalIndex {
|
|
|
232
238
|
* @param filter Filter to apply.
|
|
233
239
|
* @returns Array of items matching the filter.
|
|
234
240
|
*/
|
|
235
|
-
public async listItemsByMetadata<TMetadata =
|
|
241
|
+
public async listItemsByMetadata<TItemMetadata extends TMetadata = TMetadata>(filter: MetadataFilter): Promise<IndexItem<TItemMetadata>[]> {
|
|
236
242
|
await this.loadIndexData();
|
|
237
243
|
return this._data!.items.filter(i => ItemSelector.select(i.metadata, filter)) as any;
|
|
238
244
|
}
|
|
@@ -247,7 +253,7 @@ export class LocalIndex {
|
|
|
247
253
|
* @param filter Optional. Filter to apply.
|
|
248
254
|
* @returns Similar items to the vector that matche the supplied filter.
|
|
249
255
|
*/
|
|
250
|
-
public async queryItems<TMetadata =
|
|
256
|
+
public async queryItems<TItemMetadata extends TMetadata = TMetadata>(vector: number[], query: string, topK: number, filter?: MetadataFilter, isBm25?: boolean): Promise<QueryResult<TItemMetadata>[]> {
|
|
251
257
|
await this.loadIndexData();
|
|
252
258
|
|
|
253
259
|
// Filter items
|
|
@@ -269,7 +275,7 @@ export class LocalIndex {
|
|
|
269
275
|
distances.sort((a, b) => b.distance - a.distance);
|
|
270
276
|
|
|
271
277
|
// Find top k
|
|
272
|
-
const top: QueryResult<
|
|
278
|
+
const top: QueryResult<TItemMetadata>[] = distances.slice(0, topK).map(d => {
|
|
273
279
|
return {
|
|
274
280
|
item: Object.assign({}, items[d.index]) as any,
|
|
275
281
|
score: d.distance
|
|
@@ -285,6 +291,36 @@ export class LocalIndex {
|
|
|
285
291
|
}
|
|
286
292
|
}
|
|
287
293
|
|
|
294
|
+
//Peform bm25 search only if enabled. Avoid duplicate chunks, which are already selected during semantic search.
|
|
295
|
+
if (isBm25) {
|
|
296
|
+
const itemSet = new Set();
|
|
297
|
+
for (const item of top) itemSet.add(item.item.id);
|
|
298
|
+
|
|
299
|
+
this.setupbm25();
|
|
300
|
+
|
|
301
|
+
let currDoc;
|
|
302
|
+
let currDocTxt;
|
|
303
|
+
for (let i = 0; i < items.length; i++) {
|
|
304
|
+
if (!itemSet.has(items[i].id)) {
|
|
305
|
+
const item = items[i];
|
|
306
|
+
currDoc = new LocalDocument((this as unknown) as LocalDocumentIndex, item.metadata.documentId.toString(), '');
|
|
307
|
+
currDocTxt = await currDoc.loadText();
|
|
308
|
+
const startPos = item.metadata.startPos;
|
|
309
|
+
const endPos = item.metadata.endPos;
|
|
310
|
+
const chunkText = currDocTxt.substring(Number(startPos), Number(endPos) + 1);
|
|
311
|
+
this._bm25Engine.addDoc({body: chunkText}, i);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
this._bm25Engine.consolidate();
|
|
315
|
+
var results = await this.bm25Search(query, items, topK);
|
|
316
|
+
results.forEach((res: any) => {
|
|
317
|
+
top.push({
|
|
318
|
+
item: Object.assign({}, {...items[res[0]], metadata: {...items[res[0]].metadata, isBm25: true}}) as any,
|
|
319
|
+
score: res[1]
|
|
320
|
+
});
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
}
|
|
288
324
|
return top;
|
|
289
325
|
}
|
|
290
326
|
|
|
@@ -296,7 +332,7 @@ export class LocalIndex {
|
|
|
296
332
|
* @param item Item to insert or replace.
|
|
297
333
|
* @returns Upserted item.
|
|
298
334
|
*/
|
|
299
|
-
public async upsertItem<TMetadata =
|
|
335
|
+
public async upsertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
|
|
300
336
|
if (this._update) {
|
|
301
337
|
return await this.addItemToUpdate(item, false) as any;
|
|
302
338
|
} else {
|
|
@@ -350,7 +386,7 @@ export class LocalIndex {
|
|
|
350
386
|
}
|
|
351
387
|
|
|
352
388
|
// Save remaining metadata to disk
|
|
353
|
-
metadataFile = `${v4}.json`;
|
|
389
|
+
metadataFile = `${v4()}.json`;
|
|
354
390
|
const metadataPath = path.join(this._folderPath, metadataFile);
|
|
355
391
|
await fs.writeFile(metadataPath, JSON.stringify(item.metadata));
|
|
356
392
|
} else if (item.metadata) {
|
|
@@ -385,6 +421,37 @@ export class LocalIndex {
|
|
|
385
421
|
return newItem;
|
|
386
422
|
}
|
|
387
423
|
}
|
|
424
|
+
|
|
425
|
+
private async setupbm25(): Promise<any> {
|
|
426
|
+
this._bm25Engine = bm25();
|
|
427
|
+
const nlp = winkNLP( model );
|
|
428
|
+
const its = nlp.its;
|
|
429
|
+
|
|
430
|
+
const prepTask = function ( text: string ) {
|
|
431
|
+
const tokens: any[] = [];
|
|
432
|
+
nlp.readDoc(text)
|
|
433
|
+
.tokens()
|
|
434
|
+
// Use only words ignoring punctuations etc and from them remove stop words
|
|
435
|
+
.filter( (t: any) => ( t.out(its.type) === 'word' && !t.out(its.stopWordFlag) ) )
|
|
436
|
+
// Handle negation and extract stem of the word
|
|
437
|
+
.each( (t: any) => tokens.push( (t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem) ) );
|
|
438
|
+
|
|
439
|
+
return tokens;
|
|
440
|
+
};
|
|
441
|
+
|
|
442
|
+
this._bm25Engine.defineConfig( { fldWeights: { body: 1 } } );
|
|
443
|
+
// Step II: Define PrepTasks pipe.
|
|
444
|
+
this._bm25Engine.definePrepTasks( [ prepTask ] );
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
private async bm25Search(searchQuery: string, items: any, topK: number): Promise<any> {
|
|
448
|
+
var query = searchQuery;
|
|
449
|
+
// `results` is an array of [ doc-id, score ], sorted by score
|
|
450
|
+
var results = this._bm25Engine.search( query );
|
|
451
|
+
|
|
452
|
+
return results.slice(0, topK);
|
|
453
|
+
}
|
|
454
|
+
|
|
388
455
|
}
|
|
389
456
|
|
|
390
457
|
interface IndexData {
|
package/src/TextSplitter.ts
CHANGED
|
@@ -178,23 +178,18 @@ export class TextSplitter {
|
|
|
178
178
|
}
|
|
179
179
|
|
|
180
180
|
private splitBySpaces(text: string): string[] {
|
|
181
|
+
// Split text by tokens and return parts
|
|
181
182
|
const parts: string[] = [];
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
parts.push(part);
|
|
191
|
-
part = nextWord;
|
|
192
|
-
}
|
|
183
|
+
let tokens = this._config.tokenizer.encode(text);
|
|
184
|
+
do {
|
|
185
|
+
if (tokens.length <= this._config.chunkSize) {
|
|
186
|
+
parts.push(this._config.tokenizer.decode(tokens));
|
|
187
|
+
break;
|
|
188
|
+
} else {
|
|
189
|
+
const span = tokens.splice(0, this._config.chunkSize);
|
|
190
|
+
parts.push(this._config.tokenizer.decode(span));
|
|
193
191
|
}
|
|
194
|
-
|
|
195
|
-
} else {
|
|
196
|
-
parts.push(text);
|
|
197
|
-
}
|
|
192
|
+
} while (true);
|
|
198
193
|
|
|
199
194
|
return parts;
|
|
200
195
|
}
|
|
@@ -16,9 +16,9 @@ export class Colorize {
|
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
-
public static output(output: object | string, quote: string = '', units: string = ''): string {
|
|
19
|
+
public static output(output: object | string, isBm25: boolean = false, quote: string = '', units: string = ''): string {
|
|
20
20
|
if (typeof output === 'string') {
|
|
21
|
-
return `\x1b[32m${quote}${output}${quote}\x1b[0m`;
|
|
21
|
+
return isBm25 ? `\x1b[34m${quote}${output}${quote}\x1b[0m` : `\x1b[32m${quote}${output}${quote}\x1b[0m`;
|
|
22
22
|
} else if (typeof output === 'object' && output !== null) {
|
|
23
23
|
return colorizer(output, {
|
|
24
24
|
pretty: true,
|
|
@@ -54,7 +54,7 @@ export class Colorize {
|
|
|
54
54
|
}
|
|
55
55
|
|
|
56
56
|
public static value(field: string, value: any, units: string = ''): string {
|
|
57
|
-
return `${field}: ${Colorize.output(value, '"', units)}`;
|
|
57
|
+
return `${field}: ${Colorize.output(value, false, '"', units)}`;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
60
|
public static warning(warning: string): string {
|
package/src/types.ts
CHANGED
|
@@ -24,7 +24,7 @@ export interface EmbeddingsModel {
|
|
|
24
24
|
* `error` - An error occurred while creating the embeddings.
|
|
25
25
|
* `rate_limited` - The request was rate limited.
|
|
26
26
|
*/
|
|
27
|
-
export type EmbeddingsResponseStatus = 'success' | 'error' | 'rate_limited';
|
|
27
|
+
export type EmbeddingsResponseStatus = 'success' | 'error' | 'rate_limited' | 'cancelled';
|
|
28
28
|
|
|
29
29
|
/**
|
|
30
30
|
* Response returned by a `EmbeddingsClient`.
|
|
@@ -172,4 +172,5 @@ export interface DocumentTextSection {
|
|
|
172
172
|
text: string;
|
|
173
173
|
tokenCount: number;
|
|
174
174
|
score: number;
|
|
175
|
+
isBm25: boolean;
|
|
175
176
|
}
|
package/src/vectra-cli.ts
CHANGED
|
@@ -191,6 +191,12 @@ export async function run() {
|
|
|
191
191
|
type: 'boolean',
|
|
192
192
|
default: true
|
|
193
193
|
})
|
|
194
|
+
.option('bm25', {
|
|
195
|
+
alias: 'b',
|
|
196
|
+
describe: 'Use Okapi-bm25 keyword search alogrithm to perform hybrid search - semantic + keyword. Displayed in blue during search.',
|
|
197
|
+
type: 'boolean',
|
|
198
|
+
default: false
|
|
199
|
+
})
|
|
194
200
|
.demandOption(['keys']);
|
|
195
201
|
}, async (args) => {
|
|
196
202
|
console.log(Colorize.title('Querying Index'));
|
|
@@ -217,6 +223,7 @@ export async function run() {
|
|
|
217
223
|
const results = await index.queryDocuments(query, {
|
|
218
224
|
maxDocuments: args.documentCount,
|
|
219
225
|
maxChunks: args.chunkCount,
|
|
226
|
+
isBm25: args.bm25 as boolean,
|
|
220
227
|
});
|
|
221
228
|
|
|
222
229
|
// Render results
|
|
@@ -226,12 +233,15 @@ export async function run() {
|
|
|
226
233
|
console.log(Colorize.value('chunks', result.chunks.length));
|
|
227
234
|
if (args.format == 'sections') {
|
|
228
235
|
const sections = await result.renderSections(args.tokens, args.sectionCount, args.overlap);
|
|
236
|
+
console.log(sections.length);
|
|
229
237
|
for (let i = 0; i < sections.length; i++) {
|
|
230
238
|
const section = sections[i];
|
|
239
|
+
const isBm25 = sections[i].isBm25;
|
|
240
|
+
console.log(isBm25);
|
|
231
241
|
console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
|
|
232
242
|
console.log(Colorize.value('score', section.score));
|
|
233
243
|
console.log(Colorize.value('tokens', section.tokenCount));
|
|
234
|
-
console.log(Colorize.output(section.text));
|
|
244
|
+
console.log(Colorize.output(section.text, isBm25));
|
|
235
245
|
}
|
|
236
246
|
} else if (args.format == 'chunks') {
|
|
237
247
|
const text = await result.loadText();
|
|
@@ -239,11 +249,12 @@ export async function run() {
|
|
|
239
249
|
const chunk = result.chunks[i];
|
|
240
250
|
const startPos = chunk.item.metadata.startPos;
|
|
241
251
|
const endPos = chunk.item.metadata.endPos;
|
|
252
|
+
const isBm25 = Boolean(chunk.item.metadata.isBm25);
|
|
242
253
|
console.log(Colorize.title(`Chunk ${i + 1}`));
|
|
243
254
|
console.log(Colorize.value('score', chunk.score));
|
|
244
255
|
console.log(Colorize.value('startPos', startPos));
|
|
245
256
|
console.log(Colorize.value('endPos', endPos));
|
|
246
|
-
console.log(Colorize.output(text.substring(startPos, endPos + 1)));
|
|
257
|
+
console.log(Colorize.output(text.substring(startPos, endPos + 1), isBm25));
|
|
247
258
|
}
|
|
248
259
|
}
|
|
249
260
|
}
|
package/lib/FileFetcher.d.ts
DELETED
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
import { TextFetcher } from './types';
|
|
2
|
-
export declare class FileFetcher implements TextFetcher {
|
|
3
|
-
fetch(uri: string, onDocument: (uri: string, text: string, docType?: string | undefined) => Promise<boolean>): Promise<boolean>;
|
|
4
|
-
}
|
|
5
|
-
//# sourceMappingURL=FileFetcher.d.ts.map
|
package/lib/FileFetcher.d.ts.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"FileFetcher.d.ts","sourceRoot":"","sources":["../src/FileFetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAItC,qBAAa,WAAY,YAAW,WAAW;IAC9B,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,KAAK,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC;CAyB/I"}
|