vectra 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -10
- package/package.json +7 -6
- package/src/ItemSelector.ts +7 -1
- package/src/LocalDocumentIndex.ts +7 -1
- package/src/LocalDocumentResult.ts +70 -22
- package/src/LocalIndex.ts +70 -3
- package/src/internals/Colorize.ts +3 -3
- package/src/internals/wink-bm25-text-search.d.ts +4 -0
- package/src/types.ts +1 -0
- package/src/vectra-cli.ts +13 -2
- package/lib/FileFetcher.d.ts +0 -5
- package/lib/FileFetcher.d.ts.map +0 -1
- package/lib/FileFetcher.js +0 -69
- package/lib/FileFetcher.js.map +0 -1
- package/lib/GPT3Tokenizer.d.ts +0 -9
- package/lib/GPT3Tokenizer.d.ts.map +0 -1
- package/lib/GPT3Tokenizer.js +0 -17
- package/lib/GPT3Tokenizer.js.map +0 -1
- package/lib/ItemSelector.d.ts +0 -41
- package/lib/ItemSelector.d.ts.map +0 -1
- package/lib/ItemSelector.js +0 -162
- package/lib/ItemSelector.js.map +0 -1
- package/lib/LocalDocument.d.ts +0 -54
- package/lib/LocalDocument.d.ts.map +0 -1
- package/lib/LocalDocument.js +0 -146
- package/lib/LocalDocument.js.map +0 -1
- package/lib/LocalDocumentIndex.d.ts +0 -128
- package/lib/LocalDocumentIndex.d.ts.map +0 -1
- package/lib/LocalDocumentIndex.js +0 -446
- package/lib/LocalDocumentIndex.js.map +0 -1
- package/lib/LocalDocumentResult.d.ts +0 -45
- package/lib/LocalDocumentResult.d.ts.map +0 -1
- package/lib/LocalDocumentResult.js +0 -282
- package/lib/LocalDocumentResult.js.map +0 -1
- package/lib/LocalIndex.d.ts +0 -136
- package/lib/LocalIndex.d.ts.map +0 -1
- package/lib/LocalIndex.js +0 -413
- package/lib/LocalIndex.js.map +0 -1
- package/lib/OpenAIEmbeddings.d.ts +0 -126
- package/lib/OpenAIEmbeddings.d.ts.map +0 -1
- package/lib/OpenAIEmbeddings.js +0 -174
- package/lib/OpenAIEmbeddings.js.map +0 -1
- package/lib/TextSplitter.d.ts +0 -20
- package/lib/TextSplitter.d.ts.map +0 -1
- package/lib/TextSplitter.js +0 -537
- package/lib/TextSplitter.js.map +0 -1
- package/lib/WebFetcher.d.ts +0 -15
- package/lib/WebFetcher.d.ts.map +0 -1
- package/lib/WebFetcher.js +0 -224
- package/lib/WebFetcher.js.map +0 -1
- package/lib/index.d.ts +0 -12
- package/lib/index.d.ts.map +0 -1
- package/lib/index.js +0 -28
- package/lib/index.js.map +0 -1
- package/lib/internals/Colorize.d.ts +0 -14
- package/lib/internals/Colorize.d.ts.map +0 -1
- package/lib/internals/Colorize.js +0 -64
- package/lib/internals/Colorize.js.map +0 -1
- package/lib/internals/index.d.ts +0 -3
- package/lib/internals/index.d.ts.map +0 -1
- package/lib/internals/index.js +0 -19
- package/lib/internals/index.js.map +0 -1
- package/lib/internals/types.d.ts +0 -43
- package/lib/internals/types.d.ts.map +0 -1
- package/lib/internals/types.js +0 -3
- package/lib/internals/types.js.map +0 -1
- package/lib/types.d.ts +0 -145
- package/lib/types.d.ts.map +0 -1
- package/lib/types.js +0 -3
- package/lib/types.js.map +0 -1
- package/lib/vectra-cli.d.ts +0 -2
- package/lib/vectra-cli.d.ts.map +0 -1
- package/lib/vectra-cli.js +0 -303
- package/lib/vectra-cli.js.map +0 -1
package/README.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Vectra
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
|
|
3
4
|
|
|
4
5
|
When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by simularity. Every item in the index will first be filtered by metadata and then ranked for simularity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms.
|
|
5
6
|
|
|
@@ -8,9 +9,10 @@ Keep in mind that your entire Vectra index is loaded into memory so it's not wel
|
|
|
8
9
|
Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace.
|
|
9
10
|
|
|
10
11
|
## Other Language Bindings
|
|
12
|
+
|
|
11
13
|
This repo contains the TypeScript/JavaScript binding for Vectra but other language bindings are being created. Since Vectra is file based, any language binding can be used to read or write a Vectra index. That means you can build a Vectra index using JS and then read it using Python.
|
|
12
14
|
|
|
13
|
-
-
|
|
15
|
+
- [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
|
|
14
16
|
|
|
15
17
|
## Installation
|
|
16
18
|
|
|
@@ -31,7 +33,7 @@ const index = new LocalIndex(path.join(__dirname, '..', 'index'));
|
|
|
31
33
|
Next, from inside an async function, create your index:
|
|
32
34
|
|
|
33
35
|
```typescript
|
|
34
|
-
if (!await index.isIndexCreated()) {
|
|
36
|
+
if (!(await index.isIndexCreated())) {
|
|
35
37
|
await index.createIndex();
|
|
36
38
|
}
|
|
37
39
|
```
|
|
@@ -39,26 +41,24 @@ if (!await index.isIndexCreated()) {
|
|
|
39
41
|
Add some items to your index:
|
|
40
42
|
|
|
41
43
|
```typescript
|
|
42
|
-
import {
|
|
44
|
+
import { OpenAI } from 'openai';
|
|
43
45
|
|
|
44
|
-
const
|
|
46
|
+
const openai = new OpenAI({
|
|
45
47
|
apiKey: `<YOUR_KEY>`,
|
|
46
48
|
});
|
|
47
49
|
|
|
48
|
-
const api = new OpenAIApi(configuration);
|
|
49
|
-
|
|
50
50
|
async function getVector(text: string) {
|
|
51
|
-
const response = await
|
|
51
|
+
const response = await openai.embeddings.create({
|
|
52
52
|
'model': 'text-embedding-ada-002',
|
|
53
53
|
'input': text,
|
|
54
54
|
});
|
|
55
|
-
return response.data
|
|
55
|
+
return response.data[0].embedding;
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
async function addItem(text: string) {
|
|
59
59
|
await index.insertItem({
|
|
60
60
|
vector: await getVector(text),
|
|
61
|
-
metadata: { text }
|
|
61
|
+
metadata: { text },
|
|
62
62
|
});
|
|
63
63
|
}
|
|
64
64
|
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "vectra",
|
|
3
3
|
"author": "Steven Ickman",
|
|
4
4
|
"description": "A vector database that uses the local file system for storage.",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.10.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"keywords": [
|
|
8
8
|
"gpt"
|
|
@@ -35,14 +35,14 @@
|
|
|
35
35
|
"openai": "^3.2.1",
|
|
36
36
|
"turndown": "^7.1.2",
|
|
37
37
|
"uuid": "^9.0.0",
|
|
38
|
+
"wink-nlp": "^2.3.2",
|
|
38
39
|
"yargs": "^17.7.2"
|
|
39
40
|
},
|
|
40
|
-
"resolutions": {
|
|
41
|
-
},
|
|
41
|
+
"resolutions": {},
|
|
42
42
|
"devDependencies": {
|
|
43
|
-
"@types/node": "^14.14.31",
|
|
44
|
-
"@types/mocha": "^8.2.0",
|
|
45
43
|
"@types/assert": "^1.5.3",
|
|
44
|
+
"@types/mocha": "^8.2.0",
|
|
45
|
+
"@types/node": "^14.14.31",
|
|
46
46
|
"@types/turndown": "^5.0.1",
|
|
47
47
|
"@types/uuid": "9.0.1",
|
|
48
48
|
"@types/yargs": "17.0.24",
|
|
@@ -50,7 +50,8 @@
|
|
|
50
50
|
"nyc": "^15.1.0",
|
|
51
51
|
"shx": "^0.3.2",
|
|
52
52
|
"ts-mocha": "10.0.0",
|
|
53
|
-
"typescript": "^4.2.3"
|
|
53
|
+
"typescript": "^4.2.3",
|
|
54
|
+
"wink-bm25-text-search": "^3.1.2"
|
|
54
55
|
},
|
|
55
56
|
"scripts": {
|
|
56
57
|
"build": "tsc -b",
|
package/src/ItemSelector.ts
CHANGED
|
@@ -149,7 +149,13 @@ export class ItemSelector {
|
|
|
149
149
|
}
|
|
150
150
|
break;
|
|
151
151
|
case '$nin':
|
|
152
|
-
if (typeof value == 'boolean'
|
|
152
|
+
if (typeof value == 'boolean') {
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
else if (typeof value == 'string' && filter[key]!.includes(value)) {
|
|
156
|
+
return false;
|
|
157
|
+
}
|
|
158
|
+
else if (filter[key]!.some(val => typeof val == 'string' && val.includes(value as string))) {
|
|
153
159
|
return false;
|
|
154
160
|
}
|
|
155
161
|
break;
|
|
@@ -30,6 +30,12 @@ export interface DocumentQueryOptions {
|
|
|
30
30
|
* Optional. Filter to apply to the document metadata.
|
|
31
31
|
*/
|
|
32
32
|
filter?: MetadataFilter;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Optional. Turn on bm25 keyword search to perform hybrid search - semantic + keyword
|
|
36
|
+
*/
|
|
37
|
+
isBm25?: boolean;
|
|
38
|
+
|
|
33
39
|
}
|
|
34
40
|
|
|
35
41
|
/**
|
|
@@ -378,7 +384,7 @@ export class LocalDocumentIndex extends LocalIndex<DocumentChunkMetadata> {
|
|
|
378
384
|
}
|
|
379
385
|
|
|
380
386
|
// Query index for chunks
|
|
381
|
-
const results = await this.queryItems(embeddings.output![0], options.maxChunks!, options.filter);
|
|
387
|
+
const results = await this.queryItems(embeddings.output![0], query, options.maxChunks!, options.filter, options.isBm25);
|
|
382
388
|
|
|
383
389
|
// Group chunks by document
|
|
384
390
|
const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
|
|
@@ -66,7 +66,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
66
66
|
startPos: startPos + offset,
|
|
67
67
|
endPos: startPos + offset + chunkLength - 1,
|
|
68
68
|
score: chunk.score,
|
|
69
|
-
tokenCount: chunkLength
|
|
69
|
+
tokenCount: chunkLength,
|
|
70
|
+
isBm25: false
|
|
70
71
|
});
|
|
71
72
|
offset += chunkLength;
|
|
72
73
|
}
|
|
@@ -103,7 +104,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
103
104
|
return {
|
|
104
105
|
text: text,
|
|
105
106
|
tokenCount: section.tokenCount,
|
|
106
|
-
score: section.score
|
|
107
|
+
score: section.score,
|
|
108
|
+
isBm25: false,
|
|
107
109
|
};
|
|
108
110
|
});
|
|
109
111
|
}
|
|
@@ -127,7 +129,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
127
129
|
return [{
|
|
128
130
|
text,
|
|
129
131
|
tokenCount: length,
|
|
130
|
-
score: 1.0
|
|
132
|
+
score: 1.0,
|
|
133
|
+
isBm25: false,
|
|
131
134
|
}];
|
|
132
135
|
}
|
|
133
136
|
|
|
@@ -148,7 +151,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
148
151
|
startPos,
|
|
149
152
|
endPos,
|
|
150
153
|
score: chunk.score,
|
|
151
|
-
tokenCount: this._tokenizer.encode(chunkText).length
|
|
154
|
+
tokenCount: this._tokenizer.encode(chunkText).length,
|
|
155
|
+
isBm25: Boolean(chunk.item.metadata.isBm25),
|
|
152
156
|
};
|
|
153
157
|
}).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
|
|
154
158
|
|
|
@@ -163,36 +167,63 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
163
167
|
return [{
|
|
164
168
|
text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
|
|
165
169
|
tokenCount: maxTokens,
|
|
166
|
-
score: topChunk.score
|
|
170
|
+
score: topChunk.score,
|
|
171
|
+
isBm25: false,
|
|
167
172
|
}];
|
|
168
173
|
}
|
|
169
174
|
|
|
170
|
-
// Generate sections
|
|
175
|
+
// Generate semantic sections
|
|
171
176
|
const sections: Section[] = [];
|
|
172
177
|
for (let i = 0; i < chunks.length; i++) {
|
|
173
178
|
const chunk = chunks[i];
|
|
174
179
|
let section = sections[sections.length - 1];
|
|
175
|
-
if (!
|
|
176
|
-
section
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
180
|
+
if (!chunk.isBm25) {
|
|
181
|
+
if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
|
|
182
|
+
section = {
|
|
183
|
+
chunks: [],
|
|
184
|
+
score: 0,
|
|
185
|
+
tokenCount: 0
|
|
186
|
+
};
|
|
187
|
+
sections.push(section);
|
|
188
|
+
}
|
|
189
|
+
section.chunks.push(chunk);
|
|
190
|
+
section.score += chunk.score;
|
|
191
|
+
section.tokenCount += chunk.tokenCount;
|
|
182
192
|
}
|
|
183
|
-
section.chunks.push(chunk);
|
|
184
|
-
section.score += chunk.score;
|
|
185
|
-
section.tokenCount += chunk.tokenCount;
|
|
186
193
|
}
|
|
187
194
|
|
|
195
|
+
// Generate bm25 sections
|
|
196
|
+
const bm25Sections: Section[] = [];
|
|
197
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
198
|
+
const chunk = chunks[i];
|
|
199
|
+
let section = bm25Sections[bm25Sections.length - 1];
|
|
200
|
+
if (chunk.isBm25) {
|
|
201
|
+
if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
|
|
202
|
+
section = {
|
|
203
|
+
chunks: [],
|
|
204
|
+
score: 0,
|
|
205
|
+
tokenCount: 0
|
|
206
|
+
};
|
|
207
|
+
bm25Sections.push(section);
|
|
208
|
+
}
|
|
209
|
+
section.chunks.push(chunk);
|
|
210
|
+
section.score += chunk.score;
|
|
211
|
+
section.tokenCount += chunk.tokenCount;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
188
214
|
// Normalize section scores
|
|
189
215
|
sections.forEach(section => section.score /= section.chunks.length);
|
|
216
|
+
bm25Sections.forEach(section => section.score /= section.chunks.length);
|
|
190
217
|
|
|
191
218
|
// Sort sections by score and limit to maxSections
|
|
192
219
|
sections.sort((a, b) => b.score - a.score);
|
|
220
|
+
bm25Sections.sort((a, b) => b.score - a.score);
|
|
193
221
|
if (sections.length > maxSections) {
|
|
194
222
|
sections.splice(maxSections, sections.length - maxSections);
|
|
195
223
|
}
|
|
224
|
+
if (bm25Sections.length > maxSections) {
|
|
225
|
+
bm25Sections.splice(maxSections, bm25Sections.length - maxSections);
|
|
226
|
+
}
|
|
196
227
|
|
|
197
228
|
// Combine adjacent chunks of text
|
|
198
229
|
sections.forEach(section => {
|
|
@@ -216,7 +247,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
216
247
|
startPos: -1,
|
|
217
248
|
endPos: -1,
|
|
218
249
|
score: 0,
|
|
219
|
-
tokenCount: this._tokenizer.encode('\n\n...\n\n').length
|
|
250
|
+
tokenCount: this._tokenizer.encode('\n\n...\n\n').length,
|
|
251
|
+
isBm25: false,
|
|
220
252
|
};
|
|
221
253
|
sections.forEach(section => {
|
|
222
254
|
// Insert connectors between chunks
|
|
@@ -242,7 +274,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
242
274
|
startPos: sectionStart - beforeBudget,
|
|
243
275
|
endPos: sectionStart - 1,
|
|
244
276
|
score: 0,
|
|
245
|
-
tokenCount: beforeBudget
|
|
277
|
+
tokenCount: beforeBudget,
|
|
278
|
+
isBm25: false,
|
|
246
279
|
};
|
|
247
280
|
section.chunks.unshift(chunk);
|
|
248
281
|
section.tokenCount += chunk.tokenCount;
|
|
@@ -258,7 +291,8 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
258
291
|
startPos: sectionEnd + 1,
|
|
259
292
|
endPos: sectionEnd + afterBudget,
|
|
260
293
|
score: 0,
|
|
261
|
-
tokenCount: afterBudget
|
|
294
|
+
tokenCount: afterBudget,
|
|
295
|
+
isBm25: false,
|
|
262
296
|
};
|
|
263
297
|
section.chunks.push(chunk);
|
|
264
298
|
section.tokenCount += chunk.tokenCount;
|
|
@@ -268,16 +302,29 @@ export class LocalDocumentResult extends LocalDocument {
|
|
|
268
302
|
});
|
|
269
303
|
}
|
|
270
304
|
|
|
271
|
-
|
|
272
|
-
|
|
305
|
+
const semanticDocTextSections = sections.map(section => {
|
|
306
|
+
let text = '';
|
|
307
|
+
section.chunks.forEach(chunk => text += chunk.text);
|
|
308
|
+
return {
|
|
309
|
+
text: text,
|
|
310
|
+
tokenCount: section.tokenCount,
|
|
311
|
+
score: section.score,
|
|
312
|
+
isBm25: false,
|
|
313
|
+
};
|
|
314
|
+
});
|
|
315
|
+
const bm25DocTextSections = bm25Sections.map(section => {
|
|
273
316
|
let text = '';
|
|
274
317
|
section.chunks.forEach(chunk => text += chunk.text);
|
|
275
318
|
return {
|
|
276
319
|
text: text,
|
|
277
320
|
tokenCount: section.tokenCount,
|
|
278
|
-
score: section.score
|
|
321
|
+
score: section.score,
|
|
322
|
+
isBm25: true,
|
|
279
323
|
};
|
|
280
324
|
});
|
|
325
|
+
|
|
326
|
+
// Return final rendered sections
|
|
327
|
+
return [...semanticDocTextSections, ...bm25DocTextSections];
|
|
281
328
|
}
|
|
282
329
|
|
|
283
330
|
private encodeBeforeText(text: string, budget: number): number[] {
|
|
@@ -300,6 +347,7 @@ interface SectionChunk {
|
|
|
300
347
|
endPos: number;
|
|
301
348
|
score: number;
|
|
302
349
|
tokenCount: number;
|
|
350
|
+
isBm25: boolean;
|
|
303
351
|
}
|
|
304
352
|
|
|
305
353
|
interface Section {
|
package/src/LocalIndex.ts
CHANGED
|
@@ -3,7 +3,11 @@ import * as path from 'path';
|
|
|
3
3
|
import { v4 } from 'uuid';
|
|
4
4
|
import { ItemSelector } from './ItemSelector';
|
|
5
5
|
import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
|
|
6
|
-
|
|
6
|
+
import { LocalDocument } from './LocalDocument';
|
|
7
|
+
import { LocalDocumentIndex } from './LocalDocumentIndex';
|
|
8
|
+
import bm25 from 'wink-bm25-text-search';
|
|
9
|
+
import winkNLP from 'wink-nlp';
|
|
10
|
+
import model from 'wink-eng-lite-web-model';
|
|
7
11
|
export interface CreateIndexConfig {
|
|
8
12
|
version: number;
|
|
9
13
|
deleteIfExists?: boolean;
|
|
@@ -24,6 +28,8 @@ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<
|
|
|
24
28
|
|
|
25
29
|
private _data?: IndexData;
|
|
26
30
|
private _update?: IndexData;
|
|
31
|
+
//member fields for BM25
|
|
32
|
+
private _bm25Engine: any;
|
|
27
33
|
|
|
28
34
|
/**
|
|
29
35
|
* Creates a new instance of LocalIndex.
|
|
@@ -247,7 +253,7 @@ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<
|
|
|
247
253
|
* @param filter Optional. Filter to apply.
|
|
248
254
|
* @returns Similar items to the vector that matche the supplied filter.
|
|
249
255
|
*/
|
|
250
|
-
public async queryItems<TItemMetadata extends TMetadata = TMetadata>(vector: number[], topK: number, filter?: MetadataFilter): Promise<QueryResult<TItemMetadata>[]> {
|
|
256
|
+
public async queryItems<TItemMetadata extends TMetadata = TMetadata>(vector: number[], query: string, topK: number, filter?: MetadataFilter, isBm25?: boolean): Promise<QueryResult<TItemMetadata>[]> {
|
|
251
257
|
await this.loadIndexData();
|
|
252
258
|
|
|
253
259
|
// Filter items
|
|
@@ -285,6 +291,36 @@ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<
|
|
|
285
291
|
}
|
|
286
292
|
}
|
|
287
293
|
|
|
294
|
+
//Peform bm25 search only if enabled. Avoid duplicate chunks, which are already selected during semantic search.
|
|
295
|
+
if (isBm25) {
|
|
296
|
+
const itemSet = new Set();
|
|
297
|
+
for (const item of top) itemSet.add(item.item.id);
|
|
298
|
+
|
|
299
|
+
this.setupbm25();
|
|
300
|
+
|
|
301
|
+
let currDoc;
|
|
302
|
+
let currDocTxt;
|
|
303
|
+
for (let i = 0; i < items.length; i++) {
|
|
304
|
+
if (!itemSet.has(items[i].id)) {
|
|
305
|
+
const item = items[i];
|
|
306
|
+
currDoc = new LocalDocument((this as unknown) as LocalDocumentIndex, item.metadata.documentId.toString(), '');
|
|
307
|
+
currDocTxt = await currDoc.loadText();
|
|
308
|
+
const startPos = item.metadata.startPos;
|
|
309
|
+
const endPos = item.metadata.endPos;
|
|
310
|
+
const chunkText = currDocTxt.substring(Number(startPos), Number(endPos) + 1);
|
|
311
|
+
this._bm25Engine.addDoc({body: chunkText}, i);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
this._bm25Engine.consolidate();
|
|
315
|
+
var results = await this.bm25Search(query, items, topK);
|
|
316
|
+
results.forEach((res: any) => {
|
|
317
|
+
top.push({
|
|
318
|
+
item: Object.assign({}, {...items[res[0]], metadata: {...items[res[0]].metadata, isBm25: true}}) as any,
|
|
319
|
+
score: res[1]
|
|
320
|
+
});
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
}
|
|
288
324
|
return top;
|
|
289
325
|
}
|
|
290
326
|
|
|
@@ -350,7 +386,7 @@ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<
|
|
|
350
386
|
}
|
|
351
387
|
|
|
352
388
|
// Save remaining metadata to disk
|
|
353
|
-
metadataFile = `${v4}.json`;
|
|
389
|
+
metadataFile = `${v4()}.json`;
|
|
354
390
|
const metadataPath = path.join(this._folderPath, metadataFile);
|
|
355
391
|
await fs.writeFile(metadataPath, JSON.stringify(item.metadata));
|
|
356
392
|
} else if (item.metadata) {
|
|
@@ -385,6 +421,37 @@ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<
|
|
|
385
421
|
return newItem;
|
|
386
422
|
}
|
|
387
423
|
}
|
|
424
|
+
|
|
425
|
+
private async setupbm25(): Promise<any> {
|
|
426
|
+
this._bm25Engine = bm25();
|
|
427
|
+
const nlp = winkNLP( model );
|
|
428
|
+
const its = nlp.its;
|
|
429
|
+
|
|
430
|
+
const prepTask = function ( text: string ) {
|
|
431
|
+
const tokens: any[] = [];
|
|
432
|
+
nlp.readDoc(text)
|
|
433
|
+
.tokens()
|
|
434
|
+
// Use only words ignoring punctuations etc and from them remove stop words
|
|
435
|
+
.filter( (t: any) => ( t.out(its.type) === 'word' && !t.out(its.stopWordFlag) ) )
|
|
436
|
+
// Handle negation and extract stem of the word
|
|
437
|
+
.each( (t: any) => tokens.push( (t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem) ) );
|
|
438
|
+
|
|
439
|
+
return tokens;
|
|
440
|
+
};
|
|
441
|
+
|
|
442
|
+
this._bm25Engine.defineConfig( { fldWeights: { body: 1 } } );
|
|
443
|
+
// Step II: Define PrepTasks pipe.
|
|
444
|
+
this._bm25Engine.definePrepTasks( [ prepTask ] );
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
private async bm25Search(searchQuery: string, items: any, topK: number): Promise<any> {
|
|
448
|
+
var query = searchQuery;
|
|
449
|
+
// `results` is an array of [ doc-id, score ], sorted by score
|
|
450
|
+
var results = this._bm25Engine.search( query );
|
|
451
|
+
|
|
452
|
+
return results.slice(0, topK);
|
|
453
|
+
}
|
|
454
|
+
|
|
388
455
|
}
|
|
389
456
|
|
|
390
457
|
interface IndexData {
|
|
@@ -16,9 +16,9 @@ export class Colorize {
|
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
-
public static output(output: object | string, quote: string = '', units: string = ''): string {
|
|
19
|
+
public static output(output: object | string, isBm25: boolean = false, quote: string = '', units: string = ''): string {
|
|
20
20
|
if (typeof output === 'string') {
|
|
21
|
-
return `\x1b[32m${quote}${output}${quote}\x1b[0m`;
|
|
21
|
+
return isBm25 ? `\x1b[34m${quote}${output}${quote}\x1b[0m` : `\x1b[32m${quote}${output}${quote}\x1b[0m`;
|
|
22
22
|
} else if (typeof output === 'object' && output !== null) {
|
|
23
23
|
return colorizer(output, {
|
|
24
24
|
pretty: true,
|
|
@@ -54,7 +54,7 @@ export class Colorize {
|
|
|
54
54
|
}
|
|
55
55
|
|
|
56
56
|
public static value(field: string, value: any, units: string = ''): string {
|
|
57
|
-
return `${field}: ${Colorize.output(value, '"', units)}`;
|
|
57
|
+
return `${field}: ${Colorize.output(value, false, '"', units)}`;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
60
|
public static warning(warning: string): string {
|
package/src/types.ts
CHANGED
package/src/vectra-cli.ts
CHANGED
|
@@ -191,6 +191,12 @@ export async function run() {
|
|
|
191
191
|
type: 'boolean',
|
|
192
192
|
default: true
|
|
193
193
|
})
|
|
194
|
+
.option('bm25', {
|
|
195
|
+
alias: 'b',
|
|
196
|
+
describe: 'Use Okapi-bm25 keyword search alogrithm to perform hybrid search - semantic + keyword. Displayed in blue during search.',
|
|
197
|
+
type: 'boolean',
|
|
198
|
+
default: false
|
|
199
|
+
})
|
|
194
200
|
.demandOption(['keys']);
|
|
195
201
|
}, async (args) => {
|
|
196
202
|
console.log(Colorize.title('Querying Index'));
|
|
@@ -217,6 +223,7 @@ export async function run() {
|
|
|
217
223
|
const results = await index.queryDocuments(query, {
|
|
218
224
|
maxDocuments: args.documentCount,
|
|
219
225
|
maxChunks: args.chunkCount,
|
|
226
|
+
isBm25: args.bm25 as boolean,
|
|
220
227
|
});
|
|
221
228
|
|
|
222
229
|
// Render results
|
|
@@ -226,12 +233,15 @@ export async function run() {
|
|
|
226
233
|
console.log(Colorize.value('chunks', result.chunks.length));
|
|
227
234
|
if (args.format == 'sections') {
|
|
228
235
|
const sections = await result.renderSections(args.tokens, args.sectionCount, args.overlap);
|
|
236
|
+
console.log(sections.length);
|
|
229
237
|
for (let i = 0; i < sections.length; i++) {
|
|
230
238
|
const section = sections[i];
|
|
239
|
+
const isBm25 = sections[i].isBm25;
|
|
240
|
+
console.log(isBm25);
|
|
231
241
|
console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
|
|
232
242
|
console.log(Colorize.value('score', section.score));
|
|
233
243
|
console.log(Colorize.value('tokens', section.tokenCount));
|
|
234
|
-
console.log(Colorize.output(section.text));
|
|
244
|
+
console.log(Colorize.output(section.text, isBm25));
|
|
235
245
|
}
|
|
236
246
|
} else if (args.format == 'chunks') {
|
|
237
247
|
const text = await result.loadText();
|
|
@@ -239,11 +249,12 @@ export async function run() {
|
|
|
239
249
|
const chunk = result.chunks[i];
|
|
240
250
|
const startPos = chunk.item.metadata.startPos;
|
|
241
251
|
const endPos = chunk.item.metadata.endPos;
|
|
252
|
+
const isBm25 = Boolean(chunk.item.metadata.isBm25);
|
|
242
253
|
console.log(Colorize.title(`Chunk ${i + 1}`));
|
|
243
254
|
console.log(Colorize.value('score', chunk.score));
|
|
244
255
|
console.log(Colorize.value('startPos', startPos));
|
|
245
256
|
console.log(Colorize.value('endPos', endPos));
|
|
246
|
-
console.log(Colorize.output(text.substring(startPos, endPos + 1)));
|
|
257
|
+
console.log(Colorize.output(text.substring(startPos, endPos + 1), isBm25));
|
|
247
258
|
}
|
|
248
259
|
}
|
|
249
260
|
}
|
package/lib/FileFetcher.d.ts
DELETED
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
import { TextFetcher } from './types';
|
|
2
|
-
export declare class FileFetcher implements TextFetcher {
|
|
3
|
-
fetch(uri: string, onDocument: (uri: string, text: string, docType?: string | undefined) => Promise<boolean>): Promise<boolean>;
|
|
4
|
-
}
|
|
5
|
-
//# sourceMappingURL=FileFetcher.d.ts.map
|
package/lib/FileFetcher.d.ts.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"FileFetcher.d.ts","sourceRoot":"","sources":["../src/FileFetcher.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAItC,qBAAa,WAAY,YAAW,WAAW;IAC9B,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,KAAK,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC;CAyB/I"}
|
package/lib/FileFetcher.js
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
-
if (mod && mod.__esModule) return mod;
|
|
20
|
-
var result = {};
|
|
21
|
-
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
-
__setModuleDefault(result, mod);
|
|
23
|
-
return result;
|
|
24
|
-
};
|
|
25
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
26
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
27
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
28
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
29
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
30
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
31
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
32
|
-
});
|
|
33
|
-
};
|
|
34
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
35
|
-
exports.FileFetcher = void 0;
|
|
36
|
-
const fs = __importStar(require("fs/promises"));
|
|
37
|
-
const path = __importStar(require("path"));
|
|
38
|
-
class FileFetcher {
|
|
39
|
-
fetch(uri, onDocument) {
|
|
40
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
41
|
-
// Does path exist and is it a directory?
|
|
42
|
-
let isDirectory;
|
|
43
|
-
try {
|
|
44
|
-
const stat = yield fs.stat(uri);
|
|
45
|
-
isDirectory = stat.isDirectory();
|
|
46
|
-
}
|
|
47
|
-
catch (_a) {
|
|
48
|
-
return true;
|
|
49
|
-
}
|
|
50
|
-
// If directory, read all files and recurse
|
|
51
|
-
if (isDirectory) {
|
|
52
|
-
const files = yield fs.readdir(uri);
|
|
53
|
-
for (const file of files) {
|
|
54
|
-
const filePath = path.join(uri, file);
|
|
55
|
-
yield this.fetch(filePath, onDocument);
|
|
56
|
-
}
|
|
57
|
-
return true;
|
|
58
|
-
}
|
|
59
|
-
else {
|
|
60
|
-
// Read file and call onDocument
|
|
61
|
-
const text = yield fs.readFile(uri, 'utf8');
|
|
62
|
-
const parts = uri.split('.');
|
|
63
|
-
return yield onDocument(uri, text, parts.length > 0 ? parts[parts.length - 1].toLowerCase() : undefined);
|
|
64
|
-
}
|
|
65
|
-
});
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
exports.FileFetcher = FileFetcher;
|
|
69
|
-
//# sourceMappingURL=FileFetcher.js.map
|
package/lib/FileFetcher.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"FileFetcher.js","sourceRoot":"","sources":["../src/FileFetcher.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AACA,gDAAkC;AAClC,2CAA6B;AAE7B,MAAa,WAAW;IACP,KAAK,CAAC,GAAW,EAAE,UAAyF;;YACrH,yCAAyC;YACzC,IAAI,WAAoB,CAAC;YACzB,IAAI;gBACA,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAChC,WAAW,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;aACpC;YAAC,WAAM;gBACJ,OAAO,IAAI,CAAC;aACf;YAED,2CAA2C;YAC3C,IAAI,WAAW,EAAE;gBACb,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBACpC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE;oBACtB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;oBACtC,MAAM,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;iBAC1C;gBACD,OAAO,IAAI,CAAC;aACf;iBAAM;gBACH,gCAAgC;gBAChC,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;gBAC5C,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,MAAM,UAAU,CAAC,GAAG,EAAE,IAAI,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;aAC5G;QACL,CAAC;KAAA;CACJ;AA1BD,kCA0BC"}
|
package/lib/GPT3Tokenizer.d.ts
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import { Tokenizer } from "./types";
|
|
2
|
-
/**
|
|
3
|
-
* Tokenizer that uses GPT-3's encoder.
|
|
4
|
-
*/
|
|
5
|
-
export declare class GPT3Tokenizer implements Tokenizer {
|
|
6
|
-
decode(tokens: number[]): string;
|
|
7
|
-
encode(text: string): number[];
|
|
8
|
-
}
|
|
9
|
-
//# sourceMappingURL=GPT3Tokenizer.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"GPT3Tokenizer.d.ts","sourceRoot":"","sources":["../src/GPT3Tokenizer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAGpC;;GAEG;AACH,qBAAa,aAAc,YAAW,SAAS;IACpC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM;IAIhC,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE;CAGxC"}
|
package/lib/GPT3Tokenizer.js
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.GPT3Tokenizer = void 0;
|
|
4
|
-
const gpt_3_encoder_1 = require("gpt-3-encoder");
|
|
5
|
-
/**
|
|
6
|
-
* Tokenizer that uses GPT-3's encoder.
|
|
7
|
-
*/
|
|
8
|
-
class GPT3Tokenizer {
|
|
9
|
-
decode(tokens) {
|
|
10
|
-
return (0, gpt_3_encoder_1.decode)(tokens);
|
|
11
|
-
}
|
|
12
|
-
encode(text) {
|
|
13
|
-
return (0, gpt_3_encoder_1.encode)(text);
|
|
14
|
-
}
|
|
15
|
-
}
|
|
16
|
-
exports.GPT3Tokenizer = GPT3Tokenizer;
|
|
17
|
-
//# sourceMappingURL=GPT3Tokenizer.js.map
|
package/lib/GPT3Tokenizer.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"GPT3Tokenizer.js","sourceRoot":"","sources":["../src/GPT3Tokenizer.ts"],"names":[],"mappings":";;;AACA,iDAA+C;AAE/C;;GAEG;AACH,MAAa,aAAa;IACf,MAAM,CAAC,MAAgB;QAC1B,OAAO,IAAA,sBAAM,EAAC,MAAM,CAAC,CAAC;IAC1B,CAAC;IAEM,MAAM,CAAC,IAAY;QACtB,OAAO,IAAA,sBAAM,EAAC,IAAI,CAAC,CAAC;IACxB,CAAC;CACJ;AARD,sCAQC"}
|