vectra 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bin/vectra.js +3 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.js +17 -0
- package/lib/GPT3Tokenizer.js.map +1 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +156 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/LocalDocument.d.ts +16 -0
- package/lib/LocalDocument.d.ts.map +1 -0
- package/lib/LocalDocument.js +99 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +48 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.js +367 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +12 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -0
- package/lib/LocalDocumentResult.js +186 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +130 -0
- package/lib/LocalIndex.d.ts.map +1 -0
- package/lib/LocalIndex.js +405 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/OpenAIEmbeddings.d.ts +98 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +139 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +17 -0
- package/lib/TextSplitter.d.ts.map +1 -0
- package/lib/TextSplitter.js +460 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/WebFetcher.d.ts +16 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +144 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +11 -0
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +27 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +42 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +133 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -0
- package/lib/vectra-cli.js +276 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +21 -3
- package/src/GPT3Tokenizer.ts +15 -0
- package/src/ItemSelector.ts +9 -9
- package/src/LocalDocument.ts +70 -0
- package/src/LocalDocumentIndex.ts +355 -0
- package/src/LocalDocumentResult.ts +206 -0
- package/src/LocalIndex.ts +12 -78
- package/src/OpenAIEmbeddings.ts +205 -0
- package/src/TextSplitter.ts +480 -0
- package/src/WebFetcher.ts +128 -0
- package/src/index.ts +8 -0
- package/src/internals/Colorize.ts +64 -0
- package/src/internals/index.ts +2 -0
- package/src/internals/types.ts +46 -0
- package/src/types.ts +160 -0
- package/src/vectra-cli.ts +238 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
import * as fs from 'fs/promises';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
import { v4 } from 'uuid';
|
|
4
|
+
import { GPT3Tokenizer } from "./GPT3Tokenizer";
|
|
5
|
+
import { CreateIndexConfig, LocalIndex } from "./LocalIndex";
|
|
6
|
+
import { TextSplitter, TextSplitterConfig } from "./TextSplitter";
|
|
7
|
+
import { MetadataFilter, EmbeddingsModel, Tokenizer, MetadataTypes, EmbeddingsResponse, QueryResult, DocumentChunkMetadata, DocumentCatalogStats } from "./types";
|
|
8
|
+
import { LocalDocumentResult } from './LocalDocumentResult';
|
|
9
|
+
import { LocalDocument } from './LocalDocument';
|
|
10
|
+
|
|
11
|
+
const EMBEDDINGS_BATCH_SIZE = 500;
|
|
12
|
+
|
|
13
|
+
export interface DocumentQueryOptions {
|
|
14
|
+
maxDocuments?: number;
|
|
15
|
+
maxChunks?: number;
|
|
16
|
+
filter?: MetadataFilter;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface LocalDocumentIndexConfig {
|
|
20
|
+
folderPath: string;
|
|
21
|
+
embeddings?: EmbeddingsModel;
|
|
22
|
+
tokenizer?: Tokenizer;
|
|
23
|
+
chunkingConfig?: Partial<TextSplitterConfig>;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export class LocalDocumentIndex extends LocalIndex {
|
|
27
|
+
private readonly _embeddings?: EmbeddingsModel;
|
|
28
|
+
private readonly _tokenizer: Tokenizer;
|
|
29
|
+
private readonly _chunkingConfig?: TextSplitterConfig;
|
|
30
|
+
private _catalog?: DocumentCatalog;
|
|
31
|
+
private _newCatalog?: DocumentCatalog;
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
public constructor(config: LocalDocumentIndexConfig) {
|
|
35
|
+
super(config.folderPath);
|
|
36
|
+
this._embeddings = config.embeddings;
|
|
37
|
+
this._chunkingConfig = Object.assign({
|
|
38
|
+
keepSeparators: true,
|
|
39
|
+
chunkSize: 512,
|
|
40
|
+
chunkOverlap: 0,
|
|
41
|
+
} as TextSplitterConfig, config.chunkingConfig);
|
|
42
|
+
this._tokenizer = config.tokenizer ?? this._chunkingConfig.tokenizer ?? new GPT3Tokenizer();
|
|
43
|
+
this._chunkingConfig.tokenizer = this._tokenizer;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Returns true if the document catalog exists.
|
|
48
|
+
*/
|
|
49
|
+
public async isCatalogCreated(): Promise<boolean> {
|
|
50
|
+
try {
|
|
51
|
+
await fs.access(path.join(this.folderPath, 'catalog.json'));
|
|
52
|
+
return true;
|
|
53
|
+
} catch (err: unknown) {
|
|
54
|
+
return false;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
public async getDocumentId(uri: string): Promise<string | undefined> {
|
|
59
|
+
await this.loadIndexData();
|
|
60
|
+
return this._catalog?.uriToId[uri];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
public async getDocumentUri(documentId: string): Promise<string | undefined> {
|
|
64
|
+
await this.loadIndexData();
|
|
65
|
+
return this._catalog?.idToUri[documentId];
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
public async createIndex(config?: CreateIndexConfig): Promise<void> {
|
|
69
|
+
await super.createIndex(config);
|
|
70
|
+
await this.loadIndexData();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
public async deleteDocument(uri: string): Promise<void> {
|
|
74
|
+
// Lookup document ID
|
|
75
|
+
const documentId = await this.getDocumentId(uri);
|
|
76
|
+
if (documentId == undefined) {
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Delete document chunks from index and remove from catalog
|
|
81
|
+
await this.beginUpdate();
|
|
82
|
+
try {
|
|
83
|
+
// Get list of chunks for document
|
|
84
|
+
const chunks = await this.listItemsByMetadata<DocumentChunkMetadata>({ documentId });
|
|
85
|
+
|
|
86
|
+
// Delete chunks
|
|
87
|
+
for (const chunk of chunks) {
|
|
88
|
+
await this.deleteItem(chunk.id);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Remove entry from catalog
|
|
92
|
+
delete this._newCatalog!.uriToId[uri];
|
|
93
|
+
delete this._newCatalog!.idToUri[documentId];
|
|
94
|
+
this._newCatalog!.count--;
|
|
95
|
+
|
|
96
|
+
// Commit changes
|
|
97
|
+
await this.endUpdate();
|
|
98
|
+
} catch (err: unknown) {
|
|
99
|
+
// Cancel update and raise error
|
|
100
|
+
this.cancelUpdate();
|
|
101
|
+
throw new Error(`Error deleting document "${uri}": ${(err as any).toString()}`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Delete text file from disk
|
|
105
|
+
try {
|
|
106
|
+
await fs.unlink(path.join(this.folderPath, `${documentId}.txt`));
|
|
107
|
+
} catch (err: unknown) {
|
|
108
|
+
throw new Error(`Error removing text file for document "${uri}" from disk: ${(err as any).toString()}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Delete metadata file from disk
|
|
112
|
+
try {
|
|
113
|
+
await fs.unlink(path.join(this.folderPath, `${documentId}.json`));
|
|
114
|
+
} catch (err: unknown) {
|
|
115
|
+
// Ignore error
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
public async getCatalogStats(): Promise<DocumentCatalogStats> {
|
|
120
|
+
const stats = await this.getIndexStats()
|
|
121
|
+
return {
|
|
122
|
+
version: this._catalog!.version,
|
|
123
|
+
documents: this._catalog!.count,
|
|
124
|
+
chunks: stats.items,
|
|
125
|
+
metadata_config: stats.metadata_config
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Adds a document to the catalog.
|
|
131
|
+
* @remarks
|
|
132
|
+
* A new update is started if one is not already in progress. If an document with the same uri
|
|
133
|
+
* already exists, it will be replaced.
|
|
134
|
+
* @param item Item to insert
|
|
135
|
+
* @returns Inserted document
|
|
136
|
+
*/
|
|
137
|
+
public async upsertDocument(uri: string, text: string, metadata?: Record<string, MetadataTypes>): Promise<LocalDocument> {
|
|
138
|
+
// Ensure embeddings configured
|
|
139
|
+
if (!this._embeddings) {
|
|
140
|
+
throw new Error(`Embeddings model not configured.`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Check for existing document ID
|
|
144
|
+
let documentId = await this.getDocumentId(uri);
|
|
145
|
+
if (documentId != undefined) {
|
|
146
|
+
// Delete existing document
|
|
147
|
+
await this.deleteDocument(uri);
|
|
148
|
+
} else {
|
|
149
|
+
// Generate new document ID
|
|
150
|
+
documentId = v4();
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Populate docType based on extension
|
|
154
|
+
const config = Object.assign({}, this._chunkingConfig);
|
|
155
|
+
const pos = uri.lastIndexOf('.');
|
|
156
|
+
if (pos >= 0) {
|
|
157
|
+
const ext = uri.substring(pos + 1).toLowerCase();
|
|
158
|
+
config.docType = ext;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Split text into chunks
|
|
162
|
+
const splitter = new TextSplitter(config);
|
|
163
|
+
const chunks = splitter.split(text);
|
|
164
|
+
|
|
165
|
+
// Break chunks into batches for embedding generation
|
|
166
|
+
const chunkBatches: string[][] = [];
|
|
167
|
+
let currentBatch: string[] = [];
|
|
168
|
+
for (const chunk of chunks) {
|
|
169
|
+
currentBatch.push(chunk.text);
|
|
170
|
+
if (currentBatch.length >= EMBEDDINGS_BATCH_SIZE) {
|
|
171
|
+
chunkBatches.push(currentBatch);
|
|
172
|
+
currentBatch = [];
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
if (currentBatch.length > 0) {
|
|
176
|
+
chunkBatches.push(currentBatch);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Generate embeddings for chunks
|
|
180
|
+
const embeddings: number[][] = [];
|
|
181
|
+
for (const batch of chunkBatches) {
|
|
182
|
+
let response: EmbeddingsResponse;
|
|
183
|
+
try {
|
|
184
|
+
response = await this._embeddings.createEmbeddings(batch);
|
|
185
|
+
} catch (err: unknown) {
|
|
186
|
+
throw new Error(`Error generating embeddings: ${(err as any).toString()}`);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Check for error
|
|
190
|
+
if (response.status != 'success') {
|
|
191
|
+
throw new Error(`Error generating embeddings: ${response.message}`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Add embeddings to output
|
|
195
|
+
for (const embedding of response.output!) {
|
|
196
|
+
embeddings.push(embedding);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Add document chunks to index
|
|
201
|
+
await this.beginUpdate();
|
|
202
|
+
try {
|
|
203
|
+
// Add chunks to index
|
|
204
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
205
|
+
const chunk = chunks[i];
|
|
206
|
+
const embedding = embeddings[i];
|
|
207
|
+
const chunkMetadata: DocumentChunkMetadata = Object.assign({
|
|
208
|
+
documentId,
|
|
209
|
+
startPos: chunk.startPos,
|
|
210
|
+
endPos: chunk.endPos,
|
|
211
|
+
}, metadata);
|
|
212
|
+
await this.insertItem({
|
|
213
|
+
id: v4(),
|
|
214
|
+
metadata: chunkMetadata,
|
|
215
|
+
vector: embedding,
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Save metadata file to disk
|
|
220
|
+
if (metadata != undefined) {
|
|
221
|
+
await fs.writeFile(path.join(this.folderPath, `${documentId}.json`), JSON.stringify(metadata));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Save text file to disk
|
|
225
|
+
await fs.writeFile(path.join(this.folderPath, `${documentId}.txt`), text);
|
|
226
|
+
|
|
227
|
+
// Add entry to catalog
|
|
228
|
+
this._newCatalog!.uriToId[uri] = documentId;
|
|
229
|
+
this._newCatalog!.idToUri[documentId] = uri;
|
|
230
|
+
this._newCatalog!.count++;
|
|
231
|
+
|
|
232
|
+
// Commit changes
|
|
233
|
+
await this.endUpdate();
|
|
234
|
+
} catch (err: unknown) {
|
|
235
|
+
// Cancel update and raise error
|
|
236
|
+
this.cancelUpdate();
|
|
237
|
+
throw new Error(`Error adding document "${uri}": ${(err as any).toString()}`);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Return document
|
|
241
|
+
return new LocalDocument(this.folderPath, documentId, uri);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
public async queryDocuments(query: string, options?: DocumentQueryOptions): Promise<LocalDocumentResult[]> {
|
|
246
|
+
// Ensure embeddings configured
|
|
247
|
+
if (!this._embeddings) {
|
|
248
|
+
throw new Error(`Embeddings model not configured.`);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Ensure options are defined
|
|
252
|
+
options = Object.assign({
|
|
253
|
+
maxDocuments: 10,
|
|
254
|
+
maxChunks: 50,
|
|
255
|
+
}, options);
|
|
256
|
+
|
|
257
|
+
// Generate embeddings for query
|
|
258
|
+
let embeddings: EmbeddingsResponse;
|
|
259
|
+
try {
|
|
260
|
+
embeddings = await this._embeddings.createEmbeddings(query);
|
|
261
|
+
} catch (err: unknown) {
|
|
262
|
+
throw new Error(`Error generating embeddings for query: ${(err as any).toString()}`);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Check for error
|
|
266
|
+
if (embeddings.status != 'success') {
|
|
267
|
+
throw new Error(`Error generating embeddings for query: ${embeddings.message}`);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Query index for chunks
|
|
271
|
+
const results = await this.queryItems<DocumentChunkMetadata>(embeddings.output![0], options.maxChunks!, options.filter);
|
|
272
|
+
|
|
273
|
+
// Group chunks by document
|
|
274
|
+
const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
|
|
275
|
+
for (const result of results) {
|
|
276
|
+
const metadata = result.item.metadata;
|
|
277
|
+
if (documentChunks[metadata.documentId] == undefined) {
|
|
278
|
+
documentChunks[metadata.documentId] = [];
|
|
279
|
+
}
|
|
280
|
+
documentChunks[metadata.documentId].push(result);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Create a document result for each document
|
|
284
|
+
const documentResults: LocalDocumentResult[] = [];
|
|
285
|
+
for (const documentId in documentChunks) {
|
|
286
|
+
const chunks = documentChunks[documentId];
|
|
287
|
+
const uri = await this.getDocumentUri(documentId) as string;
|
|
288
|
+
const documentResult = new LocalDocumentResult(this.folderPath, documentId, uri, chunks, this._tokenizer);
|
|
289
|
+
documentResults.push(documentResult);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Sort document results by score and return top results
|
|
293
|
+
return documentResults.sort((a, b) => b.score - a.score).slice(0, options.maxDocuments!);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Overrides
|
|
297
|
+
|
|
298
|
+
public async beginUpdate(): Promise<void> {
|
|
299
|
+
await super.beginUpdate();
|
|
300
|
+
this._newCatalog = Object.assign({}, this._catalog);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
public cancelUpdate(): void {
|
|
304
|
+
super.cancelUpdate();
|
|
305
|
+
this._newCatalog = undefined;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
public async endUpdate(): Promise<void> {
|
|
309
|
+
await super.endUpdate();
|
|
310
|
+
|
|
311
|
+
try {
|
|
312
|
+
// Save catalog
|
|
313
|
+
await fs.writeFile(path.join(this.folderPath, 'catalog.json'), JSON.stringify(this._newCatalog));
|
|
314
|
+
this._catalog = this._newCatalog;
|
|
315
|
+
this._newCatalog = undefined;
|
|
316
|
+
} catch(err: unknown) {
|
|
317
|
+
throw new Error(`Error saving document catalog: ${(err as any).toString()}`);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
protected async loadIndexData(): Promise<void> {
|
|
322
|
+
await super.loadIndexData();
|
|
323
|
+
|
|
324
|
+
if (this._catalog) {
|
|
325
|
+
return;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const catalogPath = path.join(this.folderPath, 'catalog.json');
|
|
329
|
+
if (await this.isCatalogCreated()) {
|
|
330
|
+
// Load catalog
|
|
331
|
+
const buffer = await fs.readFile(catalogPath);
|
|
332
|
+
this._catalog = JSON.parse(buffer.toString());
|
|
333
|
+
} else {
|
|
334
|
+
try {
|
|
335
|
+
// Initialize catalog
|
|
336
|
+
this._catalog = {
|
|
337
|
+
version: 1,
|
|
338
|
+
count: 0,
|
|
339
|
+
uriToId: {},
|
|
340
|
+
idToUri: {},
|
|
341
|
+
};
|
|
342
|
+
await fs.writeFile(catalogPath, JSON.stringify(this._catalog));
|
|
343
|
+
} catch(err: unknown) {
|
|
344
|
+
throw new Error(`Error creating document catalog: ${(err as any).toString()}`);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
interface DocumentCatalog {
|
|
351
|
+
version: number;
|
|
352
|
+
count: number;
|
|
353
|
+
uriToId: { [uri: string]: string; };
|
|
354
|
+
idToUri: { [id: string]: string; };
|
|
355
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { LocalDocument } from "./LocalDocument";
|
|
2
|
+
import { QueryResult, DocumentChunkMetadata, Tokenizer, DocumentTextSection } from "./types";
|
|
3
|
+
|
|
4
|
+
export class LocalDocumentResult extends LocalDocument {
|
|
5
|
+
private readonly _chunks: QueryResult<DocumentChunkMetadata>[];
|
|
6
|
+
private readonly _tokenizer: Tokenizer;
|
|
7
|
+
private readonly _score: number;
|
|
8
|
+
|
|
9
|
+
public constructor(folderPath: string, id: string, uri: string, chunks: QueryResult<DocumentChunkMetadata>[], tokenizer: Tokenizer) {
|
|
10
|
+
super(folderPath, id, uri);
|
|
11
|
+
this._chunks = chunks;
|
|
12
|
+
this._tokenizer = tokenizer;
|
|
13
|
+
|
|
14
|
+
// Compute average score
|
|
15
|
+
let score = 0;
|
|
16
|
+
this._chunks.forEach(chunk => score += chunk.score);
|
|
17
|
+
this._score = score / this._chunks.length;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
public get chunks(): QueryResult<DocumentChunkMetadata>[] {
|
|
21
|
+
return this._chunks;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
public get score(): number {
|
|
25
|
+
return this._score;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
public async renderSections(maxTokens: number, maxSections: number): Promise<DocumentTextSection[]> {
|
|
29
|
+
// Load text from disk
|
|
30
|
+
const text = await this.loadText();
|
|
31
|
+
|
|
32
|
+
// First check to see if the entire document is less than maxTokens
|
|
33
|
+
const tokens = this._tokenizer.encode(text);
|
|
34
|
+
if (tokens.length < maxTokens) {
|
|
35
|
+
return [{
|
|
36
|
+
text,
|
|
37
|
+
tokenCount: tokens.length,
|
|
38
|
+
score: 1.0
|
|
39
|
+
}];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Otherwise, we need to split the document into sections
|
|
43
|
+
// - Add each chunk to a temp array and filter out any chunk that's longer then maxTokens.
|
|
44
|
+
// - Sort the array by startPos to arrange chunks in document order.
|
|
45
|
+
// - Generate a new array of sections by combining chunks until the maxTokens is reached for each section.
|
|
46
|
+
// - Generate an aggregate score for each section by averaging the score of each chunk in the section.
|
|
47
|
+
// - Sort the sections by score and limit to maxSections.
|
|
48
|
+
// - For each remaining section combine adjacent chunks of text.
|
|
49
|
+
// - Dynamically add overlapping chunks of text to each section until the maxTokens is reached.
|
|
50
|
+
const chunks: SectionChunk[] = this._chunks.map(chunk => {
|
|
51
|
+
const startPos = chunk.item.metadata.startPos;
|
|
52
|
+
const endPos = chunk.item.metadata.endPos;
|
|
53
|
+
const chunkText = text.substring(startPos, endPos + 1);
|
|
54
|
+
return {
|
|
55
|
+
text: chunkText,
|
|
56
|
+
startPos,
|
|
57
|
+
endPos,
|
|
58
|
+
score: chunk.score,
|
|
59
|
+
tokenCount: this._tokenizer.encode(chunkText).length
|
|
60
|
+
};
|
|
61
|
+
}).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
|
|
62
|
+
|
|
63
|
+
// Check for no chunks
|
|
64
|
+
if (chunks.length === 0) {
|
|
65
|
+
// Take the top chunk and return a subset of its text
|
|
66
|
+
const topChunk = this._chunks[0];
|
|
67
|
+
const startPos = topChunk.item.metadata.startPos;
|
|
68
|
+
const endPos = topChunk.item.metadata.endPos;
|
|
69
|
+
const chunkText = text.substring(startPos, endPos + 1);
|
|
70
|
+
const tokens = this._tokenizer.encode(chunkText);
|
|
71
|
+
return [{
|
|
72
|
+
text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
|
|
73
|
+
tokenCount: maxTokens,
|
|
74
|
+
score: topChunk.score
|
|
75
|
+
}];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Generate sections
|
|
79
|
+
const sections: Section[] = [{
|
|
80
|
+
chunks: [],
|
|
81
|
+
score: 0,
|
|
82
|
+
tokenCount: 0
|
|
83
|
+
}];
|
|
84
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
85
|
+
const chunk = chunks[i];
|
|
86
|
+
let section = sections[sections.length - 1];
|
|
87
|
+
if (section.tokenCount + chunk.tokenCount > maxTokens) {
|
|
88
|
+
sections.push({
|
|
89
|
+
chunks: [],
|
|
90
|
+
score: 0,
|
|
91
|
+
tokenCount: 0
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
sections[sections.length - 1].chunks.push(chunk);
|
|
95
|
+
sections[sections.length - 1].score += chunk.score;
|
|
96
|
+
sections[sections.length - 1].tokenCount += chunk.tokenCount;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Normalize section scores
|
|
100
|
+
sections.forEach(section => section.score /= section.chunks.length);
|
|
101
|
+
|
|
102
|
+
// Sort sections by score and limit to maxSections
|
|
103
|
+
sections.sort((a, b) => b.score - a.score);
|
|
104
|
+
if (sections.length > maxSections) {
|
|
105
|
+
sections.splice(maxSections, sections.length - maxSections);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Combine adjacent chunks of text
|
|
109
|
+
sections.forEach(section => {
|
|
110
|
+
for (let i = 0; i < section.chunks.length - 1; i++) {
|
|
111
|
+
const chunk = section.chunks[i];
|
|
112
|
+
const nextChunk = section.chunks[i + 1];
|
|
113
|
+
if (chunk.endPos + 1 === nextChunk.startPos) {
|
|
114
|
+
chunk.text += nextChunk.text;
|
|
115
|
+
chunk.endPos = nextChunk.endPos;
|
|
116
|
+
chunk.tokenCount += nextChunk.tokenCount;
|
|
117
|
+
section.chunks.splice(i + 1, 1);
|
|
118
|
+
i--;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
// Add overlapping chunks of text to each section until the maxTokens is reached
|
|
124
|
+
const connector: SectionChunk = {
|
|
125
|
+
text: '\n\n...\n\n',
|
|
126
|
+
startPos: -1,
|
|
127
|
+
endPos: -1,
|
|
128
|
+
score: 0,
|
|
129
|
+
tokenCount: this._tokenizer.encode('\n\n...\n\n').length
|
|
130
|
+
};
|
|
131
|
+
sections.forEach(section => {
|
|
132
|
+
// Insert connectors between chunks
|
|
133
|
+
if (section.chunks.length > 1) {
|
|
134
|
+
for (let i = 0; i < section.chunks.length - 1; i++) {
|
|
135
|
+
section.chunks.splice(i + 1, 0, connector);
|
|
136
|
+
section.tokenCount += connector.tokenCount;
|
|
137
|
+
i++;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Add chunks to beginning and end of the section until maxTokens is reached
|
|
142
|
+
let budget = maxTokens - section.tokenCount;
|
|
143
|
+
if (budget > 40) {
|
|
144
|
+
const sectionStart = section.chunks[0].startPos;
|
|
145
|
+
const sectionEnd = section.chunks[section.chunks.length - 1].endPos;
|
|
146
|
+
if (sectionStart > 0) {
|
|
147
|
+
const beforeTex = text.substring(0, section.chunks[0].startPos);
|
|
148
|
+
const beforeTokens = this._tokenizer.encode(beforeTex);
|
|
149
|
+
const beforeBudget = sectionEnd < text.length - 1 ? Math.min(beforeTokens.length, Math.ceil(budget/2)) : Math.min(beforeTokens.length, budget);
|
|
150
|
+
const chunk: SectionChunk = {
|
|
151
|
+
text: this._tokenizer.decode(beforeTokens.slice(-beforeBudget)),
|
|
152
|
+
startPos: sectionStart - beforeBudget,
|
|
153
|
+
endPos: sectionStart - 1,
|
|
154
|
+
score: 0,
|
|
155
|
+
tokenCount: beforeBudget
|
|
156
|
+
};
|
|
157
|
+
section.chunks.unshift(chunk);
|
|
158
|
+
section.tokenCount += chunk.tokenCount;
|
|
159
|
+
budget -= chunk.tokenCount;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (sectionEnd < text.length - 1) {
|
|
163
|
+
const afterText = text.substring(sectionEnd + 1);
|
|
164
|
+
const afterTokens = this._tokenizer.encode(afterText);
|
|
165
|
+
const afterBudget = Math.min(afterTokens.length, budget);
|
|
166
|
+
const chunk: SectionChunk = {
|
|
167
|
+
text: this._tokenizer.decode(afterTokens.slice(0, afterBudget)),
|
|
168
|
+
startPos: sectionEnd + 1,
|
|
169
|
+
endPos: sectionEnd + afterBudget,
|
|
170
|
+
score: 0,
|
|
171
|
+
tokenCount: afterBudget
|
|
172
|
+
};
|
|
173
|
+
section.chunks.push(chunk);
|
|
174
|
+
section.tokenCount += chunk.tokenCount;
|
|
175
|
+
budget -= chunk.tokenCount;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// Return final rendered sections
|
|
181
|
+
return sections.map(section => {
|
|
182
|
+
let text = '';
|
|
183
|
+
section.chunks.forEach(chunk => text += chunk.text);
|
|
184
|
+
return {
|
|
185
|
+
text: text,
|
|
186
|
+
tokenCount: section.tokenCount,
|
|
187
|
+
score: section.score
|
|
188
|
+
};
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
interface SectionChunk {
|
|
194
|
+
text: string;
|
|
195
|
+
startPos: number;
|
|
196
|
+
endPos: number;
|
|
197
|
+
score: number;
|
|
198
|
+
tokenCount: number;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
interface Section {
|
|
202
|
+
chunks: SectionChunk[];
|
|
203
|
+
score: number;
|
|
204
|
+
tokenCount: number;
|
|
205
|
+
}
|
|
206
|
+
|
package/src/LocalIndex.ts
CHANGED
|
@@ -2,6 +2,7 @@ import * as fs from 'fs/promises';
|
|
|
2
2
|
import * as path from 'path';
|
|
3
3
|
import { v4 } from 'uuid';
|
|
4
4
|
import { ItemSelector } from './ItemSelector';
|
|
5
|
+
import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
|
|
5
6
|
|
|
6
7
|
export interface CreateIndexConfig {
|
|
7
8
|
version: number;
|
|
@@ -11,83 +12,6 @@ export interface CreateIndexConfig {
|
|
|
11
12
|
};
|
|
12
13
|
}
|
|
13
14
|
|
|
14
|
-
export interface IndexStats {
|
|
15
|
-
version: number;
|
|
16
|
-
metadata_config: {
|
|
17
|
-
indexed?: string[];
|
|
18
|
-
};
|
|
19
|
-
items: number;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
export interface IndexItem<TMetadata = Record<string,MetadataTypes>> {
|
|
23
|
-
id: string;
|
|
24
|
-
metadata: TMetadata;
|
|
25
|
-
vector: number[];
|
|
26
|
-
norm: number;
|
|
27
|
-
metadataFile?: string;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
export interface QueryResult<TMetadata = Record<string,MetadataTypes>> {
|
|
31
|
-
item: IndexItem<TMetadata>;
|
|
32
|
-
score: number;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
export interface MetadataFilter {
|
|
36
|
-
[key: string]: MetadataTypes|MetadataFilter|(number|string)[]|MetadataFilter[];
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Equal to (number, string, boolean)
|
|
40
|
-
*/
|
|
41
|
-
'$eq': number|string|boolean;
|
|
42
|
-
|
|
43
|
-
/**
|
|
44
|
-
* Not equal to (number, string, boolean)
|
|
45
|
-
*/
|
|
46
|
-
'$ne': number|string|boolean;
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
* Greater than (number)
|
|
50
|
-
*/
|
|
51
|
-
'$gt': number;
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* Greater than or equal to (number)
|
|
55
|
-
*/
|
|
56
|
-
'$gte': number;
|
|
57
|
-
|
|
58
|
-
/**
|
|
59
|
-
* Less than (number)
|
|
60
|
-
*/
|
|
61
|
-
'$lt': number;
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Less than or equal to (number)
|
|
65
|
-
*/
|
|
66
|
-
'$lte': number;
|
|
67
|
-
|
|
68
|
-
/**
|
|
69
|
-
* In array (string or number)
|
|
70
|
-
*/
|
|
71
|
-
'$in': (number|string)[];
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Not in array (string or number)
|
|
75
|
-
*/
|
|
76
|
-
'$nin': (number|string)[];
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* AND (MetadataFilter[])
|
|
80
|
-
*/
|
|
81
|
-
'$and': MetadataFilter[];
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* OR (MetadataFilter[])
|
|
85
|
-
*/
|
|
86
|
-
'$or': MetadataFilter[];
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
export type MetadataTypes = number|string|boolean;
|
|
90
|
-
|
|
91
15
|
/**
|
|
92
16
|
* Local vector index instance.
|
|
93
17
|
* @remarks
|
|
@@ -107,6 +31,13 @@ export class LocalIndex {
|
|
|
107
31
|
this._folderPath = folderPath;
|
|
108
32
|
}
|
|
109
33
|
|
|
34
|
+
/**
|
|
35
|
+
* Path to the index folder.
|
|
36
|
+
*/
|
|
37
|
+
public get folderPath(): string {
|
|
38
|
+
return this._folderPath;
|
|
39
|
+
}
|
|
40
|
+
|
|
110
41
|
/**
|
|
111
42
|
* Begins an update to the index.
|
|
112
43
|
* @remarks
|
|
@@ -364,7 +295,10 @@ export class LocalIndex {
|
|
|
364
295
|
}
|
|
365
296
|
}
|
|
366
297
|
|
|
367
|
-
|
|
298
|
+
/**
|
|
299
|
+
* Ensures that the index has been loaded into memory.
|
|
300
|
+
*/
|
|
301
|
+
protected async loadIndexData(): Promise<void> {
|
|
368
302
|
if (this._data) {
|
|
369
303
|
return;
|
|
370
304
|
}
|