vectra 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +3 -3
  2. package/bin/vectra.js +3 -0
  3. package/lib/GPT3Tokenizer.d.ts +9 -0
  4. package/lib/GPT3Tokenizer.d.ts.map +1 -0
  5. package/lib/GPT3Tokenizer.js +17 -0
  6. package/lib/GPT3Tokenizer.js.map +1 -0
  7. package/lib/ItemSelector.d.ts +41 -0
  8. package/lib/ItemSelector.d.ts.map +1 -0
  9. package/lib/ItemSelector.js +156 -0
  10. package/lib/ItemSelector.js.map +1 -0
  11. package/lib/LocalDocument.d.ts +16 -0
  12. package/lib/LocalDocument.d.ts.map +1 -0
  13. package/lib/LocalDocument.js +99 -0
  14. package/lib/LocalDocument.js.map +1 -0
  15. package/lib/LocalDocumentIndex.d.ts +48 -0
  16. package/lib/LocalDocumentIndex.d.ts.map +1 -0
  17. package/lib/LocalDocumentIndex.js +367 -0
  18. package/lib/LocalDocumentIndex.js.map +1 -0
  19. package/lib/LocalDocumentResult.d.ts +12 -0
  20. package/lib/LocalDocumentResult.d.ts.map +1 -0
  21. package/lib/LocalDocumentResult.js +186 -0
  22. package/lib/LocalDocumentResult.js.map +1 -0
  23. package/lib/LocalIndex.d.ts +130 -0
  24. package/lib/LocalIndex.d.ts.map +1 -0
  25. package/lib/LocalIndex.js +405 -0
  26. package/lib/LocalIndex.js.map +1 -0
  27. package/lib/OpenAIEmbeddings.d.ts +98 -0
  28. package/lib/OpenAIEmbeddings.d.ts.map +1 -0
  29. package/lib/OpenAIEmbeddings.js +139 -0
  30. package/lib/OpenAIEmbeddings.js.map +1 -0
  31. package/lib/TextSplitter.d.ts +17 -0
  32. package/lib/TextSplitter.d.ts.map +1 -0
  33. package/lib/TextSplitter.js +460 -0
  34. package/lib/TextSplitter.js.map +1 -0
  35. package/lib/WebFetcher.d.ts +16 -0
  36. package/lib/WebFetcher.d.ts.map +1 -0
  37. package/lib/WebFetcher.js +144 -0
  38. package/lib/WebFetcher.js.map +1 -0
  39. package/lib/index.d.ts +11 -0
  40. package/lib/index.d.ts.map +1 -0
  41. package/lib/index.js +27 -0
  42. package/lib/index.js.map +1 -0
  43. package/lib/internals/Colorize.d.ts +14 -0
  44. package/lib/internals/Colorize.d.ts.map +1 -0
  45. package/lib/internals/Colorize.js +64 -0
  46. package/lib/internals/Colorize.js.map +1 -0
  47. package/lib/internals/index.d.ts +3 -0
  48. package/lib/internals/index.d.ts.map +1 -0
  49. package/lib/internals/index.js +19 -0
  50. package/lib/internals/index.js.map +1 -0
  51. package/lib/internals/types.d.ts +42 -0
  52. package/lib/internals/types.d.ts.map +1 -0
  53. package/lib/internals/types.js +3 -0
  54. package/lib/internals/types.js.map +1 -0
  55. package/lib/types.d.ts +133 -0
  56. package/lib/types.d.ts.map +1 -0
  57. package/lib/types.js +3 -0
  58. package/lib/types.js.map +1 -0
  59. package/lib/vectra-cli.d.ts +2 -0
  60. package/lib/vectra-cli.d.ts.map +1 -0
  61. package/lib/vectra-cli.js +276 -0
  62. package/lib/vectra-cli.js.map +1 -0
  63. package/package.json +21 -3
  64. package/src/GPT3Tokenizer.ts +15 -0
  65. package/src/ItemSelector.ts +9 -9
  66. package/src/LocalDocument.ts +70 -0
  67. package/src/LocalDocumentIndex.ts +355 -0
  68. package/src/LocalDocumentResult.ts +206 -0
  69. package/src/LocalIndex.ts +12 -78
  70. package/src/OpenAIEmbeddings.ts +205 -0
  71. package/src/TextSplitter.ts +480 -0
  72. package/src/WebFetcher.ts +128 -0
  73. package/src/index.ts +8 -0
  74. package/src/internals/Colorize.ts +64 -0
  75. package/src/internals/index.ts +2 -0
  76. package/src/internals/types.ts +46 -0
  77. package/src/types.ts +160 -0
  78. package/src/vectra-cli.ts +238 -0
@@ -0,0 +1,355 @@
1
+ import * as fs from 'fs/promises';
2
+ import * as path from 'path';
3
+ import { v4 } from 'uuid';
4
+ import { GPT3Tokenizer } from "./GPT3Tokenizer";
5
+ import { CreateIndexConfig, LocalIndex } from "./LocalIndex";
6
+ import { TextSplitter, TextSplitterConfig } from "./TextSplitter";
7
+ import { MetadataFilter, EmbeddingsModel, Tokenizer, MetadataTypes, EmbeddingsResponse, QueryResult, DocumentChunkMetadata, DocumentCatalogStats } from "./types";
8
+ import { LocalDocumentResult } from './LocalDocumentResult';
9
+ import { LocalDocument } from './LocalDocument';
10
+
11
+ const EMBEDDINGS_BATCH_SIZE = 500;
12
+
13
+ export interface DocumentQueryOptions {
14
+ maxDocuments?: number;
15
+ maxChunks?: number;
16
+ filter?: MetadataFilter;
17
+ }
18
+
19
+ export interface LocalDocumentIndexConfig {
20
+ folderPath: string;
21
+ embeddings?: EmbeddingsModel;
22
+ tokenizer?: Tokenizer;
23
+ chunkingConfig?: Partial<TextSplitterConfig>;
24
+ }
25
+
26
+ export class LocalDocumentIndex extends LocalIndex {
27
+ private readonly _embeddings?: EmbeddingsModel;
28
+ private readonly _tokenizer: Tokenizer;
29
+ private readonly _chunkingConfig?: TextSplitterConfig;
30
+ private _catalog?: DocumentCatalog;
31
+ private _newCatalog?: DocumentCatalog;
32
+
33
+
34
+ public constructor(config: LocalDocumentIndexConfig) {
35
+ super(config.folderPath);
36
+ this._embeddings = config.embeddings;
37
+ this._chunkingConfig = Object.assign({
38
+ keepSeparators: true,
39
+ chunkSize: 512,
40
+ chunkOverlap: 0,
41
+ } as TextSplitterConfig, config.chunkingConfig);
42
+ this._tokenizer = config.tokenizer ?? this._chunkingConfig.tokenizer ?? new GPT3Tokenizer();
43
+ this._chunkingConfig.tokenizer = this._tokenizer;
44
+ }
45
+
46
+ /**
47
+ * Returns true if the document catalog exists.
48
+ */
49
+ public async isCatalogCreated(): Promise<boolean> {
50
+ try {
51
+ await fs.access(path.join(this.folderPath, 'catalog.json'));
52
+ return true;
53
+ } catch (err: unknown) {
54
+ return false;
55
+ }
56
+ }
57
+
58
+ public async getDocumentId(uri: string): Promise<string | undefined> {
59
+ await this.loadIndexData();
60
+ return this._catalog?.uriToId[uri];
61
+ }
62
+
63
+ public async getDocumentUri(documentId: string): Promise<string | undefined> {
64
+ await this.loadIndexData();
65
+ return this._catalog?.idToUri[documentId];
66
+ }
67
+
68
+ public async createIndex(config?: CreateIndexConfig): Promise<void> {
69
+ await super.createIndex(config);
70
+ await this.loadIndexData();
71
+ }
72
+
73
+ public async deleteDocument(uri: string): Promise<void> {
74
+ // Lookup document ID
75
+ const documentId = await this.getDocumentId(uri);
76
+ if (documentId == undefined) {
77
+ return;
78
+ }
79
+
80
+ // Delete document chunks from index and remove from catalog
81
+ await this.beginUpdate();
82
+ try {
83
+ // Get list of chunks for document
84
+ const chunks = await this.listItemsByMetadata<DocumentChunkMetadata>({ documentId });
85
+
86
+ // Delete chunks
87
+ for (const chunk of chunks) {
88
+ await this.deleteItem(chunk.id);
89
+ }
90
+
91
+ // Remove entry from catalog
92
+ delete this._newCatalog!.uriToId[uri];
93
+ delete this._newCatalog!.idToUri[documentId];
94
+ this._newCatalog!.count--;
95
+
96
+ // Commit changes
97
+ await this.endUpdate();
98
+ } catch (err: unknown) {
99
+ // Cancel update and raise error
100
+ this.cancelUpdate();
101
+ throw new Error(`Error deleting document "${uri}": ${(err as any).toString()}`);
102
+ }
103
+
104
+ // Delete text file from disk
105
+ try {
106
+ await fs.unlink(path.join(this.folderPath, `${documentId}.txt`));
107
+ } catch (err: unknown) {
108
+ throw new Error(`Error removing text file for document "${uri}" from disk: ${(err as any).toString()}`);
109
+ }
110
+
111
+ // Delete metadata file from disk
112
+ try {
113
+ await fs.unlink(path.join(this.folderPath, `${documentId}.json`));
114
+ } catch (err: unknown) {
115
+ // Ignore error
116
+ }
117
+ }
118
+
119
+ public async getCatalogStats(): Promise<DocumentCatalogStats> {
120
+ const stats = await this.getIndexStats()
121
+ return {
122
+ version: this._catalog!.version,
123
+ documents: this._catalog!.count,
124
+ chunks: stats.items,
125
+ metadata_config: stats.metadata_config
126
+ };
127
+ }
128
+
129
+ /**
130
+ * Adds a document to the catalog.
131
+ * @remarks
132
+ * A new update is started if one is not already in progress. If an document with the same uri
133
+ * already exists, it will be replaced.
134
+ * @param item Item to insert
135
+ * @returns Inserted document
136
+ */
137
+ public async upsertDocument(uri: string, text: string, metadata?: Record<string, MetadataTypes>): Promise<LocalDocument> {
138
+ // Ensure embeddings configured
139
+ if (!this._embeddings) {
140
+ throw new Error(`Embeddings model not configured.`);
141
+ }
142
+
143
+ // Check for existing document ID
144
+ let documentId = await this.getDocumentId(uri);
145
+ if (documentId != undefined) {
146
+ // Delete existing document
147
+ await this.deleteDocument(uri);
148
+ } else {
149
+ // Generate new document ID
150
+ documentId = v4();
151
+ }
152
+
153
+ // Populate docType based on extension
154
+ const config = Object.assign({}, this._chunkingConfig);
155
+ const pos = uri.lastIndexOf('.');
156
+ if (pos >= 0) {
157
+ const ext = uri.substring(pos + 1).toLowerCase();
158
+ config.docType = ext;
159
+ }
160
+
161
+ // Split text into chunks
162
+ const splitter = new TextSplitter(config);
163
+ const chunks = splitter.split(text);
164
+
165
+ // Break chunks into batches for embedding generation
166
+ const chunkBatches: string[][] = [];
167
+ let currentBatch: string[] = [];
168
+ for (const chunk of chunks) {
169
+ currentBatch.push(chunk.text);
170
+ if (currentBatch.length >= EMBEDDINGS_BATCH_SIZE) {
171
+ chunkBatches.push(currentBatch);
172
+ currentBatch = [];
173
+ }
174
+ }
175
+ if (currentBatch.length > 0) {
176
+ chunkBatches.push(currentBatch);
177
+ }
178
+
179
+ // Generate embeddings for chunks
180
+ const embeddings: number[][] = [];
181
+ for (const batch of chunkBatches) {
182
+ let response: EmbeddingsResponse;
183
+ try {
184
+ response = await this._embeddings.createEmbeddings(batch);
185
+ } catch (err: unknown) {
186
+ throw new Error(`Error generating embeddings: ${(err as any).toString()}`);
187
+ }
188
+
189
+ // Check for error
190
+ if (response.status != 'success') {
191
+ throw new Error(`Error generating embeddings: ${response.message}`);
192
+ }
193
+
194
+ // Add embeddings to output
195
+ for (const embedding of response.output!) {
196
+ embeddings.push(embedding);
197
+ }
198
+ }
199
+
200
+ // Add document chunks to index
201
+ await this.beginUpdate();
202
+ try {
203
+ // Add chunks to index
204
+ for (let i = 0; i < chunks.length; i++) {
205
+ const chunk = chunks[i];
206
+ const embedding = embeddings[i];
207
+ const chunkMetadata: DocumentChunkMetadata = Object.assign({
208
+ documentId,
209
+ startPos: chunk.startPos,
210
+ endPos: chunk.endPos,
211
+ }, metadata);
212
+ await this.insertItem({
213
+ id: v4(),
214
+ metadata: chunkMetadata,
215
+ vector: embedding,
216
+ });
217
+ }
218
+
219
+ // Save metadata file to disk
220
+ if (metadata != undefined) {
221
+ await fs.writeFile(path.join(this.folderPath, `${documentId}.json`), JSON.stringify(metadata));
222
+ }
223
+
224
+ // Save text file to disk
225
+ await fs.writeFile(path.join(this.folderPath, `${documentId}.txt`), text);
226
+
227
+ // Add entry to catalog
228
+ this._newCatalog!.uriToId[uri] = documentId;
229
+ this._newCatalog!.idToUri[documentId] = uri;
230
+ this._newCatalog!.count++;
231
+
232
+ // Commit changes
233
+ await this.endUpdate();
234
+ } catch (err: unknown) {
235
+ // Cancel update and raise error
236
+ this.cancelUpdate();
237
+ throw new Error(`Error adding document "${uri}": ${(err as any).toString()}`);
238
+ }
239
+
240
+ // Return document
241
+ return new LocalDocument(this.folderPath, documentId, uri);
242
+ }
243
+
244
+
245
+ public async queryDocuments(query: string, options?: DocumentQueryOptions): Promise<LocalDocumentResult[]> {
246
+ // Ensure embeddings configured
247
+ if (!this._embeddings) {
248
+ throw new Error(`Embeddings model not configured.`);
249
+ }
250
+
251
+ // Ensure options are defined
252
+ options = Object.assign({
253
+ maxDocuments: 10,
254
+ maxChunks: 50,
255
+ }, options);
256
+
257
+ // Generate embeddings for query
258
+ let embeddings: EmbeddingsResponse;
259
+ try {
260
+ embeddings = await this._embeddings.createEmbeddings(query);
261
+ } catch (err: unknown) {
262
+ throw new Error(`Error generating embeddings for query: ${(err as any).toString()}`);
263
+ }
264
+
265
+ // Check for error
266
+ if (embeddings.status != 'success') {
267
+ throw new Error(`Error generating embeddings for query: ${embeddings.message}`);
268
+ }
269
+
270
+ // Query index for chunks
271
+ const results = await this.queryItems<DocumentChunkMetadata>(embeddings.output![0], options.maxChunks!, options.filter);
272
+
273
+ // Group chunks by document
274
+ const documentChunks: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
275
+ for (const result of results) {
276
+ const metadata = result.item.metadata;
277
+ if (documentChunks[metadata.documentId] == undefined) {
278
+ documentChunks[metadata.documentId] = [];
279
+ }
280
+ documentChunks[metadata.documentId].push(result);
281
+ }
282
+
283
+ // Create a document result for each document
284
+ const documentResults: LocalDocumentResult[] = [];
285
+ for (const documentId in documentChunks) {
286
+ const chunks = documentChunks[documentId];
287
+ const uri = await this.getDocumentUri(documentId) as string;
288
+ const documentResult = new LocalDocumentResult(this.folderPath, documentId, uri, chunks, this._tokenizer);
289
+ documentResults.push(documentResult);
290
+ }
291
+
292
+ // Sort document results by score and return top results
293
+ return documentResults.sort((a, b) => b.score - a.score).slice(0, options.maxDocuments!);
294
+ }
295
+
296
+ // Overrides
297
+
298
+ public async beginUpdate(): Promise<void> {
299
+ await super.beginUpdate();
300
+ this._newCatalog = Object.assign({}, this._catalog);
301
+ }
302
+
303
+ public cancelUpdate(): void {
304
+ super.cancelUpdate();
305
+ this._newCatalog = undefined;
306
+ }
307
+
308
+ public async endUpdate(): Promise<void> {
309
+ await super.endUpdate();
310
+
311
+ try {
312
+ // Save catalog
313
+ await fs.writeFile(path.join(this.folderPath, 'catalog.json'), JSON.stringify(this._newCatalog));
314
+ this._catalog = this._newCatalog;
315
+ this._newCatalog = undefined;
316
+ } catch(err: unknown) {
317
+ throw new Error(`Error saving document catalog: ${(err as any).toString()}`);
318
+ }
319
+ }
320
+
321
+ protected async loadIndexData(): Promise<void> {
322
+ await super.loadIndexData();
323
+
324
+ if (this._catalog) {
325
+ return;
326
+ }
327
+
328
+ const catalogPath = path.join(this.folderPath, 'catalog.json');
329
+ if (await this.isCatalogCreated()) {
330
+ // Load catalog
331
+ const buffer = await fs.readFile(catalogPath);
332
+ this._catalog = JSON.parse(buffer.toString());
333
+ } else {
334
+ try {
335
+ // Initialize catalog
336
+ this._catalog = {
337
+ version: 1,
338
+ count: 0,
339
+ uriToId: {},
340
+ idToUri: {},
341
+ };
342
+ await fs.writeFile(catalogPath, JSON.stringify(this._catalog));
343
+ } catch(err: unknown) {
344
+ throw new Error(`Error creating document catalog: ${(err as any).toString()}`);
345
+ }
346
+ }
347
+ }
348
+ }
349
+
350
+ interface DocumentCatalog {
351
+ version: number;
352
+ count: number;
353
+ uriToId: { [uri: string]: string; };
354
+ idToUri: { [id: string]: string; };
355
+ }
@@ -0,0 +1,206 @@
1
+ import { LocalDocument } from "./LocalDocument";
2
+ import { QueryResult, DocumentChunkMetadata, Tokenizer, DocumentTextSection } from "./types";
3
+
4
+ export class LocalDocumentResult extends LocalDocument {
5
+ private readonly _chunks: QueryResult<DocumentChunkMetadata>[];
6
+ private readonly _tokenizer: Tokenizer;
7
+ private readonly _score: number;
8
+
9
+ public constructor(folderPath: string, id: string, uri: string, chunks: QueryResult<DocumentChunkMetadata>[], tokenizer: Tokenizer) {
10
+ super(folderPath, id, uri);
11
+ this._chunks = chunks;
12
+ this._tokenizer = tokenizer;
13
+
14
+ // Compute average score
15
+ let score = 0;
16
+ this._chunks.forEach(chunk => score += chunk.score);
17
+ this._score = score / this._chunks.length;
18
+ }
19
+
20
+ public get chunks(): QueryResult<DocumentChunkMetadata>[] {
21
+ return this._chunks;
22
+ }
23
+
24
+ public get score(): number {
25
+ return this._score;
26
+ }
27
+
28
+ public async renderSections(maxTokens: number, maxSections: number): Promise<DocumentTextSection[]> {
29
+ // Load text from disk
30
+ const text = await this.loadText();
31
+
32
+ // First check to see if the entire document is less than maxTokens
33
+ const tokens = this._tokenizer.encode(text);
34
+ if (tokens.length < maxTokens) {
35
+ return [{
36
+ text,
37
+ tokenCount: tokens.length,
38
+ score: 1.0
39
+ }];
40
+ }
41
+
42
+ // Otherwise, we need to split the document into sections
43
+ // - Add each chunk to a temp array and filter out any chunk that's longer then maxTokens.
44
+ // - Sort the array by startPos to arrange chunks in document order.
45
+ // - Generate a new array of sections by combining chunks until the maxTokens is reached for each section.
46
+ // - Generate an aggregate score for each section by averaging the score of each chunk in the section.
47
+ // - Sort the sections by score and limit to maxSections.
48
+ // - For each remaining section combine adjacent chunks of text.
49
+ // - Dynamically add overlapping chunks of text to each section until the maxTokens is reached.
50
+ const chunks: SectionChunk[] = this._chunks.map(chunk => {
51
+ const startPos = chunk.item.metadata.startPos;
52
+ const endPos = chunk.item.metadata.endPos;
53
+ const chunkText = text.substring(startPos, endPos + 1);
54
+ return {
55
+ text: chunkText,
56
+ startPos,
57
+ endPos,
58
+ score: chunk.score,
59
+ tokenCount: this._tokenizer.encode(chunkText).length
60
+ };
61
+ }).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
62
+
63
+ // Check for no chunks
64
+ if (chunks.length === 0) {
65
+ // Take the top chunk and return a subset of its text
66
+ const topChunk = this._chunks[0];
67
+ const startPos = topChunk.item.metadata.startPos;
68
+ const endPos = topChunk.item.metadata.endPos;
69
+ const chunkText = text.substring(startPos, endPos + 1);
70
+ const tokens = this._tokenizer.encode(chunkText);
71
+ return [{
72
+ text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
73
+ tokenCount: maxTokens,
74
+ score: topChunk.score
75
+ }];
76
+ }
77
+
78
+ // Generate sections
79
+ const sections: Section[] = [{
80
+ chunks: [],
81
+ score: 0,
82
+ tokenCount: 0
83
+ }];
84
+ for (let i = 0; i < chunks.length; i++) {
85
+ const chunk = chunks[i];
86
+ let section = sections[sections.length - 1];
87
+ if (section.tokenCount + chunk.tokenCount > maxTokens) {
88
+ sections.push({
89
+ chunks: [],
90
+ score: 0,
91
+ tokenCount: 0
92
+ });
93
+ }
94
+ sections[sections.length - 1].chunks.push(chunk);
95
+ sections[sections.length - 1].score += chunk.score;
96
+ sections[sections.length - 1].tokenCount += chunk.tokenCount;
97
+ }
98
+
99
+ // Normalize section scores
100
+ sections.forEach(section => section.score /= section.chunks.length);
101
+
102
+ // Sort sections by score and limit to maxSections
103
+ sections.sort((a, b) => b.score - a.score);
104
+ if (sections.length > maxSections) {
105
+ sections.splice(maxSections, sections.length - maxSections);
106
+ }
107
+
108
+ // Combine adjacent chunks of text
109
+ sections.forEach(section => {
110
+ for (let i = 0; i < section.chunks.length - 1; i++) {
111
+ const chunk = section.chunks[i];
112
+ const nextChunk = section.chunks[i + 1];
113
+ if (chunk.endPos + 1 === nextChunk.startPos) {
114
+ chunk.text += nextChunk.text;
115
+ chunk.endPos = nextChunk.endPos;
116
+ chunk.tokenCount += nextChunk.tokenCount;
117
+ section.chunks.splice(i + 1, 1);
118
+ i--;
119
+ }
120
+ }
121
+ });
122
+
123
+ // Add overlapping chunks of text to each section until the maxTokens is reached
124
+ const connector: SectionChunk = {
125
+ text: '\n\n...\n\n',
126
+ startPos: -1,
127
+ endPos: -1,
128
+ score: 0,
129
+ tokenCount: this._tokenizer.encode('\n\n...\n\n').length
130
+ };
131
+ sections.forEach(section => {
132
+ // Insert connectors between chunks
133
+ if (section.chunks.length > 1) {
134
+ for (let i = 0; i < section.chunks.length - 1; i++) {
135
+ section.chunks.splice(i + 1, 0, connector);
136
+ section.tokenCount += connector.tokenCount;
137
+ i++;
138
+ }
139
+ }
140
+
141
+ // Add chunks to beginning and end of the section until maxTokens is reached
142
+ let budget = maxTokens - section.tokenCount;
143
+ if (budget > 40) {
144
+ const sectionStart = section.chunks[0].startPos;
145
+ const sectionEnd = section.chunks[section.chunks.length - 1].endPos;
146
+ if (sectionStart > 0) {
147
+ const beforeTex = text.substring(0, section.chunks[0].startPos);
148
+ const beforeTokens = this._tokenizer.encode(beforeTex);
149
+ const beforeBudget = sectionEnd < text.length - 1 ? Math.min(beforeTokens.length, Math.ceil(budget/2)) : Math.min(beforeTokens.length, budget);
150
+ const chunk: SectionChunk = {
151
+ text: this._tokenizer.decode(beforeTokens.slice(-beforeBudget)),
152
+ startPos: sectionStart - beforeBudget,
153
+ endPos: sectionStart - 1,
154
+ score: 0,
155
+ tokenCount: beforeBudget
156
+ };
157
+ section.chunks.unshift(chunk);
158
+ section.tokenCount += chunk.tokenCount;
159
+ budget -= chunk.tokenCount;
160
+ }
161
+
162
+ if (sectionEnd < text.length - 1) {
163
+ const afterText = text.substring(sectionEnd + 1);
164
+ const afterTokens = this._tokenizer.encode(afterText);
165
+ const afterBudget = Math.min(afterTokens.length, budget);
166
+ const chunk: SectionChunk = {
167
+ text: this._tokenizer.decode(afterTokens.slice(0, afterBudget)),
168
+ startPos: sectionEnd + 1,
169
+ endPos: sectionEnd + afterBudget,
170
+ score: 0,
171
+ tokenCount: afterBudget
172
+ };
173
+ section.chunks.push(chunk);
174
+ section.tokenCount += chunk.tokenCount;
175
+ budget -= chunk.tokenCount;
176
+ }
177
+ }
178
+ });
179
+
180
+ // Return final rendered sections
181
+ return sections.map(section => {
182
+ let text = '';
183
+ section.chunks.forEach(chunk => text += chunk.text);
184
+ return {
185
+ text: text,
186
+ tokenCount: section.tokenCount,
187
+ score: section.score
188
+ };
189
+ });
190
+ }
191
+ }
192
+
193
+ interface SectionChunk {
194
+ text: string;
195
+ startPos: number;
196
+ endPos: number;
197
+ score: number;
198
+ tokenCount: number;
199
+ }
200
+
201
+ interface Section {
202
+ chunks: SectionChunk[];
203
+ score: number;
204
+ tokenCount: number;
205
+ }
206
+
package/src/LocalIndex.ts CHANGED
@@ -2,6 +2,7 @@ import * as fs from 'fs/promises';
2
2
  import * as path from 'path';
3
3
  import { v4 } from 'uuid';
4
4
  import { ItemSelector } from './ItemSelector';
5
+ import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
5
6
 
6
7
  export interface CreateIndexConfig {
7
8
  version: number;
@@ -11,83 +12,6 @@ export interface CreateIndexConfig {
11
12
  };
12
13
  }
13
14
 
14
- export interface IndexStats {
15
- version: number;
16
- metadata_config: {
17
- indexed?: string[];
18
- };
19
- items: number;
20
- }
21
-
22
- export interface IndexItem<TMetadata = Record<string,MetadataTypes>> {
23
- id: string;
24
- metadata: TMetadata;
25
- vector: number[];
26
- norm: number;
27
- metadataFile?: string;
28
- }
29
-
30
- export interface QueryResult<TMetadata = Record<string,MetadataTypes>> {
31
- item: IndexItem<TMetadata>;
32
- score: number;
33
- }
34
-
35
- export interface MetadataFilter {
36
- [key: string]: MetadataTypes|MetadataFilter|(number|string)[]|MetadataFilter[];
37
-
38
- /**
39
- * Equal to (number, string, boolean)
40
- */
41
- '$eq': number|string|boolean;
42
-
43
- /**
44
- * Not equal to (number, string, boolean)
45
- */
46
- '$ne': number|string|boolean;
47
-
48
- /**
49
- * Greater than (number)
50
- */
51
- '$gt': number;
52
-
53
- /**
54
- * Greater than or equal to (number)
55
- */
56
- '$gte': number;
57
-
58
- /**
59
- * Less than (number)
60
- */
61
- '$lt': number;
62
-
63
- /**
64
- * Less than or equal to (number)
65
- */
66
- '$lte': number;
67
-
68
- /**
69
- * In array (string or number)
70
- */
71
- '$in': (number|string)[];
72
-
73
- /**
74
- * Not in array (string or number)
75
- */
76
- '$nin': (number|string)[];
77
-
78
- /**
79
- * AND (MetadataFilter[])
80
- */
81
- '$and': MetadataFilter[];
82
-
83
- /**
84
- * OR (MetadataFilter[])
85
- */
86
- '$or': MetadataFilter[];
87
- }
88
-
89
- export type MetadataTypes = number|string|boolean;
90
-
91
15
  /**
92
16
  * Local vector index instance.
93
17
  * @remarks
@@ -107,6 +31,13 @@ export class LocalIndex {
107
31
  this._folderPath = folderPath;
108
32
  }
109
33
 
34
+ /**
35
+ * Path to the index folder.
36
+ */
37
+ public get folderPath(): string {
38
+ return this._folderPath;
39
+ }
40
+
110
41
  /**
111
42
  * Begins an update to the index.
112
43
  * @remarks
@@ -364,7 +295,10 @@ export class LocalIndex {
364
295
  }
365
296
  }
366
297
 
367
- private async loadIndexData(): Promise<void> {
298
+ /**
299
+ * Ensures that the index has been loaded into memory.
300
+ */
301
+ protected async loadIndexData(): Promise<void> {
368
302
  if (this._data) {
369
303
  return;
370
304
  }