vectra 0.5.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,19 +8,58 @@ import { MetadataFilter, EmbeddingsModel, Tokenizer, MetadataTypes, EmbeddingsRe
8
8
  import { LocalDocumentResult } from './LocalDocumentResult';
9
9
  import { LocalDocument } from './LocalDocument';
10
10
 
11
+ /**
12
+ * Options for querying documents in the index.
13
+ */
11
14
  export interface DocumentQueryOptions {
15
+ /**
16
+ * Optional. Maximum number of documents to return.
17
+ * @remarks
18
+ * Default is 10.
19
+ */
12
20
  maxDocuments?: number;
21
+
22
+ /**
23
+ * Maximum number of chunks to return per document.
24
+ * @remarks
25
+ * Default is 50.
26
+ */
13
27
  maxChunks?: number;
28
+
29
+ /**
30
+ * Optional. Filter to apply to the document metadata.
31
+ */
14
32
  filter?: MetadataFilter;
15
33
  }
16
34
 
35
+ /**
36
+ * Configuration settings for a local document index.
37
+ */
17
38
  export interface LocalDocumentIndexConfig {
39
+ /**
40
+ * Folder path where the index is stored.
41
+ */
18
42
  folderPath: string;
43
+
44
+ /**
45
+ * Optional. Embeddings model to use for generating document embeddings.
46
+ */
19
47
  embeddings?: EmbeddingsModel;
48
+
49
+ /**
50
+ * Optional. Tokenizer to use for splitting text into tokens.
51
+ */
20
52
  tokenizer?: Tokenizer;
53
+
54
+ /**
55
+ * Optional. Configuration settings for splitting text into chunks.
56
+ */
21
57
  chunkingConfig?: Partial<TextSplitterConfig>;
22
58
  }
23
59
 
60
+ /**
61
+ * Represents a local index of documents stored on disk.
62
+ */
24
63
  export class LocalDocumentIndex extends LocalIndex {
25
64
  private readonly _embeddings?: EmbeddingsModel;
26
65
  private readonly _tokenizer: Tokenizer;
@@ -28,7 +67,10 @@ export class LocalDocumentIndex extends LocalIndex {
28
67
  private _catalog?: DocumentCatalog;
29
68
  private _newCatalog?: DocumentCatalog;
30
69
 
31
-
70
+ /**
71
+ * Creates a new `LocalDocumentIndex` instance.
72
+ * @param config Configuration settings for the document index.
73
+ */
32
74
  public constructor(config: LocalDocumentIndexConfig) {
33
75
  super(config.folderPath);
34
76
  this._embeddings = config.embeddings;
@@ -41,6 +83,20 @@ export class LocalDocumentIndex extends LocalIndex {
41
83
  this._chunkingConfig.tokenizer = this._tokenizer;
42
84
  }
43
85
 
86
+ /**
87
+ * Returns the embeddings model used by the index (if configured.)
88
+ */
89
+ public get embeddings(): EmbeddingsModel | undefined {
90
+ return this._embeddings;
91
+ }
92
+
93
+ /**
94
+ * Returns the tokenizer used by the index.
95
+ */
96
+ public get tokenizer(): Tokenizer {
97
+ return this._tokenizer;
98
+ }
99
+
44
100
  /**
45
101
  * Returns true if the document catalog exists.
46
102
  */
@@ -53,21 +109,44 @@ export class LocalDocumentIndex extends LocalIndex {
53
109
  }
54
110
  }
55
111
 
112
+ /**
113
+ * Returns the document ID for the given URI.
114
+ * @param uri URI of the document to lookup.
115
+ * @returns Document ID or undefined if not found.
116
+ */
56
117
  public async getDocumentId(uri: string): Promise<string | undefined> {
57
118
  await this.loadIndexData();
58
119
  return this._catalog?.uriToId[uri];
59
120
  }
60
121
 
122
+ /**
123
+ * Returns the document URI for the given ID.
124
+ * @param documentId ID of the document to lookup.
125
+ * @returns Document URI or undefined if not found.
126
+ */
61
127
  public async getDocumentUri(documentId: string): Promise<string | undefined> {
62
128
  await this.loadIndexData();
63
129
  return this._catalog?.idToUri[documentId];
64
130
  }
65
131
 
66
- public async createIndex(config?: CreateIndexConfig): Promise<void> {
67
- await super.createIndex(config);
68
- await this.loadIndexData();
132
+ /**
133
+ * Loads the document catalog from disk and returns its stats.
134
+ * @returns Catalog stats.
135
+ */
136
+ public async getCatalogStats(): Promise<DocumentCatalogStats> {
137
+ const stats = await this.getIndexStats()
138
+ return {
139
+ version: this._catalog!.version,
140
+ documents: this._catalog!.count,
141
+ chunks: stats.items,
142
+ metadata_config: stats.metadata_config
143
+ };
69
144
  }
70
145
 
146
+ /**
147
+ * Deletes a document from the index.
148
+ * @param uri URI of the document to delete.
149
+ */
71
150
  public async deleteDocument(uri: string): Promise<void> {
72
151
  // Lookup document ID
73
152
  const documentId = await this.getDocumentId(uri);
@@ -114,16 +193,6 @@ export class LocalDocumentIndex extends LocalIndex {
114
193
  }
115
194
  }
116
195
 
117
- public async getCatalogStats(): Promise<DocumentCatalogStats> {
118
- const stats = await this.getIndexStats()
119
- return {
120
- version: this._catalog!.version,
121
- documents: this._catalog!.count,
122
- chunks: stats.items,
123
- metadata_config: stats.metadata_config
124
- };
125
- }
126
-
127
196
  /**
128
197
  * Adds a document to the catalog.
129
198
  * @remarks
@@ -245,10 +314,44 @@ export class LocalDocumentIndex extends LocalIndex {
245
314
  }
246
315
 
247
316
  // Return document
248
- return new LocalDocument(this.folderPath, documentId, uri);
317
+ return new LocalDocument(this, documentId, uri);
249
318
  }
319
+
320
+ /**
321
+ * Returns all documents in the index.
322
+ * @remarks
323
+ * Each document will contain all of the documents indexed chunks.
324
+ * @returns Array of documents.
325
+ */
326
+ public async listDocuments(): Promise<LocalDocumentResult[]> {
327
+ // Sort chunks by document ID
328
+ const docs: { [documentId: string]: QueryResult<DocumentChunkMetadata>[]; } = {};
329
+ const chunks = await this.listItems<DocumentChunkMetadata>();
330
+ chunks.forEach(chunk => {
331
+ const metadata = chunk.metadata;
332
+ if (docs[metadata.documentId] == undefined) {
333
+ docs[metadata.documentId] = [];
334
+ }
335
+ docs[metadata.documentId].push({ item: chunk, score: 1.0 });
336
+ });
337
+
338
+ // Create document results
339
+ const results: LocalDocumentResult[] = [];
340
+ for (const documentId in docs) {
341
+ const uri = await this.getDocumentUri(documentId) as string;
342
+ const documentResult = new LocalDocumentResult(this, documentId, uri, docs[documentId], this._tokenizer);
343
+ results.push(documentResult);
344
+ }
250
345
 
346
+ return results;
347
+ }
251
348
 
349
+ /**
350
+ * Queries the index for documents similar to the given query.
351
+ * @param query Text to query for.
352
+ * @param options Optional. Query options.
353
+ * @returns Array of document results.
354
+ */
252
355
  public async queryDocuments(query: string, options?: DocumentQueryOptions): Promise<LocalDocumentResult[]> {
253
356
  // Ensure embeddings configured
254
357
  if (!this._embeddings) {
@@ -292,7 +395,7 @@ export class LocalDocumentIndex extends LocalIndex {
292
395
  for (const documentId in documentChunks) {
293
396
  const chunks = documentChunks[documentId];
294
397
  const uri = await this.getDocumentUri(documentId) as string;
295
- const documentResult = new LocalDocumentResult(this.folderPath, documentId, uri, chunks, this._tokenizer);
398
+ const documentResult = new LocalDocumentResult(this, documentId, uri, chunks, this._tokenizer);
296
399
  documentResults.push(documentResult);
297
400
  }
298
401
 
@@ -312,6 +415,11 @@ export class LocalDocumentIndex extends LocalIndex {
312
415
  this._newCatalog = undefined;
313
416
  }
314
417
 
418
+ public async createIndex(config?: CreateIndexConfig): Promise<void> {
419
+ await super.createIndex(config);
420
+ await this.loadIndexData();
421
+ }
422
+
315
423
  public async endUpdate(): Promise<void> {
316
424
  await super.endUpdate();
317
425
 
@@ -1,13 +1,21 @@
1
1
  import { LocalDocument } from "./LocalDocument";
2
+ import { LocalDocumentIndex } from "./LocalDocumentIndex";
2
3
  import { QueryResult, DocumentChunkMetadata, Tokenizer, DocumentTextSection } from "./types";
3
4
 
5
+ /**
6
+ * Represents a search result for a document stored on disk.
7
+ */
4
8
  export class LocalDocumentResult extends LocalDocument {
5
9
  private readonly _chunks: QueryResult<DocumentChunkMetadata>[];
6
10
  private readonly _tokenizer: Tokenizer;
7
11
  private readonly _score: number;
8
12
 
9
- public constructor(folderPath: string, id: string, uri: string, chunks: QueryResult<DocumentChunkMetadata>[], tokenizer: Tokenizer) {
10
- super(folderPath, id, uri);
13
+ /**
14
+ * @private
15
+ * Internal constructor for `LocalDocumentResult` instances.
16
+ */
17
+ public constructor(index: LocalDocumentIndex, id: string, uri: string, chunks: QueryResult<DocumentChunkMetadata>[], tokenizer: Tokenizer) {
18
+ super(index, id, uri);
11
19
  this._chunks = chunks;
12
20
  this._tokenizer = tokenizer;
13
21
 
@@ -17,30 +25,112 @@ export class LocalDocumentResult extends LocalDocument {
17
25
  this._score = score / this._chunks.length;
18
26
  }
19
27
 
28
+ /**
29
+ * Returns the chunks of the document that matched the query.
30
+ */
20
31
  public get chunks(): QueryResult<DocumentChunkMetadata>[] {
21
32
  return this._chunks;
22
33
  }
23
34
 
35
+ /**
36
+ * Returns the average score of the document result.
37
+ */
24
38
  public get score(): number {
25
39
  return this._score;
26
40
  }
27
41
 
28
- public async renderSections(maxTokens: number, maxSections: number, overlappingChunks = true): Promise<DocumentTextSection[]> {
42
+ /**
43
+ * Renders all of the results chunks as spans of text (sections.)
44
+ * @remarks
45
+ * The returned sections will be sorted by document order and limited to maxTokens in length.
46
+ * @param maxTokens Maximum number of tokens per section.
47
+ * @returns Array of rendered text sections.
48
+ */
49
+ public async renderAllSections(maxTokens: number): Promise<DocumentTextSection[]> {
29
50
  // Load text from disk
30
51
  const text = await this.loadText();
31
52
 
32
- // First check to see if the entire document is less than maxTokens
33
- if (text.length <= (maxTokens * 8)) {
34
- const tokens = this._tokenizer.encode(text);
35
- if (tokens.length < maxTokens) {
36
- return [{
37
- text,
38
- tokenCount: tokens.length,
39
- score: 1.0
40
- }];
53
+ // Add chunks to a temp array and split any chunks that are longer than maxTokens.
54
+ const chunks: SectionChunk[] = [];
55
+ for (let i = 0; i < this._chunks.length; i++) {
56
+ const chunk = this._chunks[i];
57
+ const startPos = chunk.item.metadata.startPos;
58
+ const endPos = chunk.item.metadata.endPos;
59
+ const chunkText = text.substring(startPos, endPos + 1);
60
+ const tokens = this._tokenizer.encode(chunkText);
61
+ let offset = 0;
62
+ while (offset < tokens.length) {
63
+ const chunkLength = Math.min(maxTokens, tokens.length - offset);
64
+ chunks.push({
65
+ text: this._tokenizer.decode(tokens.slice(offset, offset + chunkLength)),
66
+ startPos: startPos + offset,
67
+ endPos: startPos + offset + chunkLength - 1,
68
+ score: chunk.score,
69
+ tokenCount: chunkLength
70
+ });
71
+ offset += chunkLength;
41
72
  }
42
73
  }
43
74
 
75
+ // Sort chunks by startPos
76
+ const sorted = chunks.sort((a, b) => a.startPos - b.startPos);
77
+
78
+ // Generate sections
79
+ const sections: Section[] = [];
80
+ for (let i = 0; i < sorted.length; i++) {
81
+ const chunk = sorted[i];
82
+ let section = sections[sections.length - 1];
83
+ if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
84
+ section = {
85
+ chunks: [],
86
+ score: 0,
87
+ tokenCount: 0
88
+ };
89
+ sections.push(section);
90
+ }
91
+ section.chunks.push(chunk);
92
+ section.score += chunk.score;
93
+ section.tokenCount += chunk.tokenCount;
94
+ }
95
+
96
+ // Normalize section scores
97
+ sections.forEach(section => section.score /= section.chunks.length);
98
+
99
+ // Return final rendered sections
100
+ return sections.map(section => {
101
+ let text = '';
102
+ section.chunks.forEach(chunk => text += chunk.text);
103
+ return {
104
+ text: text,
105
+ tokenCount: section.tokenCount,
106
+ score: section.score
107
+ };
108
+ });
109
+ }
110
+
111
+ /**
112
+ * Renders the top spans of text (sections) of the document based on the query result.
113
+ * @remarks
114
+ * The returned sections will be sorted by relevance and limited to the top `maxSections`.
115
+ * @param maxTokens Maximum number of tokens per section.
116
+ * @param maxSections Maximum number of sections to return.
117
+ * @param overlappingChunks Optional. If true, overlapping chunks of text will be added to each section until the maxTokens is reached.
118
+ * @returns Array of rendered text sections.
119
+ */
120
+ public async renderSections(maxTokens: number, maxSections: number, overlappingChunks = true): Promise<DocumentTextSection[]> {
121
+ // Load text from disk
122
+ const text = await this.loadText();
123
+
124
+ // First check to see if the entire document is shorter than maxTokens
125
+ const length = await this.getLength();
126
+ if (length <= maxTokens) {
127
+ return [{
128
+ text,
129
+ tokenCount: length,
130
+ score: 1.0
131
+ }];
132
+ }
133
+
44
134
  // Otherwise, we need to split the document into sections
45
135
  // - Add each chunk to a temp array and filter out any chunk that's longer then maxTokens.
46
136
  // - Sort the array by startPos to arrange chunks in document order.
@@ -78,24 +168,21 @@ export class LocalDocumentResult extends LocalDocument {
78
168
  }
79
169
 
80
170
  // Generate sections
81
- const sections: Section[] = [{
82
- chunks: [],
83
- score: 0,
84
- tokenCount: 0
85
- }];
171
+ const sections: Section[] = [];
86
172
  for (let i = 0; i < chunks.length; i++) {
87
173
  const chunk = chunks[i];
88
174
  let section = sections[sections.length - 1];
89
- if (section.tokenCount + chunk.tokenCount > maxTokens) {
90
- sections.push({
175
+ if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
176
+ section = {
91
177
  chunks: [],
92
178
  score: 0,
93
179
  tokenCount: 0
94
- });
180
+ };
181
+ sections.push(section);
95
182
  }
96
- sections[sections.length - 1].chunks.push(chunk);
97
- sections[sections.length - 1].score += chunk.score;
98
- sections[sections.length - 1].tokenCount += chunk.tokenCount;
183
+ section.chunks.push(chunk);
184
+ section.score += chunk.score;
185
+ section.tokenCount += chunk.tokenCount;
99
186
  }
100
187
 
101
188
  // Normalize section scores
package/src/LocalIndex.ts CHANGED
@@ -27,8 +27,8 @@ export class LocalIndex {
27
27
 
28
28
  /**
29
29
  * Creates a new instance of LocalIndex.
30
- * @param folderPath - Path to the index folder
31
- * @param indexName - Optional name of the index file. Defaults to index.json
30
+ * @param folderPath Path to the index folder.
31
+ * @param indexName Optional name of the index file. Defaults to index.json.
32
32
  */
33
33
  public constructor(folderPath: string, indexName?: string) {
34
34
  this._folderPath = folderPath;
@@ -76,7 +76,7 @@ export class LocalIndex {
76
76
  * Creates a new index.
77
77
  * @remarks
78
78
  * This method creates a new folder on disk containing an index.json file.
79
- * @param config - Index configuration
79
+ * @param config Index configuration.
80
80
  */
81
81
  public async createIndex(config: CreateIndexConfig = {version: 1}): Promise<void> {
82
82
  // Delete if exists
@@ -121,7 +121,7 @@ export class LocalIndex {
121
121
 
122
122
  /**
123
123
  * Deletes an item from the index.
124
- * @param id - Item id
124
+ * @param id ID of item to delete.
125
125
  */
126
126
  public async deleteItem(id: string): Promise<void> {
127
127
  if (this._update) {
@@ -161,7 +161,7 @@ export class LocalIndex {
161
161
 
162
162
  /**
163
163
  * Loads an index from disk and returns its stats.
164
- * @returns Index stats
164
+ * @returns Index stats.
165
165
  */
166
166
  public async getIndexStats(): Promise<IndexStats> {
167
167
  await this.loadIndexData();
@@ -174,8 +174,8 @@ export class LocalIndex {
174
174
 
175
175
  /**
176
176
  * Returns an item from the index given its ID.
177
- * @param id Item id
178
- * @returns Item or undefined if not found
177
+ * @param id ID of the item to retrieve.
178
+ * @returns Item or undefined if not found.
179
179
  */
180
180
  public async getItem<TMetadata = Record<string,MetadataTypes>>(id: string): Promise<IndexItem<TMetadata> | undefined> {
181
181
  await this.loadIndexData();
@@ -187,8 +187,8 @@ export class LocalIndex {
187
187
  * @remarks
188
188
  * A new update is started if one is not already in progress. If an item with the same ID
189
189
  * already exists, an error will be thrown.
190
- * @param item Item to insert
191
- * @returns Inserted item
190
+ * @param item Item to insert.
191
+ * @returns Inserted item.
192
192
  */
193
193
  public async insertItem<TMetadata = Record<string,MetadataTypes>>(item: Partial<IndexItem<TMetadata>>): Promise<IndexItem<TMetadata>> {
194
194
  if (this._update) {
@@ -218,7 +218,7 @@ export class LocalIndex {
218
218
  * @remarks
219
219
  * This method loads the index into memory and returns all its items. A copy of the items
220
220
  * array is returned so no modifications should be made to the array.
221
- * @returns All items in the index
221
+ * @returns Array of all items in the index.
222
222
  */
223
223
  public async listItems<TMetadata = Record<string,MetadataTypes>>(): Promise<IndexItem<TMetadata>[]> {
224
224
  await this.loadIndexData();
@@ -229,8 +229,8 @@ export class LocalIndex {
229
229
  * Returns all items in the index matching the filter.
230
230
  * @remarks
231
231
  * This method loads the index into memory and returns all its items matching the filter.
232
- * @param filter Filter to apply
233
- * @returns Items matching the filter
232
+ * @param filter Filter to apply.
233
+ * @returns Array of items matching the filter.
234
234
  */
235
235
  public async listItemsByMetadata<TMetadata = Record<string,MetadataTypes>>(filter: MetadataFilter): Promise<IndexItem<TMetadata>[]> {
236
236
  await this.loadIndexData();
@@ -242,10 +242,10 @@ export class LocalIndex {
242
242
  * @remarks
243
243
  * This method loads the index into memory and returns the top k items that are most similar.
244
244
  * An optional filter can be applied to the metadata of the items.
245
- * @param vector Vector to query against
246
- * @param topK Number of items to return
247
- * @param filter Optional filter to apply
248
- * @returns Similar items to the vector that matches the filter
245
+ * @param vector Vector to query against.
246
+ * @param topK Number of items to return.
247
+ * @param filter Optional. Filter to apply.
248
+ * @returns Similar items to the vector that matche the supplied filter.
249
249
  */
250
250
  public async queryItems<TMetadata = Record<string,MetadataTypes>>(vector: number[], topK: number, filter?: MetadataFilter): Promise<QueryResult<TMetadata>[]> {
251
251
  await this.loadIndexData();
@@ -293,8 +293,8 @@ export class LocalIndex {
293
293
  * @remarks
294
294
  * A new update is started if one is not already in progress. If an item with the same ID
295
295
  * already exists, it will be replaced.
296
- * @param item Item to insert or replace
297
- * @returns Upserted item
296
+ * @param item Item to insert or replace.
297
+ * @returns Upserted item.
298
298
  */
299
299
  public async upsertItem<TMetadata = Record<string,MetadataTypes>>(item: Partial<IndexItem<TMetadata>>): Promise<IndexItem<TMetadata>> {
300
300
  if (this._update) {
@@ -153,6 +153,7 @@ export class TextSplitter {
153
153
  currentLength = chunk.tokens.length;
154
154
  } else {
155
155
  currentChunk.text += separator + chunk.text;
156
+ currentChunk.endPos = chunk.endPos;
156
157
  currentChunk.tokens.push(...chunk.tokens);
157
158
  currentLength += chunk.tokens.length;
158
159
  }