vectra 0.12.1 → 0.12.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/LICENSE +1 -1
  2. package/README.draft.md +499 -0
  3. package/README.draft.outline.md +160 -0
  4. package/README.research.md +2159 -0
  5. package/lib/FileFetcher.d.ts +5 -0
  6. package/lib/FileFetcher.d.ts.map +1 -0
  7. package/lib/FileFetcher.js +79 -0
  8. package/lib/FileFetcher.js.map +1 -0
  9. package/lib/GPT3Tokenizer.d.ts +9 -0
  10. package/lib/GPT3Tokenizer.d.ts.map +1 -0
  11. package/lib/GPT3Tokenizer.js +17 -0
  12. package/lib/GPT3Tokenizer.js.map +1 -0
  13. package/lib/ItemSelector.d.ts +41 -0
  14. package/lib/ItemSelector.d.ts.map +1 -0
  15. package/lib/ItemSelector.js +168 -0
  16. package/lib/ItemSelector.js.map +1 -0
  17. package/lib/LocalDocument.d.ts +54 -0
  18. package/lib/LocalDocument.d.ts.map +1 -0
  19. package/lib/LocalDocument.js +156 -0
  20. package/lib/LocalDocument.js.map +1 -0
  21. package/lib/LocalDocumentIndex.d.ts +132 -0
  22. package/lib/LocalDocumentIndex.d.ts.map +1 -0
  23. package/lib/LocalDocumentIndex.js +456 -0
  24. package/lib/LocalDocumentIndex.js.map +1 -0
  25. package/lib/LocalDocumentResult.d.ts +45 -0
  26. package/lib/LocalDocumentResult.d.ts.map +1 -0
  27. package/lib/LocalDocumentResult.js +328 -0
  28. package/lib/LocalDocumentResult.js.map +1 -0
  29. package/lib/LocalIndex.d.ts +150 -0
  30. package/lib/LocalIndex.d.ts.map +1 -0
  31. package/lib/LocalIndex.js +515 -0
  32. package/lib/LocalIndex.js.map +1 -0
  33. package/lib/LocalIndex.spec.d.ts +2 -0
  34. package/lib/LocalIndex.spec.js +218 -7
  35. package/lib/LocalIndex.spec.js.map +1 -1
  36. package/lib/OpenAIEmbeddings.d.ts +126 -0
  37. package/lib/OpenAIEmbeddings.d.ts.map +1 -0
  38. package/lib/OpenAIEmbeddings.js +174 -0
  39. package/lib/OpenAIEmbeddings.js.map +1 -0
  40. package/lib/TextSplitter.d.ts +19 -0
  41. package/lib/TextSplitter.d.ts.map +1 -0
  42. package/lib/TextSplitter.js +457 -0
  43. package/lib/TextSplitter.js.map +1 -0
  44. package/lib/TextSplitter.spec.d.ts +2 -0
  45. package/lib/TextSplitter.spec.d.ts.map +1 -0
  46. package/lib/TextSplitter.spec.js +109 -0
  47. package/lib/TextSplitter.spec.js.map +1 -0
  48. package/lib/WebFetcher.d.ts +15 -0
  49. package/lib/WebFetcher.d.ts.map +1 -0
  50. package/lib/WebFetcher.js +234 -0
  51. package/lib/WebFetcher.js.map +1 -0
  52. package/lib/index.d.ts +12 -0
  53. package/lib/index.d.ts.map +1 -0
  54. package/lib/index.js +28 -0
  55. package/lib/index.js.map +1 -0
  56. package/lib/internals/Colorize.d.ts +14 -0
  57. package/lib/internals/Colorize.d.ts.map +1 -0
  58. package/lib/internals/Colorize.js +64 -0
  59. package/lib/internals/Colorize.js.map +1 -0
  60. package/lib/internals/index.d.ts +3 -0
  61. package/lib/internals/index.d.ts.map +1 -0
  62. package/lib/internals/index.js +19 -0
  63. package/lib/internals/index.js.map +1 -0
  64. package/lib/internals/types.d.ts +43 -0
  65. package/lib/internals/types.d.ts.map +1 -0
  66. package/lib/internals/types.js +3 -0
  67. package/lib/internals/types.js.map +1 -0
  68. package/lib/types.d.ts +146 -0
  69. package/lib/types.d.ts.map +1 -0
  70. package/lib/types.js +3 -0
  71. package/lib/types.js.map +1 -0
  72. package/lib/vectra-cli.d.ts +2 -0
  73. package/lib/vectra-cli.d.ts.map +1 -0
  74. package/lib/vectra-cli.js +323 -0
  75. package/lib/vectra-cli.js.map +1 -0
  76. package/package.json +5 -3
  77. package/src/GPT3Tokenizer.ts +1 -1
  78. package/src/LocalIndex.spec.ts +265 -8
  79. package/src/LocalIndex.ts +1 -0
  80. package/src/TextSplitter.spec.ts +87 -0
  81. package/src/TextSplitter.ts +459 -531
@@ -0,0 +1,2159 @@
1
+ MEMORY:
2
+ SOURCE: src\index.ts
3
+ DETAILS: export * from './FileFetcher';
4
+ export * from './GPT3Tokenizer';
5
+ export * from './ItemSelector';
6
+ export * from './LocalIndex';
7
+ export * from './LocalDocument';
8
+ export * from './LocalDocumentIndex';
9
+ export * from './LocalDocumentResult';
10
+ export * from './OpenAIEmbeddings';
11
+ export * from './TextSplitter';
12
+ export * from './types';
13
+ export * from './WebFetcher';
14
+
15
+ MEMORY:
16
+ SOURCE: src\LocalIndex.spec.ts
17
+ DETAILS: import assert from 'node:assert'
18
+ import sinon from 'sinon'
19
+ import { LocalIndex } from './LocalIndex'
20
+ import { IndexItem } from './types'
21
+ import fs from 'fs/promises'
22
+ import path from 'path'
23
+
24
+ describe('LocalIndex', () => {
25
+ const testIndexDir = path.join(__dirname, 'test_index');
26
+
27
+ const basicIndexItems: Partial<IndexItem>[] = [
28
+ { id: '1', vector: [1, 2, 3] },
29
+ { id: '2', vector: [2, 3, 4] },
30
+ { id: '3', vector: [3, 4, 5] }
31
+ ];
32
+
33
+
34
+ beforeEach(async () => {
35
+ await fs.rm(testIndexDir, { recursive: true, force: true });
36
+ });
37
+
38
+ afterEach(async () => {
39
+ await fs.rm(testIndexDir, { recursive: true, force: true });
40
+ sinon.restore();
41
+ });
42
+
43
+ it('should create a new index', async () => {
44
+ const index = new LocalIndex(testIndexDir);
45
+ await index.createIndex();
46
+ const created = await index.isIndexCreated();
47
+ assert.equal(created, true);
48
+ assert.equal(index.folderPath, testIndexDir);
49
+ });
50
+
51
+ it('blocks concurrent operations when lock is held', async () => {
52
+ const index = new LocalIndex(testIndexDir);
53
+ await index.createIndex();
54
+ await index.beginUpdate(); // grab lock for a big update!
55
+ await assert.rejects(async () => {
56
+ await index.beginUpdate(); // try to grab lock again. should fail!
57
+ }, new Error('Update already in progress'))
58
+ })
59
+
60
+ describe('createIndex', () => {
61
+ it('checks for existing index on creation', async () => {
62
+ const index = new LocalIndex(testIndexDir);
63
+ await index.createIndex(); // create first index.json
64
+
65
+ // create without deleteIfExists. Will reject
66
+ await assert.rejects(async () => {
67
+ await index.createIndex()
68
+ }, new Error('Index already exists'))
69
+
70
+ // create with deleteIfExists. Should remove old data
71
+ await index.insertItem({id:'1', vector: [1,2,3]})
72
+ const lengthBefore = (await index.listItems()).length
73
+ assert.equal(lengthBefore, 1)
74
+ await index.createIndex({deleteIfExists: true, version: 2, metadata_config: {}})
75
+ const lengthAfter = (await index.listItems()).length
76
+ assert.equal(lengthAfter, 0)
77
+ })
78
+
79
+ it('delete index if file creation fails', async () => {
80
+ const index = new LocalIndex(testIndexDir);
81
+ sinon.stub(fs, 'writeFile').rejects(new Error('fs error'))
82
+
83
+ await assert.rejects(async () => {
84
+ await index.createIndex();
85
+ }, new Error('Error creating index'))
86
+
87
+ await assert.rejects(async () => {
88
+ await index.listItems();
89
+ })
90
+ })
91
+ })
92
+
93
+ describe('deleteItem', () => {
94
+ it('does nothing when id not found', async () => {
95
+ const index = new LocalIndex(testIndexDir);
96
+ await index.createIndex();
97
+ await index.beginUpdate();
98
+ await index.insertItem(basicIndexItems[0])
99
+ await index.insertItem(basicIndexItems[1])
100
+ await index.insertItem(basicIndexItems[2])
101
+ await index.endUpdate();
102
+
103
+ await assert.doesNotReject(async () => {
104
+ await index.deleteItem('dne');
105
+ })
106
+ assert.equal((await index.listItems()).length, 3)
107
+ })
108
+
109
+ it('leaves existing empty index when last el deleted', async () => {
110
+ const index = new LocalIndex(testIndexDir);
111
+ await index.createIndex();
112
+ await index.insertItem(basicIndexItems[0]);
113
+
114
+ await index.deleteItem(basicIndexItems[0].id ?? '');
115
+ assert.equal(await index.isIndexCreated(), true);
116
+ assert.equal((await index.listItems()).length, 0);
117
+ })
118
+
119
+ it('removes elements from any position', async () => {
120
+ const index = new LocalIndex(testIndexDir);
121
+ await index.createIndex();
122
+ await index.batchInsertItems([
123
+ {id: '1', vector: []},
124
+ {id: '2', vector: []},
125
+ {id: '3', vector: []},
126
+ {id: '4', vector: []},
127
+ {id: '5', vector: []},
128
+ ]);
129
+
130
+ await index.beginUpdate();
131
+ await index.deleteItem('1');
132
+ await index.deleteItem('3');
133
+ await index.deleteItem('5');
134
+ await index.endUpdate();
135
+
136
+ assert.deepStrictEqual(await index.listItems(), [{id: '2', vector: [], metadata: {}, norm: 0}, {id: '4', vector: [], metadata: {}, norm: 0}])
137
+ })
138
+ })
139
+
140
+ describe('endUpdate', () => {
141
+ it('throws an error if no update has begun', async () => {
142
+ const index = new LocalIndex(testIndexDir);
143
+
144
+ await assert.rejects(async () => {
145
+ await index.endUpdate();
146
+ }, new Error('No update in progress'));
147
+ })
148
+
149
+ it('throws an error if the index could not be saved', async () => {
150
+ const index = new LocalIndex(testIndexDir, 'index.json');
151
+ await index.createIndex();
152
+ await index.beginUpdate();
153
+
154
+ sinon.stub(fs, 'writeFile').rejects(new Error('fs error'))
155
+
156
+ await assert.rejects(async () => {
157
+ await index.endUpdate();
158
+ }, new Error('Error saving index: Error: fs error'))
159
+ })
160
+ })
161
+
162
+ describe('getIndexStats', () => {
163
+ it('reports empty index correctly', async () => {
164
+ const index = new LocalIndex(testIndexDir);
165
+ await index.createIndex();
166
+
167
+ assert.deepStrictEqual(await index.getIndexStats(), {
168
+ version: 1,
169
+ metadata_config: {},
170
+ items: 0
171
+ })
172
+ })
173
+
174
+ it('correctly reports non-empty index stats', async () => {
175
+ const index = new LocalIndex(testIndexDir)
176
+ await index.createIndex({version: 1, metadata_config: {indexed: []}})
177
+ await index.batchInsertItems(basicIndexItems);
178
+
179
+ assert.deepStrictEqual(await index.getIndexStats(), {
180
+ version: 1,
181
+ metadata_config: {indexed: []},
182
+ items: 3
183
+ })
184
+ })
185
+ })
186
+
187
+ describe('getItem', () => {
188
+ it('returns undefined when item not found', async () => {
189
+ const index = new LocalIndex(testIndexDir);
190
+ await index.createIndex();
191
+
192
+ assert.equal(await index.getItem('1'), undefined)
193
+ })
194
+
195
+ it('returns requested item', async () => {
196
+ const index = new LocalIndex(testIndexDir);
197
+ await index.createIndex();
198
+ await index.batchInsertItems(basicIndexItems);
199
+
200
+ const item2 = await index.getItem('2');
201
+ assert.equal(item2?.id, basicIndexItems[1].id)
202
+ assert.equal(item2?.vector, basicIndexItems[1].vector)
203
+ assert.equal((await index.listItems()).length, 3)
204
+ })
205
+ })
206
+
207
+ describe('batchInsertItems', () => {
208
+ it('should insert provided items', async () => {
209
+ const index = new LocalIndex(testIndexDir);
210
+ await index.createIndex();
211
+
212
+ const newItems = await index.batchInsertItems(basicIndexItems);
213
+
214
+ assert.equal(newItems.length, 3);
215
+
216
+ const retrievedItems = await index.listItems();
217
+ assert.equal(retrievedItems.length, 3);
218
+ });
219
+
220
+ it('on id collision - cancel batch insert & bubble up error', async () => {
221
+ const index = new LocalIndex(testIndexDir);
222
+ await index.createIndex();
223
+
224
+ await index.insertItem({ id: '2', vector: [9, 9, 9] });
225
+
226
+ // ensures insert error is bubbled up to batchIndexItems caller
227
+ await assert.rejects(
228
+ async () => {
229
+ await index.batchInsertItems(basicIndexItems);
230
+ },
231
+ {
232
+ name: 'Error',
233
+ message: 'Item with id 2 already exists'
234
+ }
235
+ );
236
+
237
+ // ensures no partial update is applied
238
+ const storedItems = await index.listItems();
239
+ assert.equal(storedItems.length, 1);
240
+ });
241
+ });
242
+
243
+ describe('listItemsByMetadata', () => {
244
+ it('returns items matching metadata filter', async () => {
245
+ const index = new LocalIndex(testIndexDir);
246
+ await index.createIndex();
247
+ await index.batchInsertItems([
248
+ {id: '1', vector: [], metadata: {category: 'food'}},
249
+ {id: '2', vector: [], metadata: {category: 'food'}},
250
+ {id: '3', vector: [], metadata: {category: 'electronics'}},
251
+ {id: '4', vector: [], metadata: {category: 'drink'}},
252
+ {id: '5', vector: [], metadata: {category: 'food'}},
253
+ ]);
254
+
255
+ const foodItems = await index.listItemsByMetadata({category: {'$eq': 'food'}})
256
+ assert.deepStrictEqual(foodItems.map((item) => item.id), ["1", "2", "5"])
257
+ const drinkItems = await index.listItemsByMetadata({category: {'$eq': 'drink'}})
258
+ assert.deepStrictEqual(drinkItems.map((item) => item.id), ["4"])
259
+ const clothingItems = await index.listItemsByMetadata({category: {'$eq': 'clothes'}})
260
+ assert.deepStrictEqual(clothingItems, [])
261
+ })
262
+
263
+ it('returns nothing when no items in index', async () => {
264
+ const index = new LocalIndex(testIndexDir);
265
+ await index.createIndex();
266
+
267
+ const items = await index.listItemsByMetadata({});
268
+ assert.deepStrictEqual(items, []);
269
+ })
270
+ });
271
+
272
+ describe("queryItems", () => {
273
+ it("returns empty array on empty index search", async () => {
274
+ const index = new LocalIndex(testIndexDir);
275
+ await index.createIndex();
276
+
277
+ const result = await index.queryItems([1, 2, 3], "", 10);
278
+ assert.deepStrictEqual(result, []);
279
+ });
280
+
281
+ it("returns bad match when no better match exists", async () => {
282
+ const index = new LocalIndex(testIndexDir);
283
+ await index.createIndex();
284
+ await index.insertItem({ id: "1", vector: [0.9, 0, 0, 0, 0] });
285
+
286
+ const result = await index.queryItems([0, 0, 0, 0, 0.1], "", 1);
287
+ assert.equal(result[0]?.score, 0);
288
+ assert.equal(result[0]?.item.id, "1");
289
+ });
290
+
291
+ it("returns all vectors when fewer than topK exist", async () => {
292
+ const index = new LocalIndex(testIndexDir);
293
+ await index.createIndex();
294
+ await index.batchInsertItems(basicIndexItems);
295
+
296
+ const result = await index.queryItems([0, 0, 1], "", 10);
297
+ assert.equal(result.length, 3);
298
+ assert.deepStrictEqual(
299
+ result.map(({ item }) => item.id),
300
+ basicIndexItems.map((item) => item.id),
301
+ );
302
+ });
303
+
304
+ it("filters by metadata when filter provided", async () => {
305
+ const index = new LocalIndex(testIndexDir);
306
+ await index.createIndex();
307
+ await index.batchInsertItems([
308
+ { id: "1", vector: [1, 0, 0], metadata: { category: "food" } },
309
+ { id: "2", vector: [0, 0, 1], metadata: { category: "drink" } },
310
+ ]);
311
+
312
+ const bestGeneralMatch = await index.queryItems([1, 0, 0], "", 1);
313
+ const bestDrinkMatch = await index.queryItems([1, 0, 0], "", 1, {
314
+ category: { $eq: "drink" },
315
+ });
316
+
317
+ assert.equal(bestGeneralMatch[0].item.id, "1");
318
+ assert.equal(bestDrinkMatch[0].item.id, "2");
319
+ });
320
+
321
+ it("reads item metadata file when provided", async () => {
322
+ const index = new LocalIndex(testIndexDir);
323
+ await index.createIndex({version: 1, metadata_config: {indexed: ['category']}});
324
+ await index.batchInsertItems([
325
+ { id: "1", vector: [1, 0, 0] },
326
+ { id: "2", vector: [0, 0, 1], metadata: {category: 'drink'} },
327
+ ]);
328
+
329
+ sinon
330
+ .stub(fs, "readFile")
331
+ .resolves(JSON.stringify({ category: "drink" }));
332
+
333
+ const bestDrinkMatch = await index.queryItems([1, 0, 0], "", 2, {category: {'$eq': 'drink'}});
334
+
335
+ assert.notEqual(bestDrinkMatch[0].item.metadataFile, undefined);
336
+ assert.equal(bestDrinkMatch[0].item.id, "2");
337
+ });
338
+ });
339
+ });
340
+
341
+ MEMORY:
342
+ SOURCE: src\OpenAIEmbeddings.ts
343
+ DETAILS: import axios, { AxiosInstance, AxiosResponse, AxiosRequestConfig } from 'axios';
344
+ import { EmbeddingsModel, EmbeddingsResponse } from "./types";
345
+ import { CreateEmbeddingRequest, CreateEmbeddingResponse, OpenAICreateEmbeddingRequest } from "./internals";
346
+ import { Colorize } from "./internals";
347
+
348
+ export interface BaseOpenAIEmbeddingsOptions {
349
+ /**
350
+ * Optional. Number of embedding dimensions to return.
351
+ */
352
+ dimensions?: number;
353
+
354
+ /**
355
+ * Optional. Whether to log requests to the console.
356
+ * @remarks
357
+ * This is useful for debugging prompts and defaults to `false`.
358
+ */
359
+ logRequests?: boolean;
360
+
361
+ /**
362
+ * Optional. Maximum number of tokens that can be sent to the embedding model.
363
+ */
364
+ maxTokens?: number;
365
+
366
+ /**
367
+ * Optional. Retry policy to use when calling the OpenAI API.
368
+ * @remarks
369
+ * The default retry policy is `[2000, 5000]` which means that the first retry will be after
370
+ * 2 seconds and the second retry will be after 5 seconds.
371
+ */
372
+ retryPolicy?: number[];
373
+
374
+ /**
375
+ * Optional. Request options to use when calling the OpenAI API.
376
+ */
377
+ requestConfig?: AxiosRequestConfig;
378
+ }
379
+
380
+
381
+ /**
382
+ * Options for configuring an `OpenAIEmbeddings` to generate embeddings using an OSS hosted model.
383
+ */
384
+ export interface OSSEmbeddingsOptions extends BaseOpenAIEmbeddingsOptions {
385
+ /**
386
+ * Model to use for completion.
387
+ */
388
+ ossModel: string;
389
+
390
+ /**
391
+ * Optional. Endpoint to use when calling the OpenAI API.
392
+ * @remarks
393
+ * For Azure OpenAI this is the deployment endpoint.
394
+ */
395
+ ossEndpoint: string;
396
+ }
397
+
398
+ /**
399
+ * Options for configuring an `OpenAIEmbeddings` to generate embeddings using an OpenAI hosted model.
400
+ */
401
+ export interface OpenAIEmbeddingsOptions extends BaseOpenAIEmbeddingsOptions {
402
+ /**
403
+ * API key to use when calling the OpenAI API.
404
+ * @remarks
405
+ * A new API key can be created at https://platform.openai.com/account/api-keys.
406
+ */
407
+ apiKey: string;
408
+
409
+ /**
410
+ * Model to use for completion.
411
+ * @remarks
412
+ * For Azure OpenAI this is the name of the deployment to use.
413
+ */
414
+ model: string;
415
+
416
+ /**
417
+ * Optional. Organization to use when calling the OpenAI API.
418
+ */
419
+ organization?: string;
420
+
421
+ /**
422
+ * Optional. Endpoint to use when calling the OpenAI API.
423
+ */
424
+ endpoint?: string;
425
+ }
426
+
427
+ /**
428
+ * Options for configuring an `OpenAIEmbeddings` to generate embeddings using an Azure OpenAI hosted model.
429
+ */
430
+ export interface AzureOpenAIEmbeddingsOptions extends BaseOpenAIEmbeddingsOptions {
431
+ /**
432
+ * API key to use when making requests to Azure OpenAI.
433
+ */
434
+ azureApiKey: string;
435
+
436
+ /**
437
+ * Deployment endpoint to use.
438
+ */
439
+ azureEndpoint: string;
440
+
441
+ /**
442
+ * Name of the Azure OpenAI deployment (model) to use.
443
+ */
444
+ azureDeployment: string;
445
+
446
+ /**
447
+ * Optional. Version of the API being called. Defaults to `2023-05-15`.
448
+ */
449
+ azureApiVersion?: string;
450
+ }
451
+
452
+ /**
453
+ * A `PromptCompletionModel` for calling OpenAI and Azure OpenAI hosted models.
454
+ * @remarks
455
+ */
456
+ export class OpenAIEmbeddings implements EmbeddingsModel {
457
+ private readonly _httpClient: AxiosInstance;
458
+ private readonly _clientType: ClientType;
459
+
460
+ private readonly UserAgent = 'AlphaWave';
461
+
462
+ public readonly maxTokens;
463
+
464
+ /**
465
+ * Options the client was configured with.
466
+ */
467
+ public readonly options: OSSEmbeddingsOptions|OpenAIEmbeddingsOptions|AzureOpenAIEmbeddingsOptions;
468
+
469
+ /**
470
+ * Creates a new `OpenAIClient` instance.
471
+ * @param options Options for configuring an `OpenAIClient`.
472
+ */
473
+ public constructor(options: OSSEmbeddingsOptions|OpenAIEmbeddingsOptions|AzureOpenAIEmbeddingsOptions) {
474
+ this.maxTokens = options.maxTokens ?? 500;
475
+
476
+ // Check for azure config
477
+ if ((options as AzureOpenAIEmbeddingsOptions).azureApiKey) {
478
+ this._clientType = ClientType.AzureOpenAI;
479
+ this.options = Object.assign({
480
+ retryPolicy: [2000, 5000],
481
+ azureApiVersion: '2023-05-15',
482
+ }, options) as AzureOpenAIEmbeddingsOptions;
483
+
484
+ // Cleanup and validate endpoint
485
+ let endpoint = this.options.azureEndpoint.trim();
486
+ if (endpoint.endsWith('/')) {
487
+ endpoint = endpoint.substring(0, endpoint.length - 1);
488
+ }
489
+
490
+ if (!endpoint.toLowerCase().startsWith('https://')) {
491
+ throw new Error(`Client created with an invalid endpoint of '${endpoint}'. The endpoint must be a valid HTTPS url.`);
492
+ }
493
+
494
+ this.options.azureEndpoint = endpoint;
495
+ } else if ((options as OSSEmbeddingsOptions).ossModel) {
496
+ this._clientType = ClientType.OSS;
497
+ this.options = Object.assign({
498
+ retryPolicy: [2000, 5000]
499
+ }, options) as OSSEmbeddingsOptions;
500
+ } else {
501
+ this._clientType = ClientType.OpenAI;
502
+ this.options = Object.assign({
503
+ retryPolicy: [2000, 5000]
504
+ }, options) as OpenAIEmbeddingsOptions;
505
+ }
506
+
507
+ // Create client
508
+ this._httpClient = axios.create({
509
+ validateStatus: (status) => status < 400 || status == 429
510
+ });
511
+ }
512
+
513
+ /**
514
+ * Creates embeddings for the given inputs using the OpenAI API.
515
+ * @param model Name of the model to use (or deployment for Azure).
516
+ * @param inputs Text inputs to create embeddings for.
517
+ * @returns A `EmbeddingsResponse` with a status and the generated embeddings or a message when an error occurs.
518
+ */
519
+ public async createEmbeddings(inputs: string | string[]): Promise<EmbeddingsResponse> {
520
+ if (this.options.logRequests) {
521
+ console.log(Colorize.title('EMBEDDINGS REQUEST:'));
522
+ console.log(Colorize.output(inputs));
523
+ }
524
+
525
+ const startTime = Date.now();
526
+ const response = await this.createEmbeddingRequest({
527
+ input: inputs,
528
+ });
529
+
530
+ if (this.options.logRequests) {
531
+ console.log(Colorize.title('RESPONSE:'));
532
+ console.log(Colorize.value('status', response.status));
533
+ console.log(Colorize.value('duration', Date.now() - startTime, 'ms'));
534
+ console.log(Colorize.output(response.data));
535
+ }
536
+
537
+
538
+ // Process response
539
+ if (response.status < 300) {
540
+ return { status: 'success', output: response.data.data.sort((a, b) => a.index - b.index).map((item) => item.embedding) };
541
+ } else if (response.status == 429) {
542
+ return { status: 'rate_limited', message: `The embeddings API returned a rate limit error.` }
543
+ } else {
544
+ return { status: 'error', message: `The embeddings API returned an error status of ${response.status}: ${response.statusText}` };
545
+ }
546
+ }
547
+
548
+ /**
549
+ * @private
550
+ */
551
+ protected createEmbeddingRequest(request: CreateEmbeddingRequest): Promise<AxiosResponse<CreateEmbeddingResponse>> {
552
+ if (this.options.dimensions) {
553
+ request.dimensions = this.options.dimensions;
554
+ }
555
+ if (this._clientType == ClientType.AzureOpenAI) {
556
+ const options = this.options as AzureOpenAIEmbeddingsOptions;
557
+ const url = `${options.azureEndpoint}/openai/deployments/${options.azureDeployment}/embeddings?api-version=${options.azureApiVersion!}`;
558
+ return this.post(url, request);
559
+ } else if (this._clientType == ClientType.OSS) {
560
+ const options = this.options as OSSEmbeddingsOptions;
561
+ const url = `${options.ossEndpoint}/v1/embeddings`;
562
+ (request as OpenAICreateEmbeddingRequest).model = options.ossModel;
563
+ return this.post(url, request);
564
+ } else {
565
+ const options = this.options as OpenAIEmbeddingsOptions;
566
+ const url = `${options.endpoint ?? 'https://api.openai.com'}/v1/embeddings`;
567
+ (request as OpenAICreateEmbeddingRequest).model = options.model;
568
+ return this.post(url, request);
569
+ }
570
+ }
571
+
572
+ /**
573
+ * @private
574
+ */
575
+ protected async post<TData>(url: string, body: object, retryCount = 0): Promise<AxiosResponse<TData>> {
576
+ // Initialize request config
577
+ const requestConfig: AxiosRequestConfig = Object.assign({}, this.options.requestConfig);
578
+
579
+ // Initialize request headers
580
+ if (!requestConfig.headers) {
581
+ requestConfig.headers = {};
582
+ }
583
+ if (!requestConfig.headers['Content-Type']) {
584
+ requestConfig.headers['Content-Type'] = 'application/json';
585
+ }
586
+ if (!requestConfig.headers['User-Agent']) {
587
+ requestConfig.headers['User-Agent'] = this.UserAgent;
588
+ }
589
+ if (this._clientType == ClientType.AzureOpenAI) {
590
+ const options = this.options as AzureOpenAIEmbeddingsOptions;
591
+ requestConfig.headers['api-key'] = options.azureApiKey;
592
+ } else if (this._clientType == ClientType.OpenAI) {
593
+ const options = this.options as OpenAIEmbeddingsOptions;
594
+ requestConfig.headers['Authorization'] = `Bearer ${options.apiKey}`;
595
+ if (options.organization) {
596
+ requestConfig.headers['OpenAI-Organization'] = options.organization;
597
+ }
598
+ }
599
+
600
+ // Send request
601
+ const response = await this._httpClient.post(url, body, requestConfig);
602
+
603
+ // Check for rate limit error
604
+ if (response.status == 429 && Array.isArray(this.options.retryPolicy) && retryCount < this.options.retryPolicy.length) {
605
+ const delay = this.options.retryPolicy[retryCount];
606
+ await new Promise((resolve) => setTimeout(resolve, delay));
607
+ return this.post(url, body, retryCount + 1);
608
+ } else {
609
+ return response;
610
+ }
611
+ }
612
+ }
613
+
614
+ enum ClientType {
615
+ OpenAI,
616
+ AzureOpenAI,
617
+ OSS
618
+ }
619
+
620
+ MEMORY:
621
+ SOURCE: src\ItemSelector.ts
622
+ DETAILS: import { MetadataFilter, MetadataTypes } from './types';
623
+
624
+ export class ItemSelector {
625
+ /**
626
+ * Returns the similarity between two vectors using the cosine similarity.
627
+ * @param vector1 Vector 1
628
+ * @param vector2 Vector 2
629
+ * @returns Similarity between the two vectors
630
+ */
631
+ public static cosineSimilarity(vector1: number[], vector2: number[]) {
632
+ // Return the quotient of the dot product and the product of the norms
633
+ return this.dotProduct(vector1, vector2) / (this.normalize(vector1) * this.normalize(vector2));
634
+ }
635
+
636
+ /**
637
+ * Normalizes a vector.
638
+ * @remarks
639
+ * The norm of a vector is the square root of the sum of the squares of the elements.
640
+ * The LocalIndex pre-normalizes all vectors to improve performance.
641
+ * @param vector Vector to normalize
642
+ * @returns Normalized vector
643
+ */
644
+ public static normalize(vector: number[]) {
645
+ // Initialize a variable to store the sum of the squares
646
+ let sum = 0;
647
+ // Loop through the elements of the array
648
+ for (let i = 0; i < vector.length; i++) {
649
+ // Square the element and add it to the sum
650
+ sum += vector[i] * vector[i];
651
+ }
652
+ // Return the square root of the sum
653
+ return Math.sqrt(sum);
654
+ }
655
+
656
+ /**
657
+ * Returns the similarity between two vectors using cosine similarity.
658
+ * @remarks
659
+ * The LocalIndex pre-normalizes all vectors to improve performance.
660
+ * This method uses the pre-calculated norms to improve performance.
661
+ * @param vector1 Vector 1
662
+ * @param norm1 Norm of vector 1
663
+ * @param vector2 Vector 2
664
+ * @param norm2 Norm of vector 2
665
+ * @returns Similarity between the two vectors
666
+ */
667
+ public static normalizedCosineSimilarity(vector1: number[], norm1: number, vector2: number[], norm2: number) {
668
+ // Return the quotient of the dot product and the product of the norms
669
+ return this.dotProduct(vector1, vector2) / (norm1 * norm2);
670
+ }
671
+
672
+ /**
673
+ * Applies a filter to the metadata of an item.
674
+ * @param metadata Metadata of the item
675
+ * @param filter Filter to apply
676
+ * @returns True if the item matches the filter, false otherwise
677
+ */
678
+ public static select(metadata: Record<string, MetadataTypes>, filter: MetadataFilter): boolean {
679
+ if (filter === undefined || filter === null) {
680
+ return true;
681
+ }
682
+
683
+ for (const key in filter) {
684
+ switch (key) {
685
+ case '$and':
686
+ if (!filter[key]!.every((f: MetadataFilter) => this.select(metadata, f))) {
687
+ return false;
688
+ }
689
+ break;
690
+ case '$or':
691
+ if (!filter[key]!.some((f: MetadataFilter) => this.select(metadata, f))) {
692
+ return false;
693
+ }
694
+ break;
695
+ default:
696
+ const value = filter[key];
697
+ if (value === undefined || value === null) {
698
+ return false;
699
+ } else if (typeof value == 'object') {
700
+ if (!this.metadataFilter(metadata[key], value as MetadataFilter)) {
701
+ return false;
702
+ }
703
+ } else {
704
+ if (metadata[key] !== value) {
705
+ return false;
706
+ }
707
+ }
708
+ break;
709
+ }
710
+ }
711
+ return true;
712
+ }
713
+
714
+ private static dotProduct(arr1: number[], arr2: number[]) {
715
+ // Initialize a variable to store the sum of the products
716
+ let sum = 0;
717
+ // Loop through the elements of the arrays
718
+ for (let i = 0; i < arr1.length; i++) {
719
+ // Multiply the corresponding elements and add them to the sum
720
+ sum += arr1[i] * arr2[i];
721
+ }
722
+ // Return the sum
723
+ return sum;
724
+ }
725
+
726
+ private static metadataFilter(value: MetadataTypes, filter: MetadataFilter): boolean {
727
+ if (value === undefined || value === null) {
728
+ return false;
729
+ }
730
+
731
+ for (const key in filter) {
732
+ switch (key) {
733
+ case '$eq':
734
+ if (value !== filter[key]) {
735
+ return false;
736
+ }
737
+ break;
738
+ case '$ne':
739
+ if (value === filter[key]) {
740
+ return false;
741
+ }
742
+ break;
743
+ case '$gt':
744
+ if (typeof value != 'number' || value <= filter[key]!) {
745
+ return false;
746
+ }
747
+ break;
748
+ case '$gte':
749
+ if (typeof value != 'number' || value < filter[key]!) {
750
+ return false;
751
+ }
752
+ break;
753
+ case '$lt':
754
+ if (typeof value != 'number' || value >= filter[key]!) {
755
+ return false;
756
+ }
757
+ break;
758
+ case '$lte':
759
+ if (typeof value != 'number' || value > filter[key]!) {
760
+ return false;
761
+ }
762
+ break;
763
+ case '$in':
764
+ if (typeof value == 'boolean') {
765
+ return false;
766
+ } else if(typeof value == 'string' && !filter[key]!.includes(value)){
767
+ return false
768
+ } else if(!filter[key]!.some(val => typeof val == 'string' && val.includes(value as string))){
769
+ return false
770
+ }
771
+ break;
772
+ case '$nin':
773
+ if (typeof value == 'boolean') {
774
+ return false;
775
+ }
776
+ else if (typeof value == 'string' && filter[key]!.includes(value)) {
777
+ return false;
778
+ }
779
+ else if (filter[key]!.some(val => typeof val == 'string' && val.includes(value as string))) {
780
+ return false;
781
+ }
782
+ break;
783
+ default:
784
+ return value === filter[key];
785
+ }
786
+ }
787
+ return true;
788
+ }
789
+ }
790
+
791
+ MEMORY:
792
+ SOURCE: src\TextSplitter.ts
793
+ DETAILS: import { GPT3Tokenizer } from "./GPT3Tokenizer";
794
+ import { TextChunk, Tokenizer } from "./types";
795
+ const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
796
+ export interface TextSplitterConfig {
797
+ separators: string[];
798
+ keepSeparators: boolean;
799
+ chunkSize: number;
800
+ chunkOverlap: number;
801
+ tokenizer: Tokenizer;
802
+ docType?: string;
803
+ export class TextSplitter {
804
+ private readonly _config: TextSplitterConfig;
805
+ public constructor(config?: Partial<TextSplitterConfig>) {
806
+ this._config = Object.assign({
807
+ keepSeparators: false,
808
+ chunkSize: 400,
809
+ chunkOverlap: 40,
810
+ } as TextSplitterConfig, config);
811
+ // Create a default tokenizer if none is provided
812
+ if (!this._config.tokenizer) {
813
+ this._config.tokenizer = new GPT3Tokenizer();
814
+ // Use default separators if none are provided
815
+ if (!this._config.separators || this._config.separators.length === 0) {
816
+ this._config.separators = this.getSeparators(this._config.docType);
817
+ // Validate the config settings
818
+ if (this._config.chunkSize < 1) {
819
+ throw new Error("chunkSize must be >= 1");
820
+ } else if (this._config.chunkOverlap < 0) {
821
+ throw new Error("chunkOverlap must be >= 0");
822
+ } else if (this._config.chunkOverlap > this._config.chunkSize) {
823
+ throw new Error("chunkOverlap must be <= chunkSize");
824
+ public split(text: string): TextChunk[] {
825
+ // Get basic chunks
826
+ const chunks = this.recursiveSplit(text, this._config.separators, 0);
827
+ const that = this;
828
+ function getOverlapTokens(tokens?: number[]): number[] {
829
+ if (tokens != undefined) {
830
+ const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
831
+ return tokens.slice(0, len);
832
+ } else {
833
+ return [];
834
+ // Add overlap tokens and text to the start and end of each chunk
835
+ if (this._config.chunkOverlap > 0) {
836
+ for (let i = 1; i < chunks.length; i++) {
837
+ const previousChunk = chunks[i - 1];
838
+ const chunk = chunks[i];
839
+ const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
840
+ chunk.startOverlap = getOverlapTokens(previousChunk.tokens.reverse()).reverse();
841
+ chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
842
+ return chunks;
843
+ private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
844
+ const chunks: TextChunk[] = [];
845
+ if (text.length > 0) {
846
+ // Split text into parts
847
+ let parts: string[];
848
+ let separator = '';
849
+ const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
850
+ if (separators.length > 0) {
851
+ // Split by separator
852
+ separator = separators[0];
853
+ parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
854
+ } else {
855
+ // Cut text in half
856
+ const half = Math.floor(text.length / 2);
857
+ parts = [text.substring(0, half), text.substring(half)];
858
+ // Iterate over parts
859
+ for (let i = 0; i < parts.length; i++) {
860
+ const lastChunk = (i === parts.length - 1);
861
+ // Get chunk text and endPos
862
+ let chunk = parts[i];
863
+ const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
864
+ if (this._config.keepSeparators && !lastChunk) {
865
+ chunk += separator;
866
+ // Ensure chunk contains text
867
+ if (!this.containsAlphanumeric(chunk)) {
868
+ continue;
869
+ // Optimization to avoid encoding really large chunks
870
+ if (chunk.length / 6 > this._config.chunkSize) {
871
+ // Break the text into smaller chunks
872
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
873
+ chunks.push(...subChunks);
874
+ } else {
875
+ // Encode chunk text
876
+ const tokens = this._config.tokenizer.encode(chunk);
877
+ if (tokens.length > this._config.chunkSize) {
878
+ // Break the text into smaller chunks
879
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
880
+ chunks.push(...subChunks);
881
+ } else {
882
+ // Append chunk to output
883
+ chunks.push({
884
+ text: chunk,
885
+ tokens: tokens,
886
+ startPos: startPos,
887
+ endPos: endPos,
888
+ startOverlap: [],
889
+ endOverlap: [],
890
+ });
891
+ // Update startPos
892
+ startPos = endPos + 1;
893
+ return this.combineChunks(chunks);
894
+ private combineChunks(chunks: TextChunk[]): TextChunk[] {
895
+ const combinedChunks: TextChunk[] = [];
896
+ let currentChunk: TextChunk|undefined;
897
+ let currentLength = 0;
898
+ const separator = this._config.keepSeparators ? '' : ' ';
899
+ for (let i = 0; i < chunks.length; i++) {
900
+ const chunk = chunks[i];
901
+ if (currentChunk) {
902
+ const length = currentChunk.tokens.length + chunk.tokens.length;
903
+ if (length > this._config.chunkSize) {
904
+ combinedChunks.push(currentChunk);
905
+ currentChunk = chunk;
906
+ currentLength = chunk.tokens.length;
907
+ } else {
908
+ currentChunk.text += separator + chunk.text;
909
+ currentChunk.endPos = chunk.endPos;
910
+ currentChunk.tokens.push(...chunk.tokens);
911
+ currentLength += chunk.tokens.length;
912
+ }
913
+ } else {
914
+ currentChunk = chunk;
915
+ currentLength = chunk.tokens.length;
916
+ if (currentChunk) {
917
+ combinedChunks.push(currentChunk);
918
+ return combinedChunks;
919
+ private containsAlphanumeric(text: string): boolean {
920
+ for (let i = 0; i < text.length; i++) {
921
+ if (ALPHANUMERIC_CHARS.includes(text[i])) {
922
+ return true;
923
+ }
924
+ }
925
+ return false;
926
+ }
927
+ private splitBySpaces(text: string): string[] {
928
+ // Split text by tokens and return parts
929
+ const parts: string[] = [];
930
+ let tokens = this._config.tokenizer.encode(text);
931
+ do {
932
+ if (tokens.length <= this._config.chunkSize) {
933
+ parts.push(this._config.tokenizer.decode(tokens));
934
+ break;
935
+ } else {
936
+ const span = tokens.splice(0, this._config.chunkSize);
937
+ parts.push(this._config.tokenizer.decode(span));
938
+ } while (true);
939
+ return parts;
940
+ }
941
+ private getSeparators(docType?: string): string[] {
942
+ switch (docType ?? '') {
943
+ case "cpp":
944
+ return [
945
+ // Split along class definitions
946
+ "\nclass ",
947
+ // Split along function definitions
948
+ "\nvoid ",
949
+ "\nint ",
950
+ "\nfloat ",
951
+ "\ndouble ",
952
+ // Split along control flow statements
953
+ "\nif ",
954
+ "\nfor ",
955
+ "\nwhile ",
956
+ "\nswitch ",
957
+ "\ncase ",
958
+ // Split by the normal type of lines
959
+ "\n\n",
960
+ "\n",
961
+ ];
962
+ case "go":
963
+ return [
964
+ // Split along function definitions
965
+ "\nfunc ",
966
+ "\nvar ",
967
+ "\nconst ",
968
+ "\ntype ",
969
+ // Split along control flow statements
970
+ "\nif ",
971
+ "\nfor ",
972
+ "\nswitch ",
973
+ "\ncase ",
974
+ // Split by the normal type of lines
975
+ "\n\n",
976
+ "\n",
977
+ ];
978
+ case "java":
979
+ case "c#":
980
+ case "csharp":
981
+ case "cs":
982
+ case "ts":
983
+ case "tsx":
984
+ case "typescript":
985
+ return [
986
+ // split along regions
987
+ "// LLM-REGION",
988
+ "/* LLM-REGION",
989
+ "/** LLM-REGION",
990
+ // Split along class definitions
991
+ "\nclass ",
992
+ // Split along method definitions
993
+ "\npublic ",
994
+ "\nprotected ",
995
+ "\nprivate ",
996
+ "\nstatic ",
997
+ // Split along control flow statements
998
+ "\nif ",
999
+ "\nfor ",
1000
+ "\nwhile ",
1001
+ "\nswitch ",
1002
+ "\ncase ",
1003
+ // Split by the normal type of lines
1004
+ "\n\n",
1005
+ "\n",
1006
+ " "
1007
+ ];
1008
+ case "js":
1009
+ case "jsx":
1010
+ case "javascript":
1011
+ return [
1012
+ // split along regions
1013
+ "// LLM-REGION",
1014
+ "/* LLM-REGION",
1015
+ "/** LLM-REGION",
1016
+ // Split along class definitions
1017
+ "\nclass ",
1018
+ // Split along function definitions
1019
+ "\nfunction ",
1020
+ "\nconst ",
1021
+ "\nlet ",
1022
+ "\nvar ",
1023
+ "\nclass ",
1024
+ // Split along control flow statements
1025
+ "\nif ",
1026
+ "\nfor ",
1027
+ "\nwhile ",
1028
+ "\nswitch ",
1029
+ "\ncase ",
1030
+ "\ndefault ",
1031
+ // Split by the normal type of lines
1032
+ "\n\n",
1033
+ "\n",
1034
+ ];
1035
+ case "php":
1036
+ return [
1037
+ // Split along function definitions
1038
+ "\nfunction ",
1039
+ // Split along class definitions
1040
+ "\nclass ",
1041
+ // Split along control flow statements
1042
+ "\nif ",
1043
+ "\nforeach ",
1044
+ "\nwhile ",
1045
+ "\ndo ",
1046
+ "\nswitch ",
1047
+ "\ncase ",
1048
+ // Split by the normal type of lines
1049
+ "\n\n",
1050
+ "\n",
1051
+ ];
1052
+ case "proto":
1053
+ return [
1054
+ // Split along message definitions
1055
+ "\nmessage ",
1056
+ // Split along service definitions
1057
+ "\nservice ",
1058
+ // Split along enum definitions
1059
+ "\nenum ",
1060
+ // Split along option definitions
1061
+ "\noption ",
1062
+ // Split along import statements
1063
+ "\nimport ",
1064
+ // Split along syntax declarations
1065
+ "\nsyntax ",
1066
+ // Split by the normal type of lines
1067
+ "\n\n",
1068
+ "\n",
1069
+ ];
1070
+ case "python":
1071
+ case "py":
1072
+ return [
1073
+ // First, try to split along class definitions
1074
+ "\nclass ",
1075
+ "\ndef ",
1076
+ "\n\tdef ",
1077
+ // Now split by the normal type of lines
1078
+ "\n\n",
1079
+ "\n",
1080
+ ];
1081
+ case "rst":
1082
+ return [
1083
+ // Split along section titles
1084
+ "\n===\n",
1085
+ "\n---\n",
1086
+ "\n***\n",
1087
+ // Split along directive markers
1088
+ "\n.. ",
1089
+ // Split by the normal type of lines
1090
+ "\n\n",
1091
+ "\n",
1092
+ ];
1093
+ case "ruby":
1094
+ return [
1095
+ // Split along method definitions
1096
+ "\ndef ",
1097
+ "\nclass ",
1098
+ // Split along control flow statements
1099
+ "\nif ",
1100
+ "\nunless ",
1101
+ "\nwhile ",
1102
+ "\nfor ",
1103
+ "\ndo ",
1104
+ "\nbegin ",
1105
+ "\nrescue ",
1106
+ // Split by the normal type of lines
1107
+ "\n\n",
1108
+ "\n",
1109
+ ];
1110
+ case "rust":
1111
+ return [
1112
+ // Split along function definitions
1113
+ "\nfn ",
1114
+ "\nconst ",
1115
+ "\nlet ",
1116
+ // Split along control flow statements
1117
+ "\nif ",
1118
+ "\nwhile ",
1119
+ "\nfor ",
1120
+ "\nloop ",
1121
+ "\nmatch ",
1122
+ "\nconst ",
1123
+ // Split by the normal type of lines
1124
+ "\n\n",
1125
+ "\n",
1126
+ ];
1127
+ case "scala":
1128
+ return [
1129
+ // Split along class definitions
1130
+ "\nclass ",
1131
+ "\nobject ",
1132
+ // Split along method definitions
1133
+ "\ndef ",
1134
+ "\nval ",
1135
+ "\nvar ",
1136
+ // Split along control flow statements
1137
+ "\nif ",
1138
+ "\nfor ",
1139
+ "\nwhile ",
1140
+ "\nmatch ",
1141
+ "\ncase ",
1142
+ // Split by the normal type of lines
1143
+ "\n\n",
1144
+ "\n",
1145
+ ];
1146
+ case "swift":
1147
+ return [
1148
+ // Split along function definitions
1149
+ "\nfunc ",
1150
+ // Split along class definitions
1151
+ "\nclass ",
1152
+ "\nstruct ",
1153
+ "\nenum ",
1154
+ // Split along control flow statements
1155
+ "\nif ",
1156
+ "\nfor ",
1157
+ "\nwhile ",
1158
+ "\ndo ",
1159
+ "\nswitch ",
1160
+ "\ncase ",
1161
+ // Split by the normal type of lines
1162
+ "\n\n",
1163
+ "\n",
1164
+ ];
1165
+ case "md":
1166
+ case "markdown":
1167
+ return [
1168
+ // First, try to split along Markdown headings (starting with level 2)
1169
+ "\n## ",
1170
+ "\n### ",
1171
+ "\n#### ",
1172
+ "\n##### ",
1173
+ "\n###### ",
1174
+ // Note the alternative syntax for headings (below) is not handled here
1175
+ // Heading level 2
1176
+ // End of code block
1177
+ "```\n\n",
1178
+ // Horizontal lines
1179
+ "\n\n***\n\n",
1180
+ "\n\n---\n\n",
1181
+ "\n\n___\n\n",
1182
+ // Note that this splitter doesn't handle horizontal lines defined
1183
+ // by *three or more* of ***, ---, or ___, but this is not handled
1184
+ // Github tables
1185
+ "<table>",
1186
+ // "<tr>",
1187
+ // "<td>",
1188
+ // "<td ",
1189
+ "\n\n",
1190
+ "\n",
1191
+ ];
1192
+ case "latex":
1193
+ return [
1194
+ // First, try to split along Latex sections
1195
+ "\n\\chapter{",
1196
+ "\n\\section{",
1197
+ "\n\\subsection{",
1198
+ "\n\\subsubsection{",
1199
+ // Now split by environments
1200
+ "\n\\begin{enumerate}",
1201
+ "\n\\begin{itemize}",
1202
+ "\n\\begin{description}",
1203
+ "\n\\begin{list}",
1204
+ "\n\\begin{quote}",
1205
+ "\n\\begin{quotation}",
1206
+ "\n\\begin{verse}",
1207
+ "\n\\begin{verbatim}",
1208
+ // Now split by math environments
1209
+ "\n\\begin{align}",
1210
+ // Now split by the normal type of lines
1211
+ "\n\n",
1212
+ "\n",
1213
+ ];
1214
+ case "html":
1215
+ return [
1216
+ // First, try to split along HTML tags
1217
+ "<body>",
1218
+ "<div>",
1219
+ "<p>",
1220
+ "<br>",
1221
+ "<li>",
1222
+ "<h1>",
1223
+ "<h2>",
1224
+ "<h3>",
1225
+ "<h4>",
1226
+ "<h5>",
1227
+ "<h6>",
1228
+ "<span>",
1229
+ "<table>",
1230
+ "<tr>",
1231
+ "<td>",
1232
+ "<th>",
1233
+ "<ul>",
1234
+ "<ol>",
1235
+ "<header>",
1236
+ "<footer>",
1237
+ "<nav>",
1238
+ // Head
1239
+ "<head>",
1240
+ "<style>",
1241
+ "<script>",
1242
+ "<meta>",
1243
+ "<title>",
1244
+ // Normal type of lines
1245
+ "\n\n",
1246
+ "\n",
1247
+ ];
1248
+ case "sol":
1249
+ return [
1250
+ // Split along compiler informations definitions
1251
+ "\npragma ",
1252
+ "\nusing ",
1253
+ // Split along contract definitions
1254
+ "\ncontract ",
1255
+ "\ninterface ",
1256
+ "\nlibrary ",
1257
+ // Split along method definitions
1258
+ "\nconstructor ",
1259
+ "\ntype ",
1260
+ "\nfunction ",
1261
+ "\nevent ",
1262
+ "\nmodifier ",
1263
+ "\nerror ",
1264
+ "\nstruct ",
1265
+ "\nenum ",
1266
+ // Split along control flow statements
1267
+ "\nif ",
1268
+ "\nfor ",
1269
+ "\nwhile ",
1270
+ "\ndo while ",
1271
+ "\nassembly ",
1272
+ // Split by the normal type of lines
1273
+ "\n\n",
1274
+ "\n",
1275
+ ];
1276
+ default:
1277
+ return [
1278
+ // Split by the normal type of lines
1279
+ "\n\n",
1280
+ "\n",
1281
+ ];
1282
+ }
1283
+ }
1284
+ }
1285
+
1286
+ MEMORY:
1287
+ SOURCE: src\LocalDocument.ts
1288
+ DETAILS: import * as fs from 'fs/promises';
1289
+ import * as path from 'path';
1290
+ import { MetadataTypes } from './types';
1291
+ import { LocalDocumentIndex } from './LocalDocumentIndex';
1292
+
1293
+ /**
1294
+ * Represents an indexed document stored on disk.
1295
+ */
1296
+ export class LocalDocument {
1297
+ private readonly _index: LocalDocumentIndex;
1298
+ private readonly _id: string;
1299
+ private readonly _uri: string;
1300
+ private _metadata: Record<string,MetadataTypes>|undefined;
1301
+ private _text: string|undefined;
1302
+
1303
+ /**
1304
+ * Creates a new `LocalDocument` instance.
1305
+ * @param index Parent index that contains the document.
1306
+ * @param id ID of the document.
1307
+ * @param uri URI of the document.
1308
+ */
1309
+ public constructor(index: LocalDocumentIndex, id: string, uri: string) {
1310
+ this._index = index;
1311
+ this._id = id;
1312
+ this._uri = uri;
1313
+ }
1314
+
1315
+ /**
1316
+ * Returns the folder path where the document is stored.
1317
+ */
1318
+ public get folderPath(): string {
1319
+ return this._index.folderPath;
1320
+ }
1321
+
1322
+ /**
1323
+ * Returns the ID of the document.
1324
+ */
1325
+ public get id(): string {
1326
+ return this._id;
1327
+ }
1328
+
1329
+ /**
1330
+ * Returns the URI of the document.
1331
+ */
1332
+ public get uri(): string {
1333
+ return this._uri;
1334
+ }
1335
+
1336
+ /**
1337
+ * Returns the length of the document in tokens.
1338
+ * @remarks
1339
+ * This value will be estimated for documents longer then 40k bytes.
1340
+ * @returns Length of the document in tokens.
1341
+ */
1342
+ public async getLength(): Promise<number> {
1343
+ const text = await this.loadText();
1344
+ if (text.length <= 40000) {
1345
+ return this._index.tokenizer.encode(text).length;
1346
+ } else {
1347
+ return Math.ceil(text.length / 4);
1348
+ }
1349
+ }
1350
+
1351
+ /**
1352
+ * Determines if the document has additional metadata storred on disk.
1353
+ * @returns True if the document has metadata; otherwise, false.
1354
+ */
1355
+ public async hasMetadata(): Promise<boolean> {
1356
+ try {
1357
+ await fs.access(path.join(this.folderPath, `${this.id}.json`));
1358
+ return true;
1359
+ } catch (err: unknown) {
1360
+ return false;
1361
+ }
1362
+ }
1363
+
1364
+ /**
1365
+ * Loads the metadata for the document from disk.
1366
+ * @returns Metadata for the document.
1367
+ */
1368
+ public async loadMetadata(): Promise<Record<string,MetadataTypes>> {
1369
+ if (this._metadata == undefined) {
1370
+ let json: string;
1371
+ try {
1372
+ json = (await fs.readFile(path.join(this.folderPath, `${this.id}.json`))).toString();
1373
+ } catch (err: unknown) {
1374
+ throw new Error(`Error reading metadata for document "${this.uri}": ${(err as any).toString()}`);
1375
+ }
1376
+
1377
+ try {
1378
+ this._metadata = JSON.parse(json);
1379
+ } catch (err: unknown) {
1380
+ throw new Error(`Error parsing metadata for document "${this.uri}": ${(err as any).toString()}`);
1381
+ }
1382
+ }
1383
+
1384
+ return this._metadata!;
1385
+ }
1386
+
1387
+ /**
1388
+ * Loads the text for the document from disk.
1389
+ * @returns Text for the document.
1390
+ */
1391
+ public async loadText(): Promise<string> {
1392
+ if (this._text == undefined) {
1393
+ try {
1394
+ this._text = (await fs.readFile(path.join(this.folderPath, `${this.id}.txt`))).toString();
1395
+ } catch (err: unknown) {
1396
+ throw new Error(`Error reading text file for document "${this.uri}": ${(err as any).toString()}`);
1397
+ }
1398
+ }
1399
+
1400
+ return this._text;
1401
+ }
1402
+ }
1403
+
1404
+ MEMORY:
1405
+ SOURCE: src\LocalDocumentResult.ts
1406
+ DETAILS: import { LocalDocument } from "./LocalDocument";
1407
+ import { LocalDocumentIndex } from "./LocalDocumentIndex";
1408
+ import { QueryResult, DocumentChunkMetadata, Tokenizer, DocumentTextSection } from "./types";
1409
+ /**
1410
+ * Represents a search result for a document stored on disk.
1411
+ */
1412
+ export class LocalDocumentResult extends LocalDocument {
1413
+ private readonly _chunks: QueryResult<DocumentChunkMetadata>[];
1414
+ private readonly _tokenizer: Tokenizer;
1415
+ private readonly _score: number;
1416
+
1417
+ /**
1418
+ * @private
1419
+ * Internal constructor for `LocalDocumentResult` instances.
1420
+ */
1421
+ public constructor(index: LocalDocumentIndex, id: string, uri: string, chunks: QueryResult<DocumentChunkMetadata>[], tokenizer: Tokenizer) {
1422
+ super(index, id, uri);
1423
+ this._chunks = chunks;
1424
+ this._tokenizer = tokenizer;
1425
+ // Compute average score
1426
+ let score = 0;
1427
+ this._chunks.forEach(chunk => score += chunk.score);
1428
+ this._score = score / this._chunks.length;
1429
+ }
1430
+
1431
+ /**
1432
+ * Returns the chunks of the document that matched the query.
1433
+ */
1434
+ public get chunks(): QueryResult<DocumentChunkMetadata>[] {
1435
+ return this._chunks;
1436
+ }
1437
+
1438
+ /**
1439
+ * Returns the average score of the document result.
1440
+ */
1441
+ public get score(): number {
1442
+ return this._score;
1443
+ }
1444
+
1445
+ /**
1446
+ * Renders all of the results chunks as spans of text (sections.)
1447
+ * @remarks
1448
+ * The returned sections will be sorted by document order and limited to maxTokens in length.
1449
+ * @param maxTokens Maximum number of tokens per section.
1450
+ * @returns Array of rendered text sections.
1451
+ */
1452
+ public async renderAllSections(maxTokens: number): Promise<DocumentTextSection[]> {
1453
+ // Load text from disk
1454
+ const text = await this.loadText();
1455
+ // Add chunks to a temp array and split any chunks that are longer than maxTokens.
1456
+ const chunks: SectionChunk[] = [];
1457
+ for (let i = 0; i < this._chunks.length; i++) {
1458
+ const chunk = this._chunks[i];
1459
+ const startPos = chunk.item.metadata.startPos;
1460
+ const endPos = chunk.item.metadata.endPos;
1461
+ const chunkText = text.substring(startPos, endPos + 1);
1462
+ const tokens = this._tokenizer.encode(chunkText);
1463
+ let offset = 0;
1464
+ while (offset < tokens.length) {
1465
+ const chunkLength = Math.min(maxTokens, tokens.length - offset);
1466
+ chunks.push({
1467
+ text: this._tokenizer.decode(tokens.slice(offset, offset + chunkLength)),
1468
+ startPos: startPos + offset,
1469
+ endPos: startPos + offset + chunkLength - 1,
1470
+ score: chunk.score,
1471
+ tokenCount: chunkLength,
1472
+ isBm25: false
1473
+ });
1474
+ offset += chunkLength;
1475
+ }
1476
+ }
1477
+ // Sort chunks by startPos
1478
+ const sorted = chunks.sort((a, b) => a.startPos - b.startPos);
1479
+ // Generate sections
1480
+ const sections: Section[] = [];
1481
+ for (let i = 0; i < sorted.length; i++) {
1482
+ const chunk = sorted[i];
1483
+ let section = sections[sections.length - 1];
1484
+ if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
1485
+ section = {
1486
+ chunks: [],
1487
+ score: 0,
1488
+ tokenCount: 0
1489
+ };
1490
+ sections.push(section);
1491
+ }
1492
+ section.chunks.push(chunk);
1493
+ section.score += chunk.score;
1494
+ section.tokenCount += chunk.tokenCount;
1495
+ }
1496
+ // Normalize section scores
1497
+ sections.forEach(section => section.score /= section.chunks.length);
1498
+ // Return final rendered sections
1499
+ return sections.map(section => {
1500
+ let text = '';
1501
+ section.chunks.forEach(chunk => text += chunk.text);
1502
+ return {
1503
+ text: text,
1504
+ tokenCount: section.tokenCount,
1505
+ score: section.score,
1506
+ isBm25: false,
1507
+ };
1508
+ });
1509
+ }
1510
+
1511
+ /**
1512
+ * Renders the top spans of text (sections) of the document based on the query result.
1513
+ * @remarks
1514
+ * The returned sections will be sorted by relevance and limited to the top `maxSections`.
1515
+ * @param maxTokens Maximum number of tokens per section.
1516
+ * @param maxSections Maximum number of sections to return.
1517
+ * @param overlappingChunks Optional. If true, overlapping chunks of text will be added to each section until the maxTokens is reached.
1518
+ * @returns Array of rendered text sections.
1519
+ */
1520
+ public async renderSections(maxTokens: number, maxSections: number, overlappingChunks = true): Promise<DocumentTextSection[]> {
1521
+ // Load text from disk
1522
+ const text = await this.loadText();
1523
+ // First check to see if the entire document is shorter than maxTokens
1524
+ const length = await this.getLength();
1525
+ if (length <= maxTokens) {
1526
+ return [{
1527
+ text,
1528
+ tokenCount: length,
1529
+ score: 1.0,
1530
+ isBm25: false,
1531
+ }];
1532
+ }
1533
+ // Otherwise, we need to split the document into sections
1534
+ // - Add each chunk to a temp array and filter out any chunk that's longer then maxTokens.
1535
+ // - Sort the array by startPos to arrange chunks in document order.
1536
+ // - Generate a new array of sections by combining chunks until the maxTokens is reached for each section.
1537
+ // - Generate an aggregate score for each section by averaging the score of each chunk in the section.
1538
+ // - Sort the sections by score and limit to maxSections.
1539
+ // - For each remaining section combine adjacent chunks of text.
1540
+ // - Dynamically add overlapping chunks of text to each section until the maxTokens is reached.
1541
+ const chunks: SectionChunk[] = this._chunks.map(chunk => {
1542
+ const startPos = chunk.item.metadata.startPos;
1543
+ const endPos = chunk.item.metadata.endPos;
1544
+ const chunkText = text.substring(startPos, endPos + 1);
1545
+ return {
1546
+ text: chunkText,
1547
+ startPos,
1548
+ endPos,
1549
+ score: chunk.score,
1550
+ tokenCount: this._tokenizer.encode(chunkText).length,
1551
+ isBm25: Boolean(chunk.item.metadata.isBm25),
1552
+ };
1553
+ }).filter(chunk => chunk.tokenCount <= maxTokens).sort((a, b) => a.startPos - b.startPos);
1554
+ // Check for no chunks
1555
+ if (chunks.length === 0) {
1556
+ // Take the top chunk and return a subset of its text
1557
+ const topChunk = this._chunks[0];
1558
+ const startPos = topChunk.item.metadata.startPos;
1559
+ const endPos = topChunk.item.metadata.endPos;
1560
+ const chunkText = text.substring(startPos, endPos + 1);
1561
+ const tokens = this._tokenizer.encode(chunkText);
1562
+ return [{
1563
+ text: this._tokenizer.decode(tokens.slice(0, maxTokens)),
1564
+ tokenCount: maxTokens,
1565
+ score: topChunk.score,
1566
+ isBm25: false,
1567
+ }];
1568
+ }
1569
+ // Generate semantic sections
1570
+ const sections: Section[] = [];
1571
+ for (let i = 0; i < chunks.length; i++) {
1572
+ const chunk = chunks[i];
1573
+ let section = sections[sections.length - 1];
1574
+ if (!chunk.isBm25) {
1575
+ if (!section || section.tokenCount + chunk.tokenCount > maxTokens) {
1576
+ section = {
1577
+ chunks: [],
1578
+ score: 0,
1579
+ tokenCount: 0
1580
+ };
1581
+ sections.push(section);
1582
+ }
1583
+ section.chunks.push(chunk);
1584
+ section.score += chunk.score;
1585
+ section.tokenCount += chunk.tokenCount;
1586
+ }
1587
+ }
1588
+ // Generate bm25 sections
1589
+ const bm25Sections: Section[] = [];
1590
+ for (let i = 0; i < chunks.length; i++) {
1591
+ const chunk = chunks[i];
1592
+ let section = bm25Sections[bm25Sections.length -
1593
+
1594
+ MEMORY:
1595
+ SOURCE: README.md
1596
+ DETAILS: # Vectra
1597
+
1598
+ Vectra is a local vector database for Node.js with features similar to [Pinecone](https://www.pinecone.io/) or [Qdrant](https://qdrant.tech/) but built using local files. Each Vectra index is a folder on disk. There's an `index.json` file in the folder that contains all the vectors for the index along with any indexed metadata. When you create an index you can specify which metadata properties to index and only those fields will be stored in the `index.json` file. All of the other metadata for an item will be stored on disk in a separate file keyed by a GUID.
1599
+
1600
+ When queryng Vectra you'll be able to use the same subset of [Mongo DB query operators](https://www.mongodb.com/docs/manual/reference/operator/query/) that Pinecone supports and the results will be returned sorted by simularity. Every item in the index will first be filtered by metadata and then ranked for simularity. Even though every item is evaluated its all in memory so it should by nearly instantanious. Likely 1ms - 2ms for even a rather large index. Smaller indexes should be <1ms.
1601
+
1602
+ Keep in mind that your entire Vectra index is loaded into memory so it's not well suited for scenarios like long term chat bot memory. Use a real vector DB for that. Vectra is intended to be used in scenarios where you have a small corpus of mostly static data that you'd like to include in your prompt. Infinite few shot examples would be a great use case for Vectra or even just a single document you want to ask questions over.
1603
+
1604
+ Pinecone style namespaces aren't directly supported but you could easily mimic them by creating a separate Vectra index (and folder) for each namespace.
1605
+
1606
+ ## Other Language Bindings
1607
+
1608
+ This repo contains the TypeScript/JavaScript binding for Vectra but other language bindings are being created. Since Vectra is file based, any language binding can be used to read or write a Vectra index. That means you can build a Vectra index using JS and then read it using Python.
1609
+
1610
+ - [vectra-py](https://github.com/BMS-geodev/vectra-py) - Python version of Vectra.
1611
+
1612
+ ## Installation
1613
+
1614
+ ```
1615
+ $ npm install vectra
1616
+ ```
1617
+
1618
+ ## Usage
1619
+
1620
+ First create an instance of `LocalIndex` with the path to the folder where you want you're items stored:
1621
+
1622
+ ```typescript
1623
+ import { LocalIndex } from 'vectra';
1624
+
1625
+ const index = new LocalIndex(path.join(__dirname, '..', 'index'));
1626
+ ```
1627
+
1628
+ Next, from inside an async function, create your index:
1629
+
1630
+ ```typescript
1631
+ if (!(await index.isIndexCreated())) {
1632
+ await index.createIndex();
1633
+ }
1634
+ ```
1635
+
1636
+ Add some items to your index:
1637
+
1638
+ ```typescript
1639
+ import { OpenAI } from 'openai';
1640
+
1641
+ const openai = new OpenAI({
1642
+ apiKey: `<YOUR_KEY>`,
1643
+ });
1644
+
1645
+ async function getVector(text: string) {
1646
+ const response = await openai.embeddings.create({
1647
+ 'model': 'text-embedding-ada-002',
1648
+ 'input': text,
1649
+ });
1650
+ return response.data[0].embedding;
1651
+ }
1652
+
1653
+ async function addItem(text: string) {
1654
+ await index.insertItem({
1655
+ vector: await getVector(text),
1656
+ metadata: { text },
1657
+ });
1658
+ }
1659
+
1660
+ // Add items
1661
+ await addItem('apple');
1662
+ await addItem('oranges');
1663
+ await addItem('red');
1664
+ await addItem('blue');
1665
+ ```
1666
+
1667
+ Then query for items:
1668
+
1669
+ ```typescript
1670
+ async function query(text: string) {
1671
+ const vector = await getVector(text);
1672
+ const results = await index.queryItems(vector, 3);
1673
+ if (results.length > 0) {
1674
+ for (const result of results) {
1675
+ console.log(`[${result.score}] ${result.item.metadata.text}`);
1676
+ }
1677
+ } else {
1678
+ console.log(`No results found.`);
1679
+ }
1680
+ }
1681
+
1682
+ await query('green');
1683
+ /*
1684
+ [0.9036569942401076] blue
1685
+ [0.8758153664568566] red
1686
+ [0.8323828606103998] apple
1687
+ */
1688
+
1689
+ await query('banana');
1690
+ /*
1691
+ [0.9033128691220631] apple
1692
+ [0.8493374123092652] oranges
1693
+ [0.8415324469533297] blue
1694
+ */
1695
+ ```
1696
+
1697
+
1698
+ MEMORY:
1699
+ SOURCE: package.json
1700
+ DETAILS: {
1701
+ "name": "vectra",
1702
+ "author": "Steven Ickman",
1703
+ "description": "A vector database that uses the local file system for storage.",
1704
+ "version": "0.12.2",
1705
+ "license": "MIT",
1706
+ "keywords": [
1707
+ "gpt"
1708
+ ],
1709
+ "bugs": {
1710
+ "url": "https://github.com/Stevenic/vectra/issues"
1711
+ },
1712
+ "repository": {
1713
+ "type": "git",
1714
+ "url": "https://github.com/Stevenic/vectra.git"
1715
+ },
1716
+ "main": "./lib/index.js",
1717
+ "types": "./lib/index.d.ts",
1718
+ "bin": {
1719
+ "vectra": "./bin/vectra.js"
1720
+ },
1721
+ "engines": {
1722
+ "node": ">=20.x"
1723
+ },
1724
+ "typesVersions": {
1725
+ "<3.9": {
1726
+ "*": [
1727
+ "_ts3.4/*"
1728
+ ]
1729
+ }
1730
+ },
1731
+ "dependencies": {
1732
+ "axios": "^1.9.0",
1733
+ "cheerio": "^1.0.0",
1734
+ "dotenv": "^16.5.0",
1735
+ "gpt-tokenizer": "^3.4.0",
1736
+ "json-colorizer": "^3.0.1",
1737
+ "openai": "^4.97.0",
1738
+ "turndown": "^7.2.0",
1739
+ "uuid": "^11.1.0",
1740
+ "wink-bm25-text-search": "^3.1.2",
1741
+ "wink-nlp": "^2.3.2",
1742
+ "yargs": "^17.7.2"
1743
+ },
1744
+ "resolutions": {},
1745
+ "devDependencies": {
1746
+ "@types/assert": "^1.5.11",
1747
+ "@types/mocha": "^10.0.10",
1748
+ "@types/node": "^22.15.11",
1749
+ "@types/sinon": "^21.0.0",
1750
+ "@types/turndown": "^5.0.5",
1751
+ "@types/uuid": "10.0.0",
1752
+ "@types/yargs": "17.0.33",
1753
+ "mocha": "11.2.2",
1754
+ "npm-run-all": "^4.1.5",
1755
+ "nyc": "^17.1.0",
1756
+ "rimraf": "^5.0.1",
1757
+ "shx": "^0.4.0",
1758
+ "sinon": "^21.0.1",
1759
+ "ts-mocha": "11.1.0",
1760
+ "ts-node": "^10.9.1",
1761
+ "typescript": "^5.8.3",
1762
+ "wink-bm25-text-search": "^3.1.2"
1763
+ },
1764
+ "scripts": {
1765
+ "build": "tsc -b",
1766
+ "build-docs": "typedoc --theme markdown --entryPoint botbuilder-m365 --excludePrivate --includeDeclarations --ignoreCompilerErrors --module amd --out ..\\..\\doc\\botbuilder-ai .\\lib\\index.d.ts --hideGenerator --name \"Bot Builder SDK - AI\" --readme none",
1767
+ "build:rollup": "yarn clean && yarn build && api-extractor run --verbose --local",
1768
+ "clean": "rimraf _ts3.4 lib tsconfig.tsbuildinfo node_modules",
1769
+ "depcheck": "depcheck --config ../../.depcheckrc",
1770
+ "lint": "eslint **/src/**/*.{j,t}s{,x} --fix --no-error-on-unmatched-pattern",
1771
+ "test": "npm-run-all build test:mocha",
1772
+ "test:mocha": "nyc ts-mocha src/**/*.spec.ts --timeout 10000",
1773
+ "test:compat": "api-extractor run --verbose"
1774
+ },
1775
+ "files": [
1776
+ "_ts3.4",
1777
+ "lib",
1778
+ "src"
1779
+ ],
1780
+ "packageManager": "yarn@1.22.22+sha512.a6b2f7906b721bba3d67d4aff083df04dad64c399707841b7acf00f6b133b7ac24255f2652fa22ae3534329dc6180534e98d17432037ff6fd140556e2bb3137e"
1781
+ }
1782
+
1783
+
1784
+ MEMORY:
1785
+ SOURCE: src\vectra-cli.ts
1786
+ DETAILS: import * as fs from 'fs/promises';
1787
+ import yargs from "yargs/yargs";
1788
+ import { hideBin } from "yargs/helpers";
1789
+ import { LocalDocumentIndex } from "./LocalDocumentIndex";
1790
+ import { WebFetcher } from './WebFetcher';
1791
+ import { AzureOpenAIEmbeddingsOptions, OSSEmbeddingsOptions, OpenAIEmbeddings, OpenAIEmbeddingsOptions } from './OpenAIEmbeddings';
1792
+ import { Colorize } from './internals';
1793
+ import { FileFetcher } from './FileFetcher';
1794
+ export async function run() {
1795
+ // prettier-ignore
1796
+ const args = await yargs(hideBin(process.argv))
1797
+ .scriptName('vectra')
1798
+ .command('create <index>', `create a new local index`, {}, async (args) => {
1799
+ const folderPath = args.index as string;
1800
+ const index = new LocalDocumentIndex({ folderPath });
1801
+ console.log(Colorize.output(`creating index at ${folderPath}`));
1802
+ await index.createIndex({ version: 1, deleteIfExists: true });
1803
+ .command('delete <index>', `delete an existing local index`, {}, async (args) => {
1804
+ const folderPath = args.index as string;
1805
+ console.log(Colorize.output(`deleting index at ${folderPath}`));
1806
+ const index = new LocalDocumentIndex({ folderPath });
1807
+ await index.deleteIndex();
1808
+ .command('add <index>', `adds one or more web pages to an index`, (yargs) => {
1809
+ return yargs
1810
+ .option('keys', {
1811
+ alias: 'k',
1812
+ describe: 'path of a JSON file containing the model keys to use for generating embeddings',
1813
+ type: 'string'
1814
+ .option('uri', {
1815
+ alias: 'u',
1816
+ array: true,
1817
+ describe: 'http/https link to a web page to add',
1818
+ type: 'string'
1819
+ .option('list', {
1820
+ alias: 'l',
1821
+ describe: 'path to a file containing a list of web pages to add',
1822
+ type: 'string'
1823
+ .option('cookie', {
1824
+ alias: 'c',
1825
+ describe: 'optional cookies to add to web fetch requests',
1826
+ type: 'string'
1827
+ .option('chunk-size', {
1828
+ alias: 'cs',
1829
+ describe: 'size of the generated chunks in tokens (defaults to 512)',
1830
+ type: 'number',
1831
+ default: 512
1832
+ .check((argv) => {
1833
+ if (Array.isArray(argv.uri) && argv.uri.length > 0) {
1834
+ return true;
1835
+ } else if (typeof argv.list == 'string' && argv.list.trim().length > 0) {
1836
+ return true;
1837
+ } else {
1838
+ throw new Error(`you must specify either one or more "--uri <link>" for the pages to add or a "--list <file path>" for a file containing the list of pages to add.`);
1839
+ .demandOption(['keys']);
1840
+ }, async (args) => {
1841
+ console.log(Colorize.title('Adding Web Pages to Index'));
1842
+ // Get embedding options
1843
+ const options: OpenAIEmbeddingsOptions|AzureOpenAIEmbeddingsOptions|OSSEmbeddingsOptions = JSON.parse(await fs.readFile(args.keys as string, 'utf-8'));
1844
+ if ((options as OpenAIEmbeddingsOptions).apiKey && !(options as OpenAIEmbeddingsOptions).model) {
1845
+ (options as OpenAIEmbeddingsOptions).model = 'text-embedding-ada-002';
1846
+ (options as OpenAIEmbeddingsOptions).maxTokens = 8000;
1847
+ // Create embeddings
1848
+ const embeddings = new OpenAIEmbeddings(options);
1849
+ // Initialize index
1850
+ const folderPath = args.index as string;
1851
+ const index = new LocalDocumentIndex({
1852
+ folderPath,
1853
+ embeddings,
1854
+ chunkingConfig: {
1855
+ chunkSize: args.chunkSize
1856
+ // Get list of url's
1857
+ const uris = await getItemList(args.uri as string[], args.list as string, 'web page');
1858
+ // Fetch documents
1859
+ const fileFetcher = new FileFetcher();
1860
+ const webFetcher = args.cookie ? new WebFetcher({ headers: { "cookie": args.cookie }}) : new WebFetcher();
1861
+ for (const path of uris) {
1862
+ try {
1863
+ console.log(Colorize.progress(`fetching ${path}`));
1864
+ const fetcher = path.startsWith('http') ? webFetcher : fileFetcher;
1865
+ await fetcher.fetch(path, async (uri, text, docType) => {
1866
+ console.log(Colorize.replaceLine(Colorize.progress(`indexing ${uri}`)));
1867
+ await index.upsertDocument(uri, text, docType);
1868
+ console.log(Colorize.replaceLine(Colorize.success(`added ${uri}`)));
1869
+ return true;
1870
+ } catch (err: unknown) {
1871
+ console.log(Colorize.replaceLine(Colorize.error(`Error adding: ${path}\n${(err as Error).message}`)));
1872
+ .command('remove <index>', `removes one or more documents from an index`, (yargs) => {
1873
+ return yargs
1874
+ .option('uri', {
1875
+ alias: 'u',
1876
+ array: true,
1877
+ describe: 'uri of a document to remove',
1878
+ type: 'string'
1879
+ .option('list', {
1880
+ alias: 'l',
1881
+ describe: 'path to a file containing a list of documents to remove',
1882
+ type: 'string'
1883
+ .check((argv) => {
1884
+ if (Array.isArray(argv.uri) && argv.uri.length > 0) {
1885
+ return true;
1886
+ } else if (typeof argv.list == 'string' && argv.list.trim().length > 0) {
1887
+ return true;
1888
+ } else {
1889
+ throw new Error(`you must specify either one or more "--uri <link>" for the pages to add or a "--list <file path>" for a file containing the list of pages to add.`);
1890
+ }, async (args) => {
1891
+ // Initialize index
1892
+ const folderPath = args.index as string;
1893
+ const index = new LocalDocumentIndex({ folderPath });
1894
+ // Get list of uri's
1895
+ const uris = await getItemList(args.uri as string[], args.list as string, 'document');
1896
+ // Remove documents
1897
+ for (const uri of uris) {
1898
+ console.log(`removing ${uri}`);
1899
+ await index.deleteDocument(uri);
1900
+ .command('stats <index>', `prints the stats for a local index`, {}, async (args) => {
1901
+ const folderPath = args.index as string;
1902
+ const index = new LocalDocumentIndex({ folderPath });
1903
+ const stats = await index.getCatalogStats();
1904
+ console.log(Colorize.title('Index Stats'));
1905
+ console.log(Colorize.output(stats));
1906
+ .command('query <index> <query>', `queries a local index`, (yargs) => {
1907
+ return yargs
1908
+ .option('keys', {
1909
+ alias: 'k',
1910
+ describe: 'path of a JSON file containing the model keys to use for generating embeddings'
1911
+ .option('document-count', {
1912
+ alias: 'dc',
1913
+ describe: 'max number of documents to return (defaults to 10)',
1914
+ type: 'number',
1915
+ default: 10
1916
+ .option('chunk-count', {
1917
+ alias: 'cc',
1918
+ describe: 'max number of chunks to return (defaults to 50)',
1919
+ type: 'number',
1920
+ default: 50
1921
+ .option('section-count', {
1922
+ alias: 'sc',
1923
+ describe: 'max number of document sections to render (defaults to 1)',
1924
+ type: 'number',
1925
+ default: 1
1926
+ .option('tokens', {
1927
+ alias: 't',
1928
+ describe: 'max number of tokens to render for each document section (defaults to 2000)',
1929
+ type: 'number',
1930
+ default: 2000
1931
+ .option('format', {
1932
+ alias: 'f',
1933
+ describe: `format of the rendered results. Defaults to 'sections'`,
1934
+ choices: ['sections', 'stats', 'chunks'],
1935
+ default: 'sections'
1936
+ .option('overlap', {
1937
+ alias: 'o',
1938
+ describe: `whether to add overlapping chunks to sections.`,
1939
+ type: 'boolean',
1940
+ default: true
1941
+ .option('bm25', {
1942
+ alias: 'b',
1943
+ describe: 'Use Okapi-bm25 keyword search alogrithm to perform hybrid search - semantic + keyword. Displayed in blue during search.',
1944
+ type: 'boolean',
1945
+ default: false
1946
+ .demandOption(['keys']);
1947
+ }, async (args) => {
1948
+ console.log(Colorize.title('Querying Index'));
1949
+ // Get embedding options
1950
+ const options: OpenAIEmbeddingsOptions|AzureOpenAIEmbeddingsOptions|OSSEmbeddingsOptions = JSON.parse(await fs.readFile(args.keys as string, 'utf-8'));
1951
+ if ((options as OpenAIEmbeddingsOptions).apiKey && !(options as OpenAIEmbeddingsOptions).model) {
1952
+ (options as OpenAIEmbeddingsOptions).model = 'text-embedding-ada-002';
1953
+ (options as OpenAIEmbeddingsOptions).maxTokens = 8000;
1954
+ // Create embeddings
1955
+ const embeddings = new OpenAIEmbeddings(options);
1956
+ // Initialize index
1957
+ const folderPath = args.index as string;
1958
+ const index = new LocalDocumentIndex({
1959
+ folderPath,
1960
+ embeddings
1961
+ // Query index
1962
+ const query = args.query as string;
1963
+ const results = await index.queryDocuments(query, {
1964
+ maxDocuments: args.documentCount,
1965
+ maxChunks: args.chunkCount,
1966
+ isBm25: args.bm25 as boolean,
1967
+ // Render results
1968
+ for (const result of results) {
1969
+ console.log(Colorize.output(result.uri));
1970
+ console.log(Colorize.value('score', result.score));
1971
+ console.log(Colorize.value('chunks', result.chunks.length));
1972
+ if (args.format == 'sections') {
1973
+ const sections = await result.renderSections(args.tokens, args.sectionCount, args.overlap);
1974
+ console.log(sections.length);
1975
+ for (let i = 0; i < sections.length; i++) {
1976
+ const section = sections[i];
1977
+ const isBm25 = sections[i].isBm25;
1978
+ console.log(isBm25);
1979
+ console.log(Colorize.title(args.sectionCount == 1 ? 'Section' : `Section ${i + 1}`));
1980
+ console.log(Colorize.value('score', section.score));
1981
+ console.log(Colorize.value('tokens', section.tokenCount));
1982
+ console.log(Colorize.output(section.text, isBm25));
1983
+ } else if (args.format == 'chunks') {
1984
+ const text = await result.loadText();
1985
+ for (let i = 0; i < result.chunks.length; i++) {
1986
+ const chunk = result.chunks[i];
1987
+ const startPos = chunk.item.metadata.startPos;
1988
+ const endPos = chunk.item.metadata.endPos;
1989
+ const isBm25 = Boolean(chunk.item.metadata.isBm25);
1990
+ console.log(Colorize.title(`Chunk ${i + 1}`));
1991
+ console.log(Colorize.value('score', chunk.score));
1992
+ console.log(Colorize.value('startPos', startPos));
1993
+ console.log(Colorize.value('endPos', endPos));
1994
+ console.log(Colorize.output(text.substring(startPos, endPos + 1), isBm25));
1995
+ .help()
1996
+ .demandCommand()
1997
+ .parseAsync();
1998
+ async function getItemList(items: string[], listFile: string, uriType: string): Promise<string[]> {
1999
+ if (Array.isArray(items) && items.length > 0) {
2000
+ return items;
2001
+ } else if (typeof listFile == 'string' && listFile.trim().length > 0) {
2002
+ const list = await fs.readFile(listFile, 'utf-8');
2003
+ return list.split('\n').map((item) => item.trim()).filter((item) => item.length > 0);
2004
+ } else {
2005
+ throw new Error(`you must specify either one or more "--uri <${uriType}>" for the items or a "--list <file path>" for a file containing the items.`)
2006
+
2007
+
2008
+ MEMORY:
2009
+ SOURCE: CONTRIBUTING.md
2010
+ DETAILS: # Contribution Guidelines
2011
+
2012
+ ## 1. Introduction
2013
+
2014
+ Thank you for your interest in contributing to Vectra! This project is an open-source local vector database for Node.js, licensed under the MIT License. These guidelines are intended to help you understand how to contribute effectively, maintain code quality, and foster a welcoming and productive community. Please read them carefully before making contributions.
2015
+
2016
+ ## 2. Code of Conduct
2017
+
2018
+ All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md). Please read it to understand the standards of behavior expected in this community.
2019
+
2020
+ ## 3. How to Contribute
2021
+
2022
+ ### Reporting Bugs
2023
+
2024
+ - If you find a bug, please [open an issue](https://github.com/Stevenic/vectra/issues) and provide as much detail as possible, including steps to reproduce, expected behavior, and your environment (Node.js version, OS, etc.).
2025
+
2026
+ ### Suggesting Enhancements
2027
+
2028
+ - To suggest a new feature or enhancement, [open an issue](https://github.com/Stevenic/vectra/issues) and describe your idea clearly. Include your use case and any relevant examples.
2029
+
2030
+ ### Submitting Pull Requests
2031
+
2032
+ - Fork the repository and create your branch from `main`.
2033
+ - Make your changes in a logical, self-contained commit.
2034
+ - Ensure your code follows the project’s coding standards and passes all tests.
2035
+ - Submit a pull request (PR) with a clear description of your changes and reference any related issues.
2036
+
2037
+ ## 4. Development Setup
2038
+
2039
+ ### Prerequisites
2040
+
2041
+ - **Node.js**: Version 20.x or higher is required.
2042
+ - **Package Manager**: [Yarn](https://classic.yarnpkg.com/en/docs/install/) is recommended (see `packageManager` in `package.json`).
2043
+
2044
+ ### Installation Steps
2045
+
2046
+ 1. **Clone the repository:**
2047
+ ```sh
2048
+ git clone https://github.com/Stevenic/vectra.git
2049
+ cd vectra
2050
+ ```
2051
+
2052
+ 2. **Install dependencies:**
2053
+ ```sh
2054
+ yarn install
2055
+ ```
2056
+
2057
+ ### Running Tests and Linting
2058
+
2059
+ - **Build the project:**
2060
+ ```sh
2061
+ yarn build
2062
+ ```
2063
+
2064
+ - **Run tests:**
2065
+ ```sh
2066
+ yarn test
2067
+ ```
2068
+
2069
+ - **Run linter and auto-fix issues:**
2070
+ ```sh
2071
+ yarn lint
2072
+ ```
2073
+
2074
+ ## 5. Coding Standards
2075
+
2076
+ ### Code Style and Formatting
2077
+
2078
+ - Use consistent code style as enforced by the linter (`yarn lint`).
2079
+ - Prefer TypeScript for all source files.
2080
+ - Follow the existing file and folder structure in the `src/` directory.
2081
+
2082
+ ### Commit Message Guidelines
2083
+
2084
+ - Write clear, concise commit messages.
2085
+ - Use the present tense (“Add feature” not “Added feature”).
2086
+ - Reference issues or PRs when relevant (e.g., `Fix #123: Correct vector normalization`).
2087
+
2088
+ ### File and Folder Structure
2089
+
2090
+ - Place all source code in the `src/` directory.
2091
+ - Tests should be placed alongside source files as a `*.spec.ts` file.
2092
+ - Keep documentation and configuration files in the project root or as specified by existing structure.
2093
+
2094
+ ## 6. Pull Request Process
2095
+
2096
+ ### Branching Model
2097
+
2098
+ - Create a feature or fix branch from `main` (e.g., `feature/add-metadata-filter` or `fix/vector-similarity-bug`).
2099
+ - Keep your branch focused on a single topic or issue.
2100
+
2101
+ ### How to Submit a Pull Request
2102
+
2103
+ 1. Push your branch to your forked repository.
2104
+ 2. Open a pull request (PR) against the `main` branch of the upstream repository.
2105
+ 3. Provide a clear and descriptive title and summary for your PR.
2106
+ 4. Reference any related issues by number (e.g., `Closes #45`).
2107
+
2108
+ ### Review Process
2109
+
2110
+ - All PRs will be reviewed by maintainers or other contributors.
2111
+ - Address any requested changes and update your PR as needed.
2112
+ - PRs must pass all tests and linting checks before being merged.
2113
+ - Once approved, a maintainer will merge your PR.
2114
+
2115
+ ## 7. Testing
2116
+
2117
+ ### How to Run Tests
2118
+
2119
+ - To run the full test suite, use:
2120
+ ```sh
2121
+ yarn test
2122
+ ```
2123
+ - This will build the project and run all tests using Mocha and NYC for coverage.
2124
+
2125
+ ### Writing New Tests
2126
+
2127
+ - Add new tests for any new features or bug fixes.
2128
+ - Place test files alongside the relevant source files or in the `src/` directory with a `.spec.ts` suffix.
2129
+ - Use [Mocha](https://mochajs.org/) and [Sinon](https://sinonjs.org/) for writing and mocking in tests.
2130
+ - Ensure all tests pass before submitting a pull request.
2131
+
2132
+ ## 8. Documentation
2133
+
2134
+ ### Updating/Adding Documentation
2135
+
2136
+ - Update the `README.md` or other relevant documentation files when you add features or make changes.
2137
+ - Ensure that usage examples and API references are clear and accurate.
2138
+
2139
+ ### Generating API Docs
2140
+
2141
+ - API documentation can be generated using [TypeDoc](https://typedoc.org/).
2142
+ - To generate docs, run:
2143
+ ```sh
2144
+ yarn build-docs
2145
+ ```
2146
+ - Generated documentation will be output as specified in the `build-docs` script in `package.json`.
2147
+
2148
+ ## 9. License
2149
+
2150
+ By contributing to this project, you agree that your contributions will be licensed under the [MIT License](LICENSE). Please ensure that you have the right to submit your code and that it does not violate any third-party licenses or agreements.
2151
+
2152
+ ## 10. Contact
2153
+
2154
+ If you have questions, need help, or want to discuss ideas, please open an issue on [GitHub](https://github.com/Stevenic/vectra/issues). For sensitive matters, you may contact the maintainer at ickman@gmail.com.
2155
+
2156
+ ## 11. Acknowledgements
2157
+
2158
+ - Vectra is inspired by other vector databases such as [Pinecone](https://www.pinecone.io/) and [Qdrant](https://qdrant.tech/).
2159
+ - Portions of this project and its documentation may reuse or adapt content and tools from the open-source community. See individual files for additional attributions where applicable.