vectra 0.12.2 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +92 -100
- package/bin/vectra.js +3 -0
- package/lib/BrowserWebFetcher.d.ts +75 -0
- package/lib/BrowserWebFetcher.d.ts.map +1 -0
- package/lib/BrowserWebFetcher.js +290 -0
- package/lib/BrowserWebFetcher.js.map +1 -0
- package/lib/FileFetcher.d.ts +5 -0
- package/lib/FileFetcher.d.ts.map +1 -0
- package/lib/FileFetcher.js +89 -0
- package/lib/FileFetcher.js.map +1 -0
- package/lib/FileFetcher.spec.d.ts +2 -0
- package/lib/FileFetcher.spec.d.ts.map +1 -0
- package/lib/FileFetcher.spec.js +244 -0
- package/lib/FileFetcher.spec.js.map +1 -0
- package/lib/FolderWatcher.d.ts +91 -0
- package/lib/FolderWatcher.d.ts.map +1 -0
- package/lib/FolderWatcher.js +304 -0
- package/lib/FolderWatcher.js.map +1 -0
- package/lib/FolderWatcher.spec.d.ts +2 -0
- package/lib/FolderWatcher.spec.d.ts.map +1 -0
- package/lib/FolderWatcher.spec.js +308 -0
- package/lib/FolderWatcher.spec.js.map +1 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.spec.d.ts +2 -0
- package/lib/GPT3Tokenizer.spec.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.spec.js +45 -0
- package/lib/GPT3Tokenizer.spec.js.map +1 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +179 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/ItemSelector.spec.d.ts +2 -0
- package/lib/ItemSelector.spec.d.ts.map +1 -0
- package/lib/ItemSelector.spec.js +204 -0
- package/lib/ItemSelector.spec.js.map +1 -0
- package/lib/LocalDocument.d.ts +54 -0
- package/lib/LocalDocument.d.ts.map +1 -1
- package/lib/LocalDocument.js +116 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocument.spec.d.ts +2 -0
- package/lib/LocalDocument.spec.d.ts.map +1 -0
- package/lib/LocalDocument.spec.js +214 -0
- package/lib/LocalDocument.spec.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +152 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +420 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentIndex.spec.d.ts +2 -0
- package/lib/LocalDocumentIndex.spec.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.spec.js +494 -0
- package/lib/LocalDocumentIndex.spec.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +66 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -1
- package/lib/LocalDocumentResult.js +376 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalDocumentResult.spec.d.ts +2 -0
- package/lib/LocalDocumentResult.spec.d.ts.map +1 -0
- package/lib/LocalDocumentResult.spec.js +373 -0
- package/lib/LocalDocumentResult.spec.js.map +1 -0
- package/lib/LocalEmbeddings.d.ts +59 -0
- package/lib/LocalEmbeddings.d.ts.map +1 -0
- package/lib/LocalEmbeddings.js +101 -0
- package/lib/LocalEmbeddings.js.map +1 -0
- package/lib/LocalEmbeddings.spec.d.ts +2 -0
- package/lib/LocalEmbeddings.spec.d.ts.map +1 -0
- package/lib/LocalEmbeddings.spec.js +155 -0
- package/lib/LocalEmbeddings.spec.js.map +1 -0
- package/lib/LocalIndex.d.ts +159 -0
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +519 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/LocalIndex.spec.d.ts +2 -0
- package/lib/LocalIndex.spec.js +611 -9
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +124 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +166 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/OpenAIEmbeddings.spec.d.ts +2 -0
- package/lib/OpenAIEmbeddings.spec.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.spec.js +298 -0
- package/lib/OpenAIEmbeddings.spec.js.map +1 -0
- package/lib/TextSplitter.d.ts +21 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +500 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/TextSplitter.spec.d.ts +2 -0
- package/lib/TextSplitter.spec.d.ts.map +1 -0
- package/lib/TextSplitter.spec.js +337 -0
- package/lib/TextSplitter.spec.js.map +1 -0
- package/lib/TransformersEmbeddings.d.ts +121 -0
- package/lib/TransformersEmbeddings.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.js +176 -0
- package/lib/TransformersEmbeddings.js.map +1 -0
- package/lib/TransformersEmbeddings.spec.d.ts +2 -0
- package/lib/TransformersEmbeddings.spec.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.spec.js +198 -0
- package/lib/TransformersEmbeddings.spec.js.map +1 -0
- package/lib/TransformersTokenizer.d.ts +33 -0
- package/lib/TransformersTokenizer.d.ts.map +1 -0
- package/lib/TransformersTokenizer.js +44 -0
- package/lib/TransformersTokenizer.js.map +1 -0
- package/lib/TransformersTokenizer.spec.d.ts +2 -0
- package/lib/TransformersTokenizer.spec.d.ts.map +1 -0
- package/lib/TransformersTokenizer.spec.js +112 -0
- package/lib/TransformersTokenizer.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +14 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +238 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/WebFetcher.spec.d.ts +2 -0
- package/lib/WebFetcher.spec.d.ts.map +1 -0
- package/lib/WebFetcher.spec.js +263 -0
- package/lib/WebFetcher.spec.js.map +1 -0
- package/lib/browser.d.ts +30 -0
- package/lib/browser.d.ts.map +1 -0
- package/lib/browser.js +52 -0
- package/lib/browser.js.map +1 -0
- package/lib/codecs/IndexCodec.d.ts +37 -0
- package/lib/codecs/IndexCodec.d.ts.map +1 -0
- package/lib/codecs/IndexCodec.js +3 -0
- package/lib/codecs/IndexCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.d.ts +19 -0
- package/lib/codecs/JsonCodec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.js +35 -0
- package/lib/codecs/JsonCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.spec.d.ts +2 -0
- package/lib/codecs/JsonCodec.spec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.spec.js +66 -0
- package/lib/codecs/JsonCodec.spec.js.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts +2 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js +108 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.d.ts +20 -0
- package/lib/codecs/ProtobufCodec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.js +225 -0
- package/lib/codecs/ProtobufCodec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts +2 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.js +155 -0
- package/lib/codecs/ProtobufCodec.spec.js.map +1 -0
- package/lib/codecs/index.d.ts +5 -0
- package/lib/codecs/index.d.ts.map +1 -0
- package/lib/codecs/index.js +21 -0
- package/lib/codecs/index.js.map +1 -0
- package/lib/codecs/migrateIndex.d.ts +24 -0
- package/lib/codecs/migrateIndex.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.js +119 -0
- package/lib/codecs/migrateIndex.js.map +1 -0
- package/lib/codecs/migrateIndex.spec.d.ts +2 -0
- package/lib/codecs/migrateIndex.spec.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.spec.js +151 -0
- package/lib/codecs/migrateIndex.spec.js.map +1 -0
- package/lib/codecs/schemas/index.proto +34 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.d.ts.map +1 -1
- package/lib/index.js +36 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +69 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +43 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/server/IndexManager.d.ts +78 -0
- package/lib/server/IndexManager.d.ts.map +1 -0
- package/lib/server/IndexManager.js +259 -0
- package/lib/server/IndexManager.js.map +1 -0
- package/lib/server/VectraServer.d.ts +40 -0
- package/lib/server/VectraServer.d.ts.map +1 -0
- package/lib/server/VectraServer.js +151 -0
- package/lib/server/VectraServer.js.map +1 -0
- package/lib/server/VectraServer.spec.d.ts +2 -0
- package/lib/server/VectraServer.spec.d.ts.map +1 -0
- package/lib/server/VectraServer.spec.js +322 -0
- package/lib/server/VectraServer.spec.js.map +1 -0
- package/lib/server/handlers/documentHandlers.d.ts +15 -0
- package/lib/server/handlers/documentHandlers.d.ts.map +1 -0
- package/lib/server/handlers/documentHandlers.js +95 -0
- package/lib/server/handlers/documentHandlers.js.map +1 -0
- package/lib/server/handlers/helpers.d.ts +23 -0
- package/lib/server/handlers/helpers.d.ts.map +1 -0
- package/lib/server/handlers/helpers.js +138 -0
- package/lib/server/handlers/helpers.js.map +1 -0
- package/lib/server/handlers/index.d.ts +8 -0
- package/lib/server/handlers/index.d.ts.map +1 -0
- package/lib/server/handlers/index.js +22 -0
- package/lib/server/handlers/index.js.map +1 -0
- package/lib/server/handlers/indexHandlers.d.ts +14 -0
- package/lib/server/handlers/indexHandlers.d.ts.map +1 -0
- package/lib/server/handlers/indexHandlers.js +85 -0
- package/lib/server/handlers/indexHandlers.js.map +1 -0
- package/lib/server/handlers/itemHandlers.d.ts +34 -0
- package/lib/server/handlers/itemHandlers.d.ts.map +1 -0
- package/lib/server/handlers/itemHandlers.js +166 -0
- package/lib/server/handlers/itemHandlers.js.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts +11 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.js +31 -0
- package/lib/server/handlers/lifecycleHandlers.js.map +1 -0
- package/lib/server/handlers/queryHandlers.d.ts +27 -0
- package/lib/server/handlers/queryHandlers.d.ts.map +1 -0
- package/lib/server/handlers/queryHandlers.js +135 -0
- package/lib/server/handlers/queryHandlers.js.map +1 -0
- package/lib/server/handlers/statsHandlers.d.ts +17 -0
- package/lib/server/handlers/statsHandlers.d.ts.map +1 -0
- package/lib/server/handlers/statsHandlers.js +81 -0
- package/lib/server/handlers/statsHandlers.js.map +1 -0
- package/lib/server/index.d.ts +4 -0
- package/lib/server/index.d.ts.map +1 -0
- package/lib/server/index.js +23 -0
- package/lib/server/index.js.map +1 -0
- package/lib/storage/FileStorage.d.ts +92 -0
- package/lib/storage/FileStorage.d.ts.map +1 -0
- package/lib/storage/FileStorage.js +3 -0
- package/lib/storage/FileStorage.js.map +1 -0
- package/lib/storage/FileStorageUtilities.d.ts +36 -0
- package/lib/storage/FileStorageUtilities.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.js +91 -0
- package/lib/storage/FileStorageUtilities.js.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts +2 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.js +98 -0
- package/lib/storage/FileStorageUtilities.spec.js.map +1 -0
- package/lib/storage/FileType.d.ts +29 -0
- package/lib/storage/FileType.d.ts.map +1 -0
- package/lib/storage/FileType.js +38 -0
- package/lib/storage/FileType.js.map +1 -0
- package/lib/storage/IndexedDBStorage.d.ts +47 -0
- package/lib/storage/IndexedDBStorage.d.ts.map +1 -0
- package/lib/storage/IndexedDBStorage.js +347 -0
- package/lib/storage/IndexedDBStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.browser.d.ts +19 -0
- package/lib/storage/LocalFileStorage.browser.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.browser.js +43 -0
- package/lib/storage/LocalFileStorage.browser.js.map +1 -0
- package/lib/storage/LocalFileStorage.d.ts +23 -0
- package/lib/storage/LocalFileStorage.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.js +152 -0
- package/lib/storage/LocalFileStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.spec.d.ts +2 -0
- package/lib/storage/LocalFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.spec.js +249 -0
- package/lib/storage/LocalFileStorage.spec.js.map +1 -0
- package/lib/storage/VirtualFileStorage.d.ts +18 -0
- package/lib/storage/VirtualFileStorage.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.js +178 -0
- package/lib/storage/VirtualFileStorage.js.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts +2 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.js +302 -0
- package/lib/storage/VirtualFileStorage.spec.js.map +1 -0
- package/lib/storage/index.d.ts +6 -0
- package/lib/storage/index.d.ts.map +1 -0
- package/lib/storage/index.js +22 -0
- package/lib/storage/index.js.map +1 -0
- package/lib/templates/templates/csharp/README.md +48 -0
- package/lib/templates/templates/csharp/VectraClient.cs +234 -0
- package/lib/templates/templates/go/README.md +71 -0
- package/lib/templates/templates/go/vectra_client.go +322 -0
- package/lib/templates/templates/java/README.md +81 -0
- package/lib/templates/templates/java/VectraClient.java +232 -0
- package/lib/templates/templates/python/README.md +37 -0
- package/lib/templates/templates/python/vectra_client.py +279 -0
- package/lib/templates/templates/rust/Cargo.toml +14 -0
- package/lib/templates/templates/rust/README.md +39 -0
- package/lib/templates/templates/rust/build.rs +4 -0
- package/lib/templates/templates/rust/lib.rs +284 -0
- package/lib/templates/templates/typescript/README.md +96 -0
- package/lib/templates/templates/typescript/VectraClient.ts +374 -0
- package/lib/templates/typescript/VectraClient.d.ts +114 -0
- package/lib/templates/typescript/VectraClient.d.ts.map +1 -0
- package/lib/templates/typescript/VectraClient.js +328 -0
- package/lib/templates/typescript/VectraClient.js.map +1 -0
- package/lib/types.d.ts +153 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/utils/index.d.ts +2 -0
- package/lib/utils/index.d.ts.map +1 -0
- package/lib/utils/index.js +18 -0
- package/lib/utils/index.js.map +1 -0
- package/lib/utils/pathUtils.d.ts +40 -0
- package/lib/utils/pathUtils.d.ts.map +1 -0
- package/lib/utils/pathUtils.js +98 -0
- package/lib/utils/pathUtils.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -1
- package/lib/vectra-cli.generate.spec.d.ts +2 -0
- package/lib/vectra-cli.generate.spec.d.ts.map +1 -0
- package/lib/vectra-cli.generate.spec.js +112 -0
- package/lib/vectra-cli.generate.spec.js.map +1 -0
- package/lib/vectra-cli.js +760 -0
- package/lib/vectra-cli.js.map +1 -0
- package/lib/vectra-cli.spec.d.ts +1 -0
- package/lib/vectra-cli.spec.d.ts.map +1 -0
- package/lib/vectra-cli.spec.js +2 -0
- package/lib/vectra-cli.spec.js.map +1 -0
- package/package.json +91 -16
- package/proto/vectra_service.proto +276 -0
- package/src/BrowserWebFetcher.ts +345 -0
- package/src/FileFetcher.spec.ts +234 -0
- package/src/FileFetcher.ts +37 -25
- package/src/FolderWatcher.spec.ts +288 -0
- package/src/FolderWatcher.ts +304 -0
- package/src/GPT3Tokenizer.spec.ts +50 -0
- package/src/ItemSelector.spec.ts +252 -0
- package/src/ItemSelector.ts +163 -150
- package/src/LocalDocument.spec.ts +211 -0
- package/src/LocalDocument.ts +88 -94
- package/src/LocalDocumentIndex.spec.ts +481 -0
- package/src/LocalDocumentIndex.ts +39 -40
- package/src/LocalDocumentResult.spec.ts +373 -0
- package/src/LocalDocumentResult.ts +489 -319
- package/src/LocalEmbeddings.spec.ts +138 -0
- package/src/LocalEmbeddings.ts +120 -0
- package/src/LocalIndex.spec.ts +808 -66
- package/src/LocalIndex.ts +479 -429
- package/src/OpenAIEmbeddings.spec.ts +354 -0
- package/src/OpenAIEmbeddings.ts +26 -27
- package/src/TextSplitter.spec.ts +342 -0
- package/src/TextSplitter.ts +517 -532
- package/src/TransformersEmbeddings.spec.ts +188 -0
- package/src/TransformersEmbeddings.ts +232 -0
- package/src/TransformersTokenizer.spec.ts +143 -0
- package/src/TransformersTokenizer.ts +45 -0
- package/src/WebFetcher.spec.ts +288 -0
- package/src/WebFetcher.ts +184 -186
- package/src/browser.ts +69 -0
- package/src/codecs/IndexCodec.ts +40 -0
- package/src/codecs/JsonCodec.spec.ts +70 -0
- package/src/codecs/JsonCodec.ts +37 -0
- package/src/codecs/LocalIndex.protobuf.spec.ts +115 -0
- package/src/codecs/ProtobufCodec.spec.ts +166 -0
- package/src/codecs/ProtobufCodec.ts +193 -0
- package/src/codecs/index.ts +4 -0
- package/src/codecs/migrateIndex.spec.ts +176 -0
- package/src/codecs/migrateIndex.ts +125 -0
- package/src/codecs/schemas/index.proto +34 -0
- package/src/index.ts +9 -1
- package/src/internals/Colorize.ts +19 -16
- package/src/server/IndexManager.ts +243 -0
- package/src/server/VectraServer.spec.ts +303 -0
- package/src/server/VectraServer.ts +156 -0
- package/src/server/handlers/documentHandlers.ts +59 -0
- package/src/server/handlers/helpers.ts +93 -0
- package/src/server/handlers/index.ts +7 -0
- package/src/server/handlers/indexHandlers.ts +44 -0
- package/src/server/handlers/itemHandlers.ts +140 -0
- package/src/server/handlers/lifecycleHandlers.ts +26 -0
- package/src/server/handlers/queryHandlers.ts +96 -0
- package/src/server/handlers/statsHandlers.ts +38 -0
- package/src/server/index.ts +3 -0
- package/src/storage/FileStorage.ts +105 -0
- package/src/storage/FileStorageUtilities.spec.ts +106 -0
- package/src/storage/FileStorageUtilities.ts +77 -0
- package/src/storage/FileType.ts +61 -0
- package/src/storage/IndexedDBStorage.ts +365 -0
- package/src/storage/LocalFileStorage.browser.ts +52 -0
- package/src/storage/LocalFileStorage.spec.ts +292 -0
- package/src/storage/LocalFileStorage.ts +98 -0
- package/src/storage/VirtualFileStorage.spec.ts +307 -0
- package/src/storage/VirtualFileStorage.ts +169 -0
- package/src/storage/index.ts +5 -0
- package/src/templates/csharp/README.md +48 -0
- package/src/templates/csharp/VectraClient.cs +234 -0
- package/src/templates/go/README.md +71 -0
- package/src/templates/go/vectra_client.go +322 -0
- package/src/templates/java/README.md +81 -0
- package/src/templates/java/VectraClient.java +232 -0
- package/src/templates/python/README.md +37 -0
- package/src/templates/python/vectra_client.py +279 -0
- package/src/templates/rust/Cargo.toml +14 -0
- package/src/templates/rust/README.md +39 -0
- package/src/templates/rust/build.rs +4 -0
- package/src/templates/rust/lib.rs +284 -0
- package/src/templates/typescript/README.md +96 -0
- package/src/templates/typescript/VectraClient.ts +374 -0
- package/src/types.ts +131 -123
- package/src/utils/index.ts +1 -0
- package/src/utils/pathUtils.ts +106 -0
- package/src/vectra-cli.generate.spec.ts +72 -0
- package/src/vectra-cli.spec.ts +0 -0
- package/src/vectra-cli.ts +687 -246
package/src/LocalIndex.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
import * as path from 'path';
|
|
1
|
+
import { pathUtils as path } from './utils/pathUtils';
|
|
3
2
|
import { v4 } from 'uuid';
|
|
4
3
|
import { ItemSelector } from './ItemSelector';
|
|
5
4
|
import { IndexItem, IndexStats, MetadataFilter, MetadataTypes, QueryResult } from './types';
|
|
@@ -8,12 +7,15 @@ import { LocalDocumentIndex } from './LocalDocumentIndex';
|
|
|
8
7
|
import bm25 from 'wink-bm25-text-search';
|
|
9
8
|
import winkNLP from 'wink-nlp';
|
|
10
9
|
import model from 'wink-eng-lite-web-model';
|
|
10
|
+
import { FileStorage, LocalFileStorage } from './storage';
|
|
11
|
+
import { IndexCodec, JsonCodec } from './codecs';
|
|
12
|
+
|
|
11
13
|
export interface CreateIndexConfig {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
version: number;
|
|
15
|
+
deleteIfExists?: boolean;
|
|
16
|
+
metadata_config?: {
|
|
17
|
+
indexed?: string[];
|
|
18
|
+
};
|
|
17
19
|
}
|
|
18
20
|
|
|
19
21
|
/**
|
|
@@ -22,468 +24,516 @@ export interface CreateIndexConfig {
|
|
|
22
24
|
* This class is used to create, update, and query a local vector index.
|
|
23
25
|
* Each index is a folder on disk containing an index.json file and an optional set of metadata files.
|
|
24
26
|
*/
|
|
25
|
-
export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<string,MetadataTypes>>{
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
27
|
+
export class LocalIndex<TMetadata extends Record<string, MetadataTypes> = Record<string, MetadataTypes>> {
|
|
28
|
+
private readonly _folderPath: string;
|
|
29
|
+
private readonly _indexName: string = 'index.json';
|
|
30
|
+
private readonly _storage: FileStorage;
|
|
31
|
+
private readonly _codec: IndexCodec;
|
|
32
|
+
private _data?: IndexData;
|
|
33
|
+
private _update?: IndexData;
|
|
34
|
+
|
|
35
|
+
// member fields for BM25
|
|
36
|
+
private _bm25Engine: any;
|
|
37
|
+
private readonly _bm25Factory: () => any;
|
|
38
|
+
private readonly _docReader: (docId: string) => Promise<string>;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Creates a new instance of LocalIndex.
|
|
42
|
+
* @param folderPath Path to the index folder.
|
|
43
|
+
* @param indexName Optional index file name. Defaults to 'index' + codec.extension.
|
|
44
|
+
* @param storage Optional file storage instance. Defaults to LocalFileStorage.
|
|
45
|
+
* @param codec Optional codec for serialization. Defaults to JsonCodec.
|
|
46
|
+
* @param options Optional constructor options for dependency injection.
|
|
47
|
+
*/
|
|
48
|
+
public constructor(
|
|
49
|
+
folderPath: string,
|
|
50
|
+
indexName?: string,
|
|
51
|
+
storage?: FileStorage,
|
|
52
|
+
codec?: IndexCodec,
|
|
53
|
+
options?: {
|
|
54
|
+
bm25Factory?: () => any;
|
|
55
|
+
docReader?: (docId: string) => Promise<string>;
|
|
49
56
|
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
57
|
+
) {
|
|
58
|
+
this._folderPath = folderPath;
|
|
59
|
+
this._codec = codec || new JsonCodec();
|
|
60
|
+
if (indexName) {
|
|
61
|
+
this._indexName = indexName;
|
|
62
|
+
} else {
|
|
63
|
+
this._indexName = `index${this._codec.extension}`;
|
|
56
64
|
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
throw new Error('Index already exists');
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
try {
|
|
98
|
-
// Create folder for index
|
|
99
|
-
await fs.mkdir(this._folderPath, { recursive: true });
|
|
100
|
-
|
|
101
|
-
// Initialize index.json file
|
|
102
|
-
this._data = {
|
|
103
|
-
version: config.version,
|
|
104
|
-
metadata_config: config.metadata_config ?? {},
|
|
105
|
-
items: []
|
|
106
|
-
};
|
|
107
|
-
|
|
108
|
-
await fs.writeFile(path.join(this._folderPath, this._indexName), JSON.stringify(this._data));
|
|
109
|
-
} catch (err: unknown) {
|
|
110
|
-
await this.deleteIndex();
|
|
111
|
-
throw new Error('Error creating index');
|
|
112
|
-
}
|
|
65
|
+
this._storage = storage || new LocalFileStorage();
|
|
66
|
+
this._bm25Factory = options?.bm25Factory || (() => bm25());
|
|
67
|
+
this._docReader = options?.docReader || (async (docId: string) => {
|
|
68
|
+
const doc = new LocalDocument((this as unknown) as LocalDocumentIndex, docId, '');
|
|
69
|
+
return await doc.loadText();
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** Path to the index folder. */
|
|
74
|
+
public get folderPath(): string {
|
|
75
|
+
return this._folderPath;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Name of the index file. */
|
|
79
|
+
public get indexName(): string {
|
|
80
|
+
return this._indexName;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Storage provider used to store the index. */
|
|
84
|
+
public get storage(): FileStorage {
|
|
85
|
+
return this._storage;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** Codec used for serialization. */
|
|
89
|
+
public get codec(): IndexCodec {
|
|
90
|
+
return this._codec;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Begins an update to the index.
|
|
95
|
+
* @remarks
|
|
96
|
+
* This method loads the index into memory and prepares it for updates.
|
|
97
|
+
*/
|
|
98
|
+
public async beginUpdate(): Promise<void> {
|
|
99
|
+
if (this._update) {
|
|
100
|
+
throw new Error('Update already in progress');
|
|
113
101
|
}
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
102
|
+
await this.loadIndexData();
|
|
103
|
+
this._update = structuredClone(this._data);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Cancels an update to the index.
|
|
108
|
+
* @remarks
|
|
109
|
+
* This method discards any changes made to the index since the update began.
|
|
110
|
+
*/
|
|
111
|
+
public cancelUpdate(): void {
|
|
112
|
+
this._update = undefined;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Creates a new index.
|
|
117
|
+
* @remarks
|
|
118
|
+
* This method creates a new folder on disk containing an index.json file.
|
|
119
|
+
* @param config Index configuration.
|
|
120
|
+
*/
|
|
121
|
+
public async createIndex(config: CreateIndexConfig = { version: 1 }): Promise<void> {
|
|
122
|
+
// Delete if exists
|
|
123
|
+
if (await this.isIndexCreated()) {
|
|
124
|
+
if (config.deleteIfExists) {
|
|
125
|
+
await this.deleteIndex();
|
|
126
|
+
} else {
|
|
127
|
+
throw new Error('Index already exists');
|
|
128
|
+
}
|
|
126
129
|
}
|
|
127
130
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
this._update!.items.splice(index, 1);
|
|
143
|
-
}
|
|
144
|
-
await this.endUpdate();
|
|
145
|
-
}
|
|
131
|
+
try {
|
|
132
|
+
// Create folder for index
|
|
133
|
+
await this.storage.createFolder(this._folderPath);
|
|
134
|
+
|
|
135
|
+
// Initialize index.json file
|
|
136
|
+
this._data = {
|
|
137
|
+
version: config.version,
|
|
138
|
+
metadata_config: config.metadata_config ?? {},
|
|
139
|
+
items: []
|
|
140
|
+
};
|
|
141
|
+
await this.storage.upsertFile(path.join(this._folderPath, this._indexName), this._codec.serializeIndex(this._data));
|
|
142
|
+
} catch {
|
|
143
|
+
await this.deleteIndex();
|
|
144
|
+
throw new Error('Error creating index');
|
|
146
145
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Deletes the index.
|
|
150
|
+
* @remarks
|
|
151
|
+
* This method deletes the index folder from disk.
|
|
152
|
+
*/
|
|
153
|
+
public async deleteIndex(): Promise<void> {
|
|
154
|
+
this._data = undefined;
|
|
155
|
+
return await this.storage.deleteFolder(this._folderPath);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Deletes an item from the index.
|
|
160
|
+
* @param id ID of item to delete.
|
|
161
|
+
*/
|
|
162
|
+
public async deleteItem(id: string): Promise<void> {
|
|
163
|
+
if (this._update) {
|
|
164
|
+
const index = this._update.items.findIndex(i => i.id === id);
|
|
165
|
+
if (index >= 0) {
|
|
166
|
+
this._update.items.splice(index, 1);
|
|
167
|
+
}
|
|
168
|
+
} else {
|
|
169
|
+
await this.beginUpdate();
|
|
170
|
+
const index = this._update!.items.findIndex(i => i.id === id);
|
|
171
|
+
if (index >= 0) {
|
|
172
|
+
this._update!.items.splice(index, 1);
|
|
173
|
+
}
|
|
174
|
+
await this.endUpdate();
|
|
166
175
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
items: this._data!.items.length
|
|
178
|
-
};
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Ends an update to the index.
|
|
180
|
+
* @remarks
|
|
181
|
+
* This method saves the index to disk.
|
|
182
|
+
*/
|
|
183
|
+
public async endUpdate(): Promise<void> {
|
|
184
|
+
if (!this._update) {
|
|
185
|
+
throw new Error('No update in progress');
|
|
179
186
|
}
|
|
180
187
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
return this._data!.items.find(i => i.id === id) as any | undefined;
|
|
188
|
+
try {
|
|
189
|
+
// Save index
|
|
190
|
+
await this.storage.upsertFile(path.join(this._folderPath, this._indexName), this._codec.serializeIndex(this._update));
|
|
191
|
+
this._data = this._update;
|
|
192
|
+
this._update = undefined;
|
|
193
|
+
} catch (err: unknown) {
|
|
194
|
+
throw new Error(`Error saving index: ${(err as any).toString()}`);
|
|
189
195
|
}
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Loads an index from disk and returns its stats.
|
|
200
|
+
* @returns Index stats.
|
|
201
|
+
*/
|
|
202
|
+
public async getIndexStats(): Promise<IndexStats> {
|
|
203
|
+
await this.loadIndexData();
|
|
204
|
+
return {
|
|
205
|
+
version: this._data!.version,
|
|
206
|
+
metadata_config: this._data!.metadata_config,
|
|
207
|
+
items: this._data!.items.length
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Returns an item from the index given its ID.
|
|
213
|
+
* @param id ID of the item to retrieve.
|
|
214
|
+
* @returns Item or undefined if not found.
|
|
215
|
+
*/
|
|
216
|
+
public async getItem<TItemMetadata extends TMetadata = TMetadata>(id: string): Promise<IndexItem<TItemMetadata> | undefined> {
|
|
217
|
+
await this.loadIndexData();
|
|
218
|
+
return this._data!.items.find(i => i.id === id) as any | undefined;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Adds an item to the index.
|
|
223
|
+
* @remarks
|
|
224
|
+
* A new update is started if one is not already in progress. If an item with the same ID
|
|
225
|
+
* already exists, an error will be thrown.
|
|
226
|
+
* @param item Item to insert.
|
|
227
|
+
* @returns Inserted item.
|
|
228
|
+
*/
|
|
229
|
+
public async insertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
|
|
230
|
+
if (this._update) {
|
|
231
|
+
return await this.addItemToUpdate(item, true) as any;
|
|
232
|
+
} else {
|
|
233
|
+
await this.beginUpdate();
|
|
234
|
+
const newItem = await this.addItemToUpdate(item, true);
|
|
235
|
+
await this.endUpdate();
|
|
236
|
+
return newItem as any;
|
|
208
237
|
}
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Adds a batch of items to the index.
|
|
242
|
+
* @remarks
|
|
243
|
+
* Batch update requires no update to be in progress. This is necessary so that if any one
|
|
244
|
+
* insert operation fails, the entire update can be safely cancelled. This prevents partial
|
|
245
|
+
* updates from being applied to the local index.
|
|
246
|
+
* @param items Items to insert.
|
|
247
|
+
* @returns Inserted items.
|
|
248
|
+
*/
|
|
249
|
+
public async batchInsertItems<TItemMetadata extends TMetadata = TMetadata>(items: Partial<IndexItem<TItemMetadata>>[]): Promise<IndexItem[]> {
|
|
250
|
+
await this.beginUpdate();
|
|
251
|
+
try {
|
|
252
|
+
const newItems: IndexItem[] = [];
|
|
253
|
+
for (const item of items) {
|
|
254
|
+
const newItem = await this.addItemToUpdate(item, true);
|
|
255
|
+
newItems.push(newItem);
|
|
256
|
+
}
|
|
257
|
+
await this.endUpdate();
|
|
258
|
+
return newItems;
|
|
259
|
+
} catch (e) {
|
|
260
|
+
// cancels this update to prevent partial batch updates. allows error to bubble up.
|
|
261
|
+
await this.cancelUpdate();
|
|
262
|
+
throw e;
|
|
234
263
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/** Returns true if the index exists. */
|
|
267
|
+
public async isIndexCreated(): Promise<boolean> {
|
|
268
|
+
return await this.storage.pathExists(path.join(this._folderPath, this._indexName));
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Returns all items in the index.
|
|
273
|
+
* @remarks
|
|
274
|
+
* This method loads the index into memory and returns all its items. A copy of the items
|
|
275
|
+
* array is returned so no modifications should be made to the array.
|
|
276
|
+
* @returns Array of all items in the index.
|
|
277
|
+
*/
|
|
278
|
+
public async listItems<TItemMetadata extends TMetadata = TMetadata>(): Promise<IndexItem<TItemMetadata>[]> {
|
|
279
|
+
await this.loadIndexData();
|
|
280
|
+
return this._data!.items.slice() as any;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Returns all items in the index matching the filter.
|
|
285
|
+
* @remarks
|
|
286
|
+
* This method loads the index into memory and returns all its items matching the filter.
|
|
287
|
+
* @param filter Filter to apply.
|
|
288
|
+
* @returns Array of items matching the filter.
|
|
289
|
+
*/
|
|
290
|
+
public async listItemsByMetadata<TItemMetadata extends TMetadata = TMetadata>(filter: MetadataFilter): Promise<IndexItem<TItemMetadata>[]> {
|
|
291
|
+
await this.loadIndexData();
|
|
292
|
+
return this._data!.items.filter(i => ItemSelector.select(i.metadata, filter)) as any;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Finds the top k items in the index that are most similar to the vector.
|
|
297
|
+
* @remarks
|
|
298
|
+
* This method loads the index into memory and returns the top k items that are most similar.
|
|
299
|
+
* An optional filter can be applied to the metadata of the items.
|
|
300
|
+
* @param vector Vector to query against.
|
|
301
|
+
* @param query Query string (used when isBm25=true).
|
|
302
|
+
* @param topK Number of items to return.
|
|
303
|
+
* @param filter Optional. Filter to apply.
|
|
304
|
+
* @param isBm25 Optional. If true, append BM25 keyword results to semantic results.
|
|
305
|
+
* @returns Similar items to the vector that match the supplied filter.
|
|
306
|
+
*/
|
|
307
|
+
public async queryItems<TItemMetadata extends TMetadata = TMetadata>(
|
|
308
|
+
vector: number[],
|
|
309
|
+
query: string,
|
|
310
|
+
topK: number,
|
|
311
|
+
filter?: MetadataFilter,
|
|
312
|
+
isBm25?: boolean
|
|
313
|
+
): Promise<QueryResult<TItemMetadata>[]> {
|
|
314
|
+
await this.loadIndexData();
|
|
315
|
+
|
|
316
|
+
// Filter items
|
|
317
|
+
let items = this._data!.items;
|
|
318
|
+
if (filter) {
|
|
319
|
+
items = items.filter(i => ItemSelector.select(i.metadata, filter));
|
|
246
320
|
}
|
|
247
321
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
public async listItems<TItemMetadata extends TMetadata = TMetadata>(): Promise<IndexItem<TItemMetadata>[]> {
|
|
256
|
-
await this.loadIndexData();
|
|
257
|
-
return this._data!.items.slice() as any;
|
|
322
|
+
// Calculate distances
|
|
323
|
+
const norm = ItemSelector.normalize(vector);
|
|
324
|
+
const distances: { index: number; distance: number }[] = [];
|
|
325
|
+
for (let i = 0; i < items.length; i++) {
|
|
326
|
+
const item = items[i];
|
|
327
|
+
const distance = ItemSelector.normalizedCosineSimilarity(vector, norm, item.vector, item.norm);
|
|
328
|
+
distances.push({ index: i, distance: distance });
|
|
258
329
|
}
|
|
259
330
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
331
|
+
// Sort by distance DESCENDING
|
|
332
|
+
distances.sort((a, b) => b.distance - a.distance);
|
|
333
|
+
|
|
334
|
+
// Find top k
|
|
335
|
+
const top: QueryResult<TItemMetadata>[] = distances.slice(0, topK).map(d => {
|
|
336
|
+
return {
|
|
337
|
+
item: Object.assign({}, items[d.index]) as any,
|
|
338
|
+
score: d.distance
|
|
339
|
+
};
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
// Load external metadata
|
|
343
|
+
for (const item of top) {
|
|
344
|
+
if (item.item.metadataFile) {
|
|
345
|
+
const metadataPath = path.join(this._folderPath, item.item.metadataFile);
|
|
346
|
+
const metadataBuffer = await this.storage.readFile(metadataPath);
|
|
347
|
+
item.item.metadata = this._codec.deserializeMetadata(metadataBuffer) as any;
|
|
348
|
+
}
|
|
270
349
|
}
|
|
271
350
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
351
|
+
// Perform bm25 search only if enabled. Avoid duplicate chunks that are already selected during semantic search.
|
|
352
|
+
if (isBm25) {
|
|
353
|
+
const itemSet = new Set<string>();
|
|
354
|
+
for (const r of top) itemSet.add(r.item.id);
|
|
355
|
+
|
|
356
|
+
// Set up BM25 engine
|
|
357
|
+
await this.setupBm25();
|
|
358
|
+
|
|
359
|
+
// Add docs if we have necessary metadata; guard everything to avoid crashes
|
|
360
|
+
for (let i = 0; i < items.length; i++) {
|
|
361
|
+
if (!itemSet.has(items[i].id)) {
|
|
362
|
+
const item = items[i] as any;
|
|
363
|
+
const md = item.metadata || {};
|
|
364
|
+
if (md.documentId != undefined && md.startPos != undefined && md.endPos != undefined) {
|
|
365
|
+
try {
|
|
366
|
+
const currDocTxt = await this._docReader(String(md.documentId));
|
|
367
|
+
const startPos = Number(md.startPos);
|
|
368
|
+
const endPos = Number(md.endPos);
|
|
369
|
+
const chunkText = currDocTxt.substring(startPos, endPos + 1);
|
|
370
|
+
this._bm25Engine.addDoc?.({ body: chunkText }, i);
|
|
371
|
+
} catch {
|
|
372
|
+
// Ignore load or engine errors for BM25 doc prep
|
|
373
|
+
}
|
|
374
|
+
}
|
|
289
375
|
}
|
|
376
|
+
}
|
|
290
377
|
|
|
291
|
-
|
|
292
|
-
const norm = ItemSelector.normalize(vector);
|
|
293
|
-
const distances: { index: number, distance: number }[] = [];
|
|
294
|
-
for (let i = 0; i < items.length; i++) {
|
|
295
|
-
const item = items[i];
|
|
296
|
-
const distance = ItemSelector.normalizedCosineSimilarity(vector, norm, item.vector, item.norm);
|
|
297
|
-
distances.push({ index: i, distance: distance });
|
|
298
|
-
}
|
|
378
|
+
this._bm25Engine.consolidate?.();
|
|
299
379
|
|
|
300
|
-
|
|
301
|
-
distances.sort((a, b) => b.distance - a.distance);
|
|
302
|
-
|
|
303
|
-
// Find top k
|
|
304
|
-
const top: QueryResult<TItemMetadata>[] = distances.slice(0, topK).map(d => {
|
|
305
|
-
return {
|
|
306
|
-
item: Object.assign({}, items[d.index]) as any,
|
|
307
|
-
score: d.distance
|
|
308
|
-
};
|
|
309
|
-
});
|
|
310
|
-
|
|
311
|
-
// Load external metadata
|
|
312
|
-
for (const item of top) {
|
|
313
|
-
if (item.item.metadataFile) {
|
|
314
|
-
const metadataPath = path.join(this._folderPath, item.item.metadataFile);
|
|
315
|
-
const metadata = await fs.readFile(metadataPath);
|
|
316
|
-
item.item.metadata = JSON.parse(metadata.toString());
|
|
317
|
-
}
|
|
318
|
-
}
|
|
380
|
+
const results: any[] = await this.bm25Search(query, items, topK);
|
|
319
381
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
for (let i = 0; i < items.length; i++) {
|
|
330
|
-
if (!itemSet.has(items[i].id)) {
|
|
331
|
-
const item = items[i];
|
|
332
|
-
currDoc = new LocalDocument((this as unknown) as LocalDocumentIndex, item.metadata.documentId.toString(), '');
|
|
333
|
-
currDocTxt = await currDoc.loadText();
|
|
334
|
-
const startPos = item.metadata.startPos;
|
|
335
|
-
const endPos = item.metadata.endPos;
|
|
336
|
-
const chunkText = currDocTxt.substring(Number(startPos), Number(endPos) + 1);
|
|
337
|
-
this._bm25Engine.addDoc({body: chunkText}, i);
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
this._bm25Engine.consolidate();
|
|
341
|
-
var results = await this.bm25Search(query, items, topK);
|
|
342
|
-
results.forEach((res: any) => {
|
|
343
|
-
top.push({
|
|
344
|
-
item: Object.assign({}, {...items[res[0]], metadata: {...items[res[0]].metadata, isBm25: true}}) as any,
|
|
345
|
-
score: res[1]
|
|
346
|
-
});
|
|
382
|
+
results.forEach((res: any) => {
|
|
383
|
+
// Support both [index, score] tuples and { item, score } objects
|
|
384
|
+
if (Array.isArray(res)) {
|
|
385
|
+
const idx = res[0];
|
|
386
|
+
const score = res[1];
|
|
387
|
+
if (items[idx]) {
|
|
388
|
+
top.push({
|
|
389
|
+
item: Object.assign({}, { ...items[idx], metadata: { ...items[idx].metadata, isBm25: true } }) as any,
|
|
390
|
+
score
|
|
347
391
|
});
|
|
348
|
-
|
|
392
|
+
}
|
|
393
|
+
} else if (res && typeof res === 'object' && 'item' in res && 'score' in res) {
|
|
394
|
+
const objItem = Object.assign({}, { ...(res.item || {}), metadata: { ...(res.item?.metadata || {}), isBm25: true } }) as any;
|
|
395
|
+
top.push({ item: objItem, score: res.score });
|
|
349
396
|
}
|
|
350
|
-
|
|
397
|
+
});
|
|
351
398
|
}
|
|
352
399
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
400
|
+
return top;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Adds or replaces an item in the index.
|
|
405
|
+
* @remarks
|
|
406
|
+
* A new update is started if one is not already in progress. If an item with the same ID
|
|
407
|
+
* already exists, it will be replaced.
|
|
408
|
+
* @param item Item to insert or replace.
|
|
409
|
+
* @returns Upserted item.
|
|
410
|
+
*/
|
|
411
|
+
public async upsertItem<TItemMetadata extends TMetadata = TMetadata>(item: Partial<IndexItem<TItemMetadata>>): Promise<IndexItem<TItemMetadata>> {
|
|
412
|
+
if (this._update) {
|
|
413
|
+
return await this.addItemToUpdate(item, false) as any;
|
|
414
|
+
} else {
|
|
415
|
+
await this.beginUpdate();
|
|
416
|
+
const newItem = await this.addItemToUpdate(item, false);
|
|
417
|
+
await this.endUpdate();
|
|
418
|
+
return newItem as any;
|
|
370
419
|
}
|
|
420
|
+
}
|
|
371
421
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
if (!await this.isIndexCreated()) {
|
|
381
|
-
throw new Error('Index does not exist');
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
const data = await fs.readFile(path.join(this._folderPath, this.indexName));
|
|
385
|
-
this._data = JSON.parse(data.toString());
|
|
422
|
+
/** Ensures that the index has been loaded into memory. */
|
|
423
|
+
protected async loadIndexData(): Promise<void> {
|
|
424
|
+
if (this._data) {
|
|
425
|
+
return;
|
|
426
|
+
}
|
|
427
|
+
if (!await this.isIndexCreated()) {
|
|
428
|
+
throw new Error('Index does not exist');
|
|
386
429
|
}
|
|
387
430
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
throw new Error('Vector is required');
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
// Ensure unique
|
|
395
|
-
const id = item.id ?? v4();
|
|
396
|
-
if (unique) {
|
|
397
|
-
const existing = this._update!.items.find(i => i.id === id);
|
|
398
|
-
if (existing) {
|
|
399
|
-
throw new Error(`Item with id ${id} already exists`);
|
|
400
|
-
}
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
// Check for indexed metadata
|
|
404
|
-
let metadata: Record<string,any> = {};
|
|
405
|
-
let metadataFile: string | undefined;
|
|
406
|
-
if (this._update!.metadata_config.indexed && this._update!.metadata_config.indexed.length > 0 && item.metadata) {
|
|
407
|
-
// Copy only indexed metadata
|
|
408
|
-
for (const key of this._update!.metadata_config.indexed) {
|
|
409
|
-
if (item.metadata && item.metadata[key]) {
|
|
410
|
-
metadata[key] = item.metadata[key];
|
|
411
|
-
}
|
|
412
|
-
}
|
|
431
|
+
const data = await this.storage.readFile(path.join(this._folderPath, this.indexName));
|
|
432
|
+
this._data = this._codec.deserializeIndex(data);
|
|
433
|
+
}
|
|
413
434
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
metadata = item.metadata;
|
|
420
|
-
}
|
|
435
|
+
private async addItemToUpdate(item: Partial<IndexItem<any>>, unique: boolean): Promise<IndexItem> {
|
|
436
|
+
// Ensure vector is provided
|
|
437
|
+
if (!item.vector) {
|
|
438
|
+
throw new Error('Vector is required');
|
|
439
|
+
}
|
|
421
440
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
newItem.metadataFile = metadataFile;
|
|
431
|
-
}
|
|
441
|
+
// Ensure unique
|
|
442
|
+
const id = item.id ?? v4();
|
|
443
|
+
if (unique) {
|
|
444
|
+
const existing = this._update!.items.find(i => i.id === id);
|
|
445
|
+
if (existing) {
|
|
446
|
+
throw new Error(`Item with id ${id} already exists`);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
432
449
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
return newItem;
|
|
444
|
-
}
|
|
445
|
-
} else {
|
|
446
|
-
this._update!.items.push(newItem);
|
|
447
|
-
return newItem;
|
|
450
|
+
// Check for indexed metadata
|
|
451
|
+
let metadata: Record<string, any> = {};
|
|
452
|
+
let metadataFile: string | undefined;
|
|
453
|
+
const indexedKeys = this._update!.metadata_config.indexed ?? [];
|
|
454
|
+
if (indexedKeys.length > 0 && item.metadata) {
|
|
455
|
+
// Copy only indexed metadata
|
|
456
|
+
const indexedOnly: Record<string, any> = {};
|
|
457
|
+
for (const key of indexedKeys) {
|
|
458
|
+
if (Object.prototype.hasOwnProperty.call(item.metadata, key)) {
|
|
459
|
+
indexedOnly[key] = (item.metadata as any)[key];
|
|
448
460
|
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Determine if there are any non-indexed keys
|
|
464
|
+
const hasNonIndexed = Object.keys(item.metadata).some(k => !indexedKeys.includes(k));
|
|
465
|
+
|
|
466
|
+
// Always store only indexed keys in the index
|
|
467
|
+
metadata = indexedOnly;
|
|
468
|
+
|
|
469
|
+
// Write full metadata externally only if there are non-indexed keys present
|
|
470
|
+
if (hasNonIndexed) {
|
|
471
|
+
metadataFile = `${v4()}${this._codec.extension}`;
|
|
472
|
+
const metadataPath = path.join(this._folderPath, metadataFile);
|
|
473
|
+
await this.storage.upsertFile(metadataPath, this._codec.serializeMetadata(item.metadata as Record<string, MetadataTypes>));
|
|
474
|
+
}
|
|
475
|
+
} else if (item.metadata) {
|
|
476
|
+
metadata = item.metadata;
|
|
449
477
|
}
|
|
450
478
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
// Use only words ignoring punctuations etc and from them remove stop words
|
|
461
|
-
.filter( (t: any) => ( t.out(its.type) === 'word' && !t.out(its.stopWordFlag) ) )
|
|
462
|
-
// Handle negation and extract stem of the word
|
|
463
|
-
.each( (t: any) => tokens.push( (t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem) ) );
|
|
464
|
-
|
|
465
|
-
return tokens;
|
|
466
|
-
};
|
|
467
|
-
|
|
468
|
-
this._bm25Engine.defineConfig( { fldWeights: { body: 1 } } );
|
|
469
|
-
// Step II: Define PrepTasks pipe.
|
|
470
|
-
this._bm25Engine.definePrepTasks( [ prepTask ] );
|
|
479
|
+
// Create new item
|
|
480
|
+
const newItem: IndexItem = {
|
|
481
|
+
id: id,
|
|
482
|
+
metadata: metadata,
|
|
483
|
+
vector: item.vector,
|
|
484
|
+
norm: ItemSelector.normalize(item.vector)
|
|
485
|
+
};
|
|
486
|
+
if (metadataFile) {
|
|
487
|
+
newItem.metadataFile = metadataFile;
|
|
471
488
|
}
|
|
472
489
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
490
|
+
// Add item to index
|
|
491
|
+
if (!unique) {
|
|
492
|
+
const existing = this._update!.items.find(i => i.id === id);
|
|
493
|
+
if (existing) {
|
|
494
|
+
existing.metadata = newItem.metadata;
|
|
495
|
+
existing.vector = newItem.vector;
|
|
496
|
+
existing.metadataFile = newItem.metadataFile;
|
|
497
|
+
return existing;
|
|
498
|
+
} else {
|
|
499
|
+
this._update!.items.push(newItem);
|
|
500
|
+
return newItem;
|
|
501
|
+
}
|
|
502
|
+
} else {
|
|
503
|
+
this._update!.items.push(newItem);
|
|
504
|
+
return newItem;
|
|
479
505
|
}
|
|
480
|
-
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
private async setupBm25(): Promise<any> {
|
|
509
|
+
this._bm25Engine = this._bm25Factory();
|
|
510
|
+
const nlp = winkNLP(model);
|
|
511
|
+
const its = nlp.its;
|
|
512
|
+
const prepTask = function (text: string) {
|
|
513
|
+
const tokens: any[] = [];
|
|
514
|
+
nlp.readDoc(text)
|
|
515
|
+
.tokens()
|
|
516
|
+
// Use only words ignoring punctuations etc and from them remove stop words
|
|
517
|
+
.filter((t: any) => (t.out(its.type) === 'word' && !t.out(its.stopWordFlag)))
|
|
518
|
+
// Handle negation and extract stem of the word
|
|
519
|
+
.each((t: any) => tokens.push((t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem)));
|
|
520
|
+
return tokens;
|
|
521
|
+
};
|
|
522
|
+
this._bm25Engine.defineConfig({ fldWeights: { body: 1 } });
|
|
523
|
+
// Step II: Define PrepTasks pipe.
|
|
524
|
+
this._bm25Engine.definePrepTasks([prepTask]);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
private async bm25Search(searchQuery: string, _items: any, topK: number): Promise<any> {
|
|
528
|
+
const results = this._bm25Engine.search(searchQuery);
|
|
529
|
+
return results.slice(0, topK);
|
|
530
|
+
}
|
|
481
531
|
}
|
|
482
532
|
|
|
483
533
|
interface IndexData {
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
534
|
+
version: number;
|
|
535
|
+
metadata_config: {
|
|
536
|
+
indexed?: string[];
|
|
537
|
+
};
|
|
538
|
+
items: IndexItem[];
|
|
489
539
|
}
|