vectra 0.12.2 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +92 -100
- package/bin/vectra.js +3 -0
- package/lib/BrowserWebFetcher.d.ts +75 -0
- package/lib/BrowserWebFetcher.d.ts.map +1 -0
- package/lib/BrowserWebFetcher.js +290 -0
- package/lib/BrowserWebFetcher.js.map +1 -0
- package/lib/FileFetcher.d.ts +5 -0
- package/lib/FileFetcher.d.ts.map +1 -0
- package/lib/FileFetcher.js +89 -0
- package/lib/FileFetcher.js.map +1 -0
- package/lib/FileFetcher.spec.d.ts +2 -0
- package/lib/FileFetcher.spec.d.ts.map +1 -0
- package/lib/FileFetcher.spec.js +244 -0
- package/lib/FileFetcher.spec.js.map +1 -0
- package/lib/FolderWatcher.d.ts +91 -0
- package/lib/FolderWatcher.d.ts.map +1 -0
- package/lib/FolderWatcher.js +304 -0
- package/lib/FolderWatcher.js.map +1 -0
- package/lib/FolderWatcher.spec.d.ts +2 -0
- package/lib/FolderWatcher.spec.d.ts.map +1 -0
- package/lib/FolderWatcher.spec.js +308 -0
- package/lib/FolderWatcher.spec.js.map +1 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.spec.d.ts +2 -0
- package/lib/GPT3Tokenizer.spec.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.spec.js +45 -0
- package/lib/GPT3Tokenizer.spec.js.map +1 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +179 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/ItemSelector.spec.d.ts +2 -0
- package/lib/ItemSelector.spec.d.ts.map +1 -0
- package/lib/ItemSelector.spec.js +204 -0
- package/lib/ItemSelector.spec.js.map +1 -0
- package/lib/LocalDocument.d.ts +54 -0
- package/lib/LocalDocument.d.ts.map +1 -1
- package/lib/LocalDocument.js +116 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocument.spec.d.ts +2 -0
- package/lib/LocalDocument.spec.d.ts.map +1 -0
- package/lib/LocalDocument.spec.js +214 -0
- package/lib/LocalDocument.spec.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +152 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +420 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentIndex.spec.d.ts +2 -0
- package/lib/LocalDocumentIndex.spec.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.spec.js +494 -0
- package/lib/LocalDocumentIndex.spec.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +66 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -1
- package/lib/LocalDocumentResult.js +376 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalDocumentResult.spec.d.ts +2 -0
- package/lib/LocalDocumentResult.spec.d.ts.map +1 -0
- package/lib/LocalDocumentResult.spec.js +373 -0
- package/lib/LocalDocumentResult.spec.js.map +1 -0
- package/lib/LocalEmbeddings.d.ts +59 -0
- package/lib/LocalEmbeddings.d.ts.map +1 -0
- package/lib/LocalEmbeddings.js +101 -0
- package/lib/LocalEmbeddings.js.map +1 -0
- package/lib/LocalEmbeddings.spec.d.ts +2 -0
- package/lib/LocalEmbeddings.spec.d.ts.map +1 -0
- package/lib/LocalEmbeddings.spec.js +155 -0
- package/lib/LocalEmbeddings.spec.js.map +1 -0
- package/lib/LocalIndex.d.ts +159 -0
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +519 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/LocalIndex.spec.d.ts +2 -0
- package/lib/LocalIndex.spec.js +611 -9
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +124 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +166 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/OpenAIEmbeddings.spec.d.ts +2 -0
- package/lib/OpenAIEmbeddings.spec.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.spec.js +298 -0
- package/lib/OpenAIEmbeddings.spec.js.map +1 -0
- package/lib/TextSplitter.d.ts +21 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +500 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/TextSplitter.spec.d.ts +2 -0
- package/lib/TextSplitter.spec.d.ts.map +1 -0
- package/lib/TextSplitter.spec.js +337 -0
- package/lib/TextSplitter.spec.js.map +1 -0
- package/lib/TransformersEmbeddings.d.ts +121 -0
- package/lib/TransformersEmbeddings.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.js +176 -0
- package/lib/TransformersEmbeddings.js.map +1 -0
- package/lib/TransformersEmbeddings.spec.d.ts +2 -0
- package/lib/TransformersEmbeddings.spec.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.spec.js +198 -0
- package/lib/TransformersEmbeddings.spec.js.map +1 -0
- package/lib/TransformersTokenizer.d.ts +33 -0
- package/lib/TransformersTokenizer.d.ts.map +1 -0
- package/lib/TransformersTokenizer.js +44 -0
- package/lib/TransformersTokenizer.js.map +1 -0
- package/lib/TransformersTokenizer.spec.d.ts +2 -0
- package/lib/TransformersTokenizer.spec.d.ts.map +1 -0
- package/lib/TransformersTokenizer.spec.js +112 -0
- package/lib/TransformersTokenizer.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +14 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +238 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/WebFetcher.spec.d.ts +2 -0
- package/lib/WebFetcher.spec.d.ts.map +1 -0
- package/lib/WebFetcher.spec.js +263 -0
- package/lib/WebFetcher.spec.js.map +1 -0
- package/lib/browser.d.ts +30 -0
- package/lib/browser.d.ts.map +1 -0
- package/lib/browser.js +52 -0
- package/lib/browser.js.map +1 -0
- package/lib/codecs/IndexCodec.d.ts +37 -0
- package/lib/codecs/IndexCodec.d.ts.map +1 -0
- package/lib/codecs/IndexCodec.js +3 -0
- package/lib/codecs/IndexCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.d.ts +19 -0
- package/lib/codecs/JsonCodec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.js +35 -0
- package/lib/codecs/JsonCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.spec.d.ts +2 -0
- package/lib/codecs/JsonCodec.spec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.spec.js +66 -0
- package/lib/codecs/JsonCodec.spec.js.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts +2 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js +108 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.d.ts +20 -0
- package/lib/codecs/ProtobufCodec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.js +225 -0
- package/lib/codecs/ProtobufCodec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts +2 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.js +155 -0
- package/lib/codecs/ProtobufCodec.spec.js.map +1 -0
- package/lib/codecs/index.d.ts +5 -0
- package/lib/codecs/index.d.ts.map +1 -0
- package/lib/codecs/index.js +21 -0
- package/lib/codecs/index.js.map +1 -0
- package/lib/codecs/migrateIndex.d.ts +24 -0
- package/lib/codecs/migrateIndex.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.js +119 -0
- package/lib/codecs/migrateIndex.js.map +1 -0
- package/lib/codecs/migrateIndex.spec.d.ts +2 -0
- package/lib/codecs/migrateIndex.spec.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.spec.js +151 -0
- package/lib/codecs/migrateIndex.spec.js.map +1 -0
- package/lib/codecs/schemas/index.proto +34 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.d.ts.map +1 -1
- package/lib/index.js +36 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +69 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +43 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/server/IndexManager.d.ts +78 -0
- package/lib/server/IndexManager.d.ts.map +1 -0
- package/lib/server/IndexManager.js +259 -0
- package/lib/server/IndexManager.js.map +1 -0
- package/lib/server/VectraServer.d.ts +40 -0
- package/lib/server/VectraServer.d.ts.map +1 -0
- package/lib/server/VectraServer.js +151 -0
- package/lib/server/VectraServer.js.map +1 -0
- package/lib/server/VectraServer.spec.d.ts +2 -0
- package/lib/server/VectraServer.spec.d.ts.map +1 -0
- package/lib/server/VectraServer.spec.js +322 -0
- package/lib/server/VectraServer.spec.js.map +1 -0
- package/lib/server/handlers/documentHandlers.d.ts +15 -0
- package/lib/server/handlers/documentHandlers.d.ts.map +1 -0
- package/lib/server/handlers/documentHandlers.js +95 -0
- package/lib/server/handlers/documentHandlers.js.map +1 -0
- package/lib/server/handlers/helpers.d.ts +23 -0
- package/lib/server/handlers/helpers.d.ts.map +1 -0
- package/lib/server/handlers/helpers.js +138 -0
- package/lib/server/handlers/helpers.js.map +1 -0
- package/lib/server/handlers/index.d.ts +8 -0
- package/lib/server/handlers/index.d.ts.map +1 -0
- package/lib/server/handlers/index.js +22 -0
- package/lib/server/handlers/index.js.map +1 -0
- package/lib/server/handlers/indexHandlers.d.ts +14 -0
- package/lib/server/handlers/indexHandlers.d.ts.map +1 -0
- package/lib/server/handlers/indexHandlers.js +85 -0
- package/lib/server/handlers/indexHandlers.js.map +1 -0
- package/lib/server/handlers/itemHandlers.d.ts +34 -0
- package/lib/server/handlers/itemHandlers.d.ts.map +1 -0
- package/lib/server/handlers/itemHandlers.js +166 -0
- package/lib/server/handlers/itemHandlers.js.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts +11 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.js +31 -0
- package/lib/server/handlers/lifecycleHandlers.js.map +1 -0
- package/lib/server/handlers/queryHandlers.d.ts +27 -0
- package/lib/server/handlers/queryHandlers.d.ts.map +1 -0
- package/lib/server/handlers/queryHandlers.js +135 -0
- package/lib/server/handlers/queryHandlers.js.map +1 -0
- package/lib/server/handlers/statsHandlers.d.ts +17 -0
- package/lib/server/handlers/statsHandlers.d.ts.map +1 -0
- package/lib/server/handlers/statsHandlers.js +81 -0
- package/lib/server/handlers/statsHandlers.js.map +1 -0
- package/lib/server/index.d.ts +4 -0
- package/lib/server/index.d.ts.map +1 -0
- package/lib/server/index.js +23 -0
- package/lib/server/index.js.map +1 -0
- package/lib/storage/FileStorage.d.ts +92 -0
- package/lib/storage/FileStorage.d.ts.map +1 -0
- package/lib/storage/FileStorage.js +3 -0
- package/lib/storage/FileStorage.js.map +1 -0
- package/lib/storage/FileStorageUtilities.d.ts +36 -0
- package/lib/storage/FileStorageUtilities.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.js +91 -0
- package/lib/storage/FileStorageUtilities.js.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts +2 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.js +98 -0
- package/lib/storage/FileStorageUtilities.spec.js.map +1 -0
- package/lib/storage/FileType.d.ts +29 -0
- package/lib/storage/FileType.d.ts.map +1 -0
- package/lib/storage/FileType.js +38 -0
- package/lib/storage/FileType.js.map +1 -0
- package/lib/storage/IndexedDBStorage.d.ts +47 -0
- package/lib/storage/IndexedDBStorage.d.ts.map +1 -0
- package/lib/storage/IndexedDBStorage.js +347 -0
- package/lib/storage/IndexedDBStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.browser.d.ts +19 -0
- package/lib/storage/LocalFileStorage.browser.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.browser.js +43 -0
- package/lib/storage/LocalFileStorage.browser.js.map +1 -0
- package/lib/storage/LocalFileStorage.d.ts +23 -0
- package/lib/storage/LocalFileStorage.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.js +152 -0
- package/lib/storage/LocalFileStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.spec.d.ts +2 -0
- package/lib/storage/LocalFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.spec.js +249 -0
- package/lib/storage/LocalFileStorage.spec.js.map +1 -0
- package/lib/storage/VirtualFileStorage.d.ts +18 -0
- package/lib/storage/VirtualFileStorage.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.js +178 -0
- package/lib/storage/VirtualFileStorage.js.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts +2 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.js +302 -0
- package/lib/storage/VirtualFileStorage.spec.js.map +1 -0
- package/lib/storage/index.d.ts +6 -0
- package/lib/storage/index.d.ts.map +1 -0
- package/lib/storage/index.js +22 -0
- package/lib/storage/index.js.map +1 -0
- package/lib/templates/templates/csharp/README.md +48 -0
- package/lib/templates/templates/csharp/VectraClient.cs +234 -0
- package/lib/templates/templates/go/README.md +71 -0
- package/lib/templates/templates/go/vectra_client.go +322 -0
- package/lib/templates/templates/java/README.md +81 -0
- package/lib/templates/templates/java/VectraClient.java +232 -0
- package/lib/templates/templates/python/README.md +37 -0
- package/lib/templates/templates/python/vectra_client.py +279 -0
- package/lib/templates/templates/rust/Cargo.toml +14 -0
- package/lib/templates/templates/rust/README.md +39 -0
- package/lib/templates/templates/rust/build.rs +4 -0
- package/lib/templates/templates/rust/lib.rs +284 -0
- package/lib/templates/templates/typescript/README.md +96 -0
- package/lib/templates/templates/typescript/VectraClient.ts +374 -0
- package/lib/templates/typescript/VectraClient.d.ts +114 -0
- package/lib/templates/typescript/VectraClient.d.ts.map +1 -0
- package/lib/templates/typescript/VectraClient.js +328 -0
- package/lib/templates/typescript/VectraClient.js.map +1 -0
- package/lib/types.d.ts +153 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/utils/index.d.ts +2 -0
- package/lib/utils/index.d.ts.map +1 -0
- package/lib/utils/index.js +18 -0
- package/lib/utils/index.js.map +1 -0
- package/lib/utils/pathUtils.d.ts +40 -0
- package/lib/utils/pathUtils.d.ts.map +1 -0
- package/lib/utils/pathUtils.js +98 -0
- package/lib/utils/pathUtils.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -1
- package/lib/vectra-cli.generate.spec.d.ts +2 -0
- package/lib/vectra-cli.generate.spec.d.ts.map +1 -0
- package/lib/vectra-cli.generate.spec.js +112 -0
- package/lib/vectra-cli.generate.spec.js.map +1 -0
- package/lib/vectra-cli.js +760 -0
- package/lib/vectra-cli.js.map +1 -0
- package/lib/vectra-cli.spec.d.ts +1 -0
- package/lib/vectra-cli.spec.d.ts.map +1 -0
- package/lib/vectra-cli.spec.js +2 -0
- package/lib/vectra-cli.spec.js.map +1 -0
- package/package.json +91 -16
- package/proto/vectra_service.proto +276 -0
- package/src/BrowserWebFetcher.ts +345 -0
- package/src/FileFetcher.spec.ts +234 -0
- package/src/FileFetcher.ts +37 -25
- package/src/FolderWatcher.spec.ts +288 -0
- package/src/FolderWatcher.ts +304 -0
- package/src/GPT3Tokenizer.spec.ts +50 -0
- package/src/ItemSelector.spec.ts +252 -0
- package/src/ItemSelector.ts +163 -150
- package/src/LocalDocument.spec.ts +211 -0
- package/src/LocalDocument.ts +88 -94
- package/src/LocalDocumentIndex.spec.ts +481 -0
- package/src/LocalDocumentIndex.ts +39 -40
- package/src/LocalDocumentResult.spec.ts +373 -0
- package/src/LocalDocumentResult.ts +489 -319
- package/src/LocalEmbeddings.spec.ts +138 -0
- package/src/LocalEmbeddings.ts +120 -0
- package/src/LocalIndex.spec.ts +808 -66
- package/src/LocalIndex.ts +479 -429
- package/src/OpenAIEmbeddings.spec.ts +354 -0
- package/src/OpenAIEmbeddings.ts +26 -27
- package/src/TextSplitter.spec.ts +342 -0
- package/src/TextSplitter.ts +517 -532
- package/src/TransformersEmbeddings.spec.ts +188 -0
- package/src/TransformersEmbeddings.ts +232 -0
- package/src/TransformersTokenizer.spec.ts +143 -0
- package/src/TransformersTokenizer.ts +45 -0
- package/src/WebFetcher.spec.ts +288 -0
- package/src/WebFetcher.ts +184 -186
- package/src/browser.ts +69 -0
- package/src/codecs/IndexCodec.ts +40 -0
- package/src/codecs/JsonCodec.spec.ts +70 -0
- package/src/codecs/JsonCodec.ts +37 -0
- package/src/codecs/LocalIndex.protobuf.spec.ts +115 -0
- package/src/codecs/ProtobufCodec.spec.ts +166 -0
- package/src/codecs/ProtobufCodec.ts +193 -0
- package/src/codecs/index.ts +4 -0
- package/src/codecs/migrateIndex.spec.ts +176 -0
- package/src/codecs/migrateIndex.ts +125 -0
- package/src/codecs/schemas/index.proto +34 -0
- package/src/index.ts +9 -1
- package/src/internals/Colorize.ts +19 -16
- package/src/server/IndexManager.ts +243 -0
- package/src/server/VectraServer.spec.ts +303 -0
- package/src/server/VectraServer.ts +156 -0
- package/src/server/handlers/documentHandlers.ts +59 -0
- package/src/server/handlers/helpers.ts +93 -0
- package/src/server/handlers/index.ts +7 -0
- package/src/server/handlers/indexHandlers.ts +44 -0
- package/src/server/handlers/itemHandlers.ts +140 -0
- package/src/server/handlers/lifecycleHandlers.ts +26 -0
- package/src/server/handlers/queryHandlers.ts +96 -0
- package/src/server/handlers/statsHandlers.ts +38 -0
- package/src/server/index.ts +3 -0
- package/src/storage/FileStorage.ts +105 -0
- package/src/storage/FileStorageUtilities.spec.ts +106 -0
- package/src/storage/FileStorageUtilities.ts +77 -0
- package/src/storage/FileType.ts +61 -0
- package/src/storage/IndexedDBStorage.ts +365 -0
- package/src/storage/LocalFileStorage.browser.ts +52 -0
- package/src/storage/LocalFileStorage.spec.ts +292 -0
- package/src/storage/LocalFileStorage.ts +98 -0
- package/src/storage/VirtualFileStorage.spec.ts +307 -0
- package/src/storage/VirtualFileStorage.ts +169 -0
- package/src/storage/index.ts +5 -0
- package/src/templates/csharp/README.md +48 -0
- package/src/templates/csharp/VectraClient.cs +234 -0
- package/src/templates/go/README.md +71 -0
- package/src/templates/go/vectra_client.go +322 -0
- package/src/templates/java/README.md +81 -0
- package/src/templates/java/VectraClient.java +232 -0
- package/src/templates/python/README.md +37 -0
- package/src/templates/python/vectra_client.py +279 -0
- package/src/templates/rust/Cargo.toml +14 -0
- package/src/templates/rust/README.md +39 -0
- package/src/templates/rust/build.rs +4 -0
- package/src/templates/rust/lib.rs +284 -0
- package/src/templates/typescript/README.md +96 -0
- package/src/templates/typescript/VectraClient.ts +374 -0
- package/src/types.ts +131 -123
- package/src/utils/index.ts +1 -0
- package/src/utils/pathUtils.ts +106 -0
- package/src/vectra-cli.generate.spec.ts +72 -0
- package/src/vectra-cli.spec.ts +0 -0
- package/src/vectra-cli.ts +687 -246
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import { strict as assert } from 'node:assert';
|
|
2
|
+
import { describe, it, beforeEach, afterEach } from 'mocha';
|
|
3
|
+
import sinon from 'sinon';
|
|
4
|
+
import { EmbeddingsModel } from './types';
|
|
5
|
+
import * as transformersModule from '@huggingface/transformers';
|
|
6
|
+
|
|
7
|
+
describe('TransformersEmbeddings', () => {
|
|
8
|
+
let TransformersEmbeddings: any;
|
|
9
|
+
let mockExtractor: sinon.SinonStub;
|
|
10
|
+
let mockTokenizer: any;
|
|
11
|
+
let sandbox: sinon.SinonSandbox;
|
|
12
|
+
let pipelineStub: sinon.SinonStub;
|
|
13
|
+
|
|
14
|
+
beforeEach(async () => {
|
|
15
|
+
sandbox = sinon.createSandbox();
|
|
16
|
+
|
|
17
|
+
// Create mock tokenizer
|
|
18
|
+
mockTokenizer = {
|
|
19
|
+
__call__: sandbox.stub().returns({
|
|
20
|
+
input_ids: { data: BigInt64Array.from([BigInt(1), BigInt(2), BigInt(3)]) }
|
|
21
|
+
}),
|
|
22
|
+
decode: sandbox.stub().returns('decoded text')
|
|
23
|
+
};
|
|
24
|
+
// Make it callable
|
|
25
|
+
const callableTokenizer = Object.assign(
|
|
26
|
+
(...args: any[]) => mockTokenizer.__call__(...args),
|
|
27
|
+
mockTokenizer
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
// Create mock extractor (feature extraction pipeline)
|
|
31
|
+
mockExtractor = sandbox.stub().callsFake(async (inputs: string | string[]) => {
|
|
32
|
+
const inputArray = Array.isArray(inputs) ? inputs : [inputs];
|
|
33
|
+
const batchSize = inputArray.length;
|
|
34
|
+
const embeddingDim = 4;
|
|
35
|
+
|
|
36
|
+
const data = new Float32Array(batchSize * embeddingDim);
|
|
37
|
+
for (let i = 0; i < batchSize; i++) {
|
|
38
|
+
data[i * embeddingDim] = 0.1;
|
|
39
|
+
data[i * embeddingDim + 1] = 0.2;
|
|
40
|
+
data[i * embeddingDim + 2] = 0.3;
|
|
41
|
+
data[i * embeddingDim + 3] = 0.4;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
data: data,
|
|
46
|
+
dims: [batchSize, embeddingDim]
|
|
47
|
+
};
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// Attach tokenizer to the mock extractor so pipeline result has .tokenizer
|
|
51
|
+
(mockExtractor as any).tokenizer = callableTokenizer;
|
|
52
|
+
|
|
53
|
+
// Stub the pipeline function from @huggingface/transformers
|
|
54
|
+
pipelineStub = sandbox.stub(transformersModule, 'pipeline' as any).resolves(mockExtractor);
|
|
55
|
+
|
|
56
|
+
// Import TransformersEmbeddings fresh (uses the stubbed pipeline via dynamic import)
|
|
57
|
+
const mod = await import('./TransformersEmbeddings');
|
|
58
|
+
TransformersEmbeddings = mod.TransformersEmbeddings;
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
afterEach(() => {
|
|
62
|
+
sandbox.restore();
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
describe('create()', () => {
|
|
66
|
+
it('creates instance with default options', async () => {
|
|
67
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
68
|
+
|
|
69
|
+
assert.equal(embeddings.maxTokens, 512, 'default maxTokens should be 512');
|
|
70
|
+
assert.equal(embeddings.model, 'Xenova/all-MiniLM-L6-v2', 'default model should be all-MiniLM-L6-v2');
|
|
71
|
+
|
|
72
|
+
// Verify pipeline was called with correct arguments
|
|
73
|
+
assert.ok(pipelineStub.calledOnce, 'pipeline should be called once');
|
|
74
|
+
assert.equal(pipelineStub.firstCall.args[0], 'feature-extraction');
|
|
75
|
+
assert.equal(pipelineStub.firstCall.args[1], 'Xenova/all-MiniLM-L6-v2');
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it('creates instance with custom options', async () => {
|
|
79
|
+
const embeddings = await TransformersEmbeddings.create({
|
|
80
|
+
model: 'Xenova/bge-small-en-v1.5',
|
|
81
|
+
maxTokens: 256,
|
|
82
|
+
device: 'cpu',
|
|
83
|
+
normalize: false,
|
|
84
|
+
pooling: 'cls'
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
assert.equal(embeddings.maxTokens, 256);
|
|
88
|
+
assert.equal(embeddings.model, 'Xenova/bge-small-en-v1.5');
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('implements EmbeddingsModel interface', async () => {
|
|
92
|
+
const embeddings: EmbeddingsModel = await TransformersEmbeddings.create();
|
|
93
|
+
|
|
94
|
+
assert.equal(typeof embeddings.maxTokens, 'number');
|
|
95
|
+
assert.equal(typeof embeddings.createEmbeddings, 'function');
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
describe('createEmbeddings()', () => {
|
|
100
|
+
it('generates embeddings for single string', async () => {
|
|
101
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
102
|
+
const result = await embeddings.createEmbeddings('hello world');
|
|
103
|
+
|
|
104
|
+
assert.equal(result.status, 'success');
|
|
105
|
+
assert.ok(result.output, 'output should be defined');
|
|
106
|
+
assert.equal(result.output!.length, 1, 'should have one embedding');
|
|
107
|
+
assert.equal(result.output![0].length, 4, 'embedding should have 4 dimensions');
|
|
108
|
+
const expected = [0.1, 0.2, 0.3, 0.4];
|
|
109
|
+
result.output![0].forEach((val: number, i: number) => {
|
|
110
|
+
assert.ok(Math.abs(val - expected[i]) < 0.001, `value ${val} should be close to ${expected[i]}`);
|
|
111
|
+
});
|
|
112
|
+
assert.equal(result.model, 'Xenova/all-MiniLM-L6-v2');
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('generates embeddings for string array', async () => {
|
|
116
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
117
|
+
const result = await embeddings.createEmbeddings(['hello', 'world']);
|
|
118
|
+
|
|
119
|
+
assert.equal(result.status, 'success');
|
|
120
|
+
assert.ok(result.output, 'output should be defined');
|
|
121
|
+
assert.equal(result.output!.length, 2, 'should have two embeddings');
|
|
122
|
+
|
|
123
|
+
assert.equal(mockExtractor.callCount, 1);
|
|
124
|
+
assert.deepEqual(mockExtractor.firstCall.args[0], ['hello', 'world']);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it('passes pooling and normalize options to extractor', async () => {
|
|
128
|
+
const embeddings = await TransformersEmbeddings.create({
|
|
129
|
+
pooling: 'cls',
|
|
130
|
+
normalize: false
|
|
131
|
+
});
|
|
132
|
+
await embeddings.createEmbeddings('test');
|
|
133
|
+
|
|
134
|
+
assert.ok(mockExtractor.calledOnce);
|
|
135
|
+
const options = mockExtractor.firstCall.args[1];
|
|
136
|
+
assert.equal(options.pooling, 'cls');
|
|
137
|
+
assert.equal(options.normalize, false);
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('returns error status on failure', async () => {
|
|
141
|
+
mockExtractor.rejects(new Error('Model inference failed'));
|
|
142
|
+
|
|
143
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
144
|
+
const result = await embeddings.createEmbeddings('test');
|
|
145
|
+
|
|
146
|
+
assert.equal(result.status, 'error');
|
|
147
|
+
assert.ok(result.message?.includes('Model inference failed'));
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
it('handles empty string input', async () => {
|
|
151
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
152
|
+
const result = await embeddings.createEmbeddings('');
|
|
153
|
+
|
|
154
|
+
assert.equal(result.status, 'success');
|
|
155
|
+
assert.ok(result.output);
|
|
156
|
+
assert.equal(result.output!.length, 1);
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it('handles empty array input', async () => {
|
|
160
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
161
|
+
const result = await embeddings.createEmbeddings([]);
|
|
162
|
+
|
|
163
|
+
assert.equal(result.status, 'success');
|
|
164
|
+
assert.ok(result.output);
|
|
165
|
+
assert.equal(result.output!.length, 0);
|
|
166
|
+
});
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
describe('getTokenizer()', () => {
|
|
170
|
+
it('returns a TransformersTokenizer instance', async () => {
|
|
171
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
172
|
+
const tokenizer = embeddings.getTokenizer();
|
|
173
|
+
|
|
174
|
+
assert.ok(tokenizer, 'tokenizer should be defined');
|
|
175
|
+
assert.equal(typeof tokenizer.encode, 'function');
|
|
176
|
+
assert.equal(typeof tokenizer.decode, 'function');
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
it('returns consistent tokenizer across calls', async () => {
|
|
180
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
181
|
+
const tokenizer1 = embeddings.getTokenizer();
|
|
182
|
+
const tokenizer2 = embeddings.getTokenizer();
|
|
183
|
+
|
|
184
|
+
assert.ok(tokenizer1);
|
|
185
|
+
assert.ok(tokenizer2);
|
|
186
|
+
});
|
|
187
|
+
});
|
|
188
|
+
});
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import { EmbeddingsModel, EmbeddingsResponse } from "./types";
|
|
2
|
+
import { TransformersTokenizer } from "./TransformersTokenizer";
|
|
3
|
+
import { FeatureExtractionPipeline, PreTrainedTokenizer } from "@huggingface/transformers";
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
const DEFAULT_MODEL = 'Xenova/all-MiniLM-L6-v2';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Type definition for the Transformers.js library.
|
|
10
|
+
* Used for dynamic import and type safety.
|
|
11
|
+
*/
|
|
12
|
+
type TransformersLibrary = typeof import('@huggingface/transformers');
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Configuration options for TransformersEmbeddings.
|
|
16
|
+
*/
|
|
17
|
+
export interface TransformersEmbeddingsOptions {
|
|
18
|
+
/**
|
|
19
|
+
* Optional. Model name/path to use for embeddings.
|
|
20
|
+
* @remarks
|
|
21
|
+
* Common models:
|
|
22
|
+
* - 'Xenova/all-MiniLM-L6-v2' (384 dimensions, fast, good quality)
|
|
23
|
+
* - 'Xenova/bge-small-en-v1.5' (384 dimensions, better quality)
|
|
24
|
+
* - 'Xenova/bge-base-en-v1.5' (768 dimensions, best quality)
|
|
25
|
+
* @default 'Xenova/all-MiniLM-L6-v2'
|
|
26
|
+
*/
|
|
27
|
+
model?: string;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Optional. Maximum number of tokens that can be sent to the embedding model.
|
|
31
|
+
* @remarks
|
|
32
|
+
* This affects batching behavior in LocalDocumentIndex.
|
|
33
|
+
* Most small models support 512 tokens.
|
|
34
|
+
* @default 512
|
|
35
|
+
*/
|
|
36
|
+
maxTokens?: number;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Optional. Device to run inference on.
|
|
40
|
+
* @remarks
|
|
41
|
+
* - 'auto': Automatically select the best available device
|
|
42
|
+
* - 'gpu': Use GPU (WebGPU in browser, CUDA in Node.js if available)
|
|
43
|
+
* - 'cpu': Use CPU (most compatible)
|
|
44
|
+
* - 'wasm': Use WebAssembly
|
|
45
|
+
* @default 'auto'
|
|
46
|
+
*/
|
|
47
|
+
device?: 'auto' | 'gpu' | 'cpu' | 'wasm';
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Optional. Data type for model weights.
|
|
51
|
+
* @remarks
|
|
52
|
+
* - 'fp32': Full precision (best quality, largest size)
|
|
53
|
+
* - 'fp16': Half precision (good quality, smaller)
|
|
54
|
+
* - 'q8': 8-bit quantization (good quality, smaller)
|
|
55
|
+
* - 'q4': 4-bit quantization (fastest, smallest, lower quality)
|
|
56
|
+
* @default 'fp32'
|
|
57
|
+
*/
|
|
58
|
+
dtype?: 'fp32' | 'fp16' | 'q8' | 'q4';
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Optional. Whether to normalize embeddings to unit length.
|
|
62
|
+
* @default true
|
|
63
|
+
*/
|
|
64
|
+
normalize?: boolean;
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Optional. Pooling strategy for token embeddings.
|
|
68
|
+
* @remarks
|
|
69
|
+
* - 'mean': Mean pooling (default, recommended)
|
|
70
|
+
* - 'cls': Use [CLS] token embedding
|
|
71
|
+
* @default 'mean'
|
|
72
|
+
*/
|
|
73
|
+
pooling?: 'mean' | 'cls';
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Optional. Callback for tracking model download/load progress.
|
|
77
|
+
*/
|
|
78
|
+
progressCallback?: (progress: { status: string; progress?: number; file?: string }) => void;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* An embeddings model using Transformers.js for local, offline inference.
|
|
83
|
+
* @remarks
|
|
84
|
+
* Requires @huggingface/transformers as a peer dependency.
|
|
85
|
+
* Use the static `create()` method to instantiate.
|
|
86
|
+
*
|
|
87
|
+
* @example
|
|
88
|
+
* ```typescript
|
|
89
|
+
* const embeddings = await TransformersEmbeddings.create({
|
|
90
|
+
* model: 'Xenova/all-MiniLM-L6-v2'
|
|
91
|
+
* });
|
|
92
|
+
*
|
|
93
|
+
* const index = new LocalDocumentIndex({
|
|
94
|
+
* folderPath: 'my-index',
|
|
95
|
+
* embeddings: embeddings,
|
|
96
|
+
* tokenizer: embeddings.getTokenizer()
|
|
97
|
+
* });
|
|
98
|
+
* ```
|
|
99
|
+
*/
|
|
100
|
+
export class TransformersEmbeddings implements EmbeddingsModel {
|
|
101
|
+
private readonly _extractor: FeatureExtractionPipeline;
|
|
102
|
+
private readonly _tokenizer: PreTrainedTokenizer;
|
|
103
|
+
private readonly _options: Required<Omit<TransformersEmbeddingsOptions, 'progressCallback'>> & Pick<TransformersEmbeddingsOptions, 'progressCallback'>;
|
|
104
|
+
|
|
105
|
+
public readonly maxTokens: number;
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Private constructor - use TransformersEmbeddings.create() instead.
|
|
109
|
+
*/
|
|
110
|
+
private constructor(
|
|
111
|
+
extractor: FeatureExtractionPipeline,
|
|
112
|
+
tokenizer: PreTrainedTokenizer,
|
|
113
|
+
options: Required<Omit<TransformersEmbeddingsOptions, 'progressCallback'>> & Pick<TransformersEmbeddingsOptions, 'progressCallback'>
|
|
114
|
+
) {
|
|
115
|
+
this._extractor = extractor;
|
|
116
|
+
this._tokenizer = tokenizer;
|
|
117
|
+
this._options = options;
|
|
118
|
+
this.maxTokens = options.maxTokens;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Creates a new TransformersEmbeddings instance.
|
|
123
|
+
* @param options Configuration options.
|
|
124
|
+
* @returns Promise resolving to initialized TransformersEmbeddings instance.
|
|
125
|
+
* @throws Error if @huggingface/transformers is not installed.
|
|
126
|
+
*/
|
|
127
|
+
public static async create(options?: TransformersEmbeddingsOptions): Promise<TransformersEmbeddings> {
|
|
128
|
+
// Dynamically import to allow optional dependency
|
|
129
|
+
let transformers: TransformersLibrary;
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
transformers = await import('@huggingface/transformers');
|
|
133
|
+
} catch (e) {
|
|
134
|
+
throw new Error(
|
|
135
|
+
'TransformersEmbeddings requires @huggingface/transformers. ' +
|
|
136
|
+
'Install it with: npm install @huggingface/transformers'
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const { pipeline } = transformers;
|
|
141
|
+
|
|
142
|
+
// Apply defaults
|
|
143
|
+
const opts = {
|
|
144
|
+
model: options?.model ?? DEFAULT_MODEL,
|
|
145
|
+
maxTokens: options?.maxTokens ?? 512,
|
|
146
|
+
device: options?.device ?? 'auto',
|
|
147
|
+
dtype: options?.dtype ?? 'fp32',
|
|
148
|
+
normalize: options?.normalize ?? true,
|
|
149
|
+
pooling: options?.pooling ?? 'mean',
|
|
150
|
+
progressCallback: options?.progressCallback
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
// Build pipeline options
|
|
154
|
+
const pipelineOptions: any = {
|
|
155
|
+
device: opts.device,
|
|
156
|
+
dtype: opts.dtype
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
if (opts.progressCallback) {
|
|
160
|
+
pipelineOptions.progress_callback = opts.progressCallback;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Load the feature extraction pipeline
|
|
164
|
+
const extractor = await pipeline(
|
|
165
|
+
'feature-extraction',
|
|
166
|
+
opts.model,
|
|
167
|
+
pipelineOptions
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
// Load the tokenizer separately for use with TextSplitter
|
|
171
|
+
const tokenizer = extractor.tokenizer;
|
|
172
|
+
|
|
173
|
+
return new TransformersEmbeddings(extractor, tokenizer, opts);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Returns a tokenizer that uses the same tokenization as this embedding model.
|
|
178
|
+
* @remarks
|
|
179
|
+
* Use this tokenizer with LocalDocumentIndex to ensure text chunking
|
|
180
|
+
* aligns with the embedding model's token boundaries.
|
|
181
|
+
* @returns TransformersTokenizer instance.
|
|
182
|
+
*/
|
|
183
|
+
public getTokenizer(): TransformersTokenizer {
|
|
184
|
+
return new TransformersTokenizer(this._tokenizer);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Creates embeddings for the given inputs.
|
|
189
|
+
* @param inputs Text inputs to create embeddings for.
|
|
190
|
+
* @returns EmbeddingsResponse with status and generated embeddings.
|
|
191
|
+
*/
|
|
192
|
+
public async createEmbeddings(inputs: string | string[]): Promise<EmbeddingsResponse> {
|
|
193
|
+
try {
|
|
194
|
+
const inputArray = Array.isArray(inputs) ? inputs : [inputs];
|
|
195
|
+
|
|
196
|
+
// Process all inputs in a single batch
|
|
197
|
+
const output = await this._extractor(inputArray, {
|
|
198
|
+
pooling: this._options.pooling,
|
|
199
|
+
normalize: this._options.normalize
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
const [batchSize, embeddingDim] = output.dims;
|
|
203
|
+
const data = output.data as Float32Array;
|
|
204
|
+
|
|
205
|
+
// Slice the flat array into individual embeddings
|
|
206
|
+
const embeddings: number[][] = [];
|
|
207
|
+
for (let i = 0; i < batchSize; i++) {
|
|
208
|
+
const start = i * embeddingDim;
|
|
209
|
+
const end = start + embeddingDim;
|
|
210
|
+
embeddings.push(Array.from(data.slice(start, end)));
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return {
|
|
214
|
+
status: 'success',
|
|
215
|
+
output: embeddings,
|
|
216
|
+
model: this._options.model
|
|
217
|
+
};
|
|
218
|
+
} catch (error: unknown) {
|
|
219
|
+
return {
|
|
220
|
+
status: 'error',
|
|
221
|
+
message: `Error generating embeddings: ${(error as Error).message}`
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Returns the model name being used.
|
|
228
|
+
*/
|
|
229
|
+
public get model(): string {
|
|
230
|
+
return this._options.model;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import { strict as assert } from 'node:assert';
|
|
2
|
+
import { describe, it } from 'mocha';
|
|
3
|
+
import { TransformersTokenizer } from './TransformersTokenizer';
|
|
4
|
+
|
|
5
|
+
describe('TransformersTokenizer', () => {
|
|
6
|
+
// Create a mock tokenizer that mimics Transformers.js behavior
|
|
7
|
+
function createMockTokenizer() {
|
|
8
|
+
const vocab: Map<string, number> = new Map([
|
|
9
|
+
['hello', 101],
|
|
10
|
+
['world', 102],
|
|
11
|
+
['test', 103],
|
|
12
|
+
['[CLS]', 1],
|
|
13
|
+
['[SEP]', 2]
|
|
14
|
+
]);
|
|
15
|
+
const reverseVocab: Map<number, string> = new Map();
|
|
16
|
+
vocab.forEach((v, k) => reverseVocab.set(v, k));
|
|
17
|
+
|
|
18
|
+
return {
|
|
19
|
+
// Mimics the callable tokenizer behavior
|
|
20
|
+
__call__: (text: string) => {
|
|
21
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w);
|
|
22
|
+
const ids = words.map(w => vocab.get(w) ?? 100);
|
|
23
|
+
return {
|
|
24
|
+
input_ids: {
|
|
25
|
+
data: BigInt64Array.from(ids.map(id => BigInt(id)))
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
},
|
|
29
|
+
decode: (tokens: number[], options?: { skip_special_tokens?: boolean }) => {
|
|
30
|
+
const words = tokens
|
|
31
|
+
.filter(t => !options?.skip_special_tokens || (t !== 1 && t !== 2))
|
|
32
|
+
.map(t => reverseVocab.get(t) ?? '[UNK]');
|
|
33
|
+
return words.join(' ');
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
it('encodes text to token array using callable tokenizer', () => {
|
|
39
|
+
const mockTokenizer = createMockTokenizer();
|
|
40
|
+
// Make it callable
|
|
41
|
+
const callableTokenizer = Object.assign(
|
|
42
|
+
(text: string) => mockTokenizer.__call__(text),
|
|
43
|
+
{ decode: mockTokenizer.decode }
|
|
44
|
+
) as any;
|
|
45
|
+
|
|
46
|
+
const tokenizer = new TransformersTokenizer(callableTokenizer);
|
|
47
|
+
const tokens = tokenizer.encode('hello world');
|
|
48
|
+
|
|
49
|
+
assert.ok(Array.isArray(tokens), 'encode should return an array');
|
|
50
|
+
assert.equal(tokens.length, 2, 'should have 2 tokens');
|
|
51
|
+
assert.deepEqual(tokens, [101, 102], 'tokens should match expected values');
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('handles BigInt64Array conversion correctly', () => {
|
|
55
|
+
const mockTokenizer = {
|
|
56
|
+
__call__: () => ({
|
|
57
|
+
input_ids: {
|
|
58
|
+
data: BigInt64Array.from([BigInt(1), BigInt(2), BigInt(3)])
|
|
59
|
+
}
|
|
60
|
+
}),
|
|
61
|
+
decode: () => 'decoded'
|
|
62
|
+
};
|
|
63
|
+
const callableTokenizer = Object.assign(
|
|
64
|
+
() => mockTokenizer.__call__(),
|
|
65
|
+
{ decode: mockTokenizer.decode }
|
|
66
|
+
) as any;
|
|
67
|
+
|
|
68
|
+
const tokenizer = new TransformersTokenizer(callableTokenizer);
|
|
69
|
+
const tokens = tokenizer.encode('any text');
|
|
70
|
+
|
|
71
|
+
assert.deepEqual(tokens, [1, 2, 3], 'should convert BigInt to number');
|
|
72
|
+
tokens.forEach(t => {
|
|
73
|
+
assert.equal(typeof t, 'number', 'each token should be a number');
|
|
74
|
+
});
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it('decodes tokens back to text', () => {
|
|
78
|
+
const mockTokenizer = {
|
|
79
|
+
__call__: () => ({ input_ids: { data: BigInt64Array.from([]) } }),
|
|
80
|
+
decode: (tokens: number[], opts?: { skip_special_tokens?: boolean }) => {
|
|
81
|
+
if (opts?.skip_special_tokens) {
|
|
82
|
+
return 'hello world';
|
|
83
|
+
}
|
|
84
|
+
return '[CLS] hello world [SEP]';
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
const callableTokenizer = Object.assign(
|
|
88
|
+
() => mockTokenizer.__call__(),
|
|
89
|
+
{ decode: mockTokenizer.decode }
|
|
90
|
+
) as any;
|
|
91
|
+
|
|
92
|
+
const tokenizer = new TransformersTokenizer(callableTokenizer);
|
|
93
|
+
const text = tokenizer.decode([1, 101, 102, 2]);
|
|
94
|
+
|
|
95
|
+
assert.equal(text, 'hello world', 'should decode with skip_special_tokens=true');
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
it('handles empty input', () => {
|
|
99
|
+
const mockTokenizer = {
|
|
100
|
+
__call__: () => ({
|
|
101
|
+
input_ids: { data: BigInt64Array.from([]) }
|
|
102
|
+
}),
|
|
103
|
+
decode: () => ''
|
|
104
|
+
};
|
|
105
|
+
const callableTokenizer = Object.assign(
|
|
106
|
+
() => mockTokenizer.__call__(),
|
|
107
|
+
{ decode: mockTokenizer.decode }
|
|
108
|
+
) as any;
|
|
109
|
+
|
|
110
|
+
const tokenizer = new TransformersTokenizer(callableTokenizer);
|
|
111
|
+
|
|
112
|
+
const tokens = tokenizer.encode('');
|
|
113
|
+
assert.deepEqual(tokens, [], 'empty input should return empty array');
|
|
114
|
+
|
|
115
|
+
const text = tokenizer.decode([]);
|
|
116
|
+
assert.equal(text, '', 'empty tokens should return empty string');
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
it('returns consistent results for same input', () => {
|
|
120
|
+
let callCount = 0;
|
|
121
|
+
const mockTokenizer = {
|
|
122
|
+
__call__: () => {
|
|
123
|
+
callCount++;
|
|
124
|
+
return {
|
|
125
|
+
input_ids: { data: BigInt64Array.from([BigInt(101), BigInt(102)]) }
|
|
126
|
+
};
|
|
127
|
+
},
|
|
128
|
+
decode: () => 'hello world'
|
|
129
|
+
};
|
|
130
|
+
const callableTokenizer = Object.assign(
|
|
131
|
+
() => mockTokenizer.__call__(),
|
|
132
|
+
{ decode: mockTokenizer.decode }
|
|
133
|
+
) as any;
|
|
134
|
+
|
|
135
|
+
const tokenizer = new TransformersTokenizer(callableTokenizer);
|
|
136
|
+
|
|
137
|
+
const tokens1 = tokenizer.encode('hello world');
|
|
138
|
+
const tokens2 = tokenizer.encode('hello world');
|
|
139
|
+
|
|
140
|
+
assert.deepEqual(tokens1, tokens2, 'encode should be deterministic');
|
|
141
|
+
assert.equal(callCount, 2, 'should call underlying tokenizer each time');
|
|
142
|
+
});
|
|
143
|
+
});
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { PreTrainedTokenizer } from "@huggingface/transformers";
|
|
2
|
+
import { Tokenizer } from "./types";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* A tokenizer wrapper for Transformers.js models.
|
|
6
|
+
* @remarks
|
|
7
|
+
* This tokenizer uses the same tokenizer as the embedding model,
|
|
8
|
+
* ensuring consistency between text splitting and embedding generation.
|
|
9
|
+
*
|
|
10
|
+
* Obtain an instance via TransformersEmbeddings.getTokenizer().
|
|
11
|
+
*/
|
|
12
|
+
export class TransformersTokenizer implements Tokenizer {
|
|
13
|
+
private readonly _tokenizer: PreTrainedTokenizer;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Creates a new TransformersTokenizer.
|
|
17
|
+
* @param tokenizer The underlying Transformers.js tokenizer.
|
|
18
|
+
* @remarks
|
|
19
|
+
* Typically created via TransformersEmbeddings.getTokenizer().
|
|
20
|
+
*/
|
|
21
|
+
public constructor(tokenizer: PreTrainedTokenizer) {
|
|
22
|
+
this._tokenizer = tokenizer;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Encodes text into token IDs.
|
|
27
|
+
* @param text The text to encode.
|
|
28
|
+
* @returns Array of token IDs.
|
|
29
|
+
*/
|
|
30
|
+
public encode(text: string): number[] {
|
|
31
|
+
const encoded = this._tokenizer(text);
|
|
32
|
+
// Transformers.js returns an object with input_ids as BigInt64Array or similar
|
|
33
|
+
const inputIds = encoded.input_ids?.data ?? encoded.input_ids ?? encoded;
|
|
34
|
+
return Array.from(inputIds).map((id: any) => Number(id));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Decodes token IDs back into text.
|
|
39
|
+
* @param tokens Array of token IDs.
|
|
40
|
+
* @returns Decoded text string.
|
|
41
|
+
*/
|
|
42
|
+
public decode(tokens: number[]): string {
|
|
43
|
+
return this._tokenizer.decode(tokens, { skip_special_tokens: true });
|
|
44
|
+
}
|
|
45
|
+
}
|