vectra 0.12.3 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -100
- package/lib/BrowserWebFetcher.d.ts +75 -0
- package/lib/BrowserWebFetcher.d.ts.map +1 -0
- package/lib/BrowserWebFetcher.js +290 -0
- package/lib/BrowserWebFetcher.js.map +1 -0
- package/lib/FileFetcher.d.ts.map +1 -1
- package/lib/FileFetcher.js +25 -15
- package/lib/FileFetcher.js.map +1 -1
- package/lib/FileFetcher.spec.d.ts +2 -0
- package/lib/FileFetcher.spec.d.ts.map +1 -0
- package/lib/FileFetcher.spec.js +244 -0
- package/lib/FileFetcher.spec.js.map +1 -0
- package/lib/FolderWatcher.d.ts +91 -0
- package/lib/FolderWatcher.d.ts.map +1 -0
- package/lib/FolderWatcher.js +304 -0
- package/lib/FolderWatcher.js.map +1 -0
- package/lib/FolderWatcher.spec.d.ts +2 -0
- package/lib/FolderWatcher.spec.d.ts.map +1 -0
- package/lib/FolderWatcher.spec.js +308 -0
- package/lib/FolderWatcher.spec.js.map +1 -0
- package/lib/GPT3Tokenizer.spec.d.ts +2 -0
- package/lib/GPT3Tokenizer.spec.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.spec.js +45 -0
- package/lib/GPT3Tokenizer.spec.js.map +1 -0
- package/lib/ItemSelector.d.ts.map +1 -1
- package/lib/ItemSelector.js +19 -8
- package/lib/ItemSelector.js.map +1 -1
- package/lib/ItemSelector.spec.d.ts +2 -0
- package/lib/ItemSelector.spec.d.ts.map +1 -0
- package/lib/ItemSelector.spec.js +204 -0
- package/lib/ItemSelector.spec.js.map +1 -0
- package/lib/LocalDocument.d.ts +1 -1
- package/lib/LocalDocument.d.ts.map +1 -1
- package/lib/LocalDocument.js +5 -45
- package/lib/LocalDocument.js.map +1 -1
- package/lib/LocalDocument.spec.d.ts +2 -0
- package/lib/LocalDocument.spec.d.ts.map +1 -0
- package/lib/LocalDocument.spec.js +214 -0
- package/lib/LocalDocument.spec.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +20 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +16 -52
- package/lib/LocalDocumentIndex.js.map +1 -1
- package/lib/LocalDocumentIndex.spec.d.ts +2 -0
- package/lib/LocalDocumentIndex.spec.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.spec.js +494 -0
- package/lib/LocalDocumentIndex.spec.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +32 -11
- package/lib/LocalDocumentResult.d.ts.map +1 -1
- package/lib/LocalDocumentResult.js +305 -257
- package/lib/LocalDocumentResult.js.map +1 -1
- package/lib/LocalDocumentResult.spec.d.ts +2 -0
- package/lib/LocalDocumentResult.spec.d.ts.map +1 -0
- package/lib/LocalDocumentResult.spec.js +373 -0
- package/lib/LocalDocumentResult.spec.js.map +1 -0
- package/lib/LocalEmbeddings.d.ts +59 -0
- package/lib/LocalEmbeddings.d.ts.map +1 -0
- package/lib/LocalEmbeddings.js +101 -0
- package/lib/LocalEmbeddings.js.map +1 -0
- package/lib/LocalEmbeddings.spec.d.ts +2 -0
- package/lib/LocalEmbeddings.spec.d.ts.map +1 -0
- package/lib/LocalEmbeddings.spec.js +155 -0
- package/lib/LocalEmbeddings.spec.js.map +1 -0
- package/lib/LocalIndex.d.ts +27 -18
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +109 -105
- package/lib/LocalIndex.js.map +1 -1
- package/lib/LocalIndex.spec.js +434 -43
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +4 -6
- package/lib/OpenAIEmbeddings.d.ts.map +1 -1
- package/lib/OpenAIEmbeddings.js +16 -24
- package/lib/OpenAIEmbeddings.js.map +1 -1
- package/lib/OpenAIEmbeddings.spec.d.ts +2 -0
- package/lib/OpenAIEmbeddings.spec.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.spec.js +298 -0
- package/lib/OpenAIEmbeddings.spec.js.map +1 -0
- package/lib/TextSplitter.d.ts +2 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +154 -111
- package/lib/TextSplitter.js.map +1 -1
- package/lib/TextSplitter.spec.js +289 -61
- package/lib/TextSplitter.spec.js.map +1 -1
- package/lib/TransformersEmbeddings.d.ts +121 -0
- package/lib/TransformersEmbeddings.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.js +176 -0
- package/lib/TransformersEmbeddings.js.map +1 -0
- package/lib/TransformersEmbeddings.spec.d.ts +2 -0
- package/lib/TransformersEmbeddings.spec.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.spec.js +198 -0
- package/lib/TransformersEmbeddings.spec.js.map +1 -0
- package/lib/TransformersTokenizer.d.ts +33 -0
- package/lib/TransformersTokenizer.d.ts.map +1 -0
- package/lib/TransformersTokenizer.js +44 -0
- package/lib/TransformersTokenizer.js.map +1 -0
- package/lib/TransformersTokenizer.spec.d.ts +2 -0
- package/lib/TransformersTokenizer.spec.d.ts.map +1 -0
- package/lib/TransformersTokenizer.spec.js +112 -0
- package/lib/TransformersTokenizer.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +1 -2
- package/lib/WebFetcher.d.ts.map +1 -1
- package/lib/WebFetcher.js +58 -54
- package/lib/WebFetcher.js.map +1 -1
- package/lib/WebFetcher.spec.d.ts +2 -0
- package/lib/WebFetcher.spec.d.ts.map +1 -0
- package/lib/WebFetcher.spec.js +263 -0
- package/lib/WebFetcher.spec.js.map +1 -0
- package/lib/browser.d.ts +30 -0
- package/lib/browser.d.ts.map +1 -0
- package/lib/browser.js +52 -0
- package/lib/browser.js.map +1 -0
- package/lib/codecs/IndexCodec.d.ts +37 -0
- package/lib/codecs/IndexCodec.d.ts.map +1 -0
- package/lib/codecs/IndexCodec.js +3 -0
- package/lib/codecs/IndexCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.d.ts +19 -0
- package/lib/codecs/JsonCodec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.js +35 -0
- package/lib/codecs/JsonCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.spec.d.ts +2 -0
- package/lib/codecs/JsonCodec.spec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.spec.js +66 -0
- package/lib/codecs/JsonCodec.spec.js.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts +2 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js +108 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.d.ts +20 -0
- package/lib/codecs/ProtobufCodec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.js +225 -0
- package/lib/codecs/ProtobufCodec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts +2 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.js +155 -0
- package/lib/codecs/ProtobufCodec.spec.js.map +1 -0
- package/lib/codecs/index.d.ts +5 -0
- package/lib/codecs/index.d.ts.map +1 -0
- package/lib/codecs/index.js +21 -0
- package/lib/codecs/index.js.map +1 -0
- package/lib/codecs/migrateIndex.d.ts +24 -0
- package/lib/codecs/migrateIndex.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.js +119 -0
- package/lib/codecs/migrateIndex.js.map +1 -0
- package/lib/codecs/migrateIndex.spec.d.ts +2 -0
- package/lib/codecs/migrateIndex.spec.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.spec.js +151 -0
- package/lib/codecs/migrateIndex.spec.js.map +1 -0
- package/lib/codecs/schemas/index.proto +34 -0
- package/lib/index.d.ts +9 -1
- package/lib/index.d.ts.map +1 -1
- package/lib/index.js +9 -1
- package/lib/index.js.map +1 -1
- package/lib/internals/Colorize.d.ts.map +1 -1
- package/lib/internals/Colorize.js +20 -15
- package/lib/internals/Colorize.js.map +1 -1
- package/lib/server/IndexManager.d.ts +78 -0
- package/lib/server/IndexManager.d.ts.map +1 -0
- package/lib/server/IndexManager.js +259 -0
- package/lib/server/IndexManager.js.map +1 -0
- package/lib/server/VectraServer.d.ts +40 -0
- package/lib/server/VectraServer.d.ts.map +1 -0
- package/lib/server/VectraServer.js +151 -0
- package/lib/server/VectraServer.js.map +1 -0
- package/lib/server/VectraServer.spec.d.ts +2 -0
- package/lib/server/VectraServer.spec.d.ts.map +1 -0
- package/lib/server/VectraServer.spec.js +322 -0
- package/lib/server/VectraServer.spec.js.map +1 -0
- package/lib/server/handlers/documentHandlers.d.ts +15 -0
- package/lib/server/handlers/documentHandlers.d.ts.map +1 -0
- package/lib/server/handlers/documentHandlers.js +95 -0
- package/lib/server/handlers/documentHandlers.js.map +1 -0
- package/lib/server/handlers/helpers.d.ts +23 -0
- package/lib/server/handlers/helpers.d.ts.map +1 -0
- package/lib/server/handlers/helpers.js +138 -0
- package/lib/server/handlers/helpers.js.map +1 -0
- package/lib/server/handlers/index.d.ts +8 -0
- package/lib/server/handlers/index.d.ts.map +1 -0
- package/lib/server/handlers/index.js +22 -0
- package/lib/server/handlers/index.js.map +1 -0
- package/lib/server/handlers/indexHandlers.d.ts +14 -0
- package/lib/server/handlers/indexHandlers.d.ts.map +1 -0
- package/lib/server/handlers/indexHandlers.js +85 -0
- package/lib/server/handlers/indexHandlers.js.map +1 -0
- package/lib/server/handlers/itemHandlers.d.ts +34 -0
- package/lib/server/handlers/itemHandlers.d.ts.map +1 -0
- package/lib/server/handlers/itemHandlers.js +166 -0
- package/lib/server/handlers/itemHandlers.js.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts +11 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.js +31 -0
- package/lib/server/handlers/lifecycleHandlers.js.map +1 -0
- package/lib/server/handlers/queryHandlers.d.ts +27 -0
- package/lib/server/handlers/queryHandlers.d.ts.map +1 -0
- package/lib/server/handlers/queryHandlers.js +135 -0
- package/lib/server/handlers/queryHandlers.js.map +1 -0
- package/lib/server/handlers/statsHandlers.d.ts +17 -0
- package/lib/server/handlers/statsHandlers.d.ts.map +1 -0
- package/lib/server/handlers/statsHandlers.js +81 -0
- package/lib/server/handlers/statsHandlers.js.map +1 -0
- package/lib/server/index.d.ts +4 -0
- package/lib/server/index.d.ts.map +1 -0
- package/lib/server/index.js +23 -0
- package/lib/server/index.js.map +1 -0
- package/lib/storage/FileStorage.d.ts +92 -0
- package/lib/storage/FileStorage.d.ts.map +1 -0
- package/lib/storage/FileStorage.js +3 -0
- package/lib/storage/FileStorage.js.map +1 -0
- package/lib/storage/FileStorageUtilities.d.ts +36 -0
- package/lib/storage/FileStorageUtilities.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.js +91 -0
- package/lib/storage/FileStorageUtilities.js.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts +2 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.js +98 -0
- package/lib/storage/FileStorageUtilities.spec.js.map +1 -0
- package/lib/storage/FileType.d.ts +29 -0
- package/lib/storage/FileType.d.ts.map +1 -0
- package/lib/storage/FileType.js +38 -0
- package/lib/storage/FileType.js.map +1 -0
- package/lib/storage/IndexedDBStorage.d.ts +47 -0
- package/lib/storage/IndexedDBStorage.d.ts.map +1 -0
- package/lib/storage/IndexedDBStorage.js +347 -0
- package/lib/storage/IndexedDBStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.browser.d.ts +19 -0
- package/lib/storage/LocalFileStorage.browser.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.browser.js +43 -0
- package/lib/storage/LocalFileStorage.browser.js.map +1 -0
- package/lib/storage/LocalFileStorage.d.ts +23 -0
- package/lib/storage/LocalFileStorage.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.js +152 -0
- package/lib/storage/LocalFileStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.spec.d.ts +2 -0
- package/lib/storage/LocalFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.spec.js +249 -0
- package/lib/storage/LocalFileStorage.spec.js.map +1 -0
- package/lib/storage/VirtualFileStorage.d.ts +18 -0
- package/lib/storage/VirtualFileStorage.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.js +178 -0
- package/lib/storage/VirtualFileStorage.js.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts +2 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.js +302 -0
- package/lib/storage/VirtualFileStorage.spec.js.map +1 -0
- package/lib/storage/index.d.ts +6 -0
- package/lib/storage/index.d.ts.map +1 -0
- package/lib/storage/index.js +22 -0
- package/lib/storage/index.js.map +1 -0
- package/lib/templates/templates/csharp/README.md +48 -0
- package/lib/templates/templates/csharp/VectraClient.cs +234 -0
- package/lib/templates/templates/go/README.md +71 -0
- package/lib/templates/templates/go/vectra_client.go +322 -0
- package/lib/templates/templates/java/README.md +81 -0
- package/lib/templates/templates/java/VectraClient.java +232 -0
- package/lib/templates/templates/python/README.md +37 -0
- package/lib/templates/templates/python/vectra_client.py +279 -0
- package/lib/templates/templates/rust/Cargo.toml +14 -0
- package/lib/templates/templates/rust/README.md +39 -0
- package/lib/templates/templates/rust/build.rs +4 -0
- package/lib/templates/templates/rust/lib.rs +284 -0
- package/lib/templates/templates/typescript/README.md +96 -0
- package/lib/templates/templates/typescript/VectraClient.ts +374 -0
- package/lib/templates/typescript/VectraClient.d.ts +114 -0
- package/lib/templates/typescript/VectraClient.d.ts.map +1 -0
- package/lib/templates/typescript/VectraClient.js +328 -0
- package/lib/templates/typescript/VectraClient.js.map +1 -0
- package/lib/types.d.ts +7 -0
- package/lib/types.d.ts.map +1 -1
- package/lib/utils/index.d.ts +2 -0
- package/lib/utils/index.d.ts.map +1 -0
- package/lib/utils/index.js +18 -0
- package/lib/utils/index.js.map +1 -0
- package/lib/utils/pathUtils.d.ts +40 -0
- package/lib/utils/pathUtils.d.ts.map +1 -0
- package/lib/utils/pathUtils.js +98 -0
- package/lib/utils/pathUtils.js.map +1 -0
- package/lib/vectra-cli.d.ts.map +1 -1
- package/lib/vectra-cli.generate.spec.d.ts +2 -0
- package/lib/vectra-cli.generate.spec.d.ts.map +1 -0
- package/lib/vectra-cli.generate.spec.js +112 -0
- package/lib/vectra-cli.generate.spec.js.map +1 -0
- package/lib/vectra-cli.js +446 -9
- package/lib/vectra-cli.js.map +1 -1
- package/lib/vectra-cli.spec.d.ts +1 -0
- package/lib/vectra-cli.spec.d.ts.map +1 -0
- package/lib/vectra-cli.spec.js +2 -0
- package/lib/vectra-cli.spec.js.map +1 -0
- package/package.json +89 -16
- package/proto/vectra_service.proto +276 -0
- package/src/BrowserWebFetcher.ts +345 -0
- package/src/FileFetcher.spec.ts +234 -0
- package/src/FileFetcher.ts +37 -25
- package/src/FolderWatcher.spec.ts +288 -0
- package/src/FolderWatcher.ts +304 -0
- package/src/GPT3Tokenizer.spec.ts +50 -0
- package/src/ItemSelector.spec.ts +252 -0
- package/src/ItemSelector.ts +163 -150
- package/src/LocalDocument.spec.ts +211 -0
- package/src/LocalDocument.ts +88 -94
- package/src/LocalDocumentIndex.spec.ts +481 -0
- package/src/LocalDocumentIndex.ts +39 -40
- package/src/LocalDocumentResult.spec.ts +373 -0
- package/src/LocalDocumentResult.ts +489 -319
- package/src/LocalEmbeddings.spec.ts +138 -0
- package/src/LocalEmbeddings.ts +120 -0
- package/src/LocalIndex.spec.ts +808 -323
- package/src/LocalIndex.ts +479 -430
- package/src/OpenAIEmbeddings.spec.ts +354 -0
- package/src/OpenAIEmbeddings.ts +26 -27
- package/src/TextSplitter.spec.ts +320 -65
- package/src/TextSplitter.ts +172 -115
- package/src/TransformersEmbeddings.spec.ts +188 -0
- package/src/TransformersEmbeddings.ts +232 -0
- package/src/TransformersTokenizer.spec.ts +143 -0
- package/src/TransformersTokenizer.ts +45 -0
- package/src/WebFetcher.spec.ts +288 -0
- package/src/WebFetcher.ts +184 -186
- package/src/browser.ts +69 -0
- package/src/codecs/IndexCodec.ts +40 -0
- package/src/codecs/JsonCodec.spec.ts +70 -0
- package/src/codecs/JsonCodec.ts +37 -0
- package/src/codecs/LocalIndex.protobuf.spec.ts +115 -0
- package/src/codecs/ProtobufCodec.spec.ts +166 -0
- package/src/codecs/ProtobufCodec.ts +193 -0
- package/src/codecs/index.ts +4 -0
- package/src/codecs/migrateIndex.spec.ts +176 -0
- package/src/codecs/migrateIndex.ts +125 -0
- package/src/codecs/schemas/index.proto +34 -0
- package/src/index.ts +9 -1
- package/src/internals/Colorize.ts +19 -16
- package/src/server/IndexManager.ts +243 -0
- package/src/server/VectraServer.spec.ts +303 -0
- package/src/server/VectraServer.ts +156 -0
- package/src/server/handlers/documentHandlers.ts +59 -0
- package/src/server/handlers/helpers.ts +93 -0
- package/src/server/handlers/index.ts +7 -0
- package/src/server/handlers/indexHandlers.ts +44 -0
- package/src/server/handlers/itemHandlers.ts +140 -0
- package/src/server/handlers/lifecycleHandlers.ts +26 -0
- package/src/server/handlers/queryHandlers.ts +96 -0
- package/src/server/handlers/statsHandlers.ts +38 -0
- package/src/server/index.ts +3 -0
- package/src/storage/FileStorage.ts +105 -0
- package/src/storage/FileStorageUtilities.spec.ts +106 -0
- package/src/storage/FileStorageUtilities.ts +77 -0
- package/src/storage/FileType.ts +61 -0
- package/src/storage/IndexedDBStorage.ts +365 -0
- package/src/storage/LocalFileStorage.browser.ts +52 -0
- package/src/storage/LocalFileStorage.spec.ts +292 -0
- package/src/storage/LocalFileStorage.ts +98 -0
- package/src/storage/VirtualFileStorage.spec.ts +307 -0
- package/src/storage/VirtualFileStorage.ts +169 -0
- package/src/storage/index.ts +5 -0
- package/src/templates/csharp/README.md +48 -0
- package/src/templates/csharp/VectraClient.cs +234 -0
- package/src/templates/go/README.md +71 -0
- package/src/templates/go/vectra_client.go +322 -0
- package/src/templates/java/README.md +81 -0
- package/src/templates/java/VectraClient.java +232 -0
- package/src/templates/python/README.md +37 -0
- package/src/templates/python/vectra_client.py +279 -0
- package/src/templates/rust/Cargo.toml +14 -0
- package/src/templates/rust/README.md +39 -0
- package/src/templates/rust/build.rs +4 -0
- package/src/templates/rust/lib.rs +284 -0
- package/src/templates/typescript/README.md +96 -0
- package/src/templates/typescript/VectraClient.ts +374 -0
- package/src/types.ts +131 -123
- package/src/utils/index.ts +1 -0
- package/src/utils/pathUtils.ts +106 -0
- package/src/vectra-cli.generate.spec.ts +72 -0
- package/src/vectra-cli.spec.ts +0 -0
- package/src/vectra-cli.ts +687 -246
- package/README.draft.md +0 -499
- package/README.draft.outline.md +0 -160
- package/README.research.md +0 -2159
package/src/TextSplitter.ts
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import { GPT3Tokenizer } from "./GPT3Tokenizer";
|
|
2
2
|
import { TextChunk, Tokenizer } from "./types";
|
|
3
3
|
|
|
4
|
-
const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
|
|
5
|
-
|
|
6
4
|
export interface TextSplitterConfig {
|
|
7
5
|
separators: string[];
|
|
8
6
|
keepSeparators: boolean;
|
|
@@ -22,17 +20,14 @@ export class TextSplitter {
|
|
|
22
20
|
chunkOverlap: 40,
|
|
23
21
|
} as TextSplitterConfig, config);
|
|
24
22
|
|
|
25
|
-
// Create a default tokenizer if none is provided
|
|
26
23
|
if (!this._config.tokenizer) {
|
|
27
24
|
this._config.tokenizer = new GPT3Tokenizer();
|
|
28
25
|
}
|
|
29
26
|
|
|
30
|
-
// Use default separators if none are provided
|
|
31
27
|
if (!this._config.separators || this._config.separators.length === 0) {
|
|
32
28
|
this._config.separators = this.getSeparators(this._config.docType);
|
|
33
29
|
}
|
|
34
30
|
|
|
35
|
-
// Validate the config settings
|
|
36
31
|
if (this._config.chunkSize < 1) {
|
|
37
32
|
throw new Error("chunkSize must be >= 1");
|
|
38
33
|
} else if (this._config.chunkOverlap < 0) {
|
|
@@ -43,30 +38,19 @@ export class TextSplitter {
|
|
|
43
38
|
}
|
|
44
39
|
|
|
45
40
|
public split(text: string): TextChunk[] {
|
|
46
|
-
// Get basic chunks
|
|
47
41
|
const chunks = this.recursiveSplit(text, this._config.separators, 0);
|
|
48
42
|
|
|
49
|
-
const that = this;
|
|
50
|
-
function getOverlapTokens(tokens?: number[]): number[] {
|
|
51
|
-
if (tokens != undefined) {
|
|
52
|
-
const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
|
|
53
|
-
return tokens.slice(0, len);
|
|
54
|
-
} else {
|
|
55
|
-
return [];
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// Add overlap tokens and text to the start and end of each chunk
|
|
60
43
|
if (this._config.chunkOverlap > 0) {
|
|
61
|
-
for (let i =
|
|
62
|
-
const
|
|
63
|
-
const
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
44
|
+
for (let i = 0; i < chunks.length - 1; i++) {
|
|
45
|
+
const current = chunks[i];
|
|
46
|
+
const next = chunks[i + 1];
|
|
47
|
+
|
|
48
|
+
const currTokensCopy = current.tokens.slice();
|
|
49
|
+
const trailing = currTokensCopy.reverse().slice(0, this._config.chunkOverlap).reverse();
|
|
50
|
+
next.startOverlap = trailing;
|
|
51
|
+
|
|
52
|
+
const leadLen = Math.min(this._config.chunkOverlap, next.tokens.length);
|
|
53
|
+
current.endOverlap = next.tokens.slice(0, leadLen);
|
|
70
54
|
}
|
|
71
55
|
}
|
|
72
56
|
|
|
@@ -74,132 +58,205 @@ export class TextSplitter {
|
|
|
74
58
|
}
|
|
75
59
|
|
|
76
60
|
private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
|
|
77
|
-
|
|
61
|
+
if (text.length === 0) return [];
|
|
78
62
|
|
|
79
|
-
if (
|
|
80
|
-
|
|
81
|
-
let parts: string[];
|
|
82
|
-
let separator = '';
|
|
63
|
+
if (separators.length > 0) {
|
|
64
|
+
const sep = separators[0];
|
|
83
65
|
const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
|
|
84
66
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
separator = separators[0];
|
|
88
|
-
parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
|
|
89
|
-
} else {
|
|
90
|
-
// Cut text in half
|
|
91
|
-
const half = Math.floor(text.length / 2);
|
|
92
|
-
parts = [text.substring(0, half), text.substring(half)];
|
|
93
|
-
}
|
|
67
|
+
const parts = sep === ' ' ? this.splitBySpaces(text) : text.split(sep);
|
|
68
|
+
const out: TextChunk[] = [];
|
|
94
69
|
|
|
95
|
-
|
|
70
|
+
let pos = startPos;
|
|
96
71
|
for (let i = 0; i < parts.length; i++) {
|
|
97
|
-
const
|
|
98
|
-
|
|
99
|
-
// Get chunk text and endPos
|
|
100
|
-
let chunk = parts[i];
|
|
101
|
-
const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
|
|
72
|
+
const lastPart = (i === parts.length - 1);
|
|
73
|
+
let piece = parts[i];
|
|
102
74
|
|
|
103
|
-
if (this._config.keepSeparators && !
|
|
104
|
-
|
|
75
|
+
if (this._config.keepSeparators && !lastPart) {
|
|
76
|
+
piece += sep;
|
|
105
77
|
}
|
|
106
78
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
startPos = endPos + 1;
|
|
79
|
+
if (!/\S/.test(piece)) {
|
|
80
|
+
const consumed = parts[i].length + (lastPart ? 0 : sep.length);
|
|
81
|
+
pos += consumed;
|
|
111
82
|
continue;
|
|
112
83
|
}
|
|
113
84
|
|
|
114
|
-
|
|
115
|
-
if (
|
|
116
|
-
|
|
117
|
-
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
118
|
-
chunks.push(...subChunks);
|
|
85
|
+
const sub = this.recursiveSplit(piece, nextSeparators, pos);
|
|
86
|
+
if (sub.length > 0) {
|
|
87
|
+
out.push(...sub);
|
|
119
88
|
} else {
|
|
120
|
-
|
|
121
|
-
const tokens = this._config.tokenizer.encode(chunk);
|
|
122
|
-
if (tokens.length > this._config.chunkSize) {
|
|
123
|
-
// Break the text into smaller chunks
|
|
124
|
-
const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
|
|
125
|
-
chunks.push(...subChunks);
|
|
126
|
-
} else {
|
|
127
|
-
// Append chunk to output
|
|
128
|
-
chunks.push({
|
|
129
|
-
text: chunk,
|
|
130
|
-
tokens: tokens,
|
|
131
|
-
startPos: startPos,
|
|
132
|
-
endPos: endPos,
|
|
133
|
-
startOverlap: [],
|
|
134
|
-
endOverlap: [],
|
|
135
|
-
});
|
|
136
|
-
}
|
|
89
|
+
out.push(...this.finalizeToChunks(piece, pos));
|
|
137
90
|
}
|
|
138
91
|
|
|
139
|
-
|
|
140
|
-
|
|
92
|
+
const consumed = parts[i].length + (lastPart ? 0 : sep.length);
|
|
93
|
+
pos += consumed;
|
|
141
94
|
}
|
|
95
|
+
|
|
96
|
+
const joiner =
|
|
97
|
+
this._config.keepSeparators
|
|
98
|
+
? ''
|
|
99
|
+
: (sep !== ' ' && (sep.includes('\n') || sep.includes('\t')) ? ' ' : '');
|
|
100
|
+
|
|
101
|
+
return this.combineChunks(out, joiner);
|
|
142
102
|
}
|
|
143
103
|
|
|
144
|
-
return this.combineChunks(
|
|
104
|
+
return this.combineChunks(this.finalizeToChunks(text, startPos), '');
|
|
145
105
|
}
|
|
146
106
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
107
|
+
// Strip inline punctuation-only runs when keepSeparators=false.
|
|
108
|
+
// Only removes runs that touch non-whitespace on at least one side (inline),
|
|
109
|
+
// preserving standalone lines like '---' or '***' that are separated by whitespace/newlines.
|
|
110
|
+
private stripInlineSeparators(s: string): string {
|
|
111
|
+
if (this._config.keepSeparators || s.length === 0) return s;
|
|
112
|
+
const re = /(-{3,}|\*{3,}|={3,}|_{3,})/g;
|
|
113
|
+
let out = '';
|
|
114
|
+
let lastIndex = 0;
|
|
115
|
+
let m: RegExpExecArray | null;
|
|
116
|
+
while ((m = re.exec(s)) !== null) {
|
|
117
|
+
const start = m.index;
|
|
118
|
+
const end = start + m[0].length;
|
|
119
|
+
const left = start > 0 ? s[start - 1] : undefined;
|
|
120
|
+
const right = end < s.length ? s[end] : undefined;
|
|
121
|
+
const leftNonWS = left !== undefined && !/\s/.test(left);
|
|
122
|
+
const rightNonWS = right !== undefined && !/\s/.test(right);
|
|
123
|
+
// Inline if touching non-whitespace on at least one side
|
|
124
|
+
if (leftNonWS || rightNonWS) {
|
|
125
|
+
out += s.slice(lastIndex, start);
|
|
126
|
+
lastIndex = end; // drop the run
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
out += s.slice(lastIndex);
|
|
130
|
+
return out;
|
|
131
|
+
}
|
|
151
132
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
const
|
|
133
|
+
// Produce one or more chunks under budget.
|
|
134
|
+
private finalizeToChunks(text: string, startPos: number): TextChunk[] {
|
|
135
|
+
const chunks: TextChunk[] = [];
|
|
136
|
+
const tokens = this._config.tokenizer.encode(text);
|
|
137
|
+
|
|
138
|
+
// Token-budget splitting
|
|
139
|
+
if (tokens.length > this._config.chunkSize) {
|
|
140
|
+
let remaining = tokens.slice();
|
|
141
|
+
let pos = startPos;
|
|
142
|
+
|
|
143
|
+
while (remaining.length > 0) {
|
|
144
|
+
const span = remaining.splice(0, this._config.chunkSize);
|
|
145
|
+
const original = this._config.tokenizer.decode(span);
|
|
146
|
+
|
|
147
|
+
const leadingWSMatch = original.match(/^\s+/);
|
|
148
|
+
const leadingWSLen = leadingWSMatch ? leadingWSMatch[0].length : 0;
|
|
149
|
+
|
|
150
|
+
let sliceText = leadingWSLen > 0 ? original.slice(leadingWSLen) : original;
|
|
151
|
+
if (sliceText.length === 0) {
|
|
152
|
+
pos += original.length;
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Drop inline punctuation-only runs if configured
|
|
157
|
+
const stripped = this.stripInlineSeparators(sliceText);
|
|
158
|
+
|
|
159
|
+
const sliceStart = pos + leadingWSLen;
|
|
160
|
+
const sliceEnd = sliceStart + stripped.length - 1;
|
|
161
|
+
|
|
162
|
+
const spanTokens = this._config.tokenizer.encode(stripped);
|
|
163
|
+
|
|
164
|
+
chunks.push({
|
|
165
|
+
text: stripped,
|
|
166
|
+
tokens: spanTokens,
|
|
167
|
+
startPos: sliceStart,
|
|
168
|
+
endPos: sliceEnd,
|
|
169
|
+
startOverlap: [],
|
|
170
|
+
endOverlap: [],
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
pos += original.length;
|
|
174
|
+
}
|
|
175
|
+
return chunks;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// If text fits but is a very long unbroken string with no configured separators, fall back to char windows
|
|
179
|
+
if (text.length > this._config.chunkSize) {
|
|
180
|
+
const hasWhitespace = /\s/.test(text);
|
|
181
|
+
const hasAnyConfiguredSep = (this._config.separators || []).some(s => s && text.includes(s));
|
|
182
|
+
|
|
183
|
+
if (!hasWhitespace && !hasAnyConfiguredSep) {
|
|
184
|
+
let pos = startPos;
|
|
185
|
+
for (let off = 0; off < text.length; off += this._config.chunkSize) {
|
|
186
|
+
const slice = text.slice(off, off + this._config.chunkSize);
|
|
187
|
+
const stripped = this.stripInlineSeparators(slice);
|
|
188
|
+
const sliceTokens = this._config.tokenizer.encode(stripped);
|
|
189
|
+
const sliceStart = pos;
|
|
190
|
+
const sliceEnd = sliceStart + stripped.length - 1;
|
|
191
|
+
chunks.push({
|
|
192
|
+
text: stripped,
|
|
193
|
+
tokens: sliceTokens,
|
|
194
|
+
startPos: sliceStart,
|
|
195
|
+
endPos: sliceEnd,
|
|
196
|
+
startOverlap: [],
|
|
197
|
+
endOverlap: [],
|
|
198
|
+
});
|
|
199
|
+
pos = sliceEnd + 1;
|
|
200
|
+
}
|
|
201
|
+
return chunks;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const stripped = this.stripInlineSeparators(text);
|
|
206
|
+
const outTokens = this._config.tokenizer.encode(stripped);
|
|
207
|
+
|
|
208
|
+
chunks.push({
|
|
209
|
+
text: stripped,
|
|
210
|
+
tokens: outTokens,
|
|
211
|
+
startPos,
|
|
212
|
+
endPos: startPos + stripped.length - 1,
|
|
213
|
+
startOverlap: [],
|
|
214
|
+
endOverlap: [],
|
|
215
|
+
});
|
|
216
|
+
return chunks;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
private combineChunks(chunks: TextChunk[], joiner: string): TextChunk[] {
|
|
220
|
+
const combined: TextChunk[] = [];
|
|
221
|
+
let current: TextChunk | undefined;
|
|
156
222
|
|
|
157
223
|
const isWhitespaceOnly = (t: string) => !/\S/.test(t);
|
|
158
224
|
const isPunctuationOnly = (t: string) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
|
|
159
225
|
|
|
160
226
|
for (let i = 0; i < chunks.length; i++) {
|
|
161
|
-
const
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
currentChunk = chunk;
|
|
165
|
-
currentLength = chunk.tokens.length;
|
|
227
|
+
const next = chunks[i];
|
|
228
|
+
if (!current) {
|
|
229
|
+
current = next;
|
|
166
230
|
continue;
|
|
167
231
|
}
|
|
168
232
|
|
|
169
|
-
//
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
currentChunk = chunk;
|
|
174
|
-
currentLength = chunk.tokens.length;
|
|
233
|
+
// Keep punctuation-only chunks standalone
|
|
234
|
+
if (isPunctuationOnly(current.text) || isPunctuationOnly(next.text)) {
|
|
235
|
+
combined.push(current);
|
|
236
|
+
current = next;
|
|
175
237
|
continue;
|
|
176
238
|
}
|
|
177
239
|
|
|
178
|
-
|
|
179
|
-
const
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
240
|
+
const tokenLength = current.tokens.length + next.tokens.length;
|
|
241
|
+
const textLength = current.text.length + (joiner ? joiner.length : 0) + next.text.length;
|
|
242
|
+
|
|
243
|
+
if (tokenLength > this._config.chunkSize || textLength > this._config.chunkSize) {
|
|
244
|
+
combined.push(current);
|
|
245
|
+
current = next;
|
|
184
246
|
} else {
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
currentChunk.tokens.push(...chunk.tokens);
|
|
190
|
-
currentLength += chunk.tokens.length;
|
|
247
|
+
const sep = (!this._config.keepSeparators && !isWhitespaceOnly(current.text) && !isWhitespaceOnly(next.text)) ? joiner : '';
|
|
248
|
+
current.text += sep + next.text;
|
|
249
|
+
current.endPos = next.endPos;
|
|
250
|
+
current.tokens.push(...next.tokens);
|
|
191
251
|
}
|
|
192
252
|
}
|
|
193
253
|
|
|
194
|
-
if (
|
|
195
|
-
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
return combinedChunks;
|
|
254
|
+
if (current) combined.push(current);
|
|
255
|
+
return combined;
|
|
199
256
|
}
|
|
200
257
|
|
|
258
|
+
// Token-window splitting utility used for the ' ' logical separator
|
|
201
259
|
private splitBySpaces(text: string): string[] {
|
|
202
|
-
// Split text by tokens and return parts
|
|
203
260
|
const parts: string[] = [];
|
|
204
261
|
let tokens = this._config.tokenizer.encode(text);
|
|
205
262
|
|
|
@@ -486,4 +543,4 @@ export class TextSplitter {
|
|
|
486
543
|
];
|
|
487
544
|
}
|
|
488
545
|
}
|
|
489
|
-
}
|
|
546
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import { strict as assert } from 'node:assert';
|
|
2
|
+
import { describe, it, beforeEach, afterEach } from 'mocha';
|
|
3
|
+
import sinon from 'sinon';
|
|
4
|
+
import { EmbeddingsModel } from './types';
|
|
5
|
+
import * as transformersModule from '@huggingface/transformers';
|
|
6
|
+
|
|
7
|
+
describe('TransformersEmbeddings', () => {
|
|
8
|
+
let TransformersEmbeddings: any;
|
|
9
|
+
let mockExtractor: sinon.SinonStub;
|
|
10
|
+
let mockTokenizer: any;
|
|
11
|
+
let sandbox: sinon.SinonSandbox;
|
|
12
|
+
let pipelineStub: sinon.SinonStub;
|
|
13
|
+
|
|
14
|
+
beforeEach(async () => {
|
|
15
|
+
sandbox = sinon.createSandbox();
|
|
16
|
+
|
|
17
|
+
// Create mock tokenizer
|
|
18
|
+
mockTokenizer = {
|
|
19
|
+
__call__: sandbox.stub().returns({
|
|
20
|
+
input_ids: { data: BigInt64Array.from([BigInt(1), BigInt(2), BigInt(3)]) }
|
|
21
|
+
}),
|
|
22
|
+
decode: sandbox.stub().returns('decoded text')
|
|
23
|
+
};
|
|
24
|
+
// Make it callable
|
|
25
|
+
const callableTokenizer = Object.assign(
|
|
26
|
+
(...args: any[]) => mockTokenizer.__call__(...args),
|
|
27
|
+
mockTokenizer
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
// Create mock extractor (feature extraction pipeline)
|
|
31
|
+
mockExtractor = sandbox.stub().callsFake(async (inputs: string | string[]) => {
|
|
32
|
+
const inputArray = Array.isArray(inputs) ? inputs : [inputs];
|
|
33
|
+
const batchSize = inputArray.length;
|
|
34
|
+
const embeddingDim = 4;
|
|
35
|
+
|
|
36
|
+
const data = new Float32Array(batchSize * embeddingDim);
|
|
37
|
+
for (let i = 0; i < batchSize; i++) {
|
|
38
|
+
data[i * embeddingDim] = 0.1;
|
|
39
|
+
data[i * embeddingDim + 1] = 0.2;
|
|
40
|
+
data[i * embeddingDim + 2] = 0.3;
|
|
41
|
+
data[i * embeddingDim + 3] = 0.4;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
data: data,
|
|
46
|
+
dims: [batchSize, embeddingDim]
|
|
47
|
+
};
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// Attach tokenizer to the mock extractor so pipeline result has .tokenizer
|
|
51
|
+
(mockExtractor as any).tokenizer = callableTokenizer;
|
|
52
|
+
|
|
53
|
+
// Stub the pipeline function from @huggingface/transformers
|
|
54
|
+
pipelineStub = sandbox.stub(transformersModule, 'pipeline' as any).resolves(mockExtractor);
|
|
55
|
+
|
|
56
|
+
// Import TransformersEmbeddings fresh (uses the stubbed pipeline via dynamic import)
|
|
57
|
+
const mod = await import('./TransformersEmbeddings');
|
|
58
|
+
TransformersEmbeddings = mod.TransformersEmbeddings;
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
afterEach(() => {
|
|
62
|
+
sandbox.restore();
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
describe('create()', () => {
|
|
66
|
+
it('creates instance with default options', async () => {
|
|
67
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
68
|
+
|
|
69
|
+
assert.equal(embeddings.maxTokens, 512, 'default maxTokens should be 512');
|
|
70
|
+
assert.equal(embeddings.model, 'Xenova/all-MiniLM-L6-v2', 'default model should be all-MiniLM-L6-v2');
|
|
71
|
+
|
|
72
|
+
// Verify pipeline was called with correct arguments
|
|
73
|
+
assert.ok(pipelineStub.calledOnce, 'pipeline should be called once');
|
|
74
|
+
assert.equal(pipelineStub.firstCall.args[0], 'feature-extraction');
|
|
75
|
+
assert.equal(pipelineStub.firstCall.args[1], 'Xenova/all-MiniLM-L6-v2');
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it('creates instance with custom options', async () => {
|
|
79
|
+
const embeddings = await TransformersEmbeddings.create({
|
|
80
|
+
model: 'Xenova/bge-small-en-v1.5',
|
|
81
|
+
maxTokens: 256,
|
|
82
|
+
device: 'cpu',
|
|
83
|
+
normalize: false,
|
|
84
|
+
pooling: 'cls'
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
assert.equal(embeddings.maxTokens, 256);
|
|
88
|
+
assert.equal(embeddings.model, 'Xenova/bge-small-en-v1.5');
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('implements EmbeddingsModel interface', async () => {
|
|
92
|
+
const embeddings: EmbeddingsModel = await TransformersEmbeddings.create();
|
|
93
|
+
|
|
94
|
+
assert.equal(typeof embeddings.maxTokens, 'number');
|
|
95
|
+
assert.equal(typeof embeddings.createEmbeddings, 'function');
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
describe('createEmbeddings()', () => {
|
|
100
|
+
it('generates embeddings for single string', async () => {
|
|
101
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
102
|
+
const result = await embeddings.createEmbeddings('hello world');
|
|
103
|
+
|
|
104
|
+
assert.equal(result.status, 'success');
|
|
105
|
+
assert.ok(result.output, 'output should be defined');
|
|
106
|
+
assert.equal(result.output!.length, 1, 'should have one embedding');
|
|
107
|
+
assert.equal(result.output![0].length, 4, 'embedding should have 4 dimensions');
|
|
108
|
+
const expected = [0.1, 0.2, 0.3, 0.4];
|
|
109
|
+
result.output![0].forEach((val: number, i: number) => {
|
|
110
|
+
assert.ok(Math.abs(val - expected[i]) < 0.001, `value ${val} should be close to ${expected[i]}`);
|
|
111
|
+
});
|
|
112
|
+
assert.equal(result.model, 'Xenova/all-MiniLM-L6-v2');
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('generates embeddings for string array', async () => {
|
|
116
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
117
|
+
const result = await embeddings.createEmbeddings(['hello', 'world']);
|
|
118
|
+
|
|
119
|
+
assert.equal(result.status, 'success');
|
|
120
|
+
assert.ok(result.output, 'output should be defined');
|
|
121
|
+
assert.equal(result.output!.length, 2, 'should have two embeddings');
|
|
122
|
+
|
|
123
|
+
assert.equal(mockExtractor.callCount, 1);
|
|
124
|
+
assert.deepEqual(mockExtractor.firstCall.args[0], ['hello', 'world']);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it('passes pooling and normalize options to extractor', async () => {
|
|
128
|
+
const embeddings = await TransformersEmbeddings.create({
|
|
129
|
+
pooling: 'cls',
|
|
130
|
+
normalize: false
|
|
131
|
+
});
|
|
132
|
+
await embeddings.createEmbeddings('test');
|
|
133
|
+
|
|
134
|
+
assert.ok(mockExtractor.calledOnce);
|
|
135
|
+
const options = mockExtractor.firstCall.args[1];
|
|
136
|
+
assert.equal(options.pooling, 'cls');
|
|
137
|
+
assert.equal(options.normalize, false);
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('returns error status on failure', async () => {
|
|
141
|
+
mockExtractor.rejects(new Error('Model inference failed'));
|
|
142
|
+
|
|
143
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
144
|
+
const result = await embeddings.createEmbeddings('test');
|
|
145
|
+
|
|
146
|
+
assert.equal(result.status, 'error');
|
|
147
|
+
assert.ok(result.message?.includes('Model inference failed'));
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
it('handles empty string input', async () => {
|
|
151
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
152
|
+
const result = await embeddings.createEmbeddings('');
|
|
153
|
+
|
|
154
|
+
assert.equal(result.status, 'success');
|
|
155
|
+
assert.ok(result.output);
|
|
156
|
+
assert.equal(result.output!.length, 1);
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it('handles empty array input', async () => {
|
|
160
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
161
|
+
const result = await embeddings.createEmbeddings([]);
|
|
162
|
+
|
|
163
|
+
assert.equal(result.status, 'success');
|
|
164
|
+
assert.ok(result.output);
|
|
165
|
+
assert.equal(result.output!.length, 0);
|
|
166
|
+
});
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
describe('getTokenizer()', () => {
|
|
170
|
+
it('returns a TransformersTokenizer instance', async () => {
|
|
171
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
172
|
+
const tokenizer = embeddings.getTokenizer();
|
|
173
|
+
|
|
174
|
+
assert.ok(tokenizer, 'tokenizer should be defined');
|
|
175
|
+
assert.equal(typeof tokenizer.encode, 'function');
|
|
176
|
+
assert.equal(typeof tokenizer.decode, 'function');
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
it('returns consistent tokenizer across calls', async () => {
|
|
180
|
+
const embeddings = await TransformersEmbeddings.create();
|
|
181
|
+
const tokenizer1 = embeddings.getTokenizer();
|
|
182
|
+
const tokenizer2 = embeddings.getTokenizer();
|
|
183
|
+
|
|
184
|
+
assert.ok(tokenizer1);
|
|
185
|
+
assert.ok(tokenizer2);
|
|
186
|
+
});
|
|
187
|
+
});
|
|
188
|
+
});
|