vectra 0.12.2 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +92 -100
- package/bin/vectra.js +3 -0
- package/lib/BrowserWebFetcher.d.ts +75 -0
- package/lib/BrowserWebFetcher.d.ts.map +1 -0
- package/lib/BrowserWebFetcher.js +290 -0
- package/lib/BrowserWebFetcher.js.map +1 -0
- package/lib/FileFetcher.d.ts +5 -0
- package/lib/FileFetcher.d.ts.map +1 -0
- package/lib/FileFetcher.js +89 -0
- package/lib/FileFetcher.js.map +1 -0
- package/lib/FileFetcher.spec.d.ts +2 -0
- package/lib/FileFetcher.spec.d.ts.map +1 -0
- package/lib/FileFetcher.spec.js +244 -0
- package/lib/FileFetcher.spec.js.map +1 -0
- package/lib/FolderWatcher.d.ts +91 -0
- package/lib/FolderWatcher.d.ts.map +1 -0
- package/lib/FolderWatcher.js +304 -0
- package/lib/FolderWatcher.js.map +1 -0
- package/lib/FolderWatcher.spec.d.ts +2 -0
- package/lib/FolderWatcher.spec.d.ts.map +1 -0
- package/lib/FolderWatcher.spec.js +308 -0
- package/lib/FolderWatcher.spec.js.map +1 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/GPT3Tokenizer.spec.d.ts +2 -0
- package/lib/GPT3Tokenizer.spec.d.ts.map +1 -0
- package/lib/GPT3Tokenizer.spec.js +45 -0
- package/lib/GPT3Tokenizer.spec.js.map +1 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +179 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/ItemSelector.spec.d.ts +2 -0
- package/lib/ItemSelector.spec.d.ts.map +1 -0
- package/lib/ItemSelector.spec.js +204 -0
- package/lib/ItemSelector.spec.js.map +1 -0
- package/lib/LocalDocument.d.ts +54 -0
- package/lib/LocalDocument.d.ts.map +1 -1
- package/lib/LocalDocument.js +116 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocument.spec.d.ts +2 -0
- package/lib/LocalDocument.spec.d.ts.map +1 -0
- package/lib/LocalDocument.spec.js +214 -0
- package/lib/LocalDocument.spec.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +152 -0
- package/lib/LocalDocumentIndex.d.ts.map +1 -1
- package/lib/LocalDocumentIndex.js +420 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentIndex.spec.d.ts +2 -0
- package/lib/LocalDocumentIndex.spec.d.ts.map +1 -0
- package/lib/LocalDocumentIndex.spec.js +494 -0
- package/lib/LocalDocumentIndex.spec.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +66 -0
- package/lib/LocalDocumentResult.d.ts.map +1 -1
- package/lib/LocalDocumentResult.js +376 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalDocumentResult.spec.d.ts +2 -0
- package/lib/LocalDocumentResult.spec.d.ts.map +1 -0
- package/lib/LocalDocumentResult.spec.js +373 -0
- package/lib/LocalDocumentResult.spec.js.map +1 -0
- package/lib/LocalEmbeddings.d.ts +59 -0
- package/lib/LocalEmbeddings.d.ts.map +1 -0
- package/lib/LocalEmbeddings.js +101 -0
- package/lib/LocalEmbeddings.js.map +1 -0
- package/lib/LocalEmbeddings.spec.d.ts +2 -0
- package/lib/LocalEmbeddings.spec.d.ts.map +1 -0
- package/lib/LocalEmbeddings.spec.js +155 -0
- package/lib/LocalEmbeddings.spec.js.map +1 -0
- package/lib/LocalIndex.d.ts +159 -0
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +519 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/LocalIndex.spec.d.ts +2 -0
- package/lib/LocalIndex.spec.js +611 -9
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +124 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +166 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/OpenAIEmbeddings.spec.d.ts +2 -0
- package/lib/OpenAIEmbeddings.spec.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.spec.js +298 -0
- package/lib/OpenAIEmbeddings.spec.js.map +1 -0
- package/lib/TextSplitter.d.ts +21 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +500 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/TextSplitter.spec.d.ts +2 -0
- package/lib/TextSplitter.spec.d.ts.map +1 -0
- package/lib/TextSplitter.spec.js +337 -0
- package/lib/TextSplitter.spec.js.map +1 -0
- package/lib/TransformersEmbeddings.d.ts +121 -0
- package/lib/TransformersEmbeddings.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.js +176 -0
- package/lib/TransformersEmbeddings.js.map +1 -0
- package/lib/TransformersEmbeddings.spec.d.ts +2 -0
- package/lib/TransformersEmbeddings.spec.d.ts.map +1 -0
- package/lib/TransformersEmbeddings.spec.js +198 -0
- package/lib/TransformersEmbeddings.spec.js.map +1 -0
- package/lib/TransformersTokenizer.d.ts +33 -0
- package/lib/TransformersTokenizer.d.ts.map +1 -0
- package/lib/TransformersTokenizer.js +44 -0
- package/lib/TransformersTokenizer.js.map +1 -0
- package/lib/TransformersTokenizer.spec.d.ts +2 -0
- package/lib/TransformersTokenizer.spec.d.ts.map +1 -0
- package/lib/TransformersTokenizer.spec.js +112 -0
- package/lib/TransformersTokenizer.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +14 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +238 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/WebFetcher.spec.d.ts +2 -0
- package/lib/WebFetcher.spec.d.ts.map +1 -0
- package/lib/WebFetcher.spec.js +263 -0
- package/lib/WebFetcher.spec.js.map +1 -0
- package/lib/browser.d.ts +30 -0
- package/lib/browser.d.ts.map +1 -0
- package/lib/browser.js +52 -0
- package/lib/browser.js.map +1 -0
- package/lib/codecs/IndexCodec.d.ts +37 -0
- package/lib/codecs/IndexCodec.d.ts.map +1 -0
- package/lib/codecs/IndexCodec.js +3 -0
- package/lib/codecs/IndexCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.d.ts +19 -0
- package/lib/codecs/JsonCodec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.js +35 -0
- package/lib/codecs/JsonCodec.js.map +1 -0
- package/lib/codecs/JsonCodec.spec.d.ts +2 -0
- package/lib/codecs/JsonCodec.spec.d.ts.map +1 -0
- package/lib/codecs/JsonCodec.spec.js +66 -0
- package/lib/codecs/JsonCodec.spec.js.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts +2 -0
- package/lib/codecs/LocalIndex.protobuf.spec.d.ts.map +1 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js +108 -0
- package/lib/codecs/LocalIndex.protobuf.spec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.d.ts +20 -0
- package/lib/codecs/ProtobufCodec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.js +225 -0
- package/lib/codecs/ProtobufCodec.js.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts +2 -0
- package/lib/codecs/ProtobufCodec.spec.d.ts.map +1 -0
- package/lib/codecs/ProtobufCodec.spec.js +155 -0
- package/lib/codecs/ProtobufCodec.spec.js.map +1 -0
- package/lib/codecs/index.d.ts +5 -0
- package/lib/codecs/index.d.ts.map +1 -0
- package/lib/codecs/index.js +21 -0
- package/lib/codecs/index.js.map +1 -0
- package/lib/codecs/migrateIndex.d.ts +24 -0
- package/lib/codecs/migrateIndex.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.js +119 -0
- package/lib/codecs/migrateIndex.js.map +1 -0
- package/lib/codecs/migrateIndex.spec.d.ts +2 -0
- package/lib/codecs/migrateIndex.spec.d.ts.map +1 -0
- package/lib/codecs/migrateIndex.spec.js +151 -0
- package/lib/codecs/migrateIndex.spec.js.map +1 -0
- package/lib/codecs/schemas/index.proto +34 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.d.ts.map +1 -1
- package/lib/index.js +36 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +69 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +43 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/server/IndexManager.d.ts +78 -0
- package/lib/server/IndexManager.d.ts.map +1 -0
- package/lib/server/IndexManager.js +259 -0
- package/lib/server/IndexManager.js.map +1 -0
- package/lib/server/VectraServer.d.ts +40 -0
- package/lib/server/VectraServer.d.ts.map +1 -0
- package/lib/server/VectraServer.js +151 -0
- package/lib/server/VectraServer.js.map +1 -0
- package/lib/server/VectraServer.spec.d.ts +2 -0
- package/lib/server/VectraServer.spec.d.ts.map +1 -0
- package/lib/server/VectraServer.spec.js +322 -0
- package/lib/server/VectraServer.spec.js.map +1 -0
- package/lib/server/handlers/documentHandlers.d.ts +15 -0
- package/lib/server/handlers/documentHandlers.d.ts.map +1 -0
- package/lib/server/handlers/documentHandlers.js +95 -0
- package/lib/server/handlers/documentHandlers.js.map +1 -0
- package/lib/server/handlers/helpers.d.ts +23 -0
- package/lib/server/handlers/helpers.d.ts.map +1 -0
- package/lib/server/handlers/helpers.js +138 -0
- package/lib/server/handlers/helpers.js.map +1 -0
- package/lib/server/handlers/index.d.ts +8 -0
- package/lib/server/handlers/index.d.ts.map +1 -0
- package/lib/server/handlers/index.js +22 -0
- package/lib/server/handlers/index.js.map +1 -0
- package/lib/server/handlers/indexHandlers.d.ts +14 -0
- package/lib/server/handlers/indexHandlers.d.ts.map +1 -0
- package/lib/server/handlers/indexHandlers.js +85 -0
- package/lib/server/handlers/indexHandlers.js.map +1 -0
- package/lib/server/handlers/itemHandlers.d.ts +34 -0
- package/lib/server/handlers/itemHandlers.d.ts.map +1 -0
- package/lib/server/handlers/itemHandlers.js +166 -0
- package/lib/server/handlers/itemHandlers.js.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts +11 -0
- package/lib/server/handlers/lifecycleHandlers.d.ts.map +1 -0
- package/lib/server/handlers/lifecycleHandlers.js +31 -0
- package/lib/server/handlers/lifecycleHandlers.js.map +1 -0
- package/lib/server/handlers/queryHandlers.d.ts +27 -0
- package/lib/server/handlers/queryHandlers.d.ts.map +1 -0
- package/lib/server/handlers/queryHandlers.js +135 -0
- package/lib/server/handlers/queryHandlers.js.map +1 -0
- package/lib/server/handlers/statsHandlers.d.ts +17 -0
- package/lib/server/handlers/statsHandlers.d.ts.map +1 -0
- package/lib/server/handlers/statsHandlers.js +81 -0
- package/lib/server/handlers/statsHandlers.js.map +1 -0
- package/lib/server/index.d.ts +4 -0
- package/lib/server/index.d.ts.map +1 -0
- package/lib/server/index.js +23 -0
- package/lib/server/index.js.map +1 -0
- package/lib/storage/FileStorage.d.ts +92 -0
- package/lib/storage/FileStorage.d.ts.map +1 -0
- package/lib/storage/FileStorage.js +3 -0
- package/lib/storage/FileStorage.js.map +1 -0
- package/lib/storage/FileStorageUtilities.d.ts +36 -0
- package/lib/storage/FileStorageUtilities.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.js +91 -0
- package/lib/storage/FileStorageUtilities.js.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts +2 -0
- package/lib/storage/FileStorageUtilities.spec.d.ts.map +1 -0
- package/lib/storage/FileStorageUtilities.spec.js +98 -0
- package/lib/storage/FileStorageUtilities.spec.js.map +1 -0
- package/lib/storage/FileType.d.ts +29 -0
- package/lib/storage/FileType.d.ts.map +1 -0
- package/lib/storage/FileType.js +38 -0
- package/lib/storage/FileType.js.map +1 -0
- package/lib/storage/IndexedDBStorage.d.ts +47 -0
- package/lib/storage/IndexedDBStorage.d.ts.map +1 -0
- package/lib/storage/IndexedDBStorage.js +347 -0
- package/lib/storage/IndexedDBStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.browser.d.ts +19 -0
- package/lib/storage/LocalFileStorage.browser.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.browser.js +43 -0
- package/lib/storage/LocalFileStorage.browser.js.map +1 -0
- package/lib/storage/LocalFileStorage.d.ts +23 -0
- package/lib/storage/LocalFileStorage.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.js +152 -0
- package/lib/storage/LocalFileStorage.js.map +1 -0
- package/lib/storage/LocalFileStorage.spec.d.ts +2 -0
- package/lib/storage/LocalFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/LocalFileStorage.spec.js +249 -0
- package/lib/storage/LocalFileStorage.spec.js.map +1 -0
- package/lib/storage/VirtualFileStorage.d.ts +18 -0
- package/lib/storage/VirtualFileStorage.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.js +178 -0
- package/lib/storage/VirtualFileStorage.js.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts +2 -0
- package/lib/storage/VirtualFileStorage.spec.d.ts.map +1 -0
- package/lib/storage/VirtualFileStorage.spec.js +302 -0
- package/lib/storage/VirtualFileStorage.spec.js.map +1 -0
- package/lib/storage/index.d.ts +6 -0
- package/lib/storage/index.d.ts.map +1 -0
- package/lib/storage/index.js +22 -0
- package/lib/storage/index.js.map +1 -0
- package/lib/templates/templates/csharp/README.md +48 -0
- package/lib/templates/templates/csharp/VectraClient.cs +234 -0
- package/lib/templates/templates/go/README.md +71 -0
- package/lib/templates/templates/go/vectra_client.go +322 -0
- package/lib/templates/templates/java/README.md +81 -0
- package/lib/templates/templates/java/VectraClient.java +232 -0
- package/lib/templates/templates/python/README.md +37 -0
- package/lib/templates/templates/python/vectra_client.py +279 -0
- package/lib/templates/templates/rust/Cargo.toml +14 -0
- package/lib/templates/templates/rust/README.md +39 -0
- package/lib/templates/templates/rust/build.rs +4 -0
- package/lib/templates/templates/rust/lib.rs +284 -0
- package/lib/templates/templates/typescript/README.md +96 -0
- package/lib/templates/templates/typescript/VectraClient.ts +374 -0
- package/lib/templates/typescript/VectraClient.d.ts +114 -0
- package/lib/templates/typescript/VectraClient.d.ts.map +1 -0
- package/lib/templates/typescript/VectraClient.js +328 -0
- package/lib/templates/typescript/VectraClient.js.map +1 -0
- package/lib/types.d.ts +153 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/utils/index.d.ts +2 -0
- package/lib/utils/index.d.ts.map +1 -0
- package/lib/utils/index.js +18 -0
- package/lib/utils/index.js.map +1 -0
- package/lib/utils/pathUtils.d.ts +40 -0
- package/lib/utils/pathUtils.d.ts.map +1 -0
- package/lib/utils/pathUtils.js +98 -0
- package/lib/utils/pathUtils.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.d.ts.map +1 -1
- package/lib/vectra-cli.generate.spec.d.ts +2 -0
- package/lib/vectra-cli.generate.spec.d.ts.map +1 -0
- package/lib/vectra-cli.generate.spec.js +112 -0
- package/lib/vectra-cli.generate.spec.js.map +1 -0
- package/lib/vectra-cli.js +760 -0
- package/lib/vectra-cli.js.map +1 -0
- package/lib/vectra-cli.spec.d.ts +1 -0
- package/lib/vectra-cli.spec.d.ts.map +1 -0
- package/lib/vectra-cli.spec.js +2 -0
- package/lib/vectra-cli.spec.js.map +1 -0
- package/package.json +91 -16
- package/proto/vectra_service.proto +276 -0
- package/src/BrowserWebFetcher.ts +345 -0
- package/src/FileFetcher.spec.ts +234 -0
- package/src/FileFetcher.ts +37 -25
- package/src/FolderWatcher.spec.ts +288 -0
- package/src/FolderWatcher.ts +304 -0
- package/src/GPT3Tokenizer.spec.ts +50 -0
- package/src/ItemSelector.spec.ts +252 -0
- package/src/ItemSelector.ts +163 -150
- package/src/LocalDocument.spec.ts +211 -0
- package/src/LocalDocument.ts +88 -94
- package/src/LocalDocumentIndex.spec.ts +481 -0
- package/src/LocalDocumentIndex.ts +39 -40
- package/src/LocalDocumentResult.spec.ts +373 -0
- package/src/LocalDocumentResult.ts +489 -319
- package/src/LocalEmbeddings.spec.ts +138 -0
- package/src/LocalEmbeddings.ts +120 -0
- package/src/LocalIndex.spec.ts +808 -66
- package/src/LocalIndex.ts +479 -429
- package/src/OpenAIEmbeddings.spec.ts +354 -0
- package/src/OpenAIEmbeddings.ts +26 -27
- package/src/TextSplitter.spec.ts +342 -0
- package/src/TextSplitter.ts +517 -532
- package/src/TransformersEmbeddings.spec.ts +188 -0
- package/src/TransformersEmbeddings.ts +232 -0
- package/src/TransformersTokenizer.spec.ts +143 -0
- package/src/TransformersTokenizer.ts +45 -0
- package/src/WebFetcher.spec.ts +288 -0
- package/src/WebFetcher.ts +184 -186
- package/src/browser.ts +69 -0
- package/src/codecs/IndexCodec.ts +40 -0
- package/src/codecs/JsonCodec.spec.ts +70 -0
- package/src/codecs/JsonCodec.ts +37 -0
- package/src/codecs/LocalIndex.protobuf.spec.ts +115 -0
- package/src/codecs/ProtobufCodec.spec.ts +166 -0
- package/src/codecs/ProtobufCodec.ts +193 -0
- package/src/codecs/index.ts +4 -0
- package/src/codecs/migrateIndex.spec.ts +176 -0
- package/src/codecs/migrateIndex.ts +125 -0
- package/src/codecs/schemas/index.proto +34 -0
- package/src/index.ts +9 -1
- package/src/internals/Colorize.ts +19 -16
- package/src/server/IndexManager.ts +243 -0
- package/src/server/VectraServer.spec.ts +303 -0
- package/src/server/VectraServer.ts +156 -0
- package/src/server/handlers/documentHandlers.ts +59 -0
- package/src/server/handlers/helpers.ts +93 -0
- package/src/server/handlers/index.ts +7 -0
- package/src/server/handlers/indexHandlers.ts +44 -0
- package/src/server/handlers/itemHandlers.ts +140 -0
- package/src/server/handlers/lifecycleHandlers.ts +26 -0
- package/src/server/handlers/queryHandlers.ts +96 -0
- package/src/server/handlers/statsHandlers.ts +38 -0
- package/src/server/index.ts +3 -0
- package/src/storage/FileStorage.ts +105 -0
- package/src/storage/FileStorageUtilities.spec.ts +106 -0
- package/src/storage/FileStorageUtilities.ts +77 -0
- package/src/storage/FileType.ts +61 -0
- package/src/storage/IndexedDBStorage.ts +365 -0
- package/src/storage/LocalFileStorage.browser.ts +52 -0
- package/src/storage/LocalFileStorage.spec.ts +292 -0
- package/src/storage/LocalFileStorage.ts +98 -0
- package/src/storage/VirtualFileStorage.spec.ts +307 -0
- package/src/storage/VirtualFileStorage.ts +169 -0
- package/src/storage/index.ts +5 -0
- package/src/templates/csharp/README.md +48 -0
- package/src/templates/csharp/VectraClient.cs +234 -0
- package/src/templates/go/README.md +71 -0
- package/src/templates/go/vectra_client.go +322 -0
- package/src/templates/java/README.md +81 -0
- package/src/templates/java/VectraClient.java +232 -0
- package/src/templates/python/README.md +37 -0
- package/src/templates/python/vectra_client.py +279 -0
- package/src/templates/rust/Cargo.toml +14 -0
- package/src/templates/rust/README.md +39 -0
- package/src/templates/rust/build.rs +4 -0
- package/src/templates/rust/lib.rs +284 -0
- package/src/templates/typescript/README.md +96 -0
- package/src/templates/typescript/VectraClient.ts +374 -0
- package/src/types.ts +131 -123
- package/src/utils/index.ts +1 -0
- package/src/utils/pathUtils.ts +106 -0
- package/src/vectra-cli.generate.spec.ts +72 -0
- package/src/vectra-cli.spec.ts +0 -0
- package/src/vectra-cli.ts +687 -246
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import { describe, it } from 'mocha';
|
|
2
|
+
import * as assert from 'node:assert';
|
|
3
|
+
import { TextSplitter } from './TextSplitter';
|
|
4
|
+
|
|
5
|
+
const charTokenizer = {
|
|
6
|
+
encode(text: string): number[] {
|
|
7
|
+
return Array.from(text).map(c => c.codePointAt(0)!);
|
|
8
|
+
},
|
|
9
|
+
decode(tokens: number[]): string {
|
|
10
|
+
return String.fromCodePoint(...tokens);
|
|
11
|
+
},
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
const makeSplitter = (opts?: Partial<ConstructorParameters<typeof TextSplitter>[0]>) =>
|
|
15
|
+
new TextSplitter({ chunkSize: 16, chunkOverlap: 0, tokenizer: charTokenizer as any, ...opts });
|
|
16
|
+
|
|
17
|
+
function joinTexts(chunks: Array<{ text: string }>, sep = '') {
|
|
18
|
+
return chunks.map(c => c.text).join(sep);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
describe('TextSplitter - full coverage suite', () => {
|
|
22
|
+
describe('constructor validation and defaults', () => {
|
|
23
|
+
it('throws when chunkSize < 1', () => {
|
|
24
|
+
assert.throws(() => new TextSplitter({ chunkSize: 0 } as any), /chunkSize must be >= 1/);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('throws when chunkOverlap < 0', () => {
|
|
28
|
+
assert.throws(() => new TextSplitter({ chunkSize: 10, chunkOverlap: -1 } as any), /chunkOverlap must be >= 0/);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('throws when chunkOverlap > chunkSize', () => {
|
|
32
|
+
assert.throws(() => new TextSplitter({ chunkSize: 5, chunkOverlap: 6 } as any), /chunkOverlap must be <= chunkSize/);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('works with default constructor and simple text', () => {
|
|
36
|
+
const splitter = new TextSplitter();
|
|
37
|
+
const chunks = splitter.split('Hello world');
|
|
38
|
+
assert.ok(Array.isArray(chunks) && chunks.length > 0);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('returns [] for empty input', () => {
|
|
42
|
+
const splitter = makeSplitter();
|
|
43
|
+
const chunks = splitter.split('');
|
|
44
|
+
assert.strictEqual(chunks.length, 0);
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
describe('basic splitting, whitespace handling, and punctuation preservation', () => {
|
|
50
|
+
it('drops pure whitespace-only input', () => {
|
|
51
|
+
const splitter = makeSplitter();
|
|
52
|
+
assert.strictEqual(splitter.split(' \t ').length, 0);
|
|
53
|
+
assert.strictEqual(splitter.split('\n\n').length, 0);
|
|
54
|
+
assert.strictEqual(splitter.split(' \n \n ').length, 0);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('keeps leading punctuation-only chunk ("---")', () => {
|
|
58
|
+
const splitter = makeSplitter({ chunkSize: 3 });
|
|
59
|
+
const chunks = splitter.split('---');
|
|
60
|
+
assert.deepStrictEqual(chunks.map(c => c.text), ['---']);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('keeps punctuation-only separators (---, ***, ====) at start, middle, end', () => {
|
|
64
|
+
const splitter = makeSplitter({ chunkSize: 4 });
|
|
65
|
+
const text = ['---', 'Hello world', '***', 'Middle', '===='].join('\n');
|
|
66
|
+
const chunks = splitter.split(text);
|
|
67
|
+
const got = chunks.map(c => c.text).join('\n');
|
|
68
|
+
assert.ok(got.includes('---'));
|
|
69
|
+
assert.ok(got.includes('***'));
|
|
70
|
+
assert.ok(got.includes('===='));
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('preserves frontmatter delimiters (---) with small chunkSize and zero overlap', () => {
|
|
74
|
+
const splitter = makeSplitter({ chunkSize: 12 });
|
|
75
|
+
const md = [
|
|
76
|
+
'---',
|
|
77
|
+
'title: Test',
|
|
78
|
+
'tags: [a, b]',
|
|
79
|
+
'---',
|
|
80
|
+
'# Heading',
|
|
81
|
+
'Body text goes here.',
|
|
82
|
+
].join('\n');
|
|
83
|
+
|
|
84
|
+
const chunks = splitter.split(md);
|
|
85
|
+
const joined = joinTexts(chunks, '\n');
|
|
86
|
+
const delimiterCount = (joined.match(/^---$/gm) ?? []).length;
|
|
87
|
+
assert.strictEqual(delimiterCount, 2);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it('keeps trailing punctuation-only chunk', () => {
|
|
91
|
+
const splitter = makeSplitter({ chunkSize: 4 });
|
|
92
|
+
const chunks = splitter.split('Content\n---');
|
|
93
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('still returns alphanumeric chunks normally', () => {
|
|
97
|
+
const splitter = makeSplitter({ chunkSize: 5 });
|
|
98
|
+
const chunks = splitter.split('abcde fghij');
|
|
99
|
+
const joined = joinTexts(chunks, ' ');
|
|
100
|
+
assert.ok(joined.includes('abcde'));
|
|
101
|
+
assert.ok(joined.includes('fghij'));
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('does not regress with non-zero overlap and preserves punctuation-only chunk', () => {
|
|
105
|
+
const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 2 });
|
|
106
|
+
const chunks = splitter.split('---\nabcdef');
|
|
107
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it('handles multiple punctuation-only separators interleaved with content', () => {
|
|
111
|
+
const splitter = makeSplitter({ chunkSize: 8 });
|
|
112
|
+
const text = ['***', 'A', '---', 'B', '====', 'C'].join('\n');
|
|
113
|
+
const chunks = splitter.split(text);
|
|
114
|
+
|
|
115
|
+
const joined = joinTexts(chunks, '\n');
|
|
116
|
+
assert.ok(joined.includes('***'));
|
|
117
|
+
assert.ok(joined.includes('---'));
|
|
118
|
+
assert.ok(joined.includes('===='));
|
|
119
|
+
assert.ok(joined.includes('\nA\n'));
|
|
120
|
+
assert.ok(joined.includes('\nB\n'));
|
|
121
|
+
assert.ok(joined.includes('\nC'));
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
describe('keepSeparators vs joiner behavior and merging', () => {
|
|
126
|
+
it('when keepSeparators=false, merges adjacent content with a space joiner across newline separator', () => {
|
|
127
|
+
const splitter = makeSplitter({ keepSeparators: false, chunkSize: 50 });
|
|
128
|
+
const text = 'Hello\nWorld';
|
|
129
|
+
const chunks = splitter.split(text);
|
|
130
|
+
// Should merge into a single chunk with a space instead of newline
|
|
131
|
+
assert.strictEqual(chunks.length, 1);
|
|
132
|
+
assert.strictEqual(chunks[0].text, 'Hello World');
|
|
133
|
+
// startPos/endPos should reflect original indices
|
|
134
|
+
assert.strictEqual(chunks[0].startPos, 0);
|
|
135
|
+
assert.strictEqual(chunks[0].endPos, text.length - 1);
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
it('when keepSeparators=true, retains separators inline', () => {
|
|
139
|
+
const splitter = makeSplitter({ keepSeparators: true, chunkSize: 50 });
|
|
140
|
+
const text = 'Hello\nWorld\n';
|
|
141
|
+
const chunks = splitter.split(text);
|
|
142
|
+
const joined = joinTexts(chunks, '');
|
|
143
|
+
// Newlines should be preserved in output
|
|
144
|
+
assert.ok(joined.includes('Hello\n'));
|
|
145
|
+
assert.ok(joined.includes('World\n'));
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
it('does not merge when token budget would be exceeded', () => {
|
|
149
|
+
// Very small chunkSize to force non-merge
|
|
150
|
+
const splitter = makeSplitter({ chunkSize: 3, keepSeparators: false });
|
|
151
|
+
const text = 'ABCD\nEFG';
|
|
152
|
+
const chunks = splitter.split(text);
|
|
153
|
+
// Expect multiple chunks due to budget
|
|
154
|
+
assert.ok(chunks.length >= 2);
|
|
155
|
+
});
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
describe('custom separators and precedence over docType', () => {
|
|
159
|
+
it('custom separators override docType separators (keepSeparators=false)', () => {
|
|
160
|
+
const splitter = makeSplitter({ docType: 'markdown', separators: ['||'], keepSeparators: false });
|
|
161
|
+
const chunks = splitter.split('A||B||C');
|
|
162
|
+
const joined = joinTexts(chunks, '');
|
|
163
|
+
assert.ok(!joined.includes('||'), 'custom separator should be removed');
|
|
164
|
+
assert.ok(joined.includes('ABC'));
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
it('custom separators can be preserved with keepSeparators=true', () => {
|
|
168
|
+
const splitter = makeSplitter({ separators: ['||'], keepSeparators: true });
|
|
169
|
+
const chunks = splitter.split('A||B||C');
|
|
170
|
+
const joined = joinTexts(chunks, '');
|
|
171
|
+
assert.ok(joined.includes('A||'));
|
|
172
|
+
assert.ok(joined.includes('B||'));
|
|
173
|
+
assert.ok(joined.endsWith('C'));
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
it('inline separators with no surrounding whitespace are preserved/dropped based on config', () => {
|
|
177
|
+
const inline = '---Hello---World---';
|
|
178
|
+
|
|
179
|
+
const keep = makeSplitter({ keepSeparators: true, chunkSize: 64 });
|
|
180
|
+
const kept = joinTexts(keep.split(inline), '');
|
|
181
|
+
assert.ok(kept.includes('---Hello---World---'));
|
|
182
|
+
|
|
183
|
+
const drop = makeSplitter({ keepSeparators: false, chunkSize: 64 });
|
|
184
|
+
const dropped = joinTexts(drop.split(inline), '');
|
|
185
|
+
assert.ok(!dropped.includes('---'));
|
|
186
|
+
assert.ok(dropped.includes('Hello') && dropped.includes('World'));
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
it('multiple consecutive separators do not produce empty chunks', () => {
|
|
190
|
+
const splitter = makeSplitter({ keepSeparators: true, chunkSize: 64 });
|
|
191
|
+
const text = ['---', '***', '===='].join('\n');
|
|
192
|
+
const chunks = splitter.split(text);
|
|
193
|
+
assert.ok(chunks.length >= 3);
|
|
194
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
195
|
+
assert.ok(chunks.some(c => c.text.includes('***')));
|
|
196
|
+
assert.ok(chunks.some(c => c.text.includes('====')));
|
|
197
|
+
});
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
describe('splitBySpaces coverage (sep === " ")', () => {
|
|
201
|
+
it('token window path: tokens.length <= chunkSize triggers single part then break', () => {
|
|
202
|
+
const splitter = makeSplitter({ separators: [' '], chunkSize: 100 });
|
|
203
|
+
const text = 'a b c';
|
|
204
|
+
const chunks = splitter.split(text);
|
|
205
|
+
// With large chunkSize and only space separator, it should come out as one chunk
|
|
206
|
+
assert.strictEqual(chunks.length, 1);
|
|
207
|
+
assert.strictEqual(chunks[0].text, text);
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
it('token window path: tokens.length > chunkSize triggers multiple parts', () => {
|
|
211
|
+
const splitter = makeSplitter({ separators: [' '], chunkSize: 4 }); // small window
|
|
212
|
+
const text = 'abcdefghij'; // 10 chars -> 3 parts
|
|
213
|
+
const chunks = splitter.split(text);
|
|
214
|
+
assert.ok(chunks.length >= 3, 'should produce multiple chunks from token windows');
|
|
215
|
+
assert.strictEqual(joinTexts(chunks, ''), text);
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
describe('base case path (no separators) and cutting in half', () => {
|
|
220
|
+
it('with separators: [] uses cut-in-half strategy and merges back under budget', () => {
|
|
221
|
+
const splitter = makeSplitter({ separators: [], chunkSize: 100, keepSeparators: false });
|
|
222
|
+
const text = 'HelloWorld';
|
|
223
|
+
const chunks = splitter.split(text);
|
|
224
|
+
// Expect merged single chunk but with a space joiner introduced by combineChunks
|
|
225
|
+
assert.strictEqual(chunks.length, 1);
|
|
226
|
+
assert.strictEqual(chunks[0].text.replace(/\s/g, ''), text);
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
it('optimization branch: chunk.length/6 > chunkSize triggers deeper recursion', () => {
|
|
230
|
+
const splitter = makeSplitter({ separators: ['\n'], chunkSize: 1 });
|
|
231
|
+
const text = 'x'.repeat(20); // 20/6 > 1 -> triggers optimization
|
|
232
|
+
const chunks = splitter.split(text);
|
|
233
|
+
assert.ok(chunks.length > 1, 'should split into multiple chunks under heavy optimization');
|
|
234
|
+
assert.strictEqual(joinTexts(chunks, ''), text);
|
|
235
|
+
});
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
describe('overlap behavior', () => {
|
|
239
|
+
it('adds startOverlap and endOverlap tokens between adjacent chunks', () => {
|
|
240
|
+
const chunkSize = 3;
|
|
241
|
+
const overlap = 2;
|
|
242
|
+
const splitter = makeSplitter({ chunkSize, chunkOverlap: overlap, separators: [], keepSeparators: false });
|
|
243
|
+
const text = 'abcdefghi'; // will be split by halves recursively then sized by budget
|
|
244
|
+
|
|
245
|
+
const chunks = splitter.split(text);
|
|
246
|
+
assert.ok(chunks.length >= 2);
|
|
247
|
+
|
|
248
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
249
|
+
const prev = chunks[i - 1];
|
|
250
|
+
const curr = chunks[i];
|
|
251
|
+
const next = chunks[i + 1];
|
|
252
|
+
|
|
253
|
+
// startOverlap should match last 'overlap' tokens of prev
|
|
254
|
+
const prevTail = prev.tokens.slice(-overlap);
|
|
255
|
+
assert.deepStrictEqual(curr.startOverlap, prevTail.slice(0, curr.startOverlap.length));
|
|
256
|
+
|
|
257
|
+
// endOverlap should match first 'overlap' tokens of next (or [])
|
|
258
|
+
if (next) {
|
|
259
|
+
const nextHead = next.tokens.slice(0, overlap);
|
|
260
|
+
assert.deepStrictEqual(curr.endOverlap, nextHead.slice(0, curr.endOverlap.length));
|
|
261
|
+
} else {
|
|
262
|
+
assert.deepStrictEqual(curr.endOverlap, []);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
it('handles maximal overlap (chunkOverlap = chunkSize)', () => {
|
|
268
|
+
const chunkSize = 4;
|
|
269
|
+
const overlap = 4;
|
|
270
|
+
const splitter = makeSplitter({ chunkSize, chunkOverlap: overlap, separators: [], keepSeparators: false });
|
|
271
|
+
const text = 'abcdefgh';
|
|
272
|
+
const chunks = splitter.split(text);
|
|
273
|
+
assert.ok(chunks.length >= 2);
|
|
274
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
275
|
+
const prevTail = chunks[i - 1].tokens.slice(-overlap);
|
|
276
|
+
assert.deepStrictEqual(chunks[i].startOverlap, prevTail.slice(0, chunks[i].startOverlap.length));
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
describe('docType-specific separators coverage', () => {
|
|
282
|
+
const docTypes = [
|
|
283
|
+
'cpp',
|
|
284
|
+
'go',
|
|
285
|
+
'typescript', // covers java/c#/cs/ts/tsx/typescript grouped case
|
|
286
|
+
'javascript',
|
|
287
|
+
'php',
|
|
288
|
+
'proto',
|
|
289
|
+
'python',
|
|
290
|
+
'rst',
|
|
291
|
+
'ruby',
|
|
292
|
+
'rust',
|
|
293
|
+
'scala',
|
|
294
|
+
'swift',
|
|
295
|
+
'markdown',
|
|
296
|
+
'latex',
|
|
297
|
+
'html',
|
|
298
|
+
'sol',
|
|
299
|
+
// default/fallback
|
|
300
|
+
'__default__',
|
|
301
|
+
];
|
|
302
|
+
|
|
303
|
+
for (const dt of docTypes) {
|
|
304
|
+
it(`constructs with docType: ${dt}`, () => {
|
|
305
|
+
const opts = dt === '__default__' ? {} : { docType: dt };
|
|
306
|
+
const splitter = new TextSplitter({ chunkSize: 64, chunkOverlap: 0, tokenizer: charTokenizer as any, ...(opts as any) });
|
|
307
|
+
const chunks = splitter.split('sample');
|
|
308
|
+
assert.ok(chunks.length >= 1);
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
describe('tokenizer injection behavior', () => {
|
|
314
|
+
it('respects a custom tokenizer that alters token counts', () => {
|
|
315
|
+
// Tokenizer that treats every character as two tokens (simulate heavier tokenization)
|
|
316
|
+
const heavyTokenizer = {
|
|
317
|
+
encode(text: string) {
|
|
318
|
+
const out: number[] = [];
|
|
319
|
+
for (const c of Array.from(text)) {
|
|
320
|
+
const code = c.codePointAt(0)!;
|
|
321
|
+
out.push(code, code + 1); // 2 tokens per char
|
|
322
|
+
}
|
|
323
|
+
return out;
|
|
324
|
+
},
|
|
325
|
+
decode(tokens: number[]) {
|
|
326
|
+
// Decode by taking every other token
|
|
327
|
+
const chars: number[] = [];
|
|
328
|
+
for (let i = 0; i < tokens.length; i += 2) {
|
|
329
|
+
chars.push(tokens[i]);
|
|
330
|
+
}
|
|
331
|
+
return String.fromCodePoint(...chars);
|
|
332
|
+
},
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
const splitter = new TextSplitter({ chunkSize: 6, chunkOverlap: 0, tokenizer: heavyTokenizer as any });
|
|
336
|
+
const text = 'abcdef'; // 6 chars -> 12 tokens -> will require splitting
|
|
337
|
+
const chunks = splitter.split(text);
|
|
338
|
+
assert.ok(chunks.length > 1, 'heavier tokenization should force more chunks');
|
|
339
|
+
assert.strictEqual(joinTexts(chunks, ''), text);
|
|
340
|
+
});
|
|
341
|
+
});
|
|
342
|
+
});
|