@juspay/neurolink 9.1.1 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +106 -37
- package/dist/agent/directTools.d.ts +11 -11
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +8 -8
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +2 -2
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +16 -16
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +8 -8
- package/dist/types/common.d.ts +0 -1
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Chunker
|
|
3
|
+
*
|
|
4
|
+
* LLM-powered semantic chunking that groups related content together.
|
|
5
|
+
* Uses embedding similarity to determine natural breakpoints.
|
|
6
|
+
* Best for complex documents where meaning should drive segmentation.
|
|
7
|
+
*/
|
|
8
|
+
import { randomUUID } from "crypto";
|
|
9
|
+
import { ProviderFactory } from "../../factories/providerFactory.js";
|
|
10
|
+
import { logger } from "../../utils/logger.js";
|
|
11
|
+
/**
|
|
12
|
+
* Semantic chunker implementation
|
|
13
|
+
* Uses embedding similarity to find natural content boundaries
|
|
14
|
+
*/
|
|
15
|
+
export class SemanticChunker {
|
|
16
|
+
strategy = "semantic";
|
|
17
|
+
async chunk(text, config) {
|
|
18
|
+
const { maxSize = 1000, overlap = 0, joinThreshold = 100, modelName = "text-embedding-3-small", provider = "openai", similarityThreshold = 0.7, trimWhitespace = true, metadata = {}, } = config || {};
|
|
19
|
+
const documentId = randomUUID();
|
|
20
|
+
const chunks = [];
|
|
21
|
+
if (!text || text.length === 0) {
|
|
22
|
+
return chunks;
|
|
23
|
+
}
|
|
24
|
+
// First, split into initial segments (paragraphs or sentences)
|
|
25
|
+
const segments = this.splitIntoSegments(text, joinThreshold);
|
|
26
|
+
if (segments.length <= 1) {
|
|
27
|
+
// Single segment, no need for semantic analysis
|
|
28
|
+
chunks.push({
|
|
29
|
+
id: randomUUID(),
|
|
30
|
+
text: trimWhitespace ? text.trim() : text,
|
|
31
|
+
metadata: {
|
|
32
|
+
documentId,
|
|
33
|
+
chunkIndex: 0,
|
|
34
|
+
totalChunks: 1,
|
|
35
|
+
startPosition: 0,
|
|
36
|
+
endPosition: text.length,
|
|
37
|
+
documentType: "text",
|
|
38
|
+
custom: metadata,
|
|
39
|
+
},
|
|
40
|
+
});
|
|
41
|
+
return chunks;
|
|
42
|
+
}
|
|
43
|
+
try {
|
|
44
|
+
// Get embeddings for each segment
|
|
45
|
+
const embeddings = await this.getEmbeddings(segments, provider, modelName);
|
|
46
|
+
// Find semantic breakpoints
|
|
47
|
+
const breakpoints = this.findSemanticBreakpoints(embeddings, similarityThreshold);
|
|
48
|
+
// Group segments by semantic similarity
|
|
49
|
+
const groups = this.groupSegments(segments, breakpoints, maxSize);
|
|
50
|
+
// Create chunks from groups
|
|
51
|
+
let chunkIndex = 0;
|
|
52
|
+
let currentPosition = 0;
|
|
53
|
+
for (const group of groups) {
|
|
54
|
+
const chunkText = group.join("\n\n");
|
|
55
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
56
|
+
if (finalText.length > 0) {
|
|
57
|
+
chunks.push({
|
|
58
|
+
id: randomUUID(),
|
|
59
|
+
text: finalText,
|
|
60
|
+
metadata: {
|
|
61
|
+
documentId,
|
|
62
|
+
chunkIndex,
|
|
63
|
+
startPosition: currentPosition,
|
|
64
|
+
endPosition: currentPosition + chunkText.length,
|
|
65
|
+
documentType: "text",
|
|
66
|
+
custom: {
|
|
67
|
+
...metadata,
|
|
68
|
+
segmentCount: group.length,
|
|
69
|
+
},
|
|
70
|
+
},
|
|
71
|
+
});
|
|
72
|
+
chunkIndex++;
|
|
73
|
+
}
|
|
74
|
+
currentPosition += chunkText.length + 2; // +2 for separator
|
|
75
|
+
}
|
|
76
|
+
// Handle overlap if configured
|
|
77
|
+
if (overlap > 0) {
|
|
78
|
+
chunks.forEach((chunk, i) => {
|
|
79
|
+
if (i > 0) {
|
|
80
|
+
// Add overlap from previous chunk
|
|
81
|
+
const prevText = chunks[i - 1].text;
|
|
82
|
+
const overlapText = prevText.slice(-overlap);
|
|
83
|
+
chunk.text = overlapText + "\n" + chunk.text;
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
// Fallback to simple chunking if embeddings fail
|
|
90
|
+
logger.warn("[SemanticChunker] Embedding failed, falling back to simple chunking", {
|
|
91
|
+
error: error instanceof Error ? error.message : String(error),
|
|
92
|
+
});
|
|
93
|
+
return this.fallbackChunk(text, maxSize, overlap, documentId, metadata, trimWhitespace);
|
|
94
|
+
}
|
|
95
|
+
// Update total chunks count
|
|
96
|
+
chunks.forEach((chunk) => {
|
|
97
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
98
|
+
});
|
|
99
|
+
return chunks;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Split text into initial segments for embedding
|
|
103
|
+
*/
|
|
104
|
+
splitIntoSegments(text, minSize) {
|
|
105
|
+
const segments = [];
|
|
106
|
+
// Split by double newlines (paragraphs)
|
|
107
|
+
const paragraphs = text.split(/\n\n+/);
|
|
108
|
+
let currentSegment = "";
|
|
109
|
+
for (const paragraph of paragraphs) {
|
|
110
|
+
const trimmed = paragraph.trim();
|
|
111
|
+
if (trimmed.length === 0) {
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (currentSegment.length === 0) {
|
|
115
|
+
currentSegment = trimmed;
|
|
116
|
+
}
|
|
117
|
+
else if (currentSegment.length + trimmed.length < minSize) {
|
|
118
|
+
// Join small paragraphs
|
|
119
|
+
currentSegment += "\n\n" + trimmed;
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
// Save current and start new
|
|
123
|
+
if (currentSegment.length > 0) {
|
|
124
|
+
segments.push(currentSegment);
|
|
125
|
+
}
|
|
126
|
+
currentSegment = trimmed;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
// Don't forget the last segment
|
|
130
|
+
if (currentSegment.length > 0) {
|
|
131
|
+
segments.push(currentSegment);
|
|
132
|
+
}
|
|
133
|
+
return segments;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Get embeddings for segments
|
|
137
|
+
*/
|
|
138
|
+
async getEmbeddings(segments, provider, modelName) {
|
|
139
|
+
const embeddingProvider = await ProviderFactory.createProvider(provider, modelName);
|
|
140
|
+
// Check if provider has embed method
|
|
141
|
+
if (typeof embeddingProvider.embed !==
|
|
142
|
+
"function") {
|
|
143
|
+
throw new Error(`Provider ${provider} does not support embeddings`);
|
|
144
|
+
}
|
|
145
|
+
const embeddings = [];
|
|
146
|
+
// Process in batches to avoid rate limits
|
|
147
|
+
const batchSize = 10;
|
|
148
|
+
for (let i = 0; i < segments.length; i += batchSize) {
|
|
149
|
+
const batch = segments.slice(i, i + batchSize);
|
|
150
|
+
for (const segment of batch) {
|
|
151
|
+
try {
|
|
152
|
+
const embedding = await embeddingProvider.embed(segment);
|
|
153
|
+
embeddings.push(embedding);
|
|
154
|
+
}
|
|
155
|
+
catch (error) {
|
|
156
|
+
logger.warn("[SemanticChunker] Failed to embed segment", {
|
|
157
|
+
error: error instanceof Error ? error.message : String(error),
|
|
158
|
+
});
|
|
159
|
+
// Use zero vector as fallback
|
|
160
|
+
embeddings.push(new Array(1536).fill(0));
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return embeddings;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Find semantic breakpoints using cosine similarity
|
|
168
|
+
*/
|
|
169
|
+
findSemanticBreakpoints(embeddings, threshold) {
|
|
170
|
+
const breakpoints = [];
|
|
171
|
+
for (let i = 1; i < embeddings.length; i++) {
|
|
172
|
+
const similarity = this.cosineSimilarity(embeddings[i - 1], embeddings[i]);
|
|
173
|
+
// If similarity is below threshold, it's a breakpoint
|
|
174
|
+
if (similarity < threshold) {
|
|
175
|
+
breakpoints.push(i);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return breakpoints;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Group segments based on breakpoints and size limits
|
|
182
|
+
*/
|
|
183
|
+
groupSegments(segments, breakpoints, maxSize) {
|
|
184
|
+
const groups = [];
|
|
185
|
+
let currentGroup = [];
|
|
186
|
+
let currentSize = 0;
|
|
187
|
+
let breakpointIndex = 0;
|
|
188
|
+
for (let i = 0; i < segments.length; i++) {
|
|
189
|
+
const segment = segments[i];
|
|
190
|
+
const segmentSize = segment.length;
|
|
191
|
+
// Check if we're at a breakpoint or exceeding size
|
|
192
|
+
const isBreakpoint = breakpointIndex < breakpoints.length &&
|
|
193
|
+
breakpoints[breakpointIndex] === i;
|
|
194
|
+
if ((currentSize + segmentSize > maxSize && currentGroup.length > 0) ||
|
|
195
|
+
(isBreakpoint && currentGroup.length > 0)) {
|
|
196
|
+
// Save current group
|
|
197
|
+
groups.push(currentGroup);
|
|
198
|
+
currentGroup = [];
|
|
199
|
+
currentSize = 0;
|
|
200
|
+
}
|
|
201
|
+
if (isBreakpoint) {
|
|
202
|
+
breakpointIndex++;
|
|
203
|
+
}
|
|
204
|
+
currentGroup.push(segment);
|
|
205
|
+
currentSize += segmentSize;
|
|
206
|
+
}
|
|
207
|
+
// Don't forget the last group
|
|
208
|
+
if (currentGroup.length > 0) {
|
|
209
|
+
groups.push(currentGroup);
|
|
210
|
+
}
|
|
211
|
+
return groups;
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Calculate cosine similarity between two vectors
|
|
215
|
+
*/
|
|
216
|
+
cosineSimilarity(a, b) {
|
|
217
|
+
if (a.length !== b.length) {
|
|
218
|
+
return 0;
|
|
219
|
+
}
|
|
220
|
+
let dotProduct = 0;
|
|
221
|
+
let normA = 0;
|
|
222
|
+
let normB = 0;
|
|
223
|
+
for (let i = 0; i < a.length; i++) {
|
|
224
|
+
dotProduct += a[i] * b[i];
|
|
225
|
+
normA += a[i] * a[i];
|
|
226
|
+
normB += b[i] * b[i];
|
|
227
|
+
}
|
|
228
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
229
|
+
return denominator === 0 ? 0 : dotProduct / denominator;
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Fallback to simple chunking when embeddings fail
|
|
233
|
+
*/
|
|
234
|
+
fallbackChunk(text, maxSize, overlap, documentId, metadata, trimWhitespace) {
|
|
235
|
+
const effectiveMaxSize = Math.max(maxSize, 1);
|
|
236
|
+
const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
|
|
237
|
+
const chunks = [];
|
|
238
|
+
let start = 0;
|
|
239
|
+
let chunkIndex = 0;
|
|
240
|
+
while (start < text.length) {
|
|
241
|
+
let end = Math.min(start + effectiveMaxSize, text.length);
|
|
242
|
+
// Try to break at paragraph boundary
|
|
243
|
+
if (end < text.length) {
|
|
244
|
+
const searchStart = Math.max(start, end - 200);
|
|
245
|
+
const searchText = text.slice(searchStart, end);
|
|
246
|
+
const paragraphBreak = searchText.lastIndexOf("\n\n");
|
|
247
|
+
if (paragraphBreak > 0) {
|
|
248
|
+
end = searchStart + paragraphBreak;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
const chunkText = text.slice(start, end);
|
|
252
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
253
|
+
if (finalText.length > 0) {
|
|
254
|
+
chunks.push({
|
|
255
|
+
id: randomUUID(),
|
|
256
|
+
text: finalText,
|
|
257
|
+
metadata: {
|
|
258
|
+
documentId,
|
|
259
|
+
chunkIndex,
|
|
260
|
+
startPosition: start,
|
|
261
|
+
endPosition: end,
|
|
262
|
+
documentType: "text",
|
|
263
|
+
custom: {
|
|
264
|
+
...metadata,
|
|
265
|
+
fallbackChunking: true,
|
|
266
|
+
},
|
|
267
|
+
},
|
|
268
|
+
});
|
|
269
|
+
chunkIndex++;
|
|
270
|
+
}
|
|
271
|
+
start = Math.max(start + 1, end - effectiveOverlap);
|
|
272
|
+
}
|
|
273
|
+
return chunks;
|
|
274
|
+
}
|
|
275
|
+
validateConfig(config) {
|
|
276
|
+
const errors = [];
|
|
277
|
+
const warnings = [];
|
|
278
|
+
const semConfig = config;
|
|
279
|
+
if (semConfig.maxSize !== undefined && semConfig.maxSize <= 0) {
|
|
280
|
+
errors.push("maxSize must be greater than 0");
|
|
281
|
+
}
|
|
282
|
+
if (semConfig.overlap !== undefined && semConfig.overlap < 0) {
|
|
283
|
+
errors.push("overlap must be non-negative");
|
|
284
|
+
}
|
|
285
|
+
if (semConfig.overlap !== undefined &&
|
|
286
|
+
semConfig.maxSize !== undefined &&
|
|
287
|
+
semConfig.overlap >= semConfig.maxSize) {
|
|
288
|
+
errors.push("overlap must be less than maxSize");
|
|
289
|
+
}
|
|
290
|
+
if (semConfig.similarityThreshold !== undefined) {
|
|
291
|
+
if (semConfig.similarityThreshold < 0 ||
|
|
292
|
+
semConfig.similarityThreshold > 1) {
|
|
293
|
+
errors.push("similarityThreshold must be between 0 and 1");
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
if (semConfig.joinThreshold !== undefined && semConfig.joinThreshold < 0) {
|
|
297
|
+
errors.push("joinThreshold must be non-negative");
|
|
298
|
+
}
|
|
299
|
+
warnings.push("Semantic chunking requires an embedding provider. Ensure API credentials are configured.");
|
|
300
|
+
return {
|
|
301
|
+
valid: errors.length === 0,
|
|
302
|
+
errors,
|
|
303
|
+
warnings,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
//# sourceMappingURL=semanticChunker.js.map
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sentence-based Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits text based on sentence boundaries while respecting size limits.
|
|
5
|
+
* Best for prose and natural language content where sentence integrity matters.
|
|
6
|
+
*/
|
|
7
|
+
import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, SentenceChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* Sentence-aware chunker implementation
|
|
10
|
+
* Splits text by sentences while respecting size constraints
|
|
11
|
+
*/
|
|
12
|
+
export declare class SentenceChunker implements Chunker {
|
|
13
|
+
readonly strategy: "sentence";
|
|
14
|
+
private readonly defaultSentenceEnders;
|
|
15
|
+
chunk(text: string, config?: SentenceChunkerConfig): Promise<Chunk[]>;
|
|
16
|
+
/**
|
|
17
|
+
* Split text into sentences based on sentence enders
|
|
18
|
+
*/
|
|
19
|
+
private splitIntoSentences;
|
|
20
|
+
/**
|
|
21
|
+
* Split a large sentence into smaller chunks
|
|
22
|
+
*/
|
|
23
|
+
private splitLargeSentence;
|
|
24
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
25
|
+
}
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sentence-based Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits text based on sentence boundaries while respecting size limits.
|
|
5
|
+
* Best for prose and natural language content where sentence integrity matters.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* Sentence-aware chunker implementation
|
|
10
|
+
* Splits text by sentences while respecting size constraints
|
|
11
|
+
*/
|
|
12
|
+
export class SentenceChunker {
|
|
13
|
+
strategy = "sentence";
|
|
14
|
+
defaultSentenceEnders = [".", "!", "?"];
|
|
15
|
+
async chunk(text, config) {
|
|
16
|
+
const { maxSize = 1000, overlap = 0, sentenceEnders = this.defaultSentenceEnders, minSentences = 1, maxSentences, trimWhitespace = true, metadata = {}, } = config || {};
|
|
17
|
+
const chunks = [];
|
|
18
|
+
const documentId = randomUUID();
|
|
19
|
+
if (!text || text.length === 0) {
|
|
20
|
+
return chunks;
|
|
21
|
+
}
|
|
22
|
+
// Split text into sentences
|
|
23
|
+
const sentences = this.splitIntoSentences(text, sentenceEnders);
|
|
24
|
+
if (sentences.length === 0) {
|
|
25
|
+
return chunks;
|
|
26
|
+
}
|
|
27
|
+
let currentChunkSentences = [];
|
|
28
|
+
let currentChunkLength = 0;
|
|
29
|
+
let chunkIndex = 0;
|
|
30
|
+
let startPosition = 0;
|
|
31
|
+
let currentPosition = 0;
|
|
32
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
33
|
+
const sentence = sentences[i];
|
|
34
|
+
const sentenceLength = sentence.length;
|
|
35
|
+
// Check if adding this sentence would exceed limits
|
|
36
|
+
const wouldExceedSize = currentChunkLength + sentenceLength + 1 > maxSize;
|
|
37
|
+
const wouldExceedSentences = maxSentences !== undefined &&
|
|
38
|
+
currentChunkSentences.length >= maxSentences;
|
|
39
|
+
if (currentChunkSentences.length > 0 &&
|
|
40
|
+
(wouldExceedSize || wouldExceedSentences)) {
|
|
41
|
+
// Save current chunk if it meets minimum requirements
|
|
42
|
+
if (currentChunkSentences.length >= minSentences) {
|
|
43
|
+
const chunkText = currentChunkSentences.join(" ");
|
|
44
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
45
|
+
if (finalText.length > 0) {
|
|
46
|
+
chunks.push({
|
|
47
|
+
id: randomUUID(),
|
|
48
|
+
text: finalText,
|
|
49
|
+
metadata: {
|
|
50
|
+
documentId,
|
|
51
|
+
chunkIndex,
|
|
52
|
+
startPosition,
|
|
53
|
+
endPosition: startPosition + chunkText.length,
|
|
54
|
+
documentType: "text",
|
|
55
|
+
custom: metadata,
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
chunkIndex++;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// Handle overlap by keeping some sentences
|
|
62
|
+
if (overlap > 0 && currentChunkSentences.length > 0) {
|
|
63
|
+
// Calculate how many sentences to keep for overlap
|
|
64
|
+
let overlapLength = 0;
|
|
65
|
+
const overlapSentences = [];
|
|
66
|
+
for (let j = currentChunkSentences.length - 1; j >= 0; j--) {
|
|
67
|
+
const s = currentChunkSentences[j];
|
|
68
|
+
if (overlapLength + s.length + 1 <= overlap) {
|
|
69
|
+
overlapSentences.unshift(s);
|
|
70
|
+
overlapLength += s.length + 1;
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
currentChunkSentences = overlapSentences;
|
|
77
|
+
currentChunkLength = overlapLength;
|
|
78
|
+
startPosition = currentPosition - overlapLength;
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
currentChunkSentences = [];
|
|
82
|
+
currentChunkLength = 0;
|
|
83
|
+
startPosition = currentPosition;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// Handle sentences larger than maxSize
|
|
87
|
+
if (sentenceLength > maxSize) {
|
|
88
|
+
// Split the sentence itself if necessary
|
|
89
|
+
const subChunks = this.splitLargeSentence(sentence, maxSize);
|
|
90
|
+
for (const subChunk of subChunks) {
|
|
91
|
+
chunks.push({
|
|
92
|
+
id: randomUUID(),
|
|
93
|
+
text: trimWhitespace ? subChunk.trim() : subChunk,
|
|
94
|
+
metadata: {
|
|
95
|
+
documentId,
|
|
96
|
+
chunkIndex,
|
|
97
|
+
startPosition: currentPosition,
|
|
98
|
+
endPosition: currentPosition + subChunk.length,
|
|
99
|
+
documentType: "text",
|
|
100
|
+
custom: metadata,
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
chunkIndex++;
|
|
104
|
+
currentPosition += subChunk.length;
|
|
105
|
+
}
|
|
106
|
+
startPosition = currentPosition;
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
currentChunkSentences.push(sentence);
|
|
110
|
+
currentChunkLength += sentenceLength + 1; // +1 for space
|
|
111
|
+
currentPosition += sentenceLength + 1;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
// Don't forget the last chunk
|
|
115
|
+
if (currentChunkSentences.length >= minSentences) {
|
|
116
|
+
const chunkText = currentChunkSentences.join(" ");
|
|
117
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
118
|
+
if (finalText.length > 0) {
|
|
119
|
+
chunks.push({
|
|
120
|
+
id: randomUUID(),
|
|
121
|
+
text: finalText,
|
|
122
|
+
metadata: {
|
|
123
|
+
documentId,
|
|
124
|
+
chunkIndex,
|
|
125
|
+
startPosition,
|
|
126
|
+
endPosition: startPosition + chunkText.length,
|
|
127
|
+
documentType: "text",
|
|
128
|
+
custom: metadata,
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
// Update total chunks count
|
|
134
|
+
chunks.forEach((chunk) => {
|
|
135
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
136
|
+
});
|
|
137
|
+
return chunks;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Split text into sentences based on sentence enders
|
|
141
|
+
*/
|
|
142
|
+
splitIntoSentences(text, sentenceEnders) {
|
|
143
|
+
const sentences = [];
|
|
144
|
+
// Build regex pattern for sentence splitting
|
|
145
|
+
// Look for sentence enders followed by whitespace or end of string
|
|
146
|
+
const pattern = new RegExp(`([${sentenceEnders.map((e) => "\\" + e).join("")}]+)(?=\\s|$)`, "g");
|
|
147
|
+
let lastIndex = 0;
|
|
148
|
+
let match;
|
|
149
|
+
// Reset regex state
|
|
150
|
+
pattern.lastIndex = 0;
|
|
151
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
152
|
+
const endIndex = match.index + match[0].length;
|
|
153
|
+
const sentence = text.slice(lastIndex, endIndex).trim();
|
|
154
|
+
if (sentence.length > 0) {
|
|
155
|
+
sentences.push(sentence);
|
|
156
|
+
}
|
|
157
|
+
lastIndex = endIndex;
|
|
158
|
+
// Skip whitespace
|
|
159
|
+
while (lastIndex < text.length && /\s/.test(text[lastIndex])) {
|
|
160
|
+
lastIndex++;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
// Don't forget the last part
|
|
164
|
+
if (lastIndex < text.length) {
|
|
165
|
+
const remaining = text.slice(lastIndex).trim();
|
|
166
|
+
if (remaining.length > 0) {
|
|
167
|
+
sentences.push(remaining);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return sentences;
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Split a large sentence into smaller chunks
|
|
174
|
+
*/
|
|
175
|
+
splitLargeSentence(sentence, maxSize) {
|
|
176
|
+
const chunks = [];
|
|
177
|
+
const words = sentence.split(/\s+/);
|
|
178
|
+
let currentChunk = "";
|
|
179
|
+
for (const word of words) {
|
|
180
|
+
if (currentChunk.length + word.length + 1 <= maxSize) {
|
|
181
|
+
currentChunk = currentChunk ? currentChunk + " " + word : word;
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
if (currentChunk.length > 0) {
|
|
185
|
+
chunks.push(currentChunk);
|
|
186
|
+
}
|
|
187
|
+
// If a single word is larger than maxSize, we have to include it anyway
|
|
188
|
+
currentChunk = word;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
if (currentChunk.length > 0) {
|
|
192
|
+
chunks.push(currentChunk);
|
|
193
|
+
}
|
|
194
|
+
return chunks;
|
|
195
|
+
}
|
|
196
|
+
validateConfig(config) {
|
|
197
|
+
const errors = [];
|
|
198
|
+
const warnings = [];
|
|
199
|
+
const sentConfig = config;
|
|
200
|
+
if (sentConfig.maxSize !== undefined && sentConfig.maxSize <= 0) {
|
|
201
|
+
errors.push("maxSize must be greater than 0");
|
|
202
|
+
}
|
|
203
|
+
if (sentConfig.overlap !== undefined && sentConfig.overlap < 0) {
|
|
204
|
+
errors.push("overlap must be non-negative");
|
|
205
|
+
}
|
|
206
|
+
if (sentConfig.overlap !== undefined &&
|
|
207
|
+
sentConfig.maxSize !== undefined &&
|
|
208
|
+
sentConfig.overlap >= sentConfig.maxSize) {
|
|
209
|
+
errors.push("overlap must be less than maxSize");
|
|
210
|
+
}
|
|
211
|
+
if (sentConfig.minSentences !== undefined && sentConfig.minSentences < 1) {
|
|
212
|
+
errors.push("minSentences must be at least 1");
|
|
213
|
+
}
|
|
214
|
+
if (sentConfig.maxSentences !== undefined &&
|
|
215
|
+
sentConfig.minSentences !== undefined) {
|
|
216
|
+
if (sentConfig.maxSentences < sentConfig.minSentences) {
|
|
217
|
+
errors.push("maxSentences must be >= minSentences");
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
if (sentConfig.sentenceEnders !== undefined &&
|
|
221
|
+
sentConfig.sentenceEnders.length === 0) {
|
|
222
|
+
warnings.push("No sentence enders specified, using defaults");
|
|
223
|
+
}
|
|
224
|
+
return {
|
|
225
|
+
valid: errors.length === 0,
|
|
226
|
+
errors,
|
|
227
|
+
warnings,
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
//# sourceMappingURL=sentenceChunker.js.map
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token-based Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits text based on token counts using simple tokenization.
|
|
5
|
+
* Best for controlling context window usage with LLMs.
|
|
6
|
+
*/
|
|
7
|
+
import type { Chunker, Chunk, ChunkerValidationResult, TokenChunkerConfig, BaseChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* Token-aware chunker implementation
|
|
10
|
+
* Splits text based on approximate token counts
|
|
11
|
+
*
|
|
12
|
+
* Note: Uses simple word-based tokenization as approximation.
|
|
13
|
+
* For exact token counts, integrate with tiktoken or model-specific tokenizers.
|
|
14
|
+
*/
|
|
15
|
+
export declare class TokenChunker implements Chunker {
|
|
16
|
+
readonly strategy: "token";
|
|
17
|
+
private readonly CHARS_PER_TOKEN;
|
|
18
|
+
chunk(text: string, config?: TokenChunkerConfig): Promise<Chunk[]>;
|
|
19
|
+
/**
|
|
20
|
+
* Simple word-based tokenization
|
|
21
|
+
*/
|
|
22
|
+
private tokenize;
|
|
23
|
+
/**
|
|
24
|
+
* Get characters per token for a tokenizer
|
|
25
|
+
*/
|
|
26
|
+
private getCharsPerToken;
|
|
27
|
+
/**
|
|
28
|
+
* Estimate average tokens per word
|
|
29
|
+
*/
|
|
30
|
+
private estimateTokensPerWord;
|
|
31
|
+
/**
|
|
32
|
+
* Estimate token count for text
|
|
33
|
+
*/
|
|
34
|
+
estimateTokenCount(text: string, tokenizer?: string): number;
|
|
35
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
36
|
+
}
|