@juspay/neurolink 9.1.1 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +106 -37
- package/dist/agent/directTools.d.ts +11 -11
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +8 -8
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +2 -2
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +16 -16
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +8 -8
- package/dist/types/common.d.ts +0 -1
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LaTeX-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits LaTeX documents based on structure (sections, environments, math).
|
|
5
|
+
* Best for academic papers, scientific documents, and mathematical content.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* LaTeX-aware chunker implementation
|
|
10
|
+
* Splits based on LaTeX structure (sections, environments)
|
|
11
|
+
*/
|
|
12
|
+
export class LaTeXChunker {
|
|
13
|
+
strategy = "latex";
|
|
14
|
+
defaultSplitEnvironments = [
|
|
15
|
+
"section",
|
|
16
|
+
"subsection",
|
|
17
|
+
"subsubsection",
|
|
18
|
+
"chapter",
|
|
19
|
+
"part",
|
|
20
|
+
];
|
|
21
|
+
mathEnvironments = [
|
|
22
|
+
"equation",
|
|
23
|
+
"equation*",
|
|
24
|
+
"align",
|
|
25
|
+
"align*",
|
|
26
|
+
"gather",
|
|
27
|
+
"gather*",
|
|
28
|
+
"multline",
|
|
29
|
+
"multline*",
|
|
30
|
+
"displaymath",
|
|
31
|
+
];
|
|
32
|
+
async chunk(text, config) {
|
|
33
|
+
const { maxSize = 1000, overlap = 0, splitEnvironments = this.defaultSplitEnvironments, preserveMath = true, includePreamble = true, trimWhitespace = true, metadata = {}, } = config || {};
|
|
34
|
+
const documentId = randomUUID();
|
|
35
|
+
const chunks = [];
|
|
36
|
+
if (!text || text.length === 0) {
|
|
37
|
+
return chunks;
|
|
38
|
+
}
|
|
39
|
+
// Extract preamble if present
|
|
40
|
+
const preambleMatch = text.match(/^([\s\S]*?)\\begin\{document\}([\s\S]*?)\\end\{document\}/);
|
|
41
|
+
let preamble = "";
|
|
42
|
+
let documentContent = text;
|
|
43
|
+
if (preambleMatch) {
|
|
44
|
+
preamble = preambleMatch[1].trim();
|
|
45
|
+
documentContent = preambleMatch[2];
|
|
46
|
+
// Add preamble as first chunk if requested
|
|
47
|
+
if (includePreamble && preamble.length > 0) {
|
|
48
|
+
chunks.push({
|
|
49
|
+
id: randomUUID(),
|
|
50
|
+
text: preamble,
|
|
51
|
+
metadata: {
|
|
52
|
+
documentId,
|
|
53
|
+
chunkIndex: 0,
|
|
54
|
+
startPosition: 0,
|
|
55
|
+
endPosition: preamble.length,
|
|
56
|
+
documentType: "latex",
|
|
57
|
+
latexEnvironment: "preamble",
|
|
58
|
+
custom: metadata,
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
// Protect math environments
|
|
64
|
+
let processedContent = documentContent;
|
|
65
|
+
const mathBlocks = [];
|
|
66
|
+
if (preserveMath) {
|
|
67
|
+
// Protect display math environments
|
|
68
|
+
for (const env of this.mathEnvironments) {
|
|
69
|
+
const envPattern = new RegExp(`\\\\begin\\{${env}\\}[\\s\\S]*?\\\\end\\{${env}\\}`, "g");
|
|
70
|
+
processedContent = processedContent.replace(envPattern, (match) => {
|
|
71
|
+
const placeholder = `__MATH_${mathBlocks.length}__`;
|
|
72
|
+
mathBlocks.push({ placeholder, content: match });
|
|
73
|
+
return placeholder;
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
// Protect inline math
|
|
77
|
+
processedContent = processedContent.replace(/\$\$[\s\S]*?\$\$/g, (match) => {
|
|
78
|
+
const placeholder = `__MATH_${mathBlocks.length}__`;
|
|
79
|
+
mathBlocks.push({ placeholder, content: match });
|
|
80
|
+
return placeholder;
|
|
81
|
+
});
|
|
82
|
+
processedContent = processedContent.replace(/\$[^$]+\$/g, (match) => {
|
|
83
|
+
const placeholder = `__MATH_${mathBlocks.length}__`;
|
|
84
|
+
mathBlocks.push({ placeholder, content: match });
|
|
85
|
+
return placeholder;
|
|
86
|
+
});
|
|
87
|
+
// Protect \[ \] math
|
|
88
|
+
processedContent = processedContent.replace(/\\\[[\s\S]*?\\\]/g, (match) => {
|
|
89
|
+
const placeholder = `__MATH_${mathBlocks.length}__`;
|
|
90
|
+
mathBlocks.push({ placeholder, content: match });
|
|
91
|
+
return placeholder;
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
// Split by sectioning commands
|
|
95
|
+
const sections = this.splitBySections(processedContent, splitEnvironments);
|
|
96
|
+
let chunkIndex = chunks.length;
|
|
97
|
+
let currentPosition = includePreamble && preamble.length > 0 ? preamble.length : 0;
|
|
98
|
+
for (const section of sections) {
|
|
99
|
+
const { title, content, environment } = section;
|
|
100
|
+
// Restore math blocks
|
|
101
|
+
let restoredContent = content;
|
|
102
|
+
for (const { placeholder, content: mathContent } of mathBlocks) {
|
|
103
|
+
restoredContent = restoredContent.replace(placeholder, mathContent);
|
|
104
|
+
}
|
|
105
|
+
// Split if content is too large
|
|
106
|
+
const contentChunks = this.splitContent(restoredContent, maxSize, overlap);
|
|
107
|
+
for (let i = 0; i < contentChunks.length; i++) {
|
|
108
|
+
let chunkText = contentChunks[i];
|
|
109
|
+
// Include section command in first chunk
|
|
110
|
+
if (i === 0 && title && environment) {
|
|
111
|
+
chunkText = `\\${environment}{${title}}\n${chunkText}`;
|
|
112
|
+
}
|
|
113
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
114
|
+
if (finalText.length > 0) {
|
|
115
|
+
chunks.push({
|
|
116
|
+
id: randomUUID(),
|
|
117
|
+
text: finalText,
|
|
118
|
+
metadata: {
|
|
119
|
+
documentId,
|
|
120
|
+
chunkIndex,
|
|
121
|
+
startPosition: currentPosition,
|
|
122
|
+
endPosition: currentPosition + chunkText.length,
|
|
123
|
+
documentType: "latex",
|
|
124
|
+
latexEnvironment: environment ?? undefined,
|
|
125
|
+
header: title ?? undefined,
|
|
126
|
+
custom: metadata,
|
|
127
|
+
},
|
|
128
|
+
});
|
|
129
|
+
chunkIndex++;
|
|
130
|
+
}
|
|
131
|
+
currentPosition += chunkText.length;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// Update total chunks count
|
|
135
|
+
chunks.forEach((chunk) => {
|
|
136
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
137
|
+
});
|
|
138
|
+
return chunks;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Split LaTeX by sectioning commands
|
|
142
|
+
*/
|
|
143
|
+
splitBySections(content, splitEnvironments) {
|
|
144
|
+
const sections = [];
|
|
145
|
+
// Build pattern for sectioning commands
|
|
146
|
+
const envPattern = splitEnvironments.join("|");
|
|
147
|
+
const sectionPattern = new RegExp(`\\\\(${envPattern})\\*?\\{([^}]*)\\}`, "g");
|
|
148
|
+
let lastIndex = 0;
|
|
149
|
+
let lastTitle = null;
|
|
150
|
+
let lastEnvironment = null;
|
|
151
|
+
let match;
|
|
152
|
+
// Reset regex
|
|
153
|
+
sectionPattern.lastIndex = 0;
|
|
154
|
+
while ((match = sectionPattern.exec(content)) !== null) {
|
|
155
|
+
// Content before this section
|
|
156
|
+
if (match.index > lastIndex) {
|
|
157
|
+
const sectionContent = content.slice(lastIndex, match.index);
|
|
158
|
+
if (sectionContent.trim()) {
|
|
159
|
+
sections.push({
|
|
160
|
+
title: lastTitle,
|
|
161
|
+
content: sectionContent.trim(),
|
|
162
|
+
environment: lastEnvironment,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
lastEnvironment = match[1];
|
|
167
|
+
lastTitle = match[2];
|
|
168
|
+
lastIndex = match.index + match[0].length;
|
|
169
|
+
}
|
|
170
|
+
// Don't forget content after the last section
|
|
171
|
+
if (lastIndex < content.length) {
|
|
172
|
+
const remaining = content.slice(lastIndex);
|
|
173
|
+
if (remaining.trim()) {
|
|
174
|
+
sections.push({
|
|
175
|
+
title: lastTitle,
|
|
176
|
+
content: remaining.trim(),
|
|
177
|
+
environment: lastEnvironment,
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
// If no sections found, return entire content
|
|
182
|
+
if (sections.length === 0 && content.trim()) {
|
|
183
|
+
sections.push({
|
|
184
|
+
title: null,
|
|
185
|
+
content: content.trim(),
|
|
186
|
+
environment: null,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
return sections;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Split content that exceeds max size
|
|
193
|
+
*/
|
|
194
|
+
splitContent(content, maxSize, overlap) {
|
|
195
|
+
const effectiveMaxSize = Math.max(maxSize, 1);
|
|
196
|
+
const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
|
|
197
|
+
if (content.length <= effectiveMaxSize) {
|
|
198
|
+
return [content];
|
|
199
|
+
}
|
|
200
|
+
const chunks = [];
|
|
201
|
+
let start = 0;
|
|
202
|
+
while (start < content.length) {
|
|
203
|
+
let end = Math.min(start + effectiveMaxSize, content.length);
|
|
204
|
+
// Try to break at paragraph boundary
|
|
205
|
+
if (end < content.length) {
|
|
206
|
+
const searchStart = Math.max(start, end - 200);
|
|
207
|
+
const searchText = content.slice(searchStart, end);
|
|
208
|
+
// Look for paragraph break
|
|
209
|
+
const paragraphBreak = searchText.lastIndexOf("\n\n");
|
|
210
|
+
if (paragraphBreak > 0) {
|
|
211
|
+
end = searchStart + paragraphBreak;
|
|
212
|
+
}
|
|
213
|
+
else {
|
|
214
|
+
// Look for sentence break
|
|
215
|
+
const sentenceBreak = searchText.search(/[.!?]\s+[A-Z\\]/);
|
|
216
|
+
if (sentenceBreak > 0) {
|
|
217
|
+
end = searchStart + sentenceBreak + 1;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
chunks.push(content.slice(start, end));
|
|
222
|
+
start = Math.max(start + 1, end - effectiveOverlap);
|
|
223
|
+
}
|
|
224
|
+
return chunks;
|
|
225
|
+
}
|
|
226
|
+
validateConfig(config) {
|
|
227
|
+
const errors = [];
|
|
228
|
+
const warnings = [];
|
|
229
|
+
const latexConfig = config;
|
|
230
|
+
if (latexConfig.maxSize !== undefined && latexConfig.maxSize <= 0) {
|
|
231
|
+
errors.push("maxSize must be greater than 0");
|
|
232
|
+
}
|
|
233
|
+
if (latexConfig.overlap !== undefined && latexConfig.overlap < 0) {
|
|
234
|
+
errors.push("overlap must be non-negative");
|
|
235
|
+
}
|
|
236
|
+
if (latexConfig.overlap !== undefined &&
|
|
237
|
+
latexConfig.maxSize !== undefined &&
|
|
238
|
+
latexConfig.overlap >= latexConfig.maxSize) {
|
|
239
|
+
errors.push("overlap must be less than maxSize");
|
|
240
|
+
}
|
|
241
|
+
if (latexConfig.splitEnvironments !== undefined &&
|
|
242
|
+
latexConfig.splitEnvironments.length === 0) {
|
|
243
|
+
warnings.push("No split environments specified, using defaults");
|
|
244
|
+
}
|
|
245
|
+
return {
|
|
246
|
+
valid: errors.length === 0,
|
|
247
|
+
errors,
|
|
248
|
+
warnings,
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits markdown documents based on header structure while preserving formatting.
|
|
5
|
+
* Best for documentation, README files, and structured markdown content.
|
|
6
|
+
*/
|
|
7
|
+
import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, MarkdownChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* Markdown-aware chunker implementation
|
|
10
|
+
* Splits based on markdown structure (headers, code blocks, etc.)
|
|
11
|
+
*/
|
|
12
|
+
export declare class MarkdownChunker implements Chunker {
|
|
13
|
+
readonly strategy: "markdown";
|
|
14
|
+
chunk(text: string, config?: MarkdownChunkerConfig): Promise<Chunk[]>;
|
|
15
|
+
private splitByHeaders;
|
|
16
|
+
private splitContent;
|
|
17
|
+
private stripMarkdown;
|
|
18
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
19
|
+
}
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits markdown documents based on header structure while preserving formatting.
|
|
5
|
+
* Best for documentation, README files, and structured markdown content.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* Markdown-aware chunker implementation
|
|
10
|
+
* Splits based on markdown structure (headers, code blocks, etc.)
|
|
11
|
+
*/
|
|
12
|
+
export class MarkdownChunker {
|
|
13
|
+
strategy = "markdown";
|
|
14
|
+
async chunk(text, config) {
|
|
15
|
+
const { maxSize = 1000, overlap = 0, headerLevels = [1, 2, 3], preserveCodeBlocks = true, includeHeader = true, stripFormatting = false, trimWhitespace = true, metadata = {}, } = config || {};
|
|
16
|
+
const documentId = randomUUID();
|
|
17
|
+
const chunks = [];
|
|
18
|
+
if (!text || text.length === 0) {
|
|
19
|
+
return chunks;
|
|
20
|
+
}
|
|
21
|
+
// Build header regex pattern
|
|
22
|
+
const headerPattern = new RegExp(`^(#{${Math.min(...headerLevels)},${Math.max(...headerLevels)}})\\s+(.+)$`, "gm");
|
|
23
|
+
// Split by headers while preserving them
|
|
24
|
+
const sections = this.splitByHeaders(text, headerPattern, includeHeader);
|
|
25
|
+
let chunkIndex = 0;
|
|
26
|
+
let currentPosition = 0;
|
|
27
|
+
for (const section of sections) {
|
|
28
|
+
const { header, content, level } = section;
|
|
29
|
+
// Handle code blocks
|
|
30
|
+
let processedContent = content;
|
|
31
|
+
const codeBlocks = [];
|
|
32
|
+
if (preserveCodeBlocks) {
|
|
33
|
+
processedContent = content.replace(/```[\s\S]*?```|`[^`]+`/g, (match) => {
|
|
34
|
+
const placeholder = `__CODE_BLOCK_${codeBlocks.length}__`;
|
|
35
|
+
codeBlocks.push({ placeholder, code: match });
|
|
36
|
+
return placeholder;
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
// Split content if too large
|
|
40
|
+
const effectiveMaxSize = Math.max(maxSize - (header?.length || 0), 100);
|
|
41
|
+
const contentChunks = this.splitContent(processedContent, effectiveMaxSize, overlap);
|
|
42
|
+
for (const contentChunk of contentChunks) {
|
|
43
|
+
let chunkText = header && includeHeader
|
|
44
|
+
? `${header}\n\n${contentChunk}`
|
|
45
|
+
: contentChunk;
|
|
46
|
+
// Restore code blocks
|
|
47
|
+
for (const { placeholder, code } of codeBlocks) {
|
|
48
|
+
chunkText = chunkText.replace(placeholder, code);
|
|
49
|
+
}
|
|
50
|
+
// Strip formatting if requested
|
|
51
|
+
if (stripFormatting) {
|
|
52
|
+
chunkText = this.stripMarkdown(chunkText);
|
|
53
|
+
}
|
|
54
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
55
|
+
if (finalText.length > 0) {
|
|
56
|
+
chunks.push({
|
|
57
|
+
id: randomUUID(),
|
|
58
|
+
text: finalText,
|
|
59
|
+
metadata: {
|
|
60
|
+
documentId,
|
|
61
|
+
chunkIndex,
|
|
62
|
+
startPosition: currentPosition,
|
|
63
|
+
endPosition: currentPosition + chunkText.length,
|
|
64
|
+
documentType: "markdown",
|
|
65
|
+
headerLevel: level ?? undefined,
|
|
66
|
+
header: header?.replace(/^#+\s*/, "") ?? undefined,
|
|
67
|
+
custom: metadata,
|
|
68
|
+
},
|
|
69
|
+
});
|
|
70
|
+
chunkIndex++;
|
|
71
|
+
}
|
|
72
|
+
currentPosition += chunkText.length;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Update total chunks count
|
|
76
|
+
chunks.forEach((chunk) => {
|
|
77
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
78
|
+
});
|
|
79
|
+
return chunks;
|
|
80
|
+
}
|
|
81
|
+
splitByHeaders(text, headerPattern, _includeHeader) {
|
|
82
|
+
const sections = [];
|
|
83
|
+
let lastIndex = 0;
|
|
84
|
+
let match;
|
|
85
|
+
let currentHeader = null;
|
|
86
|
+
let currentLevel = null;
|
|
87
|
+
// Reset regex
|
|
88
|
+
headerPattern.lastIndex = 0;
|
|
89
|
+
while ((match = headerPattern.exec(text)) !== null) {
|
|
90
|
+
// Content before this header
|
|
91
|
+
if (match.index > lastIndex) {
|
|
92
|
+
const content = text.slice(lastIndex, match.index);
|
|
93
|
+
if (content.trim()) {
|
|
94
|
+
sections.push({
|
|
95
|
+
header: currentHeader,
|
|
96
|
+
content: content.trim(),
|
|
97
|
+
level: currentLevel,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
currentHeader = match[0];
|
|
102
|
+
currentLevel = match[1].length; // Number of # characters
|
|
103
|
+
lastIndex = match.index + match[0].length;
|
|
104
|
+
}
|
|
105
|
+
// Don't forget content after the last header
|
|
106
|
+
if (lastIndex < text.length) {
|
|
107
|
+
const content = text.slice(lastIndex);
|
|
108
|
+
if (content.trim()) {
|
|
109
|
+
sections.push({
|
|
110
|
+
header: currentHeader,
|
|
111
|
+
content: content.trim(),
|
|
112
|
+
level: currentLevel,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
// If no headers found, return entire text as one section
|
|
117
|
+
if (sections.length === 0 && text.trim()) {
|
|
118
|
+
sections.push({
|
|
119
|
+
header: null,
|
|
120
|
+
content: text.trim(),
|
|
121
|
+
level: null,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
return sections;
|
|
125
|
+
}
|
|
126
|
+
splitContent(content, maxSize, overlap) {
|
|
127
|
+
const effectiveMaxSize = Math.max(maxSize, 1);
|
|
128
|
+
const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
|
|
129
|
+
if (content.length <= effectiveMaxSize) {
|
|
130
|
+
return [content];
|
|
131
|
+
}
|
|
132
|
+
const chunks = [];
|
|
133
|
+
let start = 0;
|
|
134
|
+
while (start < content.length) {
|
|
135
|
+
let end = Math.min(start + effectiveMaxSize, content.length);
|
|
136
|
+
// Try to break at a paragraph or sentence boundary
|
|
137
|
+
if (end < content.length) {
|
|
138
|
+
const searchStart = Math.max(start, end - 200);
|
|
139
|
+
const searchText = content.slice(searchStart, end);
|
|
140
|
+
// Look for paragraph break first
|
|
141
|
+
const paragraphBreak = searchText.lastIndexOf("\n\n");
|
|
142
|
+
if (paragraphBreak > 0) {
|
|
143
|
+
end = searchStart + paragraphBreak;
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
// Look for sentence break
|
|
147
|
+
const sentenceBreak = searchText.search(/[.!?]\s+[A-Z]/);
|
|
148
|
+
if (sentenceBreak > 0) {
|
|
149
|
+
end = searchStart + sentenceBreak + 1;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
chunks.push(content.slice(start, end));
|
|
154
|
+
start = Math.max(start + 1, end - effectiveOverlap);
|
|
155
|
+
}
|
|
156
|
+
return chunks;
|
|
157
|
+
}
|
|
158
|
+
stripMarkdown(text) {
|
|
159
|
+
return text
|
|
160
|
+
.replace(/^#+\s+/gm, "") // Headers
|
|
161
|
+
.replace(/\*\*(.+?)\*\*/g, "$1") // Bold
|
|
162
|
+
.replace(/\*(.+?)\*/g, "$1") // Italic
|
|
163
|
+
.replace(/__(.+?)__/g, "$1") // Bold (underscore)
|
|
164
|
+
.replace(/_(.+?)_/g, "$1") // Italic (underscore)
|
|
165
|
+
.replace(/`(.+?)`/g, "$1") // Inline code
|
|
166
|
+
.replace(/```[\s\S]*?```/g, "") // Code blocks
|
|
167
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // Links
|
|
168
|
+
.replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1"); // Images
|
|
169
|
+
}
|
|
170
|
+
validateConfig(config) {
|
|
171
|
+
const errors = [];
|
|
172
|
+
const warnings = [];
|
|
173
|
+
const mdConfig = config;
|
|
174
|
+
if (mdConfig.maxSize !== undefined && mdConfig.maxSize <= 0) {
|
|
175
|
+
errors.push("maxSize must be greater than 0");
|
|
176
|
+
}
|
|
177
|
+
if (mdConfig.headerLevels !== undefined) {
|
|
178
|
+
if (mdConfig.headerLevels.length === 0) {
|
|
179
|
+
errors.push("headerLevels must not be empty");
|
|
180
|
+
}
|
|
181
|
+
for (const level of mdConfig.headerLevels) {
|
|
182
|
+
if (level < 1 || level > 6) {
|
|
183
|
+
errors.push(`Invalid header level: ${level}. Must be between 1 and 6`);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
if (mdConfig.overlap !== undefined && mdConfig.overlap < 0) {
|
|
188
|
+
errors.push("overlap must be non-negative");
|
|
189
|
+
}
|
|
190
|
+
if (mdConfig.overlap !== undefined &&
|
|
191
|
+
mdConfig.maxSize !== undefined &&
|
|
192
|
+
mdConfig.overlap >= mdConfig.maxSize) {
|
|
193
|
+
errors.push("overlap must be less than maxSize");
|
|
194
|
+
}
|
|
195
|
+
return {
|
|
196
|
+
valid: errors.length === 0,
|
|
197
|
+
errors,
|
|
198
|
+
warnings,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recursive Chunker
|
|
3
|
+
*
|
|
4
|
+
* Smart text splitting using hierarchical separators.
|
|
5
|
+
* Tries each separator in order, recursively splitting chunks that are too large.
|
|
6
|
+
* Best for general-purpose text that has natural boundaries.
|
|
7
|
+
*/
|
|
8
|
+
import type { Chunker, Chunk, ChunkerValidationResult, RecursiveChunkerConfig, BaseChunkerConfig } from "../types.js";
|
|
9
|
+
/**
|
|
10
|
+
* Recursive chunker implementation
|
|
11
|
+
* Smart splitting based on content structure using hierarchical separators
|
|
12
|
+
*/
|
|
13
|
+
export declare class RecursiveChunker implements Chunker {
|
|
14
|
+
readonly strategy: "recursive";
|
|
15
|
+
private readonly defaultSeparators;
|
|
16
|
+
chunk(text: string, config?: RecursiveChunkerConfig): Promise<Chunk[]>;
|
|
17
|
+
private recursiveSplit;
|
|
18
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
19
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recursive Chunker
|
|
3
|
+
*
|
|
4
|
+
* Smart text splitting using hierarchical separators.
|
|
5
|
+
* Tries each separator in order, recursively splitting chunks that are too large.
|
|
6
|
+
* Best for general-purpose text that has natural boundaries.
|
|
7
|
+
*/
|
|
8
|
+
import { randomUUID } from "crypto";
|
|
9
|
+
/**
|
|
10
|
+
* Recursive chunker implementation
|
|
11
|
+
* Smart splitting based on content structure using hierarchical separators
|
|
12
|
+
*/
|
|
13
|
+
export class RecursiveChunker {
|
|
14
|
+
strategy = "recursive";
|
|
15
|
+
defaultSeparators = ["\n\n", "\n", ". ", " ", ""];
|
|
16
|
+
async chunk(text, config) {
|
|
17
|
+
const { maxSize = 1000, overlap = 200, separators = this.defaultSeparators, isSeparatorRegex = false, trimWhitespace = true, metadata = {}, } = config || {};
|
|
18
|
+
const documentId = randomUUID();
|
|
19
|
+
const chunks = [];
|
|
20
|
+
if (!text || text.length === 0) {
|
|
21
|
+
return chunks;
|
|
22
|
+
}
|
|
23
|
+
const splitTexts = this.recursiveSplit(text, separators, maxSize, overlap, isSeparatorRegex);
|
|
24
|
+
let chunkIndex = 0;
|
|
25
|
+
let currentPosition = 0;
|
|
26
|
+
for (const splitText of splitTexts) {
|
|
27
|
+
const chunkText = trimWhitespace ? splitText.trim() : splitText;
|
|
28
|
+
if (chunkText.length > 0) {
|
|
29
|
+
const startPosition = text.indexOf(splitText, currentPosition);
|
|
30
|
+
chunks.push({
|
|
31
|
+
id: randomUUID(),
|
|
32
|
+
text: chunkText,
|
|
33
|
+
metadata: {
|
|
34
|
+
documentId,
|
|
35
|
+
chunkIndex,
|
|
36
|
+
startPosition: startPosition >= 0 ? startPosition : currentPosition,
|
|
37
|
+
endPosition: startPosition >= 0
|
|
38
|
+
? startPosition + splitText.length
|
|
39
|
+
: currentPosition + splitText.length,
|
|
40
|
+
documentType: "text",
|
|
41
|
+
custom: metadata,
|
|
42
|
+
},
|
|
43
|
+
});
|
|
44
|
+
chunkIndex++;
|
|
45
|
+
if (startPosition >= 0) {
|
|
46
|
+
currentPosition = startPosition + splitText.length - overlap;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
// Update total chunks count
|
|
51
|
+
chunks.forEach((chunk) => {
|
|
52
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
53
|
+
});
|
|
54
|
+
return chunks;
|
|
55
|
+
}
|
|
56
|
+
recursiveSplit(text, separators, maxSize, overlap, isRegex) {
|
|
57
|
+
const results = [];
|
|
58
|
+
if (text.length <= maxSize) {
|
|
59
|
+
return [text];
|
|
60
|
+
}
|
|
61
|
+
// Find the best separator to use
|
|
62
|
+
let separator = separators[separators.length - 1]; // Default to last (usually "")
|
|
63
|
+
let newSeparators = separators;
|
|
64
|
+
for (let i = 0; i < separators.length; i++) {
|
|
65
|
+
const sep = separators[i];
|
|
66
|
+
const hasMatch = isRegex
|
|
67
|
+
? new RegExp(sep).test(text)
|
|
68
|
+
: text.includes(sep);
|
|
69
|
+
if (sep === "" || hasMatch) {
|
|
70
|
+
separator = sep;
|
|
71
|
+
newSeparators = separators.slice(i + 1);
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Split the text
|
|
76
|
+
const splits = isRegex
|
|
77
|
+
? text.split(new RegExp(separator))
|
|
78
|
+
: text.split(separator);
|
|
79
|
+
// Merge splits into chunks
|
|
80
|
+
let currentChunk = "";
|
|
81
|
+
for (const split of splits) {
|
|
82
|
+
const potentialChunk = currentChunk
|
|
83
|
+
? currentChunk + separator + split
|
|
84
|
+
: split;
|
|
85
|
+
if (potentialChunk.length <= maxSize) {
|
|
86
|
+
currentChunk = potentialChunk;
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
// Current chunk is ready
|
|
90
|
+
if (currentChunk.length > 0) {
|
|
91
|
+
results.push(currentChunk);
|
|
92
|
+
}
|
|
93
|
+
// Handle split that's still too large
|
|
94
|
+
if (split.length > maxSize) {
|
|
95
|
+
const subSplits = this.recursiveSplit(split, newSeparators, maxSize, overlap, isRegex);
|
|
96
|
+
results.push(...subSplits.slice(0, -1));
|
|
97
|
+
currentChunk = subSplits[subSplits.length - 1] || "";
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
// Add overlap from previous chunk
|
|
101
|
+
if (results.length > 0 && overlap > 0) {
|
|
102
|
+
const lastChunk = results[results.length - 1];
|
|
103
|
+
const overlapText = lastChunk.slice(-overlap);
|
|
104
|
+
currentChunk = overlapText + separator + split;
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
currentChunk = split;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// Don't forget the last chunk
|
|
113
|
+
if (currentChunk.length > 0) {
|
|
114
|
+
results.push(currentChunk);
|
|
115
|
+
}
|
|
116
|
+
return results;
|
|
117
|
+
}
|
|
118
|
+
validateConfig(config) {
|
|
119
|
+
const errors = [];
|
|
120
|
+
const warnings = [];
|
|
121
|
+
const recConfig = config;
|
|
122
|
+
if (recConfig.maxSize !== undefined && recConfig.maxSize <= 0) {
|
|
123
|
+
errors.push("maxSize must be greater than 0");
|
|
124
|
+
}
|
|
125
|
+
if (recConfig.overlap !== undefined && recConfig.overlap < 0) {
|
|
126
|
+
errors.push("overlap must be non-negative");
|
|
127
|
+
}
|
|
128
|
+
if (recConfig.separators !== undefined &&
|
|
129
|
+
recConfig.separators.length === 0) {
|
|
130
|
+
errors.push("separators array must not be empty");
|
|
131
|
+
}
|
|
132
|
+
if (recConfig.isSeparatorRegex && recConfig.separators) {
|
|
133
|
+
for (const sep of recConfig.separators) {
|
|
134
|
+
try {
|
|
135
|
+
new RegExp(sep);
|
|
136
|
+
}
|
|
137
|
+
catch {
|
|
138
|
+
errors.push(`Invalid regex separator: ${sep}`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return {
|
|
143
|
+
valid: errors.length === 0,
|
|
144
|
+
errors,
|
|
145
|
+
warnings,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Chunker
|
|
3
|
+
*
|
|
4
|
+
* LLM-powered semantic chunking that groups related content together.
|
|
5
|
+
* Uses embedding similarity to determine natural breakpoints.
|
|
6
|
+
* Best for complex documents where meaning should drive segmentation.
|
|
7
|
+
*/
|
|
8
|
+
import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, SemanticChunkerConfig } from "../types.js";
|
|
9
|
+
/**
|
|
10
|
+
* Semantic chunker implementation
|
|
11
|
+
* Uses embedding similarity to find natural content boundaries
|
|
12
|
+
*/
|
|
13
|
+
export declare class SemanticChunker implements Chunker {
|
|
14
|
+
readonly strategy: "semantic";
|
|
15
|
+
chunk(text: string, config?: SemanticChunkerConfig): Promise<Chunk[]>;
|
|
16
|
+
/**
|
|
17
|
+
* Split text into initial segments for embedding
|
|
18
|
+
*/
|
|
19
|
+
private splitIntoSegments;
|
|
20
|
+
/**
|
|
21
|
+
* Get embeddings for segments
|
|
22
|
+
*/
|
|
23
|
+
private getEmbeddings;
|
|
24
|
+
/**
|
|
25
|
+
* Find semantic breakpoints using cosine similarity
|
|
26
|
+
*/
|
|
27
|
+
private findSemanticBreakpoints;
|
|
28
|
+
/**
|
|
29
|
+
* Group segments based on breakpoints and size limits
|
|
30
|
+
*/
|
|
31
|
+
private groupSegments;
|
|
32
|
+
/**
|
|
33
|
+
* Calculate cosine similarity between two vectors
|
|
34
|
+
*/
|
|
35
|
+
private cosineSimilarity;
|
|
36
|
+
/**
|
|
37
|
+
* Fallback to simple chunking when embeddings fail
|
|
38
|
+
*/
|
|
39
|
+
private fallbackChunk;
|
|
40
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
41
|
+
}
|