@juspay/neurolink 9.1.1 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +106 -37
- package/dist/agent/directTools.d.ts +11 -11
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +8 -8
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +2 -2
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +16 -16
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +8 -8
- package/dist/types/common.d.ts +0 -1
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits HTML documents based on tag structure while preserving semantics.
|
|
5
|
+
* Best for web pages, email templates, and structured HTML content.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* HTML-aware chunker implementation
|
|
10
|
+
* Splits based on HTML structure (tags, elements)
|
|
11
|
+
*/
|
|
12
|
+
export class HTMLChunker {
|
|
13
|
+
strategy = "html";
|
|
14
|
+
defaultSplitTags = [
|
|
15
|
+
"div",
|
|
16
|
+
"p",
|
|
17
|
+
"section",
|
|
18
|
+
"article",
|
|
19
|
+
"main",
|
|
20
|
+
"aside",
|
|
21
|
+
"header",
|
|
22
|
+
"footer",
|
|
23
|
+
"nav",
|
|
24
|
+
"li",
|
|
25
|
+
"tr",
|
|
26
|
+
"td",
|
|
27
|
+
"th",
|
|
28
|
+
];
|
|
29
|
+
defaultPreserveTags = [
|
|
30
|
+
"pre",
|
|
31
|
+
"code",
|
|
32
|
+
"table",
|
|
33
|
+
"ul",
|
|
34
|
+
"ol",
|
|
35
|
+
"blockquote",
|
|
36
|
+
];
|
|
37
|
+
async chunk(text, config) {
|
|
38
|
+
const { maxSize = 1000, overlap = 0, splitTags = this.defaultSplitTags, preserveTags = this.defaultPreserveTags, extractTextOnly = false, includeTagMetadata = true, trimWhitespace = true, metadata = {}, } = config || {};
|
|
39
|
+
const documentId = randomUUID();
|
|
40
|
+
const chunks = [];
|
|
41
|
+
if (!text || text.length === 0) {
|
|
42
|
+
return chunks;
|
|
43
|
+
}
|
|
44
|
+
// Extract and split by structural tags
|
|
45
|
+
const sections = this.splitByTags(text, splitTags, preserveTags);
|
|
46
|
+
let chunkIndex = 0;
|
|
47
|
+
let currentPosition = 0;
|
|
48
|
+
for (const section of sections) {
|
|
49
|
+
const { content, tagName, attributes } = section;
|
|
50
|
+
// Process content
|
|
51
|
+
let processedContent = content;
|
|
52
|
+
if (extractTextOnly) {
|
|
53
|
+
processedContent = this.extractText(content);
|
|
54
|
+
}
|
|
55
|
+
// Split if content is too large
|
|
56
|
+
const contentChunks = this.splitContent(processedContent, maxSize, overlap);
|
|
57
|
+
for (const contentChunk of contentChunks) {
|
|
58
|
+
const finalText = trimWhitespace ? contentChunk.trim() : contentChunk;
|
|
59
|
+
if (finalText.length > 0) {
|
|
60
|
+
const chunkMetadata = {
|
|
61
|
+
...metadata,
|
|
62
|
+
};
|
|
63
|
+
if (includeTagMetadata && tagName) {
|
|
64
|
+
chunkMetadata.tagName = tagName;
|
|
65
|
+
if (attributes && Object.keys(attributes).length > 0) {
|
|
66
|
+
chunkMetadata.attributes = attributes;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
chunks.push({
|
|
70
|
+
id: randomUUID(),
|
|
71
|
+
text: finalText,
|
|
72
|
+
metadata: {
|
|
73
|
+
documentId,
|
|
74
|
+
chunkIndex,
|
|
75
|
+
startPosition: currentPosition,
|
|
76
|
+
endPosition: currentPosition + contentChunk.length,
|
|
77
|
+
documentType: "html",
|
|
78
|
+
custom: chunkMetadata,
|
|
79
|
+
},
|
|
80
|
+
});
|
|
81
|
+
chunkIndex++;
|
|
82
|
+
}
|
|
83
|
+
currentPosition += contentChunk.length;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// Update total chunks count
|
|
87
|
+
chunks.forEach((chunk) => {
|
|
88
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
89
|
+
});
|
|
90
|
+
return chunks;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Split HTML by structural tags
|
|
94
|
+
*/
|
|
95
|
+
splitByTags(html, splitTags, preserveTags) {
|
|
96
|
+
const sections = [];
|
|
97
|
+
// Create regex pattern for split tags
|
|
98
|
+
const tagPattern = new RegExp(`<(${splitTags.join("|")})([^>]*)>([\\s\\S]*?)</\\1>`, "gi");
|
|
99
|
+
let lastIndex = 0;
|
|
100
|
+
let match;
|
|
101
|
+
// Reset regex
|
|
102
|
+
tagPattern.lastIndex = 0;
|
|
103
|
+
while ((match = tagPattern.exec(html)) !== null) {
|
|
104
|
+
// Content before this tag
|
|
105
|
+
if (match.index > lastIndex) {
|
|
106
|
+
const beforeContent = html.slice(lastIndex, match.index).trim();
|
|
107
|
+
if (beforeContent.length > 0) {
|
|
108
|
+
sections.push({
|
|
109
|
+
content: beforeContent,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
const tagName = match[1].toLowerCase();
|
|
114
|
+
const attributeString = match[2];
|
|
115
|
+
const innerContent = match[3];
|
|
116
|
+
// Parse attributes
|
|
117
|
+
const attributes = this.parseAttributes(attributeString);
|
|
118
|
+
// Check if this tag should be preserved as a unit
|
|
119
|
+
const shouldPreserve = preserveTags.some((pt) => innerContent.toLowerCase().includes(`<${pt}`));
|
|
120
|
+
if (shouldPreserve) {
|
|
121
|
+
// Keep the full tag content
|
|
122
|
+
sections.push({
|
|
123
|
+
content: match[0],
|
|
124
|
+
tagName,
|
|
125
|
+
attributes,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
// Just the inner content
|
|
130
|
+
sections.push({
|
|
131
|
+
content: innerContent,
|
|
132
|
+
tagName,
|
|
133
|
+
attributes,
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
lastIndex = match.index + match[0].length;
|
|
137
|
+
}
|
|
138
|
+
// Don't forget content after the last tag
|
|
139
|
+
if (lastIndex < html.length) {
|
|
140
|
+
const remaining = html.slice(lastIndex).trim();
|
|
141
|
+
if (remaining.length > 0) {
|
|
142
|
+
sections.push({
|
|
143
|
+
content: remaining,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// If no tags found, return entire text as one section
|
|
148
|
+
if (sections.length === 0 && html.trim()) {
|
|
149
|
+
sections.push({
|
|
150
|
+
content: html.trim(),
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
return sections;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Parse HTML attributes from string
|
|
157
|
+
*/
|
|
158
|
+
parseAttributes(attributeString) {
|
|
159
|
+
const attributes = {};
|
|
160
|
+
const attrPattern = /(\w+)(?:=["']([^"']*?)["'])?/g;
|
|
161
|
+
let match;
|
|
162
|
+
while ((match = attrPattern.exec(attributeString)) !== null) {
|
|
163
|
+
const name = match[1];
|
|
164
|
+
const value = match[2] || "";
|
|
165
|
+
attributes[name] = value;
|
|
166
|
+
}
|
|
167
|
+
return attributes;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Extract plain text from HTML
|
|
171
|
+
*/
|
|
172
|
+
extractText(html) {
|
|
173
|
+
return (html
|
|
174
|
+
// Remove script and style elements
|
|
175
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
176
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
177
|
+
// Remove HTML comments
|
|
178
|
+
.replace(/<!--[\s\S]*?-->/g, "")
|
|
179
|
+
// Replace block elements with newlines
|
|
180
|
+
.replace(/<\/(p|div|br|h[1-6]|li|tr)>/gi, "\n")
|
|
181
|
+
// Remove remaining tags
|
|
182
|
+
.replace(/<[^>]+>/g, "")
|
|
183
|
+
// Decode common HTML entities
|
|
184
|
+
.replace(/ /gi, " ")
|
|
185
|
+
.replace(/&/gi, "&")
|
|
186
|
+
.replace(/</gi, "<")
|
|
187
|
+
.replace(/>/gi, ">")
|
|
188
|
+
.replace(/"/gi, '"')
|
|
189
|
+
.replace(/'/gi, "'")
|
|
190
|
+
// Normalize whitespace
|
|
191
|
+
.replace(/\s+/g, " ")
|
|
192
|
+
.trim());
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Split content that exceeds max size
|
|
196
|
+
*/
|
|
197
|
+
splitContent(content, maxSize, overlap) {
|
|
198
|
+
const effectiveMaxSize = Math.max(maxSize, 1);
|
|
199
|
+
const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
|
|
200
|
+
if (content.length <= effectiveMaxSize) {
|
|
201
|
+
return [content];
|
|
202
|
+
}
|
|
203
|
+
const chunks = [];
|
|
204
|
+
let start = 0;
|
|
205
|
+
while (start < content.length) {
|
|
206
|
+
let end = Math.min(start + effectiveMaxSize, content.length);
|
|
207
|
+
// Try to break at a natural boundary
|
|
208
|
+
if (end < content.length) {
|
|
209
|
+
const searchStart = Math.max(start, end - 100);
|
|
210
|
+
const searchText = content.slice(searchStart, end);
|
|
211
|
+
// Look for paragraph/sentence break
|
|
212
|
+
const breakMatch = searchText.match(/[.!?\n]\s+/);
|
|
213
|
+
if (breakMatch && breakMatch.index !== undefined) {
|
|
214
|
+
end = searchStart + breakMatch.index + 1;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
chunks.push(content.slice(start, end));
|
|
218
|
+
start = Math.max(start + 1, end - effectiveOverlap);
|
|
219
|
+
}
|
|
220
|
+
return chunks;
|
|
221
|
+
}
|
|
222
|
+
validateConfig(config) {
|
|
223
|
+
const errors = [];
|
|
224
|
+
const warnings = [];
|
|
225
|
+
const htmlConfig = config;
|
|
226
|
+
if (htmlConfig.maxSize !== undefined && htmlConfig.maxSize <= 0) {
|
|
227
|
+
errors.push("maxSize must be greater than 0");
|
|
228
|
+
}
|
|
229
|
+
if (htmlConfig.overlap !== undefined && htmlConfig.overlap < 0) {
|
|
230
|
+
errors.push("overlap must be non-negative");
|
|
231
|
+
}
|
|
232
|
+
if (htmlConfig.overlap !== undefined &&
|
|
233
|
+
htmlConfig.maxSize !== undefined &&
|
|
234
|
+
htmlConfig.overlap >= htmlConfig.maxSize) {
|
|
235
|
+
errors.push("overlap must be less than maxSize");
|
|
236
|
+
}
|
|
237
|
+
if (htmlConfig.splitTags !== undefined &&
|
|
238
|
+
htmlConfig.splitTags.length === 0) {
|
|
239
|
+
warnings.push("No split tags specified, using defaults");
|
|
240
|
+
}
|
|
241
|
+
return {
|
|
242
|
+
valid: errors.length === 0,
|
|
243
|
+
errors,
|
|
244
|
+
warnings,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
//# sourceMappingURL=htmlChunker.js.map
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Module Exports
|
|
3
|
+
*
|
|
4
|
+
* Provides all chunking strategies and the chunker registry.
|
|
5
|
+
*/
|
|
6
|
+
export { ChunkerRegistry, chunkText } from "./chunkerRegistry.js";
|
|
7
|
+
export { CharacterChunker } from "./characterChunker.js";
|
|
8
|
+
export { RecursiveChunker } from "./recursiveChunker.js";
|
|
9
|
+
export { SentenceChunker } from "./sentenceChunker.js";
|
|
10
|
+
export { TokenChunker } from "./tokenChunker.js";
|
|
11
|
+
export { MarkdownChunker } from "./markdownChunker.js";
|
|
12
|
+
export { HTMLChunker } from "./htmlChunker.js";
|
|
13
|
+
export { JSONChunker } from "./jsonChunker.js";
|
|
14
|
+
export { LaTeXChunker } from "./latexChunker.js";
|
|
15
|
+
export { SemanticChunker } from "./semanticChunker.js";
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Module Exports
|
|
3
|
+
*
|
|
4
|
+
* Provides all chunking strategies and the chunker registry.
|
|
5
|
+
*/
|
|
6
|
+
// Registry
|
|
7
|
+
export { ChunkerRegistry, chunkText } from "./chunkerRegistry.js";
|
|
8
|
+
// Individual chunkers
|
|
9
|
+
export { CharacterChunker } from "./characterChunker.js";
|
|
10
|
+
export { RecursiveChunker } from "./recursiveChunker.js";
|
|
11
|
+
export { SentenceChunker } from "./sentenceChunker.js";
|
|
12
|
+
export { TokenChunker } from "./tokenChunker.js";
|
|
13
|
+
export { MarkdownChunker } from "./markdownChunker.js";
|
|
14
|
+
export { HTMLChunker } from "./htmlChunker.js";
|
|
15
|
+
export { JSONChunker } from "./jsonChunker.js";
|
|
16
|
+
export { LaTeXChunker } from "./latexChunker.js";
|
|
17
|
+
export { SemanticChunker } from "./semanticChunker.js";
|
|
18
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits JSON documents based on structure (arrays, objects, keys).
|
|
5
|
+
* Best for API responses, configuration files, and structured data.
|
|
6
|
+
*/
|
|
7
|
+
import type { Chunker, Chunk, ChunkerValidationResult, JSONChunkerConfig, BaseChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* JSON-aware chunker implementation
|
|
10
|
+
* Splits based on JSON structure
|
|
11
|
+
*/
|
|
12
|
+
export declare class JSONChunker implements Chunker {
|
|
13
|
+
readonly strategy: "json";
|
|
14
|
+
chunk(text: string, config?: JSONChunkerConfig): Promise<Chunk[]>;
|
|
15
|
+
/**
|
|
16
|
+
* Recursively extract chunks from JSON structure
|
|
17
|
+
*/
|
|
18
|
+
private extractChunks;
|
|
19
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
20
|
+
}
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits JSON documents based on structure (arrays, objects, keys).
|
|
5
|
+
* Best for API responses, configuration files, and structured data.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* JSON-aware chunker implementation
|
|
10
|
+
* Splits based on JSON structure
|
|
11
|
+
*/
|
|
12
|
+
export class JSONChunker {
|
|
13
|
+
strategy = "json";
|
|
14
|
+
async chunk(text, config) {
|
|
15
|
+
const { maxSize = 1000, maxDepth = 10, splitKeys = [], preserveKeys = [], includeJsonPath = true, trimWhitespace = true, metadata = {}, } = config || {};
|
|
16
|
+
const documentId = randomUUID();
|
|
17
|
+
const chunks = [];
|
|
18
|
+
if (!text || text.length === 0) {
|
|
19
|
+
return chunks;
|
|
20
|
+
}
|
|
21
|
+
// Parse JSON
|
|
22
|
+
let jsonData;
|
|
23
|
+
try {
|
|
24
|
+
jsonData = JSON.parse(text);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
// If not valid JSON, treat as plain text
|
|
28
|
+
chunks.push({
|
|
29
|
+
id: randomUUID(),
|
|
30
|
+
text: trimWhitespace ? text.trim() : text,
|
|
31
|
+
metadata: {
|
|
32
|
+
documentId,
|
|
33
|
+
chunkIndex: 0,
|
|
34
|
+
totalChunks: 1,
|
|
35
|
+
startPosition: 0,
|
|
36
|
+
endPosition: text.length,
|
|
37
|
+
documentType: "json",
|
|
38
|
+
custom: {
|
|
39
|
+
...metadata,
|
|
40
|
+
parseError: "Invalid JSON",
|
|
41
|
+
},
|
|
42
|
+
},
|
|
43
|
+
});
|
|
44
|
+
return chunks;
|
|
45
|
+
}
|
|
46
|
+
// Extract chunks from JSON structure
|
|
47
|
+
const extractedChunks = this.extractChunks({
|
|
48
|
+
data: jsonData,
|
|
49
|
+
path: "",
|
|
50
|
+
depth: 0,
|
|
51
|
+
maxDepth,
|
|
52
|
+
maxSize,
|
|
53
|
+
splitKeys,
|
|
54
|
+
preserveKeys,
|
|
55
|
+
includeJsonPath,
|
|
56
|
+
});
|
|
57
|
+
// Convert to Chunk objects
|
|
58
|
+
let chunkIndex = 0;
|
|
59
|
+
let currentPosition = 0;
|
|
60
|
+
for (const extracted of extractedChunks) {
|
|
61
|
+
const chunkText = JSON.stringify(extracted.value, null, 2);
|
|
62
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
63
|
+
if (finalText.length > 0) {
|
|
64
|
+
const chunkMetadata = {
|
|
65
|
+
...metadata,
|
|
66
|
+
};
|
|
67
|
+
if (includeJsonPath && extracted.path) {
|
|
68
|
+
chunkMetadata.jsonPath = extracted.path;
|
|
69
|
+
}
|
|
70
|
+
chunks.push({
|
|
71
|
+
id: randomUUID(),
|
|
72
|
+
text: finalText,
|
|
73
|
+
metadata: {
|
|
74
|
+
documentId,
|
|
75
|
+
chunkIndex,
|
|
76
|
+
startPosition: currentPosition,
|
|
77
|
+
endPosition: currentPosition + finalText.length,
|
|
78
|
+
documentType: "json",
|
|
79
|
+
jsonPath: extracted.path,
|
|
80
|
+
custom: chunkMetadata,
|
|
81
|
+
},
|
|
82
|
+
});
|
|
83
|
+
chunkIndex++;
|
|
84
|
+
currentPosition += finalText.length;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// Update total chunks count
|
|
88
|
+
chunks.forEach((chunk) => {
|
|
89
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
90
|
+
});
|
|
91
|
+
return chunks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Recursively extract chunks from JSON structure
|
|
95
|
+
*/
|
|
96
|
+
extractChunks(options) {
|
|
97
|
+
const { data, path, depth, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, } = options;
|
|
98
|
+
const results = [];
|
|
99
|
+
// Check depth limit
|
|
100
|
+
if (depth > maxDepth) {
|
|
101
|
+
results.push({ value: data, path });
|
|
102
|
+
return results;
|
|
103
|
+
}
|
|
104
|
+
// Check if this should be preserved as a unit
|
|
105
|
+
const currentKey = path.split(".").pop() || "";
|
|
106
|
+
if (preserveKeys.includes(currentKey)) {
|
|
107
|
+
results.push({ value: data, path });
|
|
108
|
+
return results;
|
|
109
|
+
}
|
|
110
|
+
// Check size - if small enough, keep as one chunk
|
|
111
|
+
const serialized = JSON.stringify(data, null, 2);
|
|
112
|
+
if (serialized.length <= maxSize) {
|
|
113
|
+
results.push({ value: data, path });
|
|
114
|
+
return results;
|
|
115
|
+
}
|
|
116
|
+
// Handle arrays
|
|
117
|
+
if (Array.isArray(data)) {
|
|
118
|
+
// Check if array should be split by index
|
|
119
|
+
if (splitKeys.length === 0 || splitKeys.some((k) => path.endsWith(k))) {
|
|
120
|
+
// Split array into individual elements or groups
|
|
121
|
+
let currentGroup = [];
|
|
122
|
+
let currentGroupSize = 0;
|
|
123
|
+
for (let i = 0; i < data.length; i++) {
|
|
124
|
+
const item = data[i];
|
|
125
|
+
const itemSize = JSON.stringify(item, null, 2).length;
|
|
126
|
+
if (currentGroupSize + itemSize > maxSize &&
|
|
127
|
+
currentGroup.length > 0) {
|
|
128
|
+
// Save current group
|
|
129
|
+
results.push({
|
|
130
|
+
value: currentGroup.length === 1 ? currentGroup[0] : currentGroup,
|
|
131
|
+
path: `${path}[${i - currentGroup.length}:${i}]`,
|
|
132
|
+
});
|
|
133
|
+
currentGroup = [];
|
|
134
|
+
currentGroupSize = 0;
|
|
135
|
+
}
|
|
136
|
+
// If single item is too large, recursively split it
|
|
137
|
+
if (itemSize > maxSize) {
|
|
138
|
+
const subChunks = this.extractChunks({
|
|
139
|
+
data: item,
|
|
140
|
+
path: `${path}[${i}]`,
|
|
141
|
+
depth: depth + 1,
|
|
142
|
+
maxDepth,
|
|
143
|
+
maxSize,
|
|
144
|
+
splitKeys,
|
|
145
|
+
preserveKeys,
|
|
146
|
+
includeJsonPath,
|
|
147
|
+
});
|
|
148
|
+
results.push(...subChunks);
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
currentGroup.push(item);
|
|
152
|
+
currentGroupSize += itemSize;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Don't forget the last group
|
|
156
|
+
if (currentGroup.length > 0) {
|
|
157
|
+
results.push({
|
|
158
|
+
value: currentGroup.length === 1 ? currentGroup[0] : currentGroup,
|
|
159
|
+
path: `${path}[${data.length - currentGroup.length}:${data.length}]`,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
// Keep array as one unit but may need to truncate
|
|
165
|
+
results.push({ value: data, path });
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Handle objects
|
|
169
|
+
else if (data !== null && typeof data === "object") {
|
|
170
|
+
const obj = data;
|
|
171
|
+
const keys = Object.keys(obj);
|
|
172
|
+
// Check if any keys should be split
|
|
173
|
+
const keysToSplit = keys.filter((k) => splitKeys.length === 0 || splitKeys.includes(k));
|
|
174
|
+
if (keysToSplit.length > 0) {
|
|
175
|
+
let currentObj = {};
|
|
176
|
+
let currentObjSize = 0;
|
|
177
|
+
for (const key of keys) {
|
|
178
|
+
const value = obj[key];
|
|
179
|
+
const valueSize = JSON.stringify({ [key]: value }, null, 2).length;
|
|
180
|
+
// Check if this key should be split out
|
|
181
|
+
if (splitKeys.includes(key)) {
|
|
182
|
+
// Save current object first if it has content
|
|
183
|
+
if (Object.keys(currentObj).length > 0) {
|
|
184
|
+
results.push({
|
|
185
|
+
value: currentObj,
|
|
186
|
+
path: path,
|
|
187
|
+
});
|
|
188
|
+
currentObj = {};
|
|
189
|
+
currentObjSize = 0;
|
|
190
|
+
}
|
|
191
|
+
// Recursively process this value
|
|
192
|
+
const subChunks = this.extractChunks({
|
|
193
|
+
data: value,
|
|
194
|
+
path: path ? `${path}.${key}` : key,
|
|
195
|
+
depth: depth + 1,
|
|
196
|
+
maxDepth,
|
|
197
|
+
maxSize,
|
|
198
|
+
splitKeys,
|
|
199
|
+
preserveKeys,
|
|
200
|
+
includeJsonPath,
|
|
201
|
+
});
|
|
202
|
+
results.push(...subChunks);
|
|
203
|
+
}
|
|
204
|
+
else if (currentObjSize + valueSize > maxSize &&
|
|
205
|
+
Object.keys(currentObj).length > 0) {
|
|
206
|
+
// Save current object
|
|
207
|
+
results.push({
|
|
208
|
+
value: currentObj,
|
|
209
|
+
path: path,
|
|
210
|
+
});
|
|
211
|
+
currentObj = { [key]: value };
|
|
212
|
+
currentObjSize = valueSize;
|
|
213
|
+
}
|
|
214
|
+
else {
|
|
215
|
+
currentObj[key] = value;
|
|
216
|
+
currentObjSize += valueSize;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
// Don't forget the last object
|
|
220
|
+
if (Object.keys(currentObj).length > 0) {
|
|
221
|
+
results.push({
|
|
222
|
+
value: currentObj,
|
|
223
|
+
path: path,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
// Process each key individually
|
|
229
|
+
for (const key of keys) {
|
|
230
|
+
const value = obj[key];
|
|
231
|
+
const keyPath = path ? `${path}.${key}` : key;
|
|
232
|
+
const valueSize = JSON.stringify(value, null, 2).length;
|
|
233
|
+
if (valueSize > maxSize) {
|
|
234
|
+
// Recursively split
|
|
235
|
+
const subChunks = this.extractChunks({
|
|
236
|
+
data: value,
|
|
237
|
+
path: keyPath,
|
|
238
|
+
depth: depth + 1,
|
|
239
|
+
maxDepth,
|
|
240
|
+
maxSize,
|
|
241
|
+
splitKeys,
|
|
242
|
+
preserveKeys,
|
|
243
|
+
includeJsonPath,
|
|
244
|
+
});
|
|
245
|
+
results.push(...subChunks);
|
|
246
|
+
}
|
|
247
|
+
else {
|
|
248
|
+
results.push({
|
|
249
|
+
value: { [key]: value },
|
|
250
|
+
path: keyPath,
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Primitive values
|
|
257
|
+
else {
|
|
258
|
+
results.push({ value: data, path });
|
|
259
|
+
}
|
|
260
|
+
return results;
|
|
261
|
+
}
|
|
262
|
+
validateConfig(config) {
|
|
263
|
+
const errors = [];
|
|
264
|
+
const warnings = [];
|
|
265
|
+
const jsonConfig = config;
|
|
266
|
+
if (jsonConfig.maxSize !== undefined && jsonConfig.maxSize <= 0) {
|
|
267
|
+
errors.push("maxSize must be greater than 0");
|
|
268
|
+
}
|
|
269
|
+
if (jsonConfig.maxDepth !== undefined && jsonConfig.maxDepth < 1) {
|
|
270
|
+
errors.push("maxDepth must be at least 1");
|
|
271
|
+
}
|
|
272
|
+
if (jsonConfig.maxDepth !== undefined && jsonConfig.maxDepth > 100) {
|
|
273
|
+
warnings.push("Very high maxDepth may cause performance issues");
|
|
274
|
+
}
|
|
275
|
+
return {
|
|
276
|
+
valid: errors.length === 0,
|
|
277
|
+
errors,
|
|
278
|
+
warnings,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
//# sourceMappingURL=jsonChunker.js.map
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LaTeX-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits LaTeX documents based on structure (sections, environments, math).
|
|
5
|
+
* Best for academic papers, scientific documents, and mathematical content.
|
|
6
|
+
*/
|
|
7
|
+
import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, LaTeXChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* LaTeX-aware chunker implementation
|
|
10
|
+
* Splits based on LaTeX structure (sections, environments)
|
|
11
|
+
*/
|
|
12
|
+
export declare class LaTeXChunker implements Chunker {
|
|
13
|
+
readonly strategy: "latex";
|
|
14
|
+
private readonly defaultSplitEnvironments;
|
|
15
|
+
private readonly mathEnvironments;
|
|
16
|
+
chunk(text: string, config?: LaTeXChunkerConfig): Promise<Chunk[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Split LaTeX by sectioning commands
|
|
19
|
+
*/
|
|
20
|
+
private splitBySections;
|
|
21
|
+
/**
|
|
22
|
+
* Split content that exceeds max size
|
|
23
|
+
*/
|
|
24
|
+
private splitContent;
|
|
25
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
26
|
+
}
|