@juspay/neurolink 9.1.1 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +106 -37
- package/dist/agent/directTools.d.ts +11 -11
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +8 -8
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +2 -2
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +16 -16
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +8 -8
- package/dist/types/common.d.ts +0 -1
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Loaders
|
|
3
|
+
*
|
|
4
|
+
* Provides loaders for various document formats including:
|
|
5
|
+
* - Text files
|
|
6
|
+
* - Markdown files
|
|
7
|
+
* - HTML files and web pages
|
|
8
|
+
* - JSON files
|
|
9
|
+
* - CSV files
|
|
10
|
+
* - PDF files
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { loadDocument, WebLoader, PDFLoader } from 'neurolink/rag';
|
|
15
|
+
*
|
|
16
|
+
* // Load from file path
|
|
17
|
+
* const doc = await loadDocument('/path/to/document.md');
|
|
18
|
+
*
|
|
19
|
+
* // Load from URL
|
|
20
|
+
* const webDoc = await WebLoader.load('https://example.com/article');
|
|
21
|
+
*
|
|
22
|
+
* // Load PDF
|
|
23
|
+
* const pdfDoc = await PDFLoader.load('/path/to/document.pdf');
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
import { readFile } from "fs/promises";
|
|
27
|
+
import { existsSync } from "fs";
|
|
28
|
+
import { extname, basename } from "path";
|
|
29
|
+
import { MDocument } from "./MDocument.js";
|
|
30
|
+
import { logger } from "../../utils/logger.js";
|
|
31
|
+
/**
|
|
32
|
+
* Text file loader
|
|
33
|
+
*/
|
|
34
|
+
export class TextLoader {
|
|
35
|
+
async load(source, options) {
|
|
36
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
37
|
+
return MDocument.fromText(content, {
|
|
38
|
+
source: this.getSourceName(source),
|
|
39
|
+
...options?.metadata,
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
canHandle(source) {
|
|
43
|
+
const ext = extname(source).toLowerCase();
|
|
44
|
+
return ext === ".txt" || ext === "";
|
|
45
|
+
}
|
|
46
|
+
async loadContent(source, encoding = "utf-8") {
|
|
47
|
+
if (existsSync(source)) {
|
|
48
|
+
return await readFile(source, encoding);
|
|
49
|
+
}
|
|
50
|
+
// Assume source is content if not a file
|
|
51
|
+
return source;
|
|
52
|
+
}
|
|
53
|
+
getSourceName(source) {
|
|
54
|
+
return existsSync(source) ? basename(source) : "inline-content";
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Markdown file loader
|
|
59
|
+
*/
|
|
60
|
+
export class MarkdownLoader extends TextLoader {
|
|
61
|
+
async load(source, options) {
|
|
62
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
63
|
+
return MDocument.fromMarkdown(content, {
|
|
64
|
+
source: this.getSourceName(source),
|
|
65
|
+
...options?.metadata,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
canHandle(source) {
|
|
69
|
+
const ext = extname(source).toLowerCase();
|
|
70
|
+
return ext === ".md" || ext === ".markdown" || ext === ".mdx";
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* HTML file loader
|
|
75
|
+
*/
|
|
76
|
+
export class HTMLLoader extends TextLoader {
|
|
77
|
+
async load(source, options) {
|
|
78
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
79
|
+
return MDocument.fromHTML(content, {
|
|
80
|
+
source: this.getSourceName(source),
|
|
81
|
+
...options?.metadata,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
canHandle(source) {
|
|
85
|
+
const ext = extname(source).toLowerCase();
|
|
86
|
+
return ext === ".html" || ext === ".htm" || ext === ".xhtml";
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* JSON file loader
|
|
91
|
+
*/
|
|
92
|
+
export class JSONLoader extends TextLoader {
|
|
93
|
+
async load(source, options) {
|
|
94
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
95
|
+
// Validate JSON
|
|
96
|
+
try {
|
|
97
|
+
JSON.parse(content);
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
throw new Error(`Invalid JSON: ${error instanceof Error ? error.message : String(error)}`);
|
|
101
|
+
}
|
|
102
|
+
return MDocument.fromJSONContent(content, {
|
|
103
|
+
source: this.getSourceName(source),
|
|
104
|
+
...options?.metadata,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
canHandle(source) {
|
|
108
|
+
const ext = extname(source).toLowerCase();
|
|
109
|
+
return ext === ".json" || ext === ".jsonl";
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* CSV file loader
|
|
114
|
+
*/
|
|
115
|
+
export class CSVLoader extends TextLoader {
|
|
116
|
+
async load(source, options) {
|
|
117
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
118
|
+
const { delimiter = ",", hasHeader = true, columns, outputFormat = "text", } = options || {};
|
|
119
|
+
const lines = content.split("\n").filter((line) => line.trim());
|
|
120
|
+
const headers = hasHeader
|
|
121
|
+
? this.parseCSVLine(lines[0], delimiter)
|
|
122
|
+
: columns || lines[0]?.split(delimiter).map((_, i) => `col${i + 1}`);
|
|
123
|
+
const dataLines = hasHeader ? lines.slice(1) : lines;
|
|
124
|
+
const rows = dataLines.map((line) => this.parseCSVLine(line, delimiter));
|
|
125
|
+
let formattedContent;
|
|
126
|
+
switch (outputFormat) {
|
|
127
|
+
case "json":
|
|
128
|
+
formattedContent = JSON.stringify(rows.map((row) => Object.fromEntries(headers.map((h, i) => [h, row[i]]))), null, 2);
|
|
129
|
+
break;
|
|
130
|
+
case "markdown":
|
|
131
|
+
formattedContent = this.toMarkdownTable(headers, rows);
|
|
132
|
+
break;
|
|
133
|
+
default:
|
|
134
|
+
formattedContent = this.toTextTable(headers, rows);
|
|
135
|
+
}
|
|
136
|
+
return MDocument.fromCSV(formattedContent, {
|
|
137
|
+
source: this.getSourceName(source),
|
|
138
|
+
rowCount: rows.length,
|
|
139
|
+
columnCount: headers.length,
|
|
140
|
+
columns: headers,
|
|
141
|
+
...options?.metadata,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
canHandle(source) {
|
|
145
|
+
const ext = extname(source).toLowerCase();
|
|
146
|
+
return ext === ".csv" || ext === ".tsv";
|
|
147
|
+
}
|
|
148
|
+
parseCSVLine(line, delimiter) {
|
|
149
|
+
const result = [];
|
|
150
|
+
let current = "";
|
|
151
|
+
let inQuotes = false;
|
|
152
|
+
for (let i = 0; i < line.length; i++) {
|
|
153
|
+
const char = line[i];
|
|
154
|
+
if (char === '"' && (i === 0 || line[i - 1] !== "\\")) {
|
|
155
|
+
inQuotes = !inQuotes;
|
|
156
|
+
}
|
|
157
|
+
else if (char === delimiter && !inQuotes) {
|
|
158
|
+
result.push(current.trim());
|
|
159
|
+
current = "";
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
current += char;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
result.push(current.trim());
|
|
166
|
+
return result;
|
|
167
|
+
}
|
|
168
|
+
toMarkdownTable(headers, rows) {
|
|
169
|
+
const headerRow = `| ${headers.join(" | ")} |`;
|
|
170
|
+
const separator = `| ${headers.map(() => "---").join(" | ")} |`;
|
|
171
|
+
const dataRows = rows.map((row) => `| ${row.join(" | ")} |`);
|
|
172
|
+
return [headerRow, separator, ...dataRows].join("\n");
|
|
173
|
+
}
|
|
174
|
+
toTextTable(headers, rows) {
|
|
175
|
+
const allRows = [headers, ...rows];
|
|
176
|
+
const colWidths = headers.map((_, i) => Math.max(...allRows.map((row) => (row[i] || "").length)));
|
|
177
|
+
const formatRow = (row) => row.map((cell, i) => (cell || "").padEnd(colWidths[i])).join(" | ");
|
|
178
|
+
return [
|
|
179
|
+
formatRow(headers),
|
|
180
|
+
colWidths.map((w) => "-".repeat(w)).join("-+-"),
|
|
181
|
+
...rows.map(formatRow),
|
|
182
|
+
].join("\n");
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* PDF file loader
|
|
187
|
+
*
|
|
188
|
+
* Note: Requires external PDF processing library for full functionality.
|
|
189
|
+
* Falls back to placeholder implementation if pdf-parse is not available.
|
|
190
|
+
*/
|
|
191
|
+
export class PDFLoader {
|
|
192
|
+
async load(source, options) {
|
|
193
|
+
if (!existsSync(source)) {
|
|
194
|
+
throw new Error(`PDF file not found: ${source}`);
|
|
195
|
+
}
|
|
196
|
+
logger.debug("[PDFLoader] Loading PDF", {
|
|
197
|
+
source,
|
|
198
|
+
pageRange: options?.pageRange,
|
|
199
|
+
});
|
|
200
|
+
try {
|
|
201
|
+
// Try to use pdf-parse if available
|
|
202
|
+
const pdfParse = await this.loadPdfParser();
|
|
203
|
+
const buffer = await readFile(source);
|
|
204
|
+
const data = await pdfParse(buffer);
|
|
205
|
+
const text = data.text;
|
|
206
|
+
// Handle page range if specified
|
|
207
|
+
if (options?.pageRange) {
|
|
208
|
+
const _pages = this.parsePageRange(options.pageRange, data.numpages);
|
|
209
|
+
// Note: pdf-parse doesn't support page selection directly
|
|
210
|
+
// This is a placeholder for more sophisticated page handling
|
|
211
|
+
logger.debug("[PDFLoader] Page range requested but not fully supported", {
|
|
212
|
+
pageRange: options.pageRange,
|
|
213
|
+
totalPages: data.numpages,
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
return new MDocument(text, {
|
|
217
|
+
type: "pdf",
|
|
218
|
+
metadata: {
|
|
219
|
+
source: basename(source),
|
|
220
|
+
pageCount: data.numpages,
|
|
221
|
+
info: data.info,
|
|
222
|
+
...options?.metadata,
|
|
223
|
+
},
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
catch (error) {
|
|
227
|
+
// Fallback: Return placeholder document
|
|
228
|
+
logger.warn("[PDFLoader] pdf-parse not available, using fallback", {
|
|
229
|
+
error: error instanceof Error ? error.message : String(error),
|
|
230
|
+
});
|
|
231
|
+
return new MDocument(`[PDF Document: ${basename(source)}]\n\nNote: PDF parsing requires the 'pdf-parse' package. Install it with:\n npm install pdf-parse`, {
|
|
232
|
+
type: "pdf",
|
|
233
|
+
metadata: {
|
|
234
|
+
source: basename(source),
|
|
235
|
+
parseError: "pdf-parse not available",
|
|
236
|
+
...options?.metadata,
|
|
237
|
+
},
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
canHandle(source) {
|
|
242
|
+
const ext = extname(source).toLowerCase();
|
|
243
|
+
return ext === ".pdf";
|
|
244
|
+
}
|
|
245
|
+
async loadPdfParser() {
|
|
246
|
+
try {
|
|
247
|
+
// @ts-expect-error pdf-parse is an optional dependency
|
|
248
|
+
const pdfParse = await import("pdf-parse");
|
|
249
|
+
return pdfParse.default || pdfParse;
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
throw new Error("pdf-parse module not available");
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
parsePageRange(range, totalPages) {
|
|
256
|
+
const pages = [];
|
|
257
|
+
const parts = range.split(",");
|
|
258
|
+
for (const part of parts) {
|
|
259
|
+
if (part.includes("-")) {
|
|
260
|
+
const [start, end] = part.split("-").map(Number);
|
|
261
|
+
for (let i = start; i <= Math.min(end, totalPages); i++) {
|
|
262
|
+
pages.push(i);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
const page = Number(part);
|
|
267
|
+
if (page <= totalPages) {
|
|
268
|
+
pages.push(page);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
return [...new Set(pages)].sort((a, b) => a - b);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Web page loader
|
|
277
|
+
*
|
|
278
|
+
* Fetches and extracts content from web pages.
|
|
279
|
+
* Supports basic HTML parsing without external dependencies.
|
|
280
|
+
*/
|
|
281
|
+
export class WebLoader {
|
|
282
|
+
defaultUserAgent = "Mozilla/5.0 (compatible; NeuroLink/1.0; +https://github.com/juspay/neurolink)";
|
|
283
|
+
async load(source, options) {
|
|
284
|
+
if (!this.canHandle(source)) {
|
|
285
|
+
throw new Error(`Invalid URL: ${source}`);
|
|
286
|
+
}
|
|
287
|
+
logger.debug("[WebLoader] Fetching URL", {
|
|
288
|
+
url: source,
|
|
289
|
+
timeout: options?.timeout,
|
|
290
|
+
});
|
|
291
|
+
const response = await fetch(source, {
|
|
292
|
+
signal: options?.timeout
|
|
293
|
+
? AbortSignal.timeout(options.timeout)
|
|
294
|
+
: undefined,
|
|
295
|
+
headers: {
|
|
296
|
+
"User-Agent": options?.userAgent || this.defaultUserAgent,
|
|
297
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
298
|
+
...options?.headers,
|
|
299
|
+
},
|
|
300
|
+
});
|
|
301
|
+
if (!response.ok) {
|
|
302
|
+
throw new Error(`HTTP error ${response.status}: ${response.statusText}`);
|
|
303
|
+
}
|
|
304
|
+
const html = await response.text();
|
|
305
|
+
let content = html;
|
|
306
|
+
// Extract main content if requested
|
|
307
|
+
if (options?.extractMainContent) {
|
|
308
|
+
content = this.extractMainContent(html, options.contentSelector);
|
|
309
|
+
}
|
|
310
|
+
// Convert HTML to plain text for better processing
|
|
311
|
+
const text = this.htmlToText(content);
|
|
312
|
+
return new MDocument(text, {
|
|
313
|
+
type: "html",
|
|
314
|
+
metadata: {
|
|
315
|
+
source,
|
|
316
|
+
url: source,
|
|
317
|
+
fetchedAt: new Date().toISOString(),
|
|
318
|
+
contentType: response.headers.get("content-type") || "text/html",
|
|
319
|
+
...options?.metadata,
|
|
320
|
+
},
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
canHandle(source) {
|
|
324
|
+
try {
|
|
325
|
+
const url = new URL(source);
|
|
326
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
327
|
+
}
|
|
328
|
+
catch {
|
|
329
|
+
return false;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
/**
|
|
333
|
+
* Extract main content from HTML
|
|
334
|
+
*/
|
|
335
|
+
extractMainContent(html, selector) {
|
|
336
|
+
// Simple extraction based on common content patterns
|
|
337
|
+
// For production use, consider using a library like cheerio
|
|
338
|
+
// Try to extract content from common containers
|
|
339
|
+
const patterns = selector
|
|
340
|
+
? [`<${selector}[^>]*>([\\s\\S]*?)</${selector}>`]
|
|
341
|
+
: [
|
|
342
|
+
/<main[^>]*>([\s\S]*?)<\/main>/i,
|
|
343
|
+
/<article[^>]*>([\s\S]*?)<\/article>/i,
|
|
344
|
+
/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
|
|
345
|
+
/<div[^>]*id="content"[^>]*>([\s\S]*?)<\/div>/i,
|
|
346
|
+
/<body[^>]*>([\s\S]*?)<\/body>/i,
|
|
347
|
+
];
|
|
348
|
+
for (const pattern of patterns) {
|
|
349
|
+
const match = html.match(new RegExp(pattern, "i"));
|
|
350
|
+
if (match) {
|
|
351
|
+
return match[1] || match[0];
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
return html;
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Convert HTML to plain text
|
|
358
|
+
*/
|
|
359
|
+
htmlToText(html) {
|
|
360
|
+
return (html
|
|
361
|
+
// Remove script and style elements
|
|
362
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
363
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
364
|
+
// Remove HTML comments
|
|
365
|
+
.replace(/<!--[\s\S]*?-->/g, "")
|
|
366
|
+
// Replace common block elements with newlines
|
|
367
|
+
.replace(/<\/(p|div|h[1-6]|br|li|tr|blockquote)>/gi, "\n")
|
|
368
|
+
.replace(/<(br|hr)\s*\/?>/gi, "\n")
|
|
369
|
+
// Remove remaining tags
|
|
370
|
+
.replace(/<[^>]+>/g, "")
|
|
371
|
+
// Decode common HTML entities
|
|
372
|
+
.replace(/ /gi, " ")
|
|
373
|
+
.replace(/&/gi, "&")
|
|
374
|
+
.replace(/</gi, "<")
|
|
375
|
+
.replace(/>/gi, ">")
|
|
376
|
+
.replace(/"/gi, '"')
|
|
377
|
+
.replace(/'/gi, "'")
|
|
378
|
+
.replace(/'/gi, "'")
|
|
379
|
+
// Decode numeric entities
|
|
380
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
|
|
381
|
+
// Normalize whitespace
|
|
382
|
+
.replace(/\n\s*\n/g, "\n\n")
|
|
383
|
+
.replace(/[ \t]+/g, " ")
|
|
384
|
+
.trim());
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* Registry of document loaders
|
|
389
|
+
*/
|
|
390
|
+
const loaderRegistry = [
|
|
391
|
+
new MarkdownLoader(),
|
|
392
|
+
new HTMLLoader(),
|
|
393
|
+
new JSONLoader(),
|
|
394
|
+
new CSVLoader(),
|
|
395
|
+
new PDFLoader(),
|
|
396
|
+
new WebLoader(),
|
|
397
|
+
new TextLoader(), // Default fallback
|
|
398
|
+
];
|
|
399
|
+
/**
|
|
400
|
+
* Detect document type from source
|
|
401
|
+
*/
|
|
402
|
+
function _detectDocumentType(source) {
|
|
403
|
+
const ext = extname(source).toLowerCase();
|
|
404
|
+
const typeMap = {
|
|
405
|
+
".md": "markdown",
|
|
406
|
+
".markdown": "markdown",
|
|
407
|
+
".mdx": "markdown",
|
|
408
|
+
".html": "html",
|
|
409
|
+
".htm": "html",
|
|
410
|
+
".xhtml": "html",
|
|
411
|
+
".json": "json",
|
|
412
|
+
".jsonl": "json",
|
|
413
|
+
".csv": "csv",
|
|
414
|
+
".tsv": "csv",
|
|
415
|
+
".tex": "latex",
|
|
416
|
+
".latex": "latex",
|
|
417
|
+
".pdf": "pdf",
|
|
418
|
+
};
|
|
419
|
+
// Check if it's a URL
|
|
420
|
+
try {
|
|
421
|
+
const url = new URL(source);
|
|
422
|
+
if (url.protocol === "http:" || url.protocol === "https:") {
|
|
423
|
+
return "html";
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
catch {
|
|
427
|
+
// Not a URL
|
|
428
|
+
}
|
|
429
|
+
return typeMap[ext] || "text";
|
|
430
|
+
}
|
|
431
|
+
/**
|
|
432
|
+
* Load document from file path, URL, or content
|
|
433
|
+
*
|
|
434
|
+
* Automatically detects the document type and uses the appropriate loader.
|
|
435
|
+
*
|
|
436
|
+
* @param source - File path, URL, or raw content
|
|
437
|
+
* @param options - Loader options
|
|
438
|
+
* @returns Promise resolving to MDocument
|
|
439
|
+
*
|
|
440
|
+
* @example
|
|
441
|
+
* ```typescript
|
|
442
|
+
* // Load from file
|
|
443
|
+
* const doc = await loadDocument('/path/to/document.md');
|
|
444
|
+
*
|
|
445
|
+
* // Load from URL
|
|
446
|
+
* const webDoc = await loadDocument('https://example.com/article');
|
|
447
|
+
*
|
|
448
|
+
* // Load with options
|
|
449
|
+
* const pdfDoc = await loadDocument('/path/to/doc.pdf', {
|
|
450
|
+
* pageRange: '1-5',
|
|
451
|
+
* metadata: { project: 'research' }
|
|
452
|
+
* });
|
|
453
|
+
* ```
|
|
454
|
+
*/
|
|
455
|
+
export async function loadDocument(source, options) {
|
|
456
|
+
// Find appropriate loader
|
|
457
|
+
const loader = loaderRegistry.find((l) => l.canHandle(source));
|
|
458
|
+
if (!loader) {
|
|
459
|
+
// Fall back to text loader
|
|
460
|
+
return new TextLoader().load(source, options);
|
|
461
|
+
}
|
|
462
|
+
logger.debug("[loadDocument] Loading document", {
|
|
463
|
+
source: source.slice(0, 100),
|
|
464
|
+
loaderType: loader.constructor.name,
|
|
465
|
+
});
|
|
466
|
+
return loader.load(source, options);
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Load multiple documents
|
|
470
|
+
*
|
|
471
|
+
* @param sources - Array of file paths, URLs, or content
|
|
472
|
+
* @param options - Loader options (applied to all)
|
|
473
|
+
* @returns Promise resolving to array of MDocuments
|
|
474
|
+
*/
|
|
475
|
+
export async function loadDocuments(sources, options) {
|
|
476
|
+
const results = await Promise.allSettled(sources.map((source) => loadDocument(source, options)));
|
|
477
|
+
const documents = [];
|
|
478
|
+
const errors = [];
|
|
479
|
+
results.forEach((result, index) => {
|
|
480
|
+
if (result.status === "fulfilled") {
|
|
481
|
+
documents.push(result.value);
|
|
482
|
+
}
|
|
483
|
+
else {
|
|
484
|
+
errors.push({
|
|
485
|
+
source: sources[index],
|
|
486
|
+
error: result.reason instanceof Error
|
|
487
|
+
? result.reason.message
|
|
488
|
+
: String(result.reason),
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
});
|
|
492
|
+
if (errors.length > 0) {
|
|
493
|
+
logger.warn("[loadDocuments] Some documents failed to load", {
|
|
494
|
+
loaded: documents.length,
|
|
495
|
+
failed: errors.length,
|
|
496
|
+
errors,
|
|
497
|
+
});
|
|
498
|
+
}
|
|
499
|
+
return documents;
|
|
500
|
+
}
|