@juspay/neurolink 9.1.1 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +106 -37
- package/dist/agent/directTools.d.ts +11 -11
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +8 -8
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +2 -2
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +16 -16
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +8 -8
- package/dist/types/common.d.ts +0 -1
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MDocument - Main Document Processing Class
|
|
3
|
+
*
|
|
4
|
+
* Provides a fluent interface for document processing using the Factory + Registry pattern.
|
|
5
|
+
* Supports various document types, chunking strategies, and metadata extraction.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* const doc = await MDocument.fromText(content);
|
|
10
|
+
* const chunks = await doc.chunk({
|
|
11
|
+
* strategy: 'recursive',
|
|
12
|
+
* config: { maxSize: 1000, overlap: 200 }
|
|
13
|
+
* });
|
|
14
|
+
* const enriched = await doc.extractMetadata({
|
|
15
|
+
* title: true,
|
|
16
|
+
* summary: true,
|
|
17
|
+
* keywords: true
|
|
18
|
+
* });
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
import { randomUUID } from "crypto";
|
|
22
|
+
import { logger } from "../../utils/logger.js";
|
|
23
|
+
import { ChunkerRegistry } from "../chunking/chunkerRegistry.js";
|
|
24
|
+
import { LLMMetadataExtractor } from "../metadata/metadataExtractor.js";
|
|
25
|
+
/**
|
|
26
|
+
* MDocument class for comprehensive document processing
|
|
27
|
+
*
|
|
28
|
+
* Provides a chainable API for:
|
|
29
|
+
* - Loading documents from various sources
|
|
30
|
+
* - Chunking with multiple strategies
|
|
31
|
+
* - Metadata extraction using LLMs
|
|
32
|
+
* - Embedding generation
|
|
33
|
+
*/
|
|
34
|
+
export class MDocument {
|
|
35
|
+
state;
|
|
36
|
+
documentId;
|
|
37
|
+
/**
|
|
38
|
+
* Create a new MDocument instance
|
|
39
|
+
* @param content - Document content
|
|
40
|
+
* @param config - Document configuration
|
|
41
|
+
*/
|
|
42
|
+
constructor(content, config) {
|
|
43
|
+
this.documentId = randomUUID();
|
|
44
|
+
this.state = {
|
|
45
|
+
content,
|
|
46
|
+
type: config?.type ?? "text",
|
|
47
|
+
metadata: {
|
|
48
|
+
...config?.metadata,
|
|
49
|
+
documentId: this.documentId,
|
|
50
|
+
createdAt: new Date().toISOString(),
|
|
51
|
+
},
|
|
52
|
+
chunks: [],
|
|
53
|
+
embeddings: [],
|
|
54
|
+
history: ["created"],
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
// ============================================================================
|
|
58
|
+
// Static Factory Methods
|
|
59
|
+
// ============================================================================
|
|
60
|
+
/**
|
|
61
|
+
* Create MDocument from plain text
|
|
62
|
+
* @param text - Plain text content
|
|
63
|
+
* @param metadata - Optional metadata
|
|
64
|
+
* @returns MDocument instance
|
|
65
|
+
*/
|
|
66
|
+
static fromText(text, metadata) {
|
|
67
|
+
return new MDocument(text, { type: "text", metadata });
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Create MDocument from markdown content
|
|
71
|
+
* @param markdown - Markdown content
|
|
72
|
+
* @param metadata - Optional metadata
|
|
73
|
+
* @returns MDocument instance
|
|
74
|
+
*/
|
|
75
|
+
static fromMarkdown(markdown, metadata) {
|
|
76
|
+
return new MDocument(markdown, { type: "markdown", metadata });
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Create MDocument from HTML content
|
|
80
|
+
* @param html - HTML content
|
|
81
|
+
* @param metadata - Optional metadata
|
|
82
|
+
* @returns MDocument instance
|
|
83
|
+
*/
|
|
84
|
+
static fromHTML(html, metadata) {
|
|
85
|
+
return new MDocument(html, { type: "html", metadata });
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Create MDocument from JSON content
|
|
89
|
+
* @param json - JSON string or object
|
|
90
|
+
* @param metadata - Optional metadata
|
|
91
|
+
* @returns MDocument instance
|
|
92
|
+
*/
|
|
93
|
+
static fromJSONContent(json, metadata) {
|
|
94
|
+
const content = typeof json === "string" ? json : JSON.stringify(json, null, 2);
|
|
95
|
+
return new MDocument(content, { type: "json", metadata });
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Create MDocument from LaTeX content
|
|
99
|
+
* @param latex - LaTeX content
|
|
100
|
+
* @param metadata - Optional metadata
|
|
101
|
+
* @returns MDocument instance
|
|
102
|
+
*/
|
|
103
|
+
static fromLaTeX(latex, metadata) {
|
|
104
|
+
return new MDocument(latex, { type: "latex", metadata });
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Create MDocument from CSV content
|
|
108
|
+
* @param csv - CSV content
|
|
109
|
+
* @param metadata - Optional metadata
|
|
110
|
+
* @returns MDocument instance
|
|
111
|
+
*/
|
|
112
|
+
static fromCSV(csv, metadata) {
|
|
113
|
+
return new MDocument(csv, { type: "csv", metadata });
|
|
114
|
+
}
|
|
115
|
+
// ============================================================================
|
|
116
|
+
// Core Processing Methods
|
|
117
|
+
// ============================================================================
|
|
118
|
+
/**
|
|
119
|
+
* Chunk the document using specified strategy
|
|
120
|
+
* @param params - Chunking parameters
|
|
121
|
+
* @returns This MDocument instance (for chaining)
|
|
122
|
+
*/
|
|
123
|
+
async chunk(params) {
|
|
124
|
+
const { strategy = this.getDefaultStrategy(), config = {} } = params || {};
|
|
125
|
+
logger.debug("[MDocument] Chunking document", {
|
|
126
|
+
documentId: this.documentId,
|
|
127
|
+
strategy,
|
|
128
|
+
contentLength: this.state.content.length,
|
|
129
|
+
});
|
|
130
|
+
const chunker = ChunkerRegistry.get(strategy);
|
|
131
|
+
// Merge document metadata into chunk config
|
|
132
|
+
const chunkConfig = {
|
|
133
|
+
...config,
|
|
134
|
+
metadata: {
|
|
135
|
+
...config.metadata,
|
|
136
|
+
source: this.state.metadata.source,
|
|
137
|
+
documentType: this.state.type,
|
|
138
|
+
},
|
|
139
|
+
};
|
|
140
|
+
this.state.chunks = await chunker.chunk(this.state.content, chunkConfig);
|
|
141
|
+
this.state.history.push(`chunked:${strategy}`);
|
|
142
|
+
logger.info("[MDocument] Document chunked", {
|
|
143
|
+
documentId: this.documentId,
|
|
144
|
+
strategy,
|
|
145
|
+
chunkCount: this.state.chunks.length,
|
|
146
|
+
});
|
|
147
|
+
return this;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Extract metadata from chunks using LLM
|
|
151
|
+
* @param params - Extraction parameters
|
|
152
|
+
* @param options - Extractor options
|
|
153
|
+
* @returns This MDocument instance (for chaining)
|
|
154
|
+
*/
|
|
155
|
+
async extractMetadata(params, options) {
|
|
156
|
+
if (this.state.chunks.length === 0) {
|
|
157
|
+
logger.warn("[MDocument] No chunks to extract metadata from. Call chunk() first.");
|
|
158
|
+
return this;
|
|
159
|
+
}
|
|
160
|
+
logger.debug("[MDocument] Extracting metadata", {
|
|
161
|
+
documentId: this.documentId,
|
|
162
|
+
chunkCount: this.state.chunks.length,
|
|
163
|
+
params: Object.keys(params),
|
|
164
|
+
});
|
|
165
|
+
const extractor = new LLMMetadataExtractor(options);
|
|
166
|
+
const results = await extractor.extract(this.state.chunks, params);
|
|
167
|
+
// Merge extraction results into chunk metadata
|
|
168
|
+
for (let i = 0; i < this.state.chunks.length && i < results.length; i++) {
|
|
169
|
+
const result = results[i];
|
|
170
|
+
if (result.title) {
|
|
171
|
+
this.state.chunks[i].metadata.title = result.title;
|
|
172
|
+
}
|
|
173
|
+
if (result.summary) {
|
|
174
|
+
this.state.chunks[i].metadata.summary = result.summary;
|
|
175
|
+
}
|
|
176
|
+
if (result.keywords) {
|
|
177
|
+
this.state.chunks[i].metadata.keywords = result.keywords;
|
|
178
|
+
}
|
|
179
|
+
if (result.custom) {
|
|
180
|
+
this.state.chunks[i].metadata.custom = {
|
|
181
|
+
...(this.state.chunks[i].metadata.custom || {}),
|
|
182
|
+
...result.custom,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
this.state.history.push(`metadata:${Object.keys(params).join(",")}`);
|
|
187
|
+
logger.info("[MDocument] Metadata extracted", {
|
|
188
|
+
documentId: this.documentId,
|
|
189
|
+
extractedFields: Object.keys(params),
|
|
190
|
+
});
|
|
191
|
+
return this;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Generate embeddings for all chunks
|
|
195
|
+
* @param provider - Embedding provider name
|
|
196
|
+
* @param modelName - Embedding model name
|
|
197
|
+
* @returns This MDocument instance (for chaining)
|
|
198
|
+
*/
|
|
199
|
+
async embed(provider = "openai", modelName = "text-embedding-3-small") {
|
|
200
|
+
if (this.state.chunks.length === 0) {
|
|
201
|
+
logger.warn("[MDocument] No chunks to embed. Call chunk() first.");
|
|
202
|
+
return this;
|
|
203
|
+
}
|
|
204
|
+
// Lazy import to avoid circular dependencies
|
|
205
|
+
const { ProviderFactory } = await import("../../factories/providerFactory.js");
|
|
206
|
+
logger.debug("[MDocument] Generating embeddings", {
|
|
207
|
+
documentId: this.documentId,
|
|
208
|
+
chunkCount: this.state.chunks.length,
|
|
209
|
+
provider,
|
|
210
|
+
model: modelName,
|
|
211
|
+
});
|
|
212
|
+
const embeddingProvider = await ProviderFactory.createProvider(provider, modelName);
|
|
213
|
+
if (typeof embeddingProvider.embed !==
|
|
214
|
+
"function") {
|
|
215
|
+
throw new Error(`Provider ${provider} does not support embeddings`);
|
|
216
|
+
}
|
|
217
|
+
this.state.embeddings = [];
|
|
218
|
+
for (const chunk of this.state.chunks) {
|
|
219
|
+
const embedding = await embeddingProvider.embed(chunk.text);
|
|
220
|
+
this.state.embeddings.push(embedding);
|
|
221
|
+
chunk.embedding = embedding;
|
|
222
|
+
}
|
|
223
|
+
this.state.history.push(`embedded:${provider}:${modelName}`);
|
|
224
|
+
logger.info("[MDocument] Embeddings generated", {
|
|
225
|
+
documentId: this.documentId,
|
|
226
|
+
embeddingCount: this.state.embeddings.length,
|
|
227
|
+
dimension: this.state.embeddings[0]?.length,
|
|
228
|
+
});
|
|
229
|
+
return this;
|
|
230
|
+
}
|
|
231
|
+
// ============================================================================
|
|
232
|
+
// Accessor Methods
|
|
233
|
+
// ============================================================================
|
|
234
|
+
/**
|
|
235
|
+
* Get document ID
|
|
236
|
+
*/
|
|
237
|
+
getId() {
|
|
238
|
+
return this.documentId;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Get raw document content
|
|
242
|
+
*/
|
|
243
|
+
getContent() {
|
|
244
|
+
return this.state.content;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Get document type
|
|
248
|
+
*/
|
|
249
|
+
getType() {
|
|
250
|
+
return this.state.type;
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Get document metadata
|
|
254
|
+
*/
|
|
255
|
+
getMetadata() {
|
|
256
|
+
return { ...this.state.metadata };
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Get processed chunks
|
|
260
|
+
*/
|
|
261
|
+
getChunks() {
|
|
262
|
+
return [...this.state.chunks];
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Get chunk embeddings
|
|
266
|
+
*/
|
|
267
|
+
getEmbeddings() {
|
|
268
|
+
return [...this.state.embeddings];
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Get processing history
|
|
272
|
+
*/
|
|
273
|
+
getHistory() {
|
|
274
|
+
return [...this.state.history];
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* Check if document has been chunked
|
|
278
|
+
*/
|
|
279
|
+
isChunked() {
|
|
280
|
+
return this.state.chunks.length > 0;
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Check if document has embeddings
|
|
284
|
+
*/
|
|
285
|
+
hasEmbeddings() {
|
|
286
|
+
return this.state.embeddings.length > 0;
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Get chunk count
|
|
290
|
+
*/
|
|
291
|
+
getChunkCount() {
|
|
292
|
+
return this.state.chunks.length;
|
|
293
|
+
}
|
|
294
|
+
// ============================================================================
|
|
295
|
+
// Transformation Methods
|
|
296
|
+
// ============================================================================
|
|
297
|
+
/**
|
|
298
|
+
* Set document metadata
|
|
299
|
+
* @param key - Metadata key
|
|
300
|
+
* @param value - Metadata value
|
|
301
|
+
* @returns This MDocument instance (for chaining)
|
|
302
|
+
*/
|
|
303
|
+
setMetadata(key, value) {
|
|
304
|
+
this.state.metadata[key] = value;
|
|
305
|
+
return this;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Merge metadata into document
|
|
309
|
+
* @param metadata - Metadata to merge
|
|
310
|
+
* @returns This MDocument instance (for chaining)
|
|
311
|
+
*/
|
|
312
|
+
mergeMetadata(metadata) {
|
|
313
|
+
this.state.metadata = { ...this.state.metadata, ...metadata };
|
|
314
|
+
return this;
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Filter chunks based on predicate
|
|
318
|
+
* @param predicate - Filter function
|
|
319
|
+
* @returns New MDocument with filtered chunks
|
|
320
|
+
*/
|
|
321
|
+
filterChunks(predicate) {
|
|
322
|
+
const doc = new MDocument(this.state.content, {
|
|
323
|
+
type: this.state.type,
|
|
324
|
+
metadata: this.state.metadata,
|
|
325
|
+
});
|
|
326
|
+
doc.state.chunks = this.state.chunks.filter(predicate);
|
|
327
|
+
doc.state.embeddings = this.state.embeddings.filter((_, i) => predicate(this.state.chunks[i]));
|
|
328
|
+
doc.state.history = [...this.state.history, "filtered"];
|
|
329
|
+
return doc;
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Map transformation over chunks
|
|
333
|
+
* @param transform - Transform function
|
|
334
|
+
* @returns New MDocument with transformed chunks
|
|
335
|
+
*/
|
|
336
|
+
mapChunks(transform) {
|
|
337
|
+
const doc = new MDocument(this.state.content, {
|
|
338
|
+
type: this.state.type,
|
|
339
|
+
metadata: this.state.metadata,
|
|
340
|
+
});
|
|
341
|
+
doc.state.chunks = this.state.chunks.map(transform);
|
|
342
|
+
doc.state.embeddings = [...this.state.embeddings];
|
|
343
|
+
doc.state.history = [...this.state.history, "mapped"];
|
|
344
|
+
return doc;
|
|
345
|
+
}
|
|
346
|
+
// ============================================================================
|
|
347
|
+
// Serialization Methods
|
|
348
|
+
// ============================================================================
|
|
349
|
+
/**
|
|
350
|
+
* Convert to plain object for serialization
|
|
351
|
+
*/
|
|
352
|
+
toJSON() {
|
|
353
|
+
return {
|
|
354
|
+
id: this.documentId,
|
|
355
|
+
content: this.state.content,
|
|
356
|
+
type: this.state.type,
|
|
357
|
+
metadata: this.state.metadata,
|
|
358
|
+
chunks: this.state.chunks,
|
|
359
|
+
history: this.state.history,
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
/**
|
|
363
|
+
* Create MDocument from serialized JSON
|
|
364
|
+
* @param json - Serialized document data
|
|
365
|
+
* @returns MDocument instance
|
|
366
|
+
*/
|
|
367
|
+
static fromJSON(json) {
|
|
368
|
+
const doc = new MDocument(json.content, {
|
|
369
|
+
type: json.type,
|
|
370
|
+
metadata: json.metadata,
|
|
371
|
+
});
|
|
372
|
+
if (json.id) {
|
|
373
|
+
doc.documentId = json.id;
|
|
374
|
+
}
|
|
375
|
+
if (json.chunks) {
|
|
376
|
+
doc.state.chunks = json.chunks;
|
|
377
|
+
}
|
|
378
|
+
if (json.history) {
|
|
379
|
+
doc.state.history = json.history;
|
|
380
|
+
}
|
|
381
|
+
return doc;
|
|
382
|
+
}
|
|
383
|
+
// ============================================================================
|
|
384
|
+
// Private Helper Methods
|
|
385
|
+
// ============================================================================
|
|
386
|
+
/**
|
|
387
|
+
* Get default chunking strategy based on document type
|
|
388
|
+
*/
|
|
389
|
+
getDefaultStrategy() {
|
|
390
|
+
return ChunkerRegistry.getRecommendedStrategy(this.state.type);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Module Exports
|
|
3
|
+
*/
|
|
4
|
+
export { MDocument } from "./MDocument.js";
|
|
5
|
+
export { type DocumentLoader, TextLoader, MarkdownLoader, HTMLLoader, JSONLoader, CSVLoader, PDFLoader, WebLoader, loadDocument, loadDocuments, type LoaderOptions, type WebLoaderOptions, type PDFLoaderOptions, type CSVLoaderOptions, } from "./loaders.js";
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Loaders
|
|
3
|
+
*
|
|
4
|
+
* Provides loaders for various document formats including:
|
|
5
|
+
* - Text files
|
|
6
|
+
* - Markdown files
|
|
7
|
+
* - HTML files and web pages
|
|
8
|
+
* - JSON files
|
|
9
|
+
* - CSV files
|
|
10
|
+
* - PDF files
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { loadDocument, WebLoader, PDFLoader } from 'neurolink/rag';
|
|
15
|
+
*
|
|
16
|
+
* // Load from file path
|
|
17
|
+
* const doc = await loadDocument('/path/to/document.md');
|
|
18
|
+
*
|
|
19
|
+
* // Load from URL
|
|
20
|
+
* const webDoc = await WebLoader.load('https://example.com/article');
|
|
21
|
+
*
|
|
22
|
+
* // Load PDF
|
|
23
|
+
* const pdfDoc = await PDFLoader.load('/path/to/document.pdf');
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
import { MDocument } from "./MDocument.js";
|
|
27
|
+
import type { DocumentType } from "../types.js";
|
|
28
|
+
/**
|
|
29
|
+
* Document loader options
|
|
30
|
+
*/
|
|
31
|
+
export interface LoaderOptions {
|
|
32
|
+
/** Custom metadata to add to document */
|
|
33
|
+
metadata?: Record<string, unknown>;
|
|
34
|
+
/** Encoding for text files */
|
|
35
|
+
encoding?: BufferEncoding;
|
|
36
|
+
/** Document type override */
|
|
37
|
+
type?: DocumentType;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Web loader options
|
|
41
|
+
*/
|
|
42
|
+
export interface WebLoaderOptions extends LoaderOptions {
|
|
43
|
+
/** Request timeout in milliseconds */
|
|
44
|
+
timeout?: number;
|
|
45
|
+
/** Custom headers for request */
|
|
46
|
+
headers?: Record<string, string>;
|
|
47
|
+
/** Extract only main content (remove navigation, ads, etc.) */
|
|
48
|
+
extractMainContent?: boolean;
|
|
49
|
+
/** Selector for main content (CSS selector) */
|
|
50
|
+
contentSelector?: string;
|
|
51
|
+
/** User agent string */
|
|
52
|
+
userAgent?: string;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* PDF loader options
|
|
56
|
+
*/
|
|
57
|
+
export interface PDFLoaderOptions extends LoaderOptions {
|
|
58
|
+
/** Page range to extract (e.g., "1-5" or "1,3,5") */
|
|
59
|
+
pageRange?: string;
|
|
60
|
+
/** Extract images as base64 */
|
|
61
|
+
extractImages?: boolean;
|
|
62
|
+
/** OCR for scanned documents */
|
|
63
|
+
enableOCR?: boolean;
|
|
64
|
+
/** Preserve layout formatting */
|
|
65
|
+
preserveLayout?: boolean;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* CSV loader options
|
|
69
|
+
*/
|
|
70
|
+
export interface CSVLoaderOptions extends LoaderOptions {
|
|
71
|
+
/** Delimiter character */
|
|
72
|
+
delimiter?: string;
|
|
73
|
+
/** Whether first row is header */
|
|
74
|
+
hasHeader?: boolean;
|
|
75
|
+
/** Column names (if no header) */
|
|
76
|
+
columns?: string[];
|
|
77
|
+
/** Output format */
|
|
78
|
+
outputFormat?: "text" | "json" | "markdown";
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Abstract document loader interface
|
|
82
|
+
*/
|
|
83
|
+
export interface DocumentLoader {
|
|
84
|
+
/**
|
|
85
|
+
* Load document from source
|
|
86
|
+
* @param source - File path, URL, or content
|
|
87
|
+
* @param options - Loader options
|
|
88
|
+
* @returns Promise resolving to MDocument
|
|
89
|
+
*/
|
|
90
|
+
load(source: string, options?: LoaderOptions): Promise<MDocument>;
|
|
91
|
+
/**
|
|
92
|
+
* Check if loader can handle the source
|
|
93
|
+
* @param source - File path, URL, or content
|
|
94
|
+
* @returns True if loader can handle the source
|
|
95
|
+
*/
|
|
96
|
+
canHandle(source: string): boolean;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Text file loader
|
|
100
|
+
*/
|
|
101
|
+
export declare class TextLoader implements DocumentLoader {
|
|
102
|
+
load(source: string, options?: LoaderOptions): Promise<MDocument>;
|
|
103
|
+
canHandle(source: string): boolean;
|
|
104
|
+
protected loadContent(source: string, encoding?: BufferEncoding): Promise<string>;
|
|
105
|
+
protected getSourceName(source: string): string;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Markdown file loader
|
|
109
|
+
*/
|
|
110
|
+
export declare class MarkdownLoader extends TextLoader {
|
|
111
|
+
load(source: string, options?: LoaderOptions): Promise<MDocument>;
|
|
112
|
+
canHandle(source: string): boolean;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* HTML file loader
|
|
116
|
+
*/
|
|
117
|
+
export declare class HTMLLoader extends TextLoader {
|
|
118
|
+
load(source: string, options?: LoaderOptions): Promise<MDocument>;
|
|
119
|
+
canHandle(source: string): boolean;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* JSON file loader
|
|
123
|
+
*/
|
|
124
|
+
export declare class JSONLoader extends TextLoader {
|
|
125
|
+
load(source: string, options?: LoaderOptions): Promise<MDocument>;
|
|
126
|
+
canHandle(source: string): boolean;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* CSV file loader
|
|
130
|
+
*/
|
|
131
|
+
export declare class CSVLoader extends TextLoader {
|
|
132
|
+
load(source: string, options?: CSVLoaderOptions): Promise<MDocument>;
|
|
133
|
+
canHandle(source: string): boolean;
|
|
134
|
+
private parseCSVLine;
|
|
135
|
+
private toMarkdownTable;
|
|
136
|
+
private toTextTable;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* PDF file loader
|
|
140
|
+
*
|
|
141
|
+
* Note: Requires external PDF processing library for full functionality.
|
|
142
|
+
* Falls back to placeholder implementation if pdf-parse is not available.
|
|
143
|
+
*/
|
|
144
|
+
export declare class PDFLoader implements DocumentLoader {
|
|
145
|
+
load(source: string, options?: PDFLoaderOptions): Promise<MDocument>;
|
|
146
|
+
canHandle(source: string): boolean;
|
|
147
|
+
private loadPdfParser;
|
|
148
|
+
private parsePageRange;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Web page loader
|
|
152
|
+
*
|
|
153
|
+
* Fetches and extracts content from web pages.
|
|
154
|
+
* Supports basic HTML parsing without external dependencies.
|
|
155
|
+
*/
|
|
156
|
+
export declare class WebLoader implements DocumentLoader {
|
|
157
|
+
private defaultUserAgent;
|
|
158
|
+
load(source: string, options?: WebLoaderOptions): Promise<MDocument>;
|
|
159
|
+
canHandle(source: string): boolean;
|
|
160
|
+
/**
|
|
161
|
+
* Extract main content from HTML
|
|
162
|
+
*/
|
|
163
|
+
private extractMainContent;
|
|
164
|
+
/**
|
|
165
|
+
* Convert HTML to plain text
|
|
166
|
+
*/
|
|
167
|
+
private htmlToText;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Load document from file path, URL, or content
|
|
171
|
+
*
|
|
172
|
+
* Automatically detects the document type and uses the appropriate loader.
|
|
173
|
+
*
|
|
174
|
+
* @param source - File path, URL, or raw content
|
|
175
|
+
* @param options - Loader options
|
|
176
|
+
* @returns Promise resolving to MDocument
|
|
177
|
+
*
|
|
178
|
+
* @example
|
|
179
|
+
* ```typescript
|
|
180
|
+
* // Load from file
|
|
181
|
+
* const doc = await loadDocument('/path/to/document.md');
|
|
182
|
+
*
|
|
183
|
+
* // Load from URL
|
|
184
|
+
* const webDoc = await loadDocument('https://example.com/article');
|
|
185
|
+
*
|
|
186
|
+
* // Load with options
|
|
187
|
+
* const pdfDoc = await loadDocument('/path/to/doc.pdf', {
|
|
188
|
+
* pageRange: '1-5',
|
|
189
|
+
* metadata: { project: 'research' }
|
|
190
|
+
* });
|
|
191
|
+
* ```
|
|
192
|
+
*/
|
|
193
|
+
export declare function loadDocument(source: string, options?: LoaderOptions): Promise<MDocument>;
|
|
194
|
+
/**
|
|
195
|
+
* Load multiple documents
|
|
196
|
+
*
|
|
197
|
+
* @param sources - Array of file paths, URLs, or content
|
|
198
|
+
* @param options - Loader options (applied to all)
|
|
199
|
+
* @returns Promise resolving to array of MDocuments
|
|
200
|
+
*/
|
|
201
|
+
export declare function loadDocuments(sources: string[], options?: LoaderOptions): Promise<MDocument[]>;
|