@juspay/neurolink 9.2.0 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/README.md +52 -30
- package/dist/agent/directTools.d.ts +8 -8
- package/dist/cli/commands/config.d.ts +3 -3
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +5 -5
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +8 -8
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/modelTypes.d.ts +2 -2
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +2 -2
- package/dist/types/common.d.ts +0 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/modelTypes.d.ts +20 -20
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Loaders
|
|
3
|
+
*
|
|
4
|
+
* Provides loaders for various document formats including:
|
|
5
|
+
* - Text files
|
|
6
|
+
* - Markdown files
|
|
7
|
+
* - HTML files and web pages
|
|
8
|
+
* - JSON files
|
|
9
|
+
* - CSV files
|
|
10
|
+
* - PDF files
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { loadDocument, WebLoader, PDFLoader } from 'neurolink/rag';
|
|
15
|
+
*
|
|
16
|
+
* // Load from file path
|
|
17
|
+
* const doc = await loadDocument('/path/to/document.md');
|
|
18
|
+
*
|
|
19
|
+
* // Load from URL
|
|
20
|
+
* const webDoc = await WebLoader.load('https://example.com/article');
|
|
21
|
+
*
|
|
22
|
+
* // Load PDF
|
|
23
|
+
* const pdfDoc = await PDFLoader.load('/path/to/document.pdf');
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
import { readFile } from "fs/promises";
|
|
27
|
+
import { existsSync } from "fs";
|
|
28
|
+
import { extname, basename } from "path";
|
|
29
|
+
import { MDocument } from "./MDocument.js";
|
|
30
|
+
import { logger } from "../../utils/logger.js";
|
|
31
|
+
/**
|
|
32
|
+
* Text file loader
|
|
33
|
+
*/
|
|
34
|
+
export class TextLoader {
|
|
35
|
+
async load(source, options) {
|
|
36
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
37
|
+
return MDocument.fromText(content, {
|
|
38
|
+
source: this.getSourceName(source),
|
|
39
|
+
...options?.metadata,
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
canHandle(source) {
|
|
43
|
+
const ext = extname(source).toLowerCase();
|
|
44
|
+
return ext === ".txt" || ext === "";
|
|
45
|
+
}
|
|
46
|
+
async loadContent(source, encoding = "utf-8") {
|
|
47
|
+
if (existsSync(source)) {
|
|
48
|
+
return await readFile(source, encoding);
|
|
49
|
+
}
|
|
50
|
+
// Assume source is content if not a file
|
|
51
|
+
return source;
|
|
52
|
+
}
|
|
53
|
+
getSourceName(source) {
|
|
54
|
+
return existsSync(source) ? basename(source) : "inline-content";
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Markdown file loader
|
|
59
|
+
*/
|
|
60
|
+
export class MarkdownLoader extends TextLoader {
|
|
61
|
+
async load(source, options) {
|
|
62
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
63
|
+
return MDocument.fromMarkdown(content, {
|
|
64
|
+
source: this.getSourceName(source),
|
|
65
|
+
...options?.metadata,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
canHandle(source) {
|
|
69
|
+
const ext = extname(source).toLowerCase();
|
|
70
|
+
return ext === ".md" || ext === ".markdown" || ext === ".mdx";
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* HTML file loader
|
|
75
|
+
*/
|
|
76
|
+
export class HTMLLoader extends TextLoader {
|
|
77
|
+
async load(source, options) {
|
|
78
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
79
|
+
return MDocument.fromHTML(content, {
|
|
80
|
+
source: this.getSourceName(source),
|
|
81
|
+
...options?.metadata,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
canHandle(source) {
|
|
85
|
+
const ext = extname(source).toLowerCase();
|
|
86
|
+
return ext === ".html" || ext === ".htm" || ext === ".xhtml";
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* JSON file loader
|
|
91
|
+
*/
|
|
92
|
+
export class JSONLoader extends TextLoader {
|
|
93
|
+
async load(source, options) {
|
|
94
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
95
|
+
// Validate JSON
|
|
96
|
+
try {
|
|
97
|
+
JSON.parse(content);
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
throw new Error(`Invalid JSON: ${error instanceof Error ? error.message : String(error)}`);
|
|
101
|
+
}
|
|
102
|
+
return MDocument.fromJSONContent(content, {
|
|
103
|
+
source: this.getSourceName(source),
|
|
104
|
+
...options?.metadata,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
canHandle(source) {
|
|
108
|
+
const ext = extname(source).toLowerCase();
|
|
109
|
+
return ext === ".json" || ext === ".jsonl";
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* CSV file loader
|
|
114
|
+
*/
|
|
115
|
+
export class CSVLoader extends TextLoader {
|
|
116
|
+
async load(source, options) {
|
|
117
|
+
const content = await this.loadContent(source, options?.encoding);
|
|
118
|
+
const { delimiter = ",", hasHeader = true, columns, outputFormat = "text", } = options || {};
|
|
119
|
+
const lines = content.split("\n").filter((line) => line.trim());
|
|
120
|
+
const headers = hasHeader
|
|
121
|
+
? this.parseCSVLine(lines[0], delimiter)
|
|
122
|
+
: columns || lines[0]?.split(delimiter).map((_, i) => `col${i + 1}`);
|
|
123
|
+
const dataLines = hasHeader ? lines.slice(1) : lines;
|
|
124
|
+
const rows = dataLines.map((line) => this.parseCSVLine(line, delimiter));
|
|
125
|
+
let formattedContent;
|
|
126
|
+
switch (outputFormat) {
|
|
127
|
+
case "json":
|
|
128
|
+
formattedContent = JSON.stringify(rows.map((row) => Object.fromEntries(headers.map((h, i) => [h, row[i]]))), null, 2);
|
|
129
|
+
break;
|
|
130
|
+
case "markdown":
|
|
131
|
+
formattedContent = this.toMarkdownTable(headers, rows);
|
|
132
|
+
break;
|
|
133
|
+
default:
|
|
134
|
+
formattedContent = this.toTextTable(headers, rows);
|
|
135
|
+
}
|
|
136
|
+
return MDocument.fromCSV(formattedContent, {
|
|
137
|
+
source: this.getSourceName(source),
|
|
138
|
+
rowCount: rows.length,
|
|
139
|
+
columnCount: headers.length,
|
|
140
|
+
columns: headers,
|
|
141
|
+
...options?.metadata,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
canHandle(source) {
|
|
145
|
+
const ext = extname(source).toLowerCase();
|
|
146
|
+
return ext === ".csv" || ext === ".tsv";
|
|
147
|
+
}
|
|
148
|
+
parseCSVLine(line, delimiter) {
|
|
149
|
+
const result = [];
|
|
150
|
+
let current = "";
|
|
151
|
+
let inQuotes = false;
|
|
152
|
+
for (let i = 0; i < line.length; i++) {
|
|
153
|
+
const char = line[i];
|
|
154
|
+
if (char === '"' && (i === 0 || line[i - 1] !== "\\")) {
|
|
155
|
+
inQuotes = !inQuotes;
|
|
156
|
+
}
|
|
157
|
+
else if (char === delimiter && !inQuotes) {
|
|
158
|
+
result.push(current.trim());
|
|
159
|
+
current = "";
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
current += char;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
result.push(current.trim());
|
|
166
|
+
return result;
|
|
167
|
+
}
|
|
168
|
+
toMarkdownTable(headers, rows) {
|
|
169
|
+
const headerRow = `| ${headers.join(" | ")} |`;
|
|
170
|
+
const separator = `| ${headers.map(() => "---").join(" | ")} |`;
|
|
171
|
+
const dataRows = rows.map((row) => `| ${row.join(" | ")} |`);
|
|
172
|
+
return [headerRow, separator, ...dataRows].join("\n");
|
|
173
|
+
}
|
|
174
|
+
toTextTable(headers, rows) {
|
|
175
|
+
const allRows = [headers, ...rows];
|
|
176
|
+
const colWidths = headers.map((_, i) => Math.max(...allRows.map((row) => (row[i] || "").length)));
|
|
177
|
+
const formatRow = (row) => row.map((cell, i) => (cell || "").padEnd(colWidths[i])).join(" | ");
|
|
178
|
+
return [
|
|
179
|
+
formatRow(headers),
|
|
180
|
+
colWidths.map((w) => "-".repeat(w)).join("-+-"),
|
|
181
|
+
...rows.map(formatRow),
|
|
182
|
+
].join("\n");
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* PDF file loader
|
|
187
|
+
*
|
|
188
|
+
* Note: Requires external PDF processing library for full functionality.
|
|
189
|
+
* Falls back to placeholder implementation if pdf-parse is not available.
|
|
190
|
+
*/
|
|
191
|
+
export class PDFLoader {
|
|
192
|
+
async load(source, options) {
|
|
193
|
+
if (!existsSync(source)) {
|
|
194
|
+
throw new Error(`PDF file not found: ${source}`);
|
|
195
|
+
}
|
|
196
|
+
logger.debug("[PDFLoader] Loading PDF", {
|
|
197
|
+
source,
|
|
198
|
+
pageRange: options?.pageRange,
|
|
199
|
+
});
|
|
200
|
+
try {
|
|
201
|
+
// Try to use pdf-parse if available
|
|
202
|
+
const pdfParse = await this.loadPdfParser();
|
|
203
|
+
const buffer = await readFile(source);
|
|
204
|
+
const data = await pdfParse(buffer);
|
|
205
|
+
const text = data.text;
|
|
206
|
+
// Handle page range if specified
|
|
207
|
+
if (options?.pageRange) {
|
|
208
|
+
const _pages = this.parsePageRange(options.pageRange, data.numpages);
|
|
209
|
+
// Note: pdf-parse doesn't support page selection directly
|
|
210
|
+
// This is a placeholder for more sophisticated page handling
|
|
211
|
+
logger.debug("[PDFLoader] Page range requested but not fully supported", {
|
|
212
|
+
pageRange: options.pageRange,
|
|
213
|
+
totalPages: data.numpages,
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
return new MDocument(text, {
|
|
217
|
+
type: "pdf",
|
|
218
|
+
metadata: {
|
|
219
|
+
source: basename(source),
|
|
220
|
+
pageCount: data.numpages,
|
|
221
|
+
info: data.info,
|
|
222
|
+
...options?.metadata,
|
|
223
|
+
},
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
catch (error) {
|
|
227
|
+
// Fallback: Return placeholder document
|
|
228
|
+
logger.warn("[PDFLoader] pdf-parse not available, using fallback", {
|
|
229
|
+
error: error instanceof Error ? error.message : String(error),
|
|
230
|
+
});
|
|
231
|
+
return new MDocument(`[PDF Document: ${basename(source)}]\n\nNote: PDF parsing requires the 'pdf-parse' package. Install it with:\n npm install pdf-parse`, {
|
|
232
|
+
type: "pdf",
|
|
233
|
+
metadata: {
|
|
234
|
+
source: basename(source),
|
|
235
|
+
parseError: "pdf-parse not available",
|
|
236
|
+
...options?.metadata,
|
|
237
|
+
},
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
canHandle(source) {
|
|
242
|
+
const ext = extname(source).toLowerCase();
|
|
243
|
+
return ext === ".pdf";
|
|
244
|
+
}
|
|
245
|
+
async loadPdfParser() {
|
|
246
|
+
try {
|
|
247
|
+
// @ts-expect-error pdf-parse is an optional dependency
|
|
248
|
+
const pdfParse = await import("pdf-parse");
|
|
249
|
+
return pdfParse.default || pdfParse;
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
throw new Error("pdf-parse module not available");
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
parsePageRange(range, totalPages) {
|
|
256
|
+
const pages = [];
|
|
257
|
+
const parts = range.split(",");
|
|
258
|
+
for (const part of parts) {
|
|
259
|
+
if (part.includes("-")) {
|
|
260
|
+
const [start, end] = part.split("-").map(Number);
|
|
261
|
+
for (let i = start; i <= Math.min(end, totalPages); i++) {
|
|
262
|
+
pages.push(i);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
const page = Number(part);
|
|
267
|
+
if (page <= totalPages) {
|
|
268
|
+
pages.push(page);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
return [...new Set(pages)].sort((a, b) => a - b);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Web page loader
|
|
277
|
+
*
|
|
278
|
+
* Fetches and extracts content from web pages.
|
|
279
|
+
* Supports basic HTML parsing without external dependencies.
|
|
280
|
+
*/
|
|
281
|
+
export class WebLoader {
|
|
282
|
+
defaultUserAgent = "Mozilla/5.0 (compatible; NeuroLink/1.0; +https://github.com/juspay/neurolink)";
|
|
283
|
+
async load(source, options) {
|
|
284
|
+
if (!this.canHandle(source)) {
|
|
285
|
+
throw new Error(`Invalid URL: ${source}`);
|
|
286
|
+
}
|
|
287
|
+
logger.debug("[WebLoader] Fetching URL", {
|
|
288
|
+
url: source,
|
|
289
|
+
timeout: options?.timeout,
|
|
290
|
+
});
|
|
291
|
+
const response = await fetch(source, {
|
|
292
|
+
signal: options?.timeout
|
|
293
|
+
? AbortSignal.timeout(options.timeout)
|
|
294
|
+
: undefined,
|
|
295
|
+
headers: {
|
|
296
|
+
"User-Agent": options?.userAgent || this.defaultUserAgent,
|
|
297
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
298
|
+
...options?.headers,
|
|
299
|
+
},
|
|
300
|
+
});
|
|
301
|
+
if (!response.ok) {
|
|
302
|
+
throw new Error(`HTTP error ${response.status}: ${response.statusText}`);
|
|
303
|
+
}
|
|
304
|
+
const html = await response.text();
|
|
305
|
+
let content = html;
|
|
306
|
+
// Extract main content if requested
|
|
307
|
+
if (options?.extractMainContent) {
|
|
308
|
+
content = this.extractMainContent(html, options.contentSelector);
|
|
309
|
+
}
|
|
310
|
+
// Convert HTML to plain text for better processing
|
|
311
|
+
const text = this.htmlToText(content);
|
|
312
|
+
return new MDocument(text, {
|
|
313
|
+
type: "html",
|
|
314
|
+
metadata: {
|
|
315
|
+
source,
|
|
316
|
+
url: source,
|
|
317
|
+
fetchedAt: new Date().toISOString(),
|
|
318
|
+
contentType: response.headers.get("content-type") || "text/html",
|
|
319
|
+
...options?.metadata,
|
|
320
|
+
},
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
canHandle(source) {
|
|
324
|
+
try {
|
|
325
|
+
const url = new URL(source);
|
|
326
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
327
|
+
}
|
|
328
|
+
catch {
|
|
329
|
+
return false;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
/**
|
|
333
|
+
* Extract main content from HTML
|
|
334
|
+
*/
|
|
335
|
+
extractMainContent(html, selector) {
|
|
336
|
+
// Simple extraction based on common content patterns
|
|
337
|
+
// For production use, consider using a library like cheerio
|
|
338
|
+
// Try to extract content from common containers
|
|
339
|
+
const patterns = selector
|
|
340
|
+
? [`<${selector}[^>]*>([\\s\\S]*?)</${selector}>`]
|
|
341
|
+
: [
|
|
342
|
+
/<main[^>]*>([\s\S]*?)<\/main>/i,
|
|
343
|
+
/<article[^>]*>([\s\S]*?)<\/article>/i,
|
|
344
|
+
/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
|
|
345
|
+
/<div[^>]*id="content"[^>]*>([\s\S]*?)<\/div>/i,
|
|
346
|
+
/<body[^>]*>([\s\S]*?)<\/body>/i,
|
|
347
|
+
];
|
|
348
|
+
for (const pattern of patterns) {
|
|
349
|
+
const match = html.match(new RegExp(pattern, "i"));
|
|
350
|
+
if (match) {
|
|
351
|
+
return match[1] || match[0];
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
return html;
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Convert HTML to plain text
|
|
358
|
+
*/
|
|
359
|
+
htmlToText(html) {
|
|
360
|
+
return (html
|
|
361
|
+
// Remove script and style elements
|
|
362
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
363
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
364
|
+
// Remove HTML comments
|
|
365
|
+
.replace(/<!--[\s\S]*?-->/g, "")
|
|
366
|
+
// Replace common block elements with newlines
|
|
367
|
+
.replace(/<\/(p|div|h[1-6]|br|li|tr|blockquote)>/gi, "\n")
|
|
368
|
+
.replace(/<(br|hr)\s*\/?>/gi, "\n")
|
|
369
|
+
// Remove remaining tags
|
|
370
|
+
.replace(/<[^>]+>/g, "")
|
|
371
|
+
// Decode common HTML entities
|
|
372
|
+
.replace(/ /gi, " ")
|
|
373
|
+
.replace(/&/gi, "&")
|
|
374
|
+
.replace(/</gi, "<")
|
|
375
|
+
.replace(/>/gi, ">")
|
|
376
|
+
.replace(/"/gi, '"')
|
|
377
|
+
.replace(/'/gi, "'")
|
|
378
|
+
.replace(/'/gi, "'")
|
|
379
|
+
// Decode numeric entities
|
|
380
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
|
|
381
|
+
// Normalize whitespace
|
|
382
|
+
.replace(/\n\s*\n/g, "\n\n")
|
|
383
|
+
.replace(/[ \t]+/g, " ")
|
|
384
|
+
.trim());
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* Registry of document loaders
|
|
389
|
+
*/
|
|
390
|
+
const loaderRegistry = [
|
|
391
|
+
new MarkdownLoader(),
|
|
392
|
+
new HTMLLoader(),
|
|
393
|
+
new JSONLoader(),
|
|
394
|
+
new CSVLoader(),
|
|
395
|
+
new PDFLoader(),
|
|
396
|
+
new WebLoader(),
|
|
397
|
+
new TextLoader(), // Default fallback
|
|
398
|
+
];
|
|
399
|
+
/**
|
|
400
|
+
* Detect document type from source
|
|
401
|
+
*/
|
|
402
|
+
function _detectDocumentType(source) {
|
|
403
|
+
const ext = extname(source).toLowerCase();
|
|
404
|
+
const typeMap = {
|
|
405
|
+
".md": "markdown",
|
|
406
|
+
".markdown": "markdown",
|
|
407
|
+
".mdx": "markdown",
|
|
408
|
+
".html": "html",
|
|
409
|
+
".htm": "html",
|
|
410
|
+
".xhtml": "html",
|
|
411
|
+
".json": "json",
|
|
412
|
+
".jsonl": "json",
|
|
413
|
+
".csv": "csv",
|
|
414
|
+
".tsv": "csv",
|
|
415
|
+
".tex": "latex",
|
|
416
|
+
".latex": "latex",
|
|
417
|
+
".pdf": "pdf",
|
|
418
|
+
};
|
|
419
|
+
// Check if it's a URL
|
|
420
|
+
try {
|
|
421
|
+
const url = new URL(source);
|
|
422
|
+
if (url.protocol === "http:" || url.protocol === "https:") {
|
|
423
|
+
return "html";
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
catch {
|
|
427
|
+
// Not a URL
|
|
428
|
+
}
|
|
429
|
+
return typeMap[ext] || "text";
|
|
430
|
+
}
|
|
431
|
+
/**
|
|
432
|
+
* Load document from file path, URL, or content
|
|
433
|
+
*
|
|
434
|
+
* Automatically detects the document type and uses the appropriate loader.
|
|
435
|
+
*
|
|
436
|
+
* @param source - File path, URL, or raw content
|
|
437
|
+
* @param options - Loader options
|
|
438
|
+
* @returns Promise resolving to MDocument
|
|
439
|
+
*
|
|
440
|
+
* @example
|
|
441
|
+
* ```typescript
|
|
442
|
+
* // Load from file
|
|
443
|
+
* const doc = await loadDocument('/path/to/document.md');
|
|
444
|
+
*
|
|
445
|
+
* // Load from URL
|
|
446
|
+
* const webDoc = await loadDocument('https://example.com/article');
|
|
447
|
+
*
|
|
448
|
+
* // Load with options
|
|
449
|
+
* const pdfDoc = await loadDocument('/path/to/doc.pdf', {
|
|
450
|
+
* pageRange: '1-5',
|
|
451
|
+
* metadata: { project: 'research' }
|
|
452
|
+
* });
|
|
453
|
+
* ```
|
|
454
|
+
*/
|
|
455
|
+
export async function loadDocument(source, options) {
|
|
456
|
+
// Find appropriate loader
|
|
457
|
+
const loader = loaderRegistry.find((l) => l.canHandle(source));
|
|
458
|
+
if (!loader) {
|
|
459
|
+
// Fall back to text loader
|
|
460
|
+
return new TextLoader().load(source, options);
|
|
461
|
+
}
|
|
462
|
+
logger.debug("[loadDocument] Loading document", {
|
|
463
|
+
source: source.slice(0, 100),
|
|
464
|
+
loaderType: loader.constructor.name,
|
|
465
|
+
});
|
|
466
|
+
return loader.load(source, options);
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Load multiple documents
|
|
470
|
+
*
|
|
471
|
+
* @param sources - Array of file paths, URLs, or content
|
|
472
|
+
* @param options - Loader options (applied to all)
|
|
473
|
+
* @returns Promise resolving to array of MDocuments
|
|
474
|
+
*/
|
|
475
|
+
export async function loadDocuments(sources, options) {
|
|
476
|
+
const results = await Promise.allSettled(sources.map((source) => loadDocument(source, options)));
|
|
477
|
+
const documents = [];
|
|
478
|
+
const errors = [];
|
|
479
|
+
results.forEach((result, index) => {
|
|
480
|
+
if (result.status === "fulfilled") {
|
|
481
|
+
documents.push(result.value);
|
|
482
|
+
}
|
|
483
|
+
else {
|
|
484
|
+
errors.push({
|
|
485
|
+
source: sources[index],
|
|
486
|
+
error: result.reason instanceof Error
|
|
487
|
+
? result.reason.message
|
|
488
|
+
: String(result.reason),
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
});
|
|
492
|
+
if (errors.length > 0) {
|
|
493
|
+
logger.warn("[loadDocuments] Some documents failed to load", {
|
|
494
|
+
loaded: documents.length,
|
|
495
|
+
failed: errors.length,
|
|
496
|
+
errors,
|
|
497
|
+
});
|
|
498
|
+
}
|
|
499
|
+
return documents;
|
|
500
|
+
}
|