@juspay/neurolink 9.2.0 → 9.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/README.md +52 -30
- package/dist/agent/directTools.d.ts +8 -8
- package/dist/cli/commands/config.d.ts +3 -3
- package/dist/cli/commands/rag.d.ts +19 -0
- package/dist/cli/commands/rag.js +756 -0
- package/dist/cli/factories/commandFactory.js +146 -83
- package/dist/cli/parser.js +4 -1
- package/dist/core/baseProvider.d.ts +43 -30
- package/dist/core/baseProvider.js +98 -138
- package/dist/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/core/conversationMemoryFactory.js +2 -2
- package/dist/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/core/conversationMemoryInitializer.js +2 -2
- package/dist/core/infrastructure/baseError.d.ts +21 -0
- package/dist/core/infrastructure/baseError.js +22 -0
- package/dist/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/core/infrastructure/baseFactory.js +54 -0
- package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/core/infrastructure/baseRegistry.js +49 -0
- package/dist/core/infrastructure/index.d.ts +5 -0
- package/dist/core/infrastructure/index.js +5 -0
- package/dist/core/infrastructure/retry.d.ts +7 -0
- package/dist/core/infrastructure/retry.js +20 -0
- package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/core/infrastructure/typedEventEmitter.js +23 -0
- package/dist/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/core/redisConversationMemoryManager.js +7 -19
- package/dist/factories/providerFactory.d.ts +5 -3
- package/dist/factories/providerFactory.js +31 -24
- package/dist/index.d.ts +46 -12
- package/dist/index.js +88 -36
- package/dist/lib/agent/directTools.d.ts +5 -5
- package/dist/lib/core/baseProvider.d.ts +43 -30
- package/dist/lib/core/baseProvider.js +98 -138
- package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
- package/dist/lib/core/conversationMemoryFactory.js +2 -2
- package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
- package/dist/lib/core/conversationMemoryInitializer.js +2 -2
- package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseError.js +23 -0
- package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseFactory.js +55 -0
- package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
- package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
- package/dist/lib/core/infrastructure/index.d.ts +5 -0
- package/dist/lib/core/infrastructure/index.js +6 -0
- package/dist/lib/core/infrastructure/retry.d.ts +7 -0
- package/dist/lib/core/infrastructure/retry.js +21 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
- package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
- package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
- package/dist/lib/core/redisConversationMemoryManager.js +7 -19
- package/dist/lib/factories/providerFactory.d.ts +5 -3
- package/dist/lib/factories/providerFactory.js +31 -24
- package/dist/lib/index.d.ts +46 -12
- package/dist/lib/index.js +88 -36
- package/dist/lib/mcp/index.d.ts +6 -5
- package/dist/lib/mcp/index.js +7 -5
- package/dist/lib/neurolink.d.ts +11 -13
- package/dist/lib/neurolink.js +95 -29
- package/dist/lib/providers/amazonBedrock.d.ts +15 -2
- package/dist/lib/providers/amazonBedrock.js +65 -8
- package/dist/lib/providers/anthropic.d.ts +3 -3
- package/dist/lib/providers/anthropic.js +10 -7
- package/dist/lib/providers/googleAiStudio.d.ts +5 -5
- package/dist/lib/providers/googleAiStudio.js +10 -7
- package/dist/lib/providers/googleVertex.d.ts +16 -4
- package/dist/lib/providers/googleVertex.js +72 -16
- package/dist/lib/providers/litellm.d.ts +3 -3
- package/dist/lib/providers/litellm.js +10 -10
- package/dist/lib/providers/mistral.d.ts +3 -3
- package/dist/lib/providers/mistral.js +7 -6
- package/dist/lib/providers/ollama.d.ts +3 -4
- package/dist/lib/providers/ollama.js +7 -8
- package/dist/lib/providers/openAI.d.ts +14 -2
- package/dist/lib/providers/openAI.js +60 -6
- package/dist/lib/providers/openRouter.d.ts +2 -2
- package/dist/lib/providers/openRouter.js +10 -6
- package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
- package/dist/lib/rag/ChunkerFactory.js +321 -0
- package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/lib/rag/ChunkerRegistry.js +422 -0
- package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
- package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
- package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
- package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
- package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
- package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
- package/dist/lib/rag/chunkers/index.d.ts +15 -0
- package/dist/lib/rag/chunkers/index.js +16 -0
- package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/lib/rag/chunking/characterChunker.js +143 -0
- package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
- package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/lib/rag/chunking/htmlChunker.js +248 -0
- package/dist/lib/rag/chunking/index.d.ts +15 -0
- package/dist/lib/rag/chunking/index.js +18 -0
- package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/lib/rag/chunking/jsonChunker.js +282 -0
- package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/lib/rag/chunking/latexChunker.js +252 -0
- package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/markdownChunker.js +202 -0
- package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
- package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/lib/rag/chunking/semanticChunker.js +307 -0
- package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
- package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/lib/rag/chunking/tokenChunker.js +184 -0
- package/dist/lib/rag/document/MDocument.d.ts +198 -0
- package/dist/lib/rag/document/MDocument.js +393 -0
- package/dist/lib/rag/document/index.d.ts +5 -0
- package/dist/lib/rag/document/index.js +6 -0
- package/dist/lib/rag/document/loaders.d.ts +201 -0
- package/dist/lib/rag/document/loaders.js +501 -0
- package/dist/lib/rag/errors/RAGError.d.ts +244 -0
- package/dist/lib/rag/errors/RAGError.js +275 -0
- package/dist/lib/rag/errors/index.d.ts +6 -0
- package/dist/lib/rag/errors/index.js +7 -0
- package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/lib/rag/graphRag/graphRAG.js +385 -0
- package/dist/lib/rag/graphRag/index.d.ts +4 -0
- package/dist/lib/rag/graphRag/index.js +5 -0
- package/dist/lib/rag/index.d.ts +103 -0
- package/dist/lib/rag/index.js +142 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
- package/dist/lib/rag/metadata/index.d.ts +6 -0
- package/dist/lib/rag/metadata/index.js +10 -0
- package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
- package/dist/lib/rag/pipeline/index.d.ts +5 -0
- package/dist/lib/rag/pipeline/index.js +6 -0
- package/dist/lib/rag/ragIntegration.d.ts +38 -0
- package/dist/lib/rag/ragIntegration.js +212 -0
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
- package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
- package/dist/lib/rag/reranker/index.d.ts +6 -0
- package/dist/lib/rag/reranker/index.js +10 -0
- package/dist/lib/rag/reranker/reranker.d.ts +71 -0
- package/dist/lib/rag/reranker/reranker.js +278 -0
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
- package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/lib/rag/resilience/RetryHandler.js +301 -0
- package/dist/lib/rag/resilience/index.d.ts +7 -0
- package/dist/lib/rag/resilience/index.js +8 -0
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
- package/dist/lib/rag/retrieval/index.d.ts +5 -0
- package/dist/lib/rag/retrieval/index.js +6 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
- package/dist/lib/rag/types.d.ts +768 -0
- package/dist/lib/rag/types.js +9 -0
- package/dist/lib/server/index.d.ts +15 -11
- package/dist/lib/server/index.js +55 -51
- package/dist/lib/server/utils/validation.d.ts +8 -8
- package/dist/lib/types/common.d.ts +0 -1
- package/dist/lib/types/generateTypes.d.ts +42 -8
- package/dist/lib/types/generateTypes.js +1 -1
- package/dist/lib/types/modelTypes.d.ts +2 -2
- package/dist/lib/types/streamTypes.d.ts +28 -8
- package/dist/lib/types/streamTypes.js +1 -1
- package/dist/lib/utils/modelRouter.d.ts +4 -4
- package/dist/lib/utils/modelRouter.js +4 -4
- package/dist/mcp/index.d.ts +6 -5
- package/dist/mcp/index.js +7 -5
- package/dist/neurolink.d.ts +11 -13
- package/dist/neurolink.js +95 -29
- package/dist/providers/amazonBedrock.d.ts +15 -2
- package/dist/providers/amazonBedrock.js +65 -8
- package/dist/providers/anthropic.d.ts +3 -3
- package/dist/providers/anthropic.js +10 -7
- package/dist/providers/googleAiStudio.d.ts +5 -5
- package/dist/providers/googleAiStudio.js +10 -7
- package/dist/providers/googleVertex.d.ts +16 -4
- package/dist/providers/googleVertex.js +72 -16
- package/dist/providers/litellm.d.ts +3 -3
- package/dist/providers/litellm.js +10 -10
- package/dist/providers/mistral.d.ts +3 -3
- package/dist/providers/mistral.js +7 -6
- package/dist/providers/ollama.d.ts +3 -4
- package/dist/providers/ollama.js +7 -8
- package/dist/providers/openAI.d.ts +14 -2
- package/dist/providers/openAI.js +60 -6
- package/dist/providers/openRouter.d.ts +2 -2
- package/dist/providers/openRouter.js +10 -6
- package/dist/rag/ChunkerFactory.d.ts +91 -0
- package/dist/rag/ChunkerFactory.js +320 -0
- package/dist/rag/ChunkerRegistry.d.ts +91 -0
- package/dist/rag/ChunkerRegistry.js +421 -0
- package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
- package/dist/rag/chunkers/BaseChunker.js +143 -0
- package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
- package/dist/rag/chunkers/CharacterChunker.js +28 -0
- package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
- package/dist/rag/chunkers/HTMLChunker.js +38 -0
- package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
- package/dist/rag/chunkers/JSONChunker.js +68 -0
- package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
- package/dist/rag/chunkers/LaTeXChunker.js +63 -0
- package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
- package/dist/rag/chunkers/MarkdownChunker.js +102 -0
- package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
- package/dist/rag/chunkers/RecursiveChunker.js +139 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
- package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
- package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
- package/dist/rag/chunkers/SentenceChunker.js +66 -0
- package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
- package/dist/rag/chunkers/TokenChunker.js +61 -0
- package/dist/rag/chunkers/index.d.ts +15 -0
- package/dist/rag/chunkers/index.js +15 -0
- package/dist/rag/chunking/characterChunker.d.ts +16 -0
- package/dist/rag/chunking/characterChunker.js +142 -0
- package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
- package/dist/rag/chunking/chunkerRegistry.js +194 -0
- package/dist/rag/chunking/htmlChunker.d.ts +34 -0
- package/dist/rag/chunking/htmlChunker.js +247 -0
- package/dist/rag/chunking/index.d.ts +15 -0
- package/dist/rag/chunking/index.js +17 -0
- package/dist/rag/chunking/jsonChunker.d.ts +20 -0
- package/dist/rag/chunking/jsonChunker.js +281 -0
- package/dist/rag/chunking/latexChunker.d.ts +26 -0
- package/dist/rag/chunking/latexChunker.js +251 -0
- package/dist/rag/chunking/markdownChunker.d.ts +19 -0
- package/dist/rag/chunking/markdownChunker.js +201 -0
- package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
- package/dist/rag/chunking/recursiveChunker.js +148 -0
- package/dist/rag/chunking/semanticChunker.d.ts +41 -0
- package/dist/rag/chunking/semanticChunker.js +306 -0
- package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
- package/dist/rag/chunking/sentenceChunker.js +230 -0
- package/dist/rag/chunking/tokenChunker.d.ts +36 -0
- package/dist/rag/chunking/tokenChunker.js +183 -0
- package/dist/rag/document/MDocument.d.ts +198 -0
- package/dist/rag/document/MDocument.js +392 -0
- package/dist/rag/document/index.d.ts +5 -0
- package/dist/rag/document/index.js +5 -0
- package/dist/rag/document/loaders.d.ts +201 -0
- package/dist/rag/document/loaders.js +500 -0
- package/dist/rag/errors/RAGError.d.ts +244 -0
- package/dist/rag/errors/RAGError.js +274 -0
- package/dist/rag/errors/index.d.ts +6 -0
- package/dist/rag/errors/index.js +6 -0
- package/dist/rag/graphRag/graphRAG.d.ts +115 -0
- package/dist/rag/graphRag/graphRAG.js +384 -0
- package/dist/rag/graphRag/index.d.ts +4 -0
- package/dist/rag/graphRag/index.js +4 -0
- package/dist/rag/index.d.ts +103 -0
- package/dist/rag/index.js +141 -0
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
- package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
- package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
- package/dist/rag/metadata/index.d.ts +6 -0
- package/dist/rag/metadata/index.js +9 -0
- package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
- package/dist/rag/metadata/metadataExtractor.js +277 -0
- package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
- package/dist/rag/pipeline/RAGPipeline.js +401 -0
- package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
- package/dist/rag/pipeline/contextAssembly.js +337 -0
- package/dist/rag/pipeline/index.d.ts +5 -0
- package/dist/rag/pipeline/index.js +5 -0
- package/dist/rag/ragIntegration.d.ts +38 -0
- package/dist/rag/ragIntegration.js +211 -0
- package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
- package/dist/rag/reranker/RerankerFactory.js +430 -0
- package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
- package/dist/rag/reranker/RerankerRegistry.js +402 -0
- package/dist/rag/reranker/index.d.ts +6 -0
- package/dist/rag/reranker/index.js +9 -0
- package/dist/rag/reranker/reranker.d.ts +71 -0
- package/dist/rag/reranker/reranker.js +277 -0
- package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
- package/dist/rag/resilience/CircuitBreaker.js +431 -0
- package/dist/rag/resilience/RetryHandler.d.ts +115 -0
- package/dist/rag/resilience/RetryHandler.js +300 -0
- package/dist/rag/resilience/index.d.ts +7 -0
- package/dist/rag/resilience/index.js +7 -0
- package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
- package/dist/rag/retrieval/hybridSearch.js +313 -0
- package/dist/rag/retrieval/index.d.ts +5 -0
- package/dist/rag/retrieval/index.js +5 -0
- package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
- package/dist/rag/retrieval/vectorQueryTool.js +289 -0
- package/dist/rag/types.d.ts +768 -0
- package/dist/rag/types.js +8 -0
- package/dist/server/index.d.ts +15 -11
- package/dist/server/index.js +55 -51
- package/dist/server/utils/validation.d.ts +2 -2
- package/dist/types/common.d.ts +0 -1
- package/dist/types/generateTypes.d.ts +42 -8
- package/dist/types/generateTypes.js +1 -1
- package/dist/types/modelTypes.d.ts +20 -20
- package/dist/types/streamTypes.d.ts +28 -8
- package/dist/types/streamTypes.js +1 -1
- package/dist/utils/modelRouter.d.ts +4 -4
- package/dist/utils/modelRouter.js +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits HTML documents based on tag structure while preserving semantics.
|
|
5
|
+
* Best for web pages, email templates, and structured HTML content.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* HTML-aware chunker implementation
|
|
10
|
+
* Splits based on HTML structure (tags, elements)
|
|
11
|
+
*/
|
|
12
|
+
export class HTMLChunker {
|
|
13
|
+
strategy = "html";
|
|
14
|
+
defaultSplitTags = [
|
|
15
|
+
"div",
|
|
16
|
+
"p",
|
|
17
|
+
"section",
|
|
18
|
+
"article",
|
|
19
|
+
"main",
|
|
20
|
+
"aside",
|
|
21
|
+
"header",
|
|
22
|
+
"footer",
|
|
23
|
+
"nav",
|
|
24
|
+
"li",
|
|
25
|
+
"tr",
|
|
26
|
+
"td",
|
|
27
|
+
"th",
|
|
28
|
+
];
|
|
29
|
+
defaultPreserveTags = [
|
|
30
|
+
"pre",
|
|
31
|
+
"code",
|
|
32
|
+
"table",
|
|
33
|
+
"ul",
|
|
34
|
+
"ol",
|
|
35
|
+
"blockquote",
|
|
36
|
+
];
|
|
37
|
+
async chunk(text, config) {
|
|
38
|
+
const { maxSize = 1000, overlap = 0, splitTags = this.defaultSplitTags, preserveTags = this.defaultPreserveTags, extractTextOnly = false, includeTagMetadata = true, trimWhitespace = true, metadata = {}, } = config || {};
|
|
39
|
+
const documentId = randomUUID();
|
|
40
|
+
const chunks = [];
|
|
41
|
+
if (!text || text.length === 0) {
|
|
42
|
+
return chunks;
|
|
43
|
+
}
|
|
44
|
+
// Extract and split by structural tags
|
|
45
|
+
const sections = this.splitByTags(text, splitTags, preserveTags);
|
|
46
|
+
let chunkIndex = 0;
|
|
47
|
+
let currentPosition = 0;
|
|
48
|
+
for (const section of sections) {
|
|
49
|
+
const { content, tagName, attributes } = section;
|
|
50
|
+
// Process content
|
|
51
|
+
let processedContent = content;
|
|
52
|
+
if (extractTextOnly) {
|
|
53
|
+
processedContent = this.extractText(content);
|
|
54
|
+
}
|
|
55
|
+
// Split if content is too large
|
|
56
|
+
const contentChunks = this.splitContent(processedContent, maxSize, overlap);
|
|
57
|
+
for (const contentChunk of contentChunks) {
|
|
58
|
+
const finalText = trimWhitespace ? contentChunk.trim() : contentChunk;
|
|
59
|
+
if (finalText.length > 0) {
|
|
60
|
+
const chunkMetadata = {
|
|
61
|
+
...metadata,
|
|
62
|
+
};
|
|
63
|
+
if (includeTagMetadata && tagName) {
|
|
64
|
+
chunkMetadata.tagName = tagName;
|
|
65
|
+
if (attributes && Object.keys(attributes).length > 0) {
|
|
66
|
+
chunkMetadata.attributes = attributes;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
chunks.push({
|
|
70
|
+
id: randomUUID(),
|
|
71
|
+
text: finalText,
|
|
72
|
+
metadata: {
|
|
73
|
+
documentId,
|
|
74
|
+
chunkIndex,
|
|
75
|
+
startPosition: currentPosition,
|
|
76
|
+
endPosition: currentPosition + contentChunk.length,
|
|
77
|
+
documentType: "html",
|
|
78
|
+
custom: chunkMetadata,
|
|
79
|
+
},
|
|
80
|
+
});
|
|
81
|
+
chunkIndex++;
|
|
82
|
+
}
|
|
83
|
+
currentPosition += contentChunk.length;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// Update total chunks count
|
|
87
|
+
chunks.forEach((chunk) => {
|
|
88
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
89
|
+
});
|
|
90
|
+
return chunks;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Split HTML by structural tags
|
|
94
|
+
*/
|
|
95
|
+
splitByTags(html, splitTags, preserveTags) {
|
|
96
|
+
const sections = [];
|
|
97
|
+
// Create regex pattern for split tags
|
|
98
|
+
const tagPattern = new RegExp(`<(${splitTags.join("|")})([^>]*)>([\\s\\S]*?)</\\1>`, "gi");
|
|
99
|
+
let lastIndex = 0;
|
|
100
|
+
let match;
|
|
101
|
+
// Reset regex
|
|
102
|
+
tagPattern.lastIndex = 0;
|
|
103
|
+
while ((match = tagPattern.exec(html)) !== null) {
|
|
104
|
+
// Content before this tag
|
|
105
|
+
if (match.index > lastIndex) {
|
|
106
|
+
const beforeContent = html.slice(lastIndex, match.index).trim();
|
|
107
|
+
if (beforeContent.length > 0) {
|
|
108
|
+
sections.push({
|
|
109
|
+
content: beforeContent,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
const tagName = match[1].toLowerCase();
|
|
114
|
+
const attributeString = match[2];
|
|
115
|
+
const innerContent = match[3];
|
|
116
|
+
// Parse attributes
|
|
117
|
+
const attributes = this.parseAttributes(attributeString);
|
|
118
|
+
// Check if this tag should be preserved as a unit
|
|
119
|
+
const shouldPreserve = preserveTags.some((pt) => innerContent.toLowerCase().includes(`<${pt}`));
|
|
120
|
+
if (shouldPreserve) {
|
|
121
|
+
// Keep the full tag content
|
|
122
|
+
sections.push({
|
|
123
|
+
content: match[0],
|
|
124
|
+
tagName,
|
|
125
|
+
attributes,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
// Just the inner content
|
|
130
|
+
sections.push({
|
|
131
|
+
content: innerContent,
|
|
132
|
+
tagName,
|
|
133
|
+
attributes,
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
lastIndex = match.index + match[0].length;
|
|
137
|
+
}
|
|
138
|
+
// Don't forget content after the last tag
|
|
139
|
+
if (lastIndex < html.length) {
|
|
140
|
+
const remaining = html.slice(lastIndex).trim();
|
|
141
|
+
if (remaining.length > 0) {
|
|
142
|
+
sections.push({
|
|
143
|
+
content: remaining,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// If no tags found, return entire text as one section
|
|
148
|
+
if (sections.length === 0 && html.trim()) {
|
|
149
|
+
sections.push({
|
|
150
|
+
content: html.trim(),
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
return sections;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Parse HTML attributes from string
|
|
157
|
+
*/
|
|
158
|
+
parseAttributes(attributeString) {
|
|
159
|
+
const attributes = {};
|
|
160
|
+
const attrPattern = /(\w+)(?:=["']([^"']*?)["'])?/g;
|
|
161
|
+
let match;
|
|
162
|
+
while ((match = attrPattern.exec(attributeString)) !== null) {
|
|
163
|
+
const name = match[1];
|
|
164
|
+
const value = match[2] || "";
|
|
165
|
+
attributes[name] = value;
|
|
166
|
+
}
|
|
167
|
+
return attributes;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Extract plain text from HTML
|
|
171
|
+
*/
|
|
172
|
+
extractText(html) {
|
|
173
|
+
return (html
|
|
174
|
+
// Remove script and style elements
|
|
175
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
176
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
177
|
+
// Remove HTML comments
|
|
178
|
+
.replace(/<!--[\s\S]*?-->/g, "")
|
|
179
|
+
// Replace block elements with newlines
|
|
180
|
+
.replace(/<\/(p|div|br|h[1-6]|li|tr)>/gi, "\n")
|
|
181
|
+
// Remove remaining tags
|
|
182
|
+
.replace(/<[^>]+>/g, "")
|
|
183
|
+
// Decode common HTML entities
|
|
184
|
+
.replace(/ /gi, " ")
|
|
185
|
+
.replace(/&/gi, "&")
|
|
186
|
+
.replace(/</gi, "<")
|
|
187
|
+
.replace(/>/gi, ">")
|
|
188
|
+
.replace(/"/gi, '"')
|
|
189
|
+
.replace(/'/gi, "'")
|
|
190
|
+
// Normalize whitespace
|
|
191
|
+
.replace(/\s+/g, " ")
|
|
192
|
+
.trim());
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Split content that exceeds max size
|
|
196
|
+
*/
|
|
197
|
+
splitContent(content, maxSize, overlap) {
|
|
198
|
+
const effectiveMaxSize = Math.max(maxSize, 1);
|
|
199
|
+
const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
|
|
200
|
+
if (content.length <= effectiveMaxSize) {
|
|
201
|
+
return [content];
|
|
202
|
+
}
|
|
203
|
+
const chunks = [];
|
|
204
|
+
let start = 0;
|
|
205
|
+
while (start < content.length) {
|
|
206
|
+
let end = Math.min(start + effectiveMaxSize, content.length);
|
|
207
|
+
// Try to break at a natural boundary
|
|
208
|
+
if (end < content.length) {
|
|
209
|
+
const searchStart = Math.max(start, end - 100);
|
|
210
|
+
const searchText = content.slice(searchStart, end);
|
|
211
|
+
// Look for paragraph/sentence break
|
|
212
|
+
const breakMatch = searchText.match(/[.!?\n]\s+/);
|
|
213
|
+
if (breakMatch && breakMatch.index !== undefined) {
|
|
214
|
+
end = searchStart + breakMatch.index + 1;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
chunks.push(content.slice(start, end));
|
|
218
|
+
start = Math.max(start + 1, end - effectiveOverlap);
|
|
219
|
+
}
|
|
220
|
+
return chunks;
|
|
221
|
+
}
|
|
222
|
+
validateConfig(config) {
|
|
223
|
+
const errors = [];
|
|
224
|
+
const warnings = [];
|
|
225
|
+
const htmlConfig = config;
|
|
226
|
+
if (htmlConfig.maxSize !== undefined && htmlConfig.maxSize <= 0) {
|
|
227
|
+
errors.push("maxSize must be greater than 0");
|
|
228
|
+
}
|
|
229
|
+
if (htmlConfig.overlap !== undefined && htmlConfig.overlap < 0) {
|
|
230
|
+
errors.push("overlap must be non-negative");
|
|
231
|
+
}
|
|
232
|
+
if (htmlConfig.overlap !== undefined &&
|
|
233
|
+
htmlConfig.maxSize !== undefined &&
|
|
234
|
+
htmlConfig.overlap >= htmlConfig.maxSize) {
|
|
235
|
+
errors.push("overlap must be less than maxSize");
|
|
236
|
+
}
|
|
237
|
+
if (htmlConfig.splitTags !== undefined &&
|
|
238
|
+
htmlConfig.splitTags.length === 0) {
|
|
239
|
+
warnings.push("No split tags specified, using defaults");
|
|
240
|
+
}
|
|
241
|
+
return {
|
|
242
|
+
valid: errors.length === 0,
|
|
243
|
+
errors,
|
|
244
|
+
warnings,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
//# sourceMappingURL=htmlChunker.js.map
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Module Exports
|
|
3
|
+
*
|
|
4
|
+
* Provides all chunking strategies and the chunker registry.
|
|
5
|
+
*/
|
|
6
|
+
export { ChunkerRegistry, chunkText } from "./chunkerRegistry.js";
|
|
7
|
+
export { CharacterChunker } from "./characterChunker.js";
|
|
8
|
+
export { RecursiveChunker } from "./recursiveChunker.js";
|
|
9
|
+
export { SentenceChunker } from "./sentenceChunker.js";
|
|
10
|
+
export { TokenChunker } from "./tokenChunker.js";
|
|
11
|
+
export { MarkdownChunker } from "./markdownChunker.js";
|
|
12
|
+
export { HTMLChunker } from "./htmlChunker.js";
|
|
13
|
+
export { JSONChunker } from "./jsonChunker.js";
|
|
14
|
+
export { LaTeXChunker } from "./latexChunker.js";
|
|
15
|
+
export { SemanticChunker } from "./semanticChunker.js";
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Module Exports
|
|
3
|
+
*
|
|
4
|
+
* Provides all chunking strategies and the chunker registry.
|
|
5
|
+
*/
|
|
6
|
+
// Registry
|
|
7
|
+
export { ChunkerRegistry, chunkText } from "./chunkerRegistry.js";
|
|
8
|
+
// Individual chunkers
|
|
9
|
+
export { CharacterChunker } from "./characterChunker.js";
|
|
10
|
+
export { RecursiveChunker } from "./recursiveChunker.js";
|
|
11
|
+
export { SentenceChunker } from "./sentenceChunker.js";
|
|
12
|
+
export { TokenChunker } from "./tokenChunker.js";
|
|
13
|
+
export { MarkdownChunker } from "./markdownChunker.js";
|
|
14
|
+
export { HTMLChunker } from "./htmlChunker.js";
|
|
15
|
+
export { JSONChunker } from "./jsonChunker.js";
|
|
16
|
+
export { LaTeXChunker } from "./latexChunker.js";
|
|
17
|
+
export { SemanticChunker } from "./semanticChunker.js";
|
|
18
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits JSON documents based on structure (arrays, objects, keys).
|
|
5
|
+
* Best for API responses, configuration files, and structured data.
|
|
6
|
+
*/
|
|
7
|
+
import type { Chunker, Chunk, ChunkerValidationResult, JSONChunkerConfig, BaseChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* JSON-aware chunker implementation
|
|
10
|
+
* Splits based on JSON structure
|
|
11
|
+
*/
|
|
12
|
+
export declare class JSONChunker implements Chunker {
|
|
13
|
+
readonly strategy: "json";
|
|
14
|
+
chunk(text: string, config?: JSONChunkerConfig): Promise<Chunk[]>;
|
|
15
|
+
/**
|
|
16
|
+
* Recursively extract chunks from JSON structure
|
|
17
|
+
*/
|
|
18
|
+
private extractChunks;
|
|
19
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
20
|
+
}
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits JSON documents based on structure (arrays, objects, keys).
|
|
5
|
+
* Best for API responses, configuration files, and structured data.
|
|
6
|
+
*/
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
/**
|
|
9
|
+
* JSON-aware chunker implementation
|
|
10
|
+
* Splits based on JSON structure
|
|
11
|
+
*/
|
|
12
|
+
export class JSONChunker {
|
|
13
|
+
strategy = "json";
|
|
14
|
+
async chunk(text, config) {
|
|
15
|
+
const { maxSize = 1000, maxDepth = 10, splitKeys = [], preserveKeys = [], includeJsonPath = true, trimWhitespace = true, metadata = {}, } = config || {};
|
|
16
|
+
const documentId = randomUUID();
|
|
17
|
+
const chunks = [];
|
|
18
|
+
if (!text || text.length === 0) {
|
|
19
|
+
return chunks;
|
|
20
|
+
}
|
|
21
|
+
// Parse JSON
|
|
22
|
+
let jsonData;
|
|
23
|
+
try {
|
|
24
|
+
jsonData = JSON.parse(text);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
// If not valid JSON, treat as plain text
|
|
28
|
+
chunks.push({
|
|
29
|
+
id: randomUUID(),
|
|
30
|
+
text: trimWhitespace ? text.trim() : text,
|
|
31
|
+
metadata: {
|
|
32
|
+
documentId,
|
|
33
|
+
chunkIndex: 0,
|
|
34
|
+
totalChunks: 1,
|
|
35
|
+
startPosition: 0,
|
|
36
|
+
endPosition: text.length,
|
|
37
|
+
documentType: "json",
|
|
38
|
+
custom: {
|
|
39
|
+
...metadata,
|
|
40
|
+
parseError: "Invalid JSON",
|
|
41
|
+
},
|
|
42
|
+
},
|
|
43
|
+
});
|
|
44
|
+
return chunks;
|
|
45
|
+
}
|
|
46
|
+
// Extract chunks from JSON structure
|
|
47
|
+
const extractedChunks = this.extractChunks({
|
|
48
|
+
data: jsonData,
|
|
49
|
+
path: "",
|
|
50
|
+
depth: 0,
|
|
51
|
+
maxDepth,
|
|
52
|
+
maxSize,
|
|
53
|
+
splitKeys,
|
|
54
|
+
preserveKeys,
|
|
55
|
+
includeJsonPath,
|
|
56
|
+
});
|
|
57
|
+
// Convert to Chunk objects
|
|
58
|
+
let chunkIndex = 0;
|
|
59
|
+
let currentPosition = 0;
|
|
60
|
+
for (const extracted of extractedChunks) {
|
|
61
|
+
const chunkText = JSON.stringify(extracted.value, null, 2);
|
|
62
|
+
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
|
|
63
|
+
if (finalText.length > 0) {
|
|
64
|
+
const chunkMetadata = {
|
|
65
|
+
...metadata,
|
|
66
|
+
};
|
|
67
|
+
if (includeJsonPath && extracted.path) {
|
|
68
|
+
chunkMetadata.jsonPath = extracted.path;
|
|
69
|
+
}
|
|
70
|
+
chunks.push({
|
|
71
|
+
id: randomUUID(),
|
|
72
|
+
text: finalText,
|
|
73
|
+
metadata: {
|
|
74
|
+
documentId,
|
|
75
|
+
chunkIndex,
|
|
76
|
+
startPosition: currentPosition,
|
|
77
|
+
endPosition: currentPosition + finalText.length,
|
|
78
|
+
documentType: "json",
|
|
79
|
+
jsonPath: extracted.path,
|
|
80
|
+
custom: chunkMetadata,
|
|
81
|
+
},
|
|
82
|
+
});
|
|
83
|
+
chunkIndex++;
|
|
84
|
+
currentPosition += finalText.length;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// Update total chunks count
|
|
88
|
+
chunks.forEach((chunk) => {
|
|
89
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
90
|
+
});
|
|
91
|
+
return chunks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Recursively extract chunks from JSON structure
|
|
95
|
+
*/
|
|
96
|
+
extractChunks(options) {
|
|
97
|
+
const { data, path, depth, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, } = options;
|
|
98
|
+
const results = [];
|
|
99
|
+
// Check depth limit
|
|
100
|
+
if (depth > maxDepth) {
|
|
101
|
+
results.push({ value: data, path });
|
|
102
|
+
return results;
|
|
103
|
+
}
|
|
104
|
+
// Check if this should be preserved as a unit
|
|
105
|
+
const currentKey = path.split(".").pop() || "";
|
|
106
|
+
if (preserveKeys.includes(currentKey)) {
|
|
107
|
+
results.push({ value: data, path });
|
|
108
|
+
return results;
|
|
109
|
+
}
|
|
110
|
+
// Check size - if small enough, keep as one chunk
|
|
111
|
+
const serialized = JSON.stringify(data, null, 2);
|
|
112
|
+
if (serialized.length <= maxSize) {
|
|
113
|
+
results.push({ value: data, path });
|
|
114
|
+
return results;
|
|
115
|
+
}
|
|
116
|
+
// Handle arrays
|
|
117
|
+
if (Array.isArray(data)) {
|
|
118
|
+
// Check if array should be split by index
|
|
119
|
+
if (splitKeys.length === 0 || splitKeys.some((k) => path.endsWith(k))) {
|
|
120
|
+
// Split array into individual elements or groups
|
|
121
|
+
let currentGroup = [];
|
|
122
|
+
let currentGroupSize = 0;
|
|
123
|
+
for (let i = 0; i < data.length; i++) {
|
|
124
|
+
const item = data[i];
|
|
125
|
+
const itemSize = JSON.stringify(item, null, 2).length;
|
|
126
|
+
if (currentGroupSize + itemSize > maxSize &&
|
|
127
|
+
currentGroup.length > 0) {
|
|
128
|
+
// Save current group
|
|
129
|
+
results.push({
|
|
130
|
+
value: currentGroup.length === 1 ? currentGroup[0] : currentGroup,
|
|
131
|
+
path: `${path}[${i - currentGroup.length}:${i}]`,
|
|
132
|
+
});
|
|
133
|
+
currentGroup = [];
|
|
134
|
+
currentGroupSize = 0;
|
|
135
|
+
}
|
|
136
|
+
// If single item is too large, recursively split it
|
|
137
|
+
if (itemSize > maxSize) {
|
|
138
|
+
const subChunks = this.extractChunks({
|
|
139
|
+
data: item,
|
|
140
|
+
path: `${path}[${i}]`,
|
|
141
|
+
depth: depth + 1,
|
|
142
|
+
maxDepth,
|
|
143
|
+
maxSize,
|
|
144
|
+
splitKeys,
|
|
145
|
+
preserveKeys,
|
|
146
|
+
includeJsonPath,
|
|
147
|
+
});
|
|
148
|
+
results.push(...subChunks);
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
currentGroup.push(item);
|
|
152
|
+
currentGroupSize += itemSize;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Don't forget the last group
|
|
156
|
+
if (currentGroup.length > 0) {
|
|
157
|
+
results.push({
|
|
158
|
+
value: currentGroup.length === 1 ? currentGroup[0] : currentGroup,
|
|
159
|
+
path: `${path}[${data.length - currentGroup.length}:${data.length}]`,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
// Keep array as one unit but may need to truncate
|
|
165
|
+
results.push({ value: data, path });
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Handle objects
|
|
169
|
+
else if (data !== null && typeof data === "object") {
|
|
170
|
+
const obj = data;
|
|
171
|
+
const keys = Object.keys(obj);
|
|
172
|
+
// Check if any keys should be split
|
|
173
|
+
const keysToSplit = keys.filter((k) => splitKeys.length === 0 || splitKeys.includes(k));
|
|
174
|
+
if (keysToSplit.length > 0) {
|
|
175
|
+
let currentObj = {};
|
|
176
|
+
let currentObjSize = 0;
|
|
177
|
+
for (const key of keys) {
|
|
178
|
+
const value = obj[key];
|
|
179
|
+
const valueSize = JSON.stringify({ [key]: value }, null, 2).length;
|
|
180
|
+
// Check if this key should be split out
|
|
181
|
+
if (splitKeys.includes(key)) {
|
|
182
|
+
// Save current object first if it has content
|
|
183
|
+
if (Object.keys(currentObj).length > 0) {
|
|
184
|
+
results.push({
|
|
185
|
+
value: currentObj,
|
|
186
|
+
path: path,
|
|
187
|
+
});
|
|
188
|
+
currentObj = {};
|
|
189
|
+
currentObjSize = 0;
|
|
190
|
+
}
|
|
191
|
+
// Recursively process this value
|
|
192
|
+
const subChunks = this.extractChunks({
|
|
193
|
+
data: value,
|
|
194
|
+
path: path ? `${path}.${key}` : key,
|
|
195
|
+
depth: depth + 1,
|
|
196
|
+
maxDepth,
|
|
197
|
+
maxSize,
|
|
198
|
+
splitKeys,
|
|
199
|
+
preserveKeys,
|
|
200
|
+
includeJsonPath,
|
|
201
|
+
});
|
|
202
|
+
results.push(...subChunks);
|
|
203
|
+
}
|
|
204
|
+
else if (currentObjSize + valueSize > maxSize &&
|
|
205
|
+
Object.keys(currentObj).length > 0) {
|
|
206
|
+
// Save current object
|
|
207
|
+
results.push({
|
|
208
|
+
value: currentObj,
|
|
209
|
+
path: path,
|
|
210
|
+
});
|
|
211
|
+
currentObj = { [key]: value };
|
|
212
|
+
currentObjSize = valueSize;
|
|
213
|
+
}
|
|
214
|
+
else {
|
|
215
|
+
currentObj[key] = value;
|
|
216
|
+
currentObjSize += valueSize;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
// Don't forget the last object
|
|
220
|
+
if (Object.keys(currentObj).length > 0) {
|
|
221
|
+
results.push({
|
|
222
|
+
value: currentObj,
|
|
223
|
+
path: path,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
// Process each key individually
|
|
229
|
+
for (const key of keys) {
|
|
230
|
+
const value = obj[key];
|
|
231
|
+
const keyPath = path ? `${path}.${key}` : key;
|
|
232
|
+
const valueSize = JSON.stringify(value, null, 2).length;
|
|
233
|
+
if (valueSize > maxSize) {
|
|
234
|
+
// Recursively split
|
|
235
|
+
const subChunks = this.extractChunks({
|
|
236
|
+
data: value,
|
|
237
|
+
path: keyPath,
|
|
238
|
+
depth: depth + 1,
|
|
239
|
+
maxDepth,
|
|
240
|
+
maxSize,
|
|
241
|
+
splitKeys,
|
|
242
|
+
preserveKeys,
|
|
243
|
+
includeJsonPath,
|
|
244
|
+
});
|
|
245
|
+
results.push(...subChunks);
|
|
246
|
+
}
|
|
247
|
+
else {
|
|
248
|
+
results.push({
|
|
249
|
+
value: { [key]: value },
|
|
250
|
+
path: keyPath,
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Primitive values
|
|
257
|
+
else {
|
|
258
|
+
results.push({ value: data, path });
|
|
259
|
+
}
|
|
260
|
+
return results;
|
|
261
|
+
}
|
|
262
|
+
validateConfig(config) {
|
|
263
|
+
const errors = [];
|
|
264
|
+
const warnings = [];
|
|
265
|
+
const jsonConfig = config;
|
|
266
|
+
if (jsonConfig.maxSize !== undefined && jsonConfig.maxSize <= 0) {
|
|
267
|
+
errors.push("maxSize must be greater than 0");
|
|
268
|
+
}
|
|
269
|
+
if (jsonConfig.maxDepth !== undefined && jsonConfig.maxDepth < 1) {
|
|
270
|
+
errors.push("maxDepth must be at least 1");
|
|
271
|
+
}
|
|
272
|
+
if (jsonConfig.maxDepth !== undefined && jsonConfig.maxDepth > 100) {
|
|
273
|
+
warnings.push("Very high maxDepth may cause performance issues");
|
|
274
|
+
}
|
|
275
|
+
return {
|
|
276
|
+
valid: errors.length === 0,
|
|
277
|
+
errors,
|
|
278
|
+
warnings,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
//# sourceMappingURL=jsonChunker.js.map
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LaTeX-aware Chunker
|
|
3
|
+
*
|
|
4
|
+
* Splits LaTeX documents based on structure (sections, environments, math).
|
|
5
|
+
* Best for academic papers, scientific documents, and mathematical content.
|
|
6
|
+
*/
|
|
7
|
+
import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, LaTeXChunkerConfig } from "../types.js";
|
|
8
|
+
/**
|
|
9
|
+
* LaTeX-aware chunker implementation
|
|
10
|
+
* Splits based on LaTeX structure (sections, environments)
|
|
11
|
+
*/
|
|
12
|
+
export declare class LaTeXChunker implements Chunker {
|
|
13
|
+
readonly strategy: "latex";
|
|
14
|
+
private readonly defaultSplitEnvironments;
|
|
15
|
+
private readonly mathEnvironments;
|
|
16
|
+
chunk(text: string, config?: LaTeXChunkerConfig): Promise<Chunk[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Split LaTeX by sectioning commands
|
|
19
|
+
*/
|
|
20
|
+
private splitBySections;
|
|
21
|
+
/**
|
|
22
|
+
* Split content that exceeds max size
|
|
23
|
+
*/
|
|
24
|
+
private splitContent;
|
|
25
|
+
validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
|
|
26
|
+
}
|