@juspay/neurolink 9.1.1 → 9.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (555) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/README.md +106 -37
  3. package/dist/agent/directTools.d.ts +11 -11
  4. package/dist/cli/commands/config.d.ts +6 -6
  5. package/dist/cli/commands/rag.d.ts +19 -0
  6. package/dist/cli/commands/rag.js +756 -0
  7. package/dist/cli/factories/commandFactory.js +146 -83
  8. package/dist/cli/parser.js +4 -1
  9. package/dist/core/baseProvider.d.ts +43 -30
  10. package/dist/core/baseProvider.js +98 -138
  11. package/dist/core/conversationMemoryFactory.d.ts +2 -2
  12. package/dist/core/conversationMemoryFactory.js +2 -2
  13. package/dist/core/conversationMemoryInitializer.d.ts +1 -2
  14. package/dist/core/conversationMemoryInitializer.js +2 -2
  15. package/dist/core/infrastructure/baseError.d.ts +21 -0
  16. package/dist/core/infrastructure/baseError.js +22 -0
  17. package/dist/core/infrastructure/baseFactory.d.ts +21 -0
  18. package/dist/core/infrastructure/baseFactory.js +54 -0
  19. package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
  20. package/dist/core/infrastructure/baseRegistry.js +49 -0
  21. package/dist/core/infrastructure/index.d.ts +5 -0
  22. package/dist/core/infrastructure/index.js +5 -0
  23. package/dist/core/infrastructure/retry.d.ts +7 -0
  24. package/dist/core/infrastructure/retry.js +20 -0
  25. package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
  26. package/dist/core/infrastructure/typedEventEmitter.js +23 -0
  27. package/dist/core/redisConversationMemoryManager.d.ts +1 -6
  28. package/dist/core/redisConversationMemoryManager.js +7 -19
  29. package/dist/factories/providerFactory.d.ts +5 -3
  30. package/dist/factories/providerFactory.js +31 -24
  31. package/dist/image-gen/ImageGenService.d.ts +143 -0
  32. package/dist/image-gen/ImageGenService.js +345 -0
  33. package/dist/image-gen/imageGenTools.d.ts +126 -0
  34. package/dist/image-gen/imageGenTools.js +304 -0
  35. package/dist/image-gen/index.d.ts +46 -0
  36. package/dist/image-gen/index.js +48 -0
  37. package/dist/image-gen/types.d.ts +237 -0
  38. package/dist/image-gen/types.js +24 -0
  39. package/dist/index.d.ts +46 -12
  40. package/dist/index.js +88 -36
  41. package/dist/lib/agent/directTools.d.ts +8 -8
  42. package/dist/lib/core/baseProvider.d.ts +43 -30
  43. package/dist/lib/core/baseProvider.js +98 -138
  44. package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
  45. package/dist/lib/core/conversationMemoryFactory.js +2 -2
  46. package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
  47. package/dist/lib/core/conversationMemoryInitializer.js +2 -2
  48. package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
  49. package/dist/lib/core/infrastructure/baseError.js +23 -0
  50. package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
  51. package/dist/lib/core/infrastructure/baseFactory.js +55 -0
  52. package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
  53. package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
  54. package/dist/lib/core/infrastructure/index.d.ts +5 -0
  55. package/dist/lib/core/infrastructure/index.js +6 -0
  56. package/dist/lib/core/infrastructure/retry.d.ts +7 -0
  57. package/dist/lib/core/infrastructure/retry.js +21 -0
  58. package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
  59. package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
  60. package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
  61. package/dist/lib/core/redisConversationMemoryManager.js +7 -19
  62. package/dist/lib/factories/providerFactory.d.ts +5 -3
  63. package/dist/lib/factories/providerFactory.js +31 -24
  64. package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
  65. package/dist/lib/image-gen/ImageGenService.js +346 -0
  66. package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
  67. package/dist/lib/image-gen/imageGenTools.js +305 -0
  68. package/dist/lib/image-gen/index.d.ts +46 -0
  69. package/dist/lib/image-gen/index.js +49 -0
  70. package/dist/lib/image-gen/types.d.ts +237 -0
  71. package/dist/lib/image-gen/types.js +25 -0
  72. package/dist/lib/index.d.ts +46 -12
  73. package/dist/lib/index.js +88 -36
  74. package/dist/lib/mcp/index.d.ts +6 -5
  75. package/dist/lib/mcp/index.js +7 -5
  76. package/dist/lib/neurolink.d.ts +11 -13
  77. package/dist/lib/neurolink.js +95 -29
  78. package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
  79. package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
  80. package/dist/lib/processors/base/index.d.ts +14 -0
  81. package/dist/lib/processors/base/index.js +20 -0
  82. package/dist/lib/processors/base/types.d.ts +593 -0
  83. package/dist/lib/processors/base/types.js +77 -0
  84. package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
  85. package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
  86. package/dist/lib/processors/cli/index.d.ts +37 -0
  87. package/dist/lib/processors/cli/index.js +50 -0
  88. package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
  89. package/dist/lib/processors/code/ConfigProcessor.js +401 -0
  90. package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
  91. package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
  92. package/dist/lib/processors/code/index.d.ts +44 -0
  93. package/dist/lib/processors/code/index.js +61 -0
  94. package/dist/lib/processors/config/fileTypes.d.ts +283 -0
  95. package/dist/lib/processors/config/fileTypes.js +521 -0
  96. package/dist/lib/processors/config/index.d.ts +32 -0
  97. package/dist/lib/processors/config/index.js +93 -0
  98. package/dist/lib/processors/config/languageMap.d.ts +66 -0
  99. package/dist/lib/processors/config/languageMap.js +411 -0
  100. package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
  101. package/dist/lib/processors/config/mimeTypes.js +339 -0
  102. package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
  103. package/dist/lib/processors/config/sizeLimits.js +247 -0
  104. package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
  105. package/dist/lib/processors/data/JsonProcessor.js +204 -0
  106. package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
  107. package/dist/lib/processors/data/XmlProcessor.js +284 -0
  108. package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
  109. package/dist/lib/processors/data/YamlProcessor.js +295 -0
  110. package/dist/lib/processors/data/index.d.ts +49 -0
  111. package/dist/lib/processors/data/index.js +77 -0
  112. package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
  113. package/dist/lib/processors/document/ExcelProcessor.js +520 -0
  114. package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
  115. package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
  116. package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
  117. package/dist/lib/processors/document/RtfProcessor.js +362 -0
  118. package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
  119. package/dist/lib/processors/document/WordProcessor.js +354 -0
  120. package/dist/lib/processors/document/index.d.ts +54 -0
  121. package/dist/lib/processors/document/index.js +91 -0
  122. package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
  123. package/dist/lib/processors/errors/FileErrorCode.js +256 -0
  124. package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
  125. package/dist/lib/processors/errors/errorHelpers.js +379 -0
  126. package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
  127. package/dist/lib/processors/errors/errorSerializer.js +508 -0
  128. package/dist/lib/processors/errors/index.d.ts +46 -0
  129. package/dist/lib/processors/errors/index.js +50 -0
  130. package/dist/lib/processors/index.d.ts +76 -0
  131. package/dist/lib/processors/index.js +113 -0
  132. package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
  133. package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
  134. package/dist/lib/processors/integration/index.d.ts +42 -0
  135. package/dist/lib/processors/integration/index.js +45 -0
  136. package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
  137. package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
  138. package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
  139. package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
  140. package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
  141. package/dist/lib/processors/markup/SvgProcessor.js +241 -0
  142. package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
  143. package/dist/lib/processors/markup/TextProcessor.js +189 -0
  144. package/dist/lib/processors/markup/index.d.ts +66 -0
  145. package/dist/lib/processors/markup/index.js +103 -0
  146. package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
  147. package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
  148. package/dist/lib/processors/registry/index.d.ts +12 -0
  149. package/dist/lib/processors/registry/index.js +17 -0
  150. package/dist/lib/processors/registry/types.d.ts +53 -0
  151. package/dist/lib/processors/registry/types.js +11 -0
  152. package/dist/lib/providers/amazonBedrock.d.ts +15 -2
  153. package/dist/lib/providers/amazonBedrock.js +65 -8
  154. package/dist/lib/providers/anthropic.d.ts +3 -3
  155. package/dist/lib/providers/anthropic.js +10 -7
  156. package/dist/lib/providers/googleAiStudio.d.ts +5 -5
  157. package/dist/lib/providers/googleAiStudio.js +10 -7
  158. package/dist/lib/providers/googleVertex.d.ts +16 -4
  159. package/dist/lib/providers/googleVertex.js +72 -16
  160. package/dist/lib/providers/litellm.d.ts +3 -3
  161. package/dist/lib/providers/litellm.js +10 -10
  162. package/dist/lib/providers/mistral.d.ts +3 -3
  163. package/dist/lib/providers/mistral.js +7 -6
  164. package/dist/lib/providers/ollama.d.ts +3 -4
  165. package/dist/lib/providers/ollama.js +7 -8
  166. package/dist/lib/providers/openAI.d.ts +14 -2
  167. package/dist/lib/providers/openAI.js +60 -6
  168. package/dist/lib/providers/openRouter.d.ts +2 -2
  169. package/dist/lib/providers/openRouter.js +10 -6
  170. package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
  171. package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
  172. package/dist/lib/rag/ChunkerFactory.js +321 -0
  173. package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
  174. package/dist/lib/rag/ChunkerRegistry.js +422 -0
  175. package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
  176. package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
  177. package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
  178. package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
  179. package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
  180. package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
  181. package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
  182. package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
  183. package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
  184. package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
  185. package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
  186. package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
  187. package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
  188. package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
  189. package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
  190. package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
  191. package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
  192. package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
  193. package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
  194. package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
  195. package/dist/lib/rag/chunkers/index.d.ts +15 -0
  196. package/dist/lib/rag/chunkers/index.js +16 -0
  197. package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
  198. package/dist/lib/rag/chunking/characterChunker.js +143 -0
  199. package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
  200. package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
  201. package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
  202. package/dist/lib/rag/chunking/htmlChunker.js +248 -0
  203. package/dist/lib/rag/chunking/index.d.ts +15 -0
  204. package/dist/lib/rag/chunking/index.js +18 -0
  205. package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
  206. package/dist/lib/rag/chunking/jsonChunker.js +282 -0
  207. package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
  208. package/dist/lib/rag/chunking/latexChunker.js +252 -0
  209. package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
  210. package/dist/lib/rag/chunking/markdownChunker.js +202 -0
  211. package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
  212. package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
  213. package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
  214. package/dist/lib/rag/chunking/semanticChunker.js +307 -0
  215. package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
  216. package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
  217. package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
  218. package/dist/lib/rag/chunking/tokenChunker.js +184 -0
  219. package/dist/lib/rag/document/MDocument.d.ts +198 -0
  220. package/dist/lib/rag/document/MDocument.js +393 -0
  221. package/dist/lib/rag/document/index.d.ts +5 -0
  222. package/dist/lib/rag/document/index.js +6 -0
  223. package/dist/lib/rag/document/loaders.d.ts +201 -0
  224. package/dist/lib/rag/document/loaders.js +501 -0
  225. package/dist/lib/rag/errors/RAGError.d.ts +244 -0
  226. package/dist/lib/rag/errors/RAGError.js +275 -0
  227. package/dist/lib/rag/errors/index.d.ts +6 -0
  228. package/dist/lib/rag/errors/index.js +7 -0
  229. package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
  230. package/dist/lib/rag/graphRag/graphRAG.js +385 -0
  231. package/dist/lib/rag/graphRag/index.d.ts +4 -0
  232. package/dist/lib/rag/graphRag/index.js +5 -0
  233. package/dist/lib/rag/index.d.ts +103 -0
  234. package/dist/lib/rag/index.js +142 -0
  235. package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
  236. package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
  237. package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
  238. package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
  239. package/dist/lib/rag/metadata/index.d.ts +6 -0
  240. package/dist/lib/rag/metadata/index.js +10 -0
  241. package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
  242. package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
  243. package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
  244. package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
  245. package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
  246. package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
  247. package/dist/lib/rag/pipeline/index.d.ts +5 -0
  248. package/dist/lib/rag/pipeline/index.js +6 -0
  249. package/dist/lib/rag/ragIntegration.d.ts +38 -0
  250. package/dist/lib/rag/ragIntegration.js +212 -0
  251. package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
  252. package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
  253. package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
  254. package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
  255. package/dist/lib/rag/reranker/index.d.ts +6 -0
  256. package/dist/lib/rag/reranker/index.js +10 -0
  257. package/dist/lib/rag/reranker/reranker.d.ts +71 -0
  258. package/dist/lib/rag/reranker/reranker.js +278 -0
  259. package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
  260. package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
  261. package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
  262. package/dist/lib/rag/resilience/RetryHandler.js +301 -0
  263. package/dist/lib/rag/resilience/index.d.ts +7 -0
  264. package/dist/lib/rag/resilience/index.js +8 -0
  265. package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
  266. package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
  267. package/dist/lib/rag/retrieval/index.d.ts +5 -0
  268. package/dist/lib/rag/retrieval/index.js +6 -0
  269. package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
  270. package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
  271. package/dist/lib/rag/types.d.ts +768 -0
  272. package/dist/lib/rag/types.js +9 -0
  273. package/dist/lib/server/index.d.ts +15 -11
  274. package/dist/lib/server/index.js +55 -51
  275. package/dist/lib/server/utils/validation.d.ts +2 -2
  276. package/dist/lib/types/common.d.ts +0 -1
  277. package/dist/lib/types/fileTypes.d.ts +1 -1
  278. package/dist/lib/types/generateTypes.d.ts +42 -8
  279. package/dist/lib/types/generateTypes.js +1 -1
  280. package/dist/lib/types/index.d.ts +25 -24
  281. package/dist/lib/types/index.js +21 -20
  282. package/dist/lib/types/modelTypes.d.ts +16 -16
  283. package/dist/lib/types/pptTypes.d.ts +14 -2
  284. package/dist/lib/types/pptTypes.js +16 -0
  285. package/dist/lib/types/streamTypes.d.ts +28 -8
  286. package/dist/lib/types/streamTypes.js +1 -1
  287. package/dist/lib/utils/async/delay.d.ts +40 -0
  288. package/dist/lib/utils/async/delay.js +43 -0
  289. package/dist/lib/utils/async/index.d.ts +23 -0
  290. package/dist/lib/utils/async/index.js +24 -0
  291. package/dist/lib/utils/async/retry.d.ts +141 -0
  292. package/dist/lib/utils/async/retry.js +172 -0
  293. package/dist/lib/utils/async/withTimeout.d.ts +73 -0
  294. package/dist/lib/utils/async/withTimeout.js +97 -0
  295. package/dist/lib/utils/fileDetector.d.ts +7 -1
  296. package/dist/lib/utils/fileDetector.js +91 -18
  297. package/dist/lib/utils/json/extract.d.ts +103 -0
  298. package/dist/lib/utils/json/extract.js +249 -0
  299. package/dist/lib/utils/json/index.d.ts +36 -0
  300. package/dist/lib/utils/json/index.js +37 -0
  301. package/dist/lib/utils/json/safeParse.d.ts +137 -0
  302. package/dist/lib/utils/json/safeParse.js +191 -0
  303. package/dist/lib/utils/messageBuilder.d.ts +2 -2
  304. package/dist/lib/utils/messageBuilder.js +15 -7
  305. package/dist/lib/utils/modelRouter.d.ts +4 -4
  306. package/dist/lib/utils/modelRouter.js +4 -4
  307. package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
  308. package/dist/lib/utils/sanitizers/filename.js +366 -0
  309. package/dist/lib/utils/sanitizers/html.d.ts +170 -0
  310. package/dist/lib/utils/sanitizers/html.js +326 -0
  311. package/dist/lib/utils/sanitizers/index.d.ts +26 -0
  312. package/dist/lib/utils/sanitizers/index.js +30 -0
  313. package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
  314. package/dist/lib/utils/sanitizers/svg.js +483 -0
  315. package/dist/mcp/index.d.ts +6 -5
  316. package/dist/mcp/index.js +7 -5
  317. package/dist/neurolink.d.ts +11 -13
  318. package/dist/neurolink.js +95 -29
  319. package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
  320. package/dist/processors/base/BaseFileProcessor.js +613 -0
  321. package/dist/processors/base/index.d.ts +14 -0
  322. package/dist/processors/base/index.js +19 -0
  323. package/dist/processors/base/types.d.ts +593 -0
  324. package/dist/processors/base/types.js +76 -0
  325. package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
  326. package/dist/processors/cli/fileProcessorCli.js +388 -0
  327. package/dist/processors/cli/index.d.ts +37 -0
  328. package/dist/processors/cli/index.js +49 -0
  329. package/dist/processors/code/ConfigProcessor.d.ts +171 -0
  330. package/dist/processors/code/ConfigProcessor.js +400 -0
  331. package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
  332. package/dist/processors/code/SourceCodeProcessor.js +304 -0
  333. package/dist/processors/code/index.d.ts +44 -0
  334. package/dist/processors/code/index.js +60 -0
  335. package/dist/processors/config/fileTypes.d.ts +283 -0
  336. package/dist/processors/config/fileTypes.js +520 -0
  337. package/dist/processors/config/index.d.ts +32 -0
  338. package/dist/processors/config/index.js +92 -0
  339. package/dist/processors/config/languageMap.d.ts +66 -0
  340. package/dist/processors/config/languageMap.js +410 -0
  341. package/dist/processors/config/mimeTypes.d.ts +376 -0
  342. package/dist/processors/config/mimeTypes.js +338 -0
  343. package/dist/processors/config/sizeLimits.d.ts +194 -0
  344. package/dist/processors/config/sizeLimits.js +246 -0
  345. package/dist/processors/data/JsonProcessor.d.ts +122 -0
  346. package/dist/processors/data/JsonProcessor.js +203 -0
  347. package/dist/processors/data/XmlProcessor.d.ts +160 -0
  348. package/dist/processors/data/XmlProcessor.js +283 -0
  349. package/dist/processors/data/YamlProcessor.d.ts +163 -0
  350. package/dist/processors/data/YamlProcessor.js +294 -0
  351. package/dist/processors/data/index.d.ts +49 -0
  352. package/dist/processors/data/index.js +76 -0
  353. package/dist/processors/document/ExcelProcessor.d.ts +238 -0
  354. package/dist/processors/document/ExcelProcessor.js +519 -0
  355. package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
  356. package/dist/processors/document/OpenDocumentProcessor.js +210 -0
  357. package/dist/processors/document/RtfProcessor.d.ts +152 -0
  358. package/dist/processors/document/RtfProcessor.js +361 -0
  359. package/dist/processors/document/WordProcessor.d.ts +168 -0
  360. package/dist/processors/document/WordProcessor.js +353 -0
  361. package/dist/processors/document/index.d.ts +54 -0
  362. package/dist/processors/document/index.js +90 -0
  363. package/dist/processors/errors/FileErrorCode.d.ts +98 -0
  364. package/dist/processors/errors/FileErrorCode.js +255 -0
  365. package/dist/processors/errors/errorHelpers.d.ts +151 -0
  366. package/dist/processors/errors/errorHelpers.js +378 -0
  367. package/dist/processors/errors/errorSerializer.d.ts +139 -0
  368. package/dist/processors/errors/errorSerializer.js +507 -0
  369. package/dist/processors/errors/index.d.ts +46 -0
  370. package/dist/processors/errors/index.js +49 -0
  371. package/dist/processors/index.d.ts +76 -0
  372. package/dist/processors/index.js +112 -0
  373. package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
  374. package/dist/processors/integration/FileProcessorIntegration.js +272 -0
  375. package/dist/processors/integration/index.d.ts +42 -0
  376. package/dist/processors/integration/index.js +44 -0
  377. package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
  378. package/dist/processors/markup/HtmlProcessor.js +249 -0
  379. package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
  380. package/dist/processors/markup/MarkdownProcessor.js +244 -0
  381. package/dist/processors/markup/SvgProcessor.d.ts +156 -0
  382. package/dist/processors/markup/SvgProcessor.js +240 -0
  383. package/dist/processors/markup/TextProcessor.d.ts +135 -0
  384. package/dist/processors/markup/TextProcessor.js +188 -0
  385. package/dist/processors/markup/index.d.ts +66 -0
  386. package/dist/processors/markup/index.js +102 -0
  387. package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
  388. package/dist/processors/registry/ProcessorRegistry.js +608 -0
  389. package/dist/processors/registry/index.d.ts +12 -0
  390. package/dist/processors/registry/index.js +16 -0
  391. package/dist/processors/registry/types.d.ts +53 -0
  392. package/dist/processors/registry/types.js +10 -0
  393. package/dist/providers/amazonBedrock.d.ts +15 -2
  394. package/dist/providers/amazonBedrock.js +65 -8
  395. package/dist/providers/anthropic.d.ts +3 -3
  396. package/dist/providers/anthropic.js +10 -7
  397. package/dist/providers/googleAiStudio.d.ts +5 -5
  398. package/dist/providers/googleAiStudio.js +10 -7
  399. package/dist/providers/googleVertex.d.ts +16 -4
  400. package/dist/providers/googleVertex.js +72 -16
  401. package/dist/providers/litellm.d.ts +3 -3
  402. package/dist/providers/litellm.js +10 -10
  403. package/dist/providers/mistral.d.ts +3 -3
  404. package/dist/providers/mistral.js +7 -6
  405. package/dist/providers/ollama.d.ts +3 -4
  406. package/dist/providers/ollama.js +7 -8
  407. package/dist/providers/openAI.d.ts +14 -2
  408. package/dist/providers/openAI.js +60 -6
  409. package/dist/providers/openRouter.d.ts +2 -2
  410. package/dist/providers/openRouter.js +10 -6
  411. package/dist/rag/ChunkerFactory.d.ts +91 -0
  412. package/dist/rag/ChunkerFactory.js +320 -0
  413. package/dist/rag/ChunkerRegistry.d.ts +91 -0
  414. package/dist/rag/ChunkerRegistry.js +421 -0
  415. package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
  416. package/dist/rag/chunkers/BaseChunker.js +143 -0
  417. package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
  418. package/dist/rag/chunkers/CharacterChunker.js +28 -0
  419. package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
  420. package/dist/rag/chunkers/HTMLChunker.js +38 -0
  421. package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
  422. package/dist/rag/chunkers/JSONChunker.js +68 -0
  423. package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
  424. package/dist/rag/chunkers/LaTeXChunker.js +63 -0
  425. package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
  426. package/dist/rag/chunkers/MarkdownChunker.js +102 -0
  427. package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
  428. package/dist/rag/chunkers/RecursiveChunker.js +139 -0
  429. package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
  430. package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
  431. package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
  432. package/dist/rag/chunkers/SentenceChunker.js +66 -0
  433. package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
  434. package/dist/rag/chunkers/TokenChunker.js +61 -0
  435. package/dist/rag/chunkers/index.d.ts +15 -0
  436. package/dist/rag/chunkers/index.js +15 -0
  437. package/dist/rag/chunking/characterChunker.d.ts +16 -0
  438. package/dist/rag/chunking/characterChunker.js +142 -0
  439. package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
  440. package/dist/rag/chunking/chunkerRegistry.js +194 -0
  441. package/dist/rag/chunking/htmlChunker.d.ts +34 -0
  442. package/dist/rag/chunking/htmlChunker.js +247 -0
  443. package/dist/rag/chunking/index.d.ts +15 -0
  444. package/dist/rag/chunking/index.js +17 -0
  445. package/dist/rag/chunking/jsonChunker.d.ts +20 -0
  446. package/dist/rag/chunking/jsonChunker.js +281 -0
  447. package/dist/rag/chunking/latexChunker.d.ts +26 -0
  448. package/dist/rag/chunking/latexChunker.js +251 -0
  449. package/dist/rag/chunking/markdownChunker.d.ts +19 -0
  450. package/dist/rag/chunking/markdownChunker.js +201 -0
  451. package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
  452. package/dist/rag/chunking/recursiveChunker.js +148 -0
  453. package/dist/rag/chunking/semanticChunker.d.ts +41 -0
  454. package/dist/rag/chunking/semanticChunker.js +306 -0
  455. package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
  456. package/dist/rag/chunking/sentenceChunker.js +230 -0
  457. package/dist/rag/chunking/tokenChunker.d.ts +36 -0
  458. package/dist/rag/chunking/tokenChunker.js +183 -0
  459. package/dist/rag/document/MDocument.d.ts +198 -0
  460. package/dist/rag/document/MDocument.js +392 -0
  461. package/dist/rag/document/index.d.ts +5 -0
  462. package/dist/rag/document/index.js +5 -0
  463. package/dist/rag/document/loaders.d.ts +201 -0
  464. package/dist/rag/document/loaders.js +500 -0
  465. package/dist/rag/errors/RAGError.d.ts +244 -0
  466. package/dist/rag/errors/RAGError.js +274 -0
  467. package/dist/rag/errors/index.d.ts +6 -0
  468. package/dist/rag/errors/index.js +6 -0
  469. package/dist/rag/graphRag/graphRAG.d.ts +115 -0
  470. package/dist/rag/graphRag/graphRAG.js +384 -0
  471. package/dist/rag/graphRag/index.d.ts +4 -0
  472. package/dist/rag/graphRag/index.js +4 -0
  473. package/dist/rag/index.d.ts +103 -0
  474. package/dist/rag/index.js +141 -0
  475. package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
  476. package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
  477. package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
  478. package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
  479. package/dist/rag/metadata/index.d.ts +6 -0
  480. package/dist/rag/metadata/index.js +9 -0
  481. package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
  482. package/dist/rag/metadata/metadataExtractor.js +277 -0
  483. package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
  484. package/dist/rag/pipeline/RAGPipeline.js +401 -0
  485. package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
  486. package/dist/rag/pipeline/contextAssembly.js +337 -0
  487. package/dist/rag/pipeline/index.d.ts +5 -0
  488. package/dist/rag/pipeline/index.js +5 -0
  489. package/dist/rag/ragIntegration.d.ts +38 -0
  490. package/dist/rag/ragIntegration.js +211 -0
  491. package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
  492. package/dist/rag/reranker/RerankerFactory.js +430 -0
  493. package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
  494. package/dist/rag/reranker/RerankerRegistry.js +402 -0
  495. package/dist/rag/reranker/index.d.ts +6 -0
  496. package/dist/rag/reranker/index.js +9 -0
  497. package/dist/rag/reranker/reranker.d.ts +71 -0
  498. package/dist/rag/reranker/reranker.js +277 -0
  499. package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
  500. package/dist/rag/resilience/CircuitBreaker.js +431 -0
  501. package/dist/rag/resilience/RetryHandler.d.ts +115 -0
  502. package/dist/rag/resilience/RetryHandler.js +300 -0
  503. package/dist/rag/resilience/index.d.ts +7 -0
  504. package/dist/rag/resilience/index.js +7 -0
  505. package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
  506. package/dist/rag/retrieval/hybridSearch.js +313 -0
  507. package/dist/rag/retrieval/index.d.ts +5 -0
  508. package/dist/rag/retrieval/index.js +5 -0
  509. package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
  510. package/dist/rag/retrieval/vectorQueryTool.js +289 -0
  511. package/dist/rag/types.d.ts +768 -0
  512. package/dist/rag/types.js +8 -0
  513. package/dist/server/index.d.ts +15 -11
  514. package/dist/server/index.js +55 -51
  515. package/dist/server/utils/validation.d.ts +8 -8
  516. package/dist/types/common.d.ts +0 -1
  517. package/dist/types/fileTypes.d.ts +1 -1
  518. package/dist/types/generateTypes.d.ts +42 -8
  519. package/dist/types/generateTypes.js +1 -1
  520. package/dist/types/index.d.ts +25 -24
  521. package/dist/types/index.js +21 -20
  522. package/dist/types/modelTypes.d.ts +10 -10
  523. package/dist/types/pptTypes.d.ts +14 -2
  524. package/dist/types/pptTypes.js +16 -0
  525. package/dist/types/streamTypes.d.ts +28 -8
  526. package/dist/types/streamTypes.js +1 -1
  527. package/dist/utils/async/delay.d.ts +40 -0
  528. package/dist/utils/async/delay.js +42 -0
  529. package/dist/utils/async/index.d.ts +23 -0
  530. package/dist/utils/async/index.js +23 -0
  531. package/dist/utils/async/retry.d.ts +141 -0
  532. package/dist/utils/async/retry.js +171 -0
  533. package/dist/utils/async/withTimeout.d.ts +73 -0
  534. package/dist/utils/async/withTimeout.js +96 -0
  535. package/dist/utils/fileDetector.d.ts +7 -1
  536. package/dist/utils/fileDetector.js +91 -18
  537. package/dist/utils/json/extract.d.ts +103 -0
  538. package/dist/utils/json/extract.js +248 -0
  539. package/dist/utils/json/index.d.ts +36 -0
  540. package/dist/utils/json/index.js +36 -0
  541. package/dist/utils/json/safeParse.d.ts +137 -0
  542. package/dist/utils/json/safeParse.js +190 -0
  543. package/dist/utils/messageBuilder.d.ts +2 -2
  544. package/dist/utils/messageBuilder.js +15 -7
  545. package/dist/utils/modelRouter.d.ts +4 -4
  546. package/dist/utils/modelRouter.js +4 -4
  547. package/dist/utils/sanitizers/filename.d.ts +137 -0
  548. package/dist/utils/sanitizers/filename.js +365 -0
  549. package/dist/utils/sanitizers/html.d.ts +170 -0
  550. package/dist/utils/sanitizers/html.js +325 -0
  551. package/dist/utils/sanitizers/index.d.ts +26 -0
  552. package/dist/utils/sanitizers/index.js +29 -0
  553. package/dist/utils/sanitizers/svg.d.ts +81 -0
  554. package/dist/utils/sanitizers/svg.js +482 -0
  555. package/package.json +2 -2
@@ -0,0 +1,251 @@
1
+ /**
2
+ * LaTeX-aware Chunker
3
+ *
4
+ * Splits LaTeX documents based on structure (sections, environments, math).
5
+ * Best for academic papers, scientific documents, and mathematical content.
6
+ */
7
+ import { randomUUID } from "crypto";
8
+ /**
9
+ * LaTeX-aware chunker implementation
10
+ * Splits based on LaTeX structure (sections, environments)
11
+ */
12
+ export class LaTeXChunker {
13
+ strategy = "latex";
14
+ defaultSplitEnvironments = [
15
+ "section",
16
+ "subsection",
17
+ "subsubsection",
18
+ "chapter",
19
+ "part",
20
+ ];
21
+ mathEnvironments = [
22
+ "equation",
23
+ "equation*",
24
+ "align",
25
+ "align*",
26
+ "gather",
27
+ "gather*",
28
+ "multline",
29
+ "multline*",
30
+ "displaymath",
31
+ ];
32
+ async chunk(text, config) {
33
+ const { maxSize = 1000, overlap = 0, splitEnvironments = this.defaultSplitEnvironments, preserveMath = true, includePreamble = true, trimWhitespace = true, metadata = {}, } = config || {};
34
+ const documentId = randomUUID();
35
+ const chunks = [];
36
+ if (!text || text.length === 0) {
37
+ return chunks;
38
+ }
39
+ // Extract preamble if present
40
+ const preambleMatch = text.match(/^([\s\S]*?)\\begin\{document\}([\s\S]*?)\\end\{document\}/);
41
+ let preamble = "";
42
+ let documentContent = text;
43
+ if (preambleMatch) {
44
+ preamble = preambleMatch[1].trim();
45
+ documentContent = preambleMatch[2];
46
+ // Add preamble as first chunk if requested
47
+ if (includePreamble && preamble.length > 0) {
48
+ chunks.push({
49
+ id: randomUUID(),
50
+ text: preamble,
51
+ metadata: {
52
+ documentId,
53
+ chunkIndex: 0,
54
+ startPosition: 0,
55
+ endPosition: preamble.length,
56
+ documentType: "latex",
57
+ latexEnvironment: "preamble",
58
+ custom: metadata,
59
+ },
60
+ });
61
+ }
62
+ }
63
+ // Protect math environments
64
+ let processedContent = documentContent;
65
+ const mathBlocks = [];
66
+ if (preserveMath) {
67
+ // Protect display math environments
68
+ for (const env of this.mathEnvironments) {
69
+ const envPattern = new RegExp(`\\\\begin\\{${env}\\}[\\s\\S]*?\\\\end\\{${env}\\}`, "g");
70
+ processedContent = processedContent.replace(envPattern, (match) => {
71
+ const placeholder = `__MATH_${mathBlocks.length}__`;
72
+ mathBlocks.push({ placeholder, content: match });
73
+ return placeholder;
74
+ });
75
+ }
76
+ // Protect inline math
77
+ processedContent = processedContent.replace(/\$\$[\s\S]*?\$\$/g, (match) => {
78
+ const placeholder = `__MATH_${mathBlocks.length}__`;
79
+ mathBlocks.push({ placeholder, content: match });
80
+ return placeholder;
81
+ });
82
+ processedContent = processedContent.replace(/\$[^$]+\$/g, (match) => {
83
+ const placeholder = `__MATH_${mathBlocks.length}__`;
84
+ mathBlocks.push({ placeholder, content: match });
85
+ return placeholder;
86
+ });
87
+ // Protect \[ \] math
88
+ processedContent = processedContent.replace(/\\\[[\s\S]*?\\\]/g, (match) => {
89
+ const placeholder = `__MATH_${mathBlocks.length}__`;
90
+ mathBlocks.push({ placeholder, content: match });
91
+ return placeholder;
92
+ });
93
+ }
94
+ // Split by sectioning commands
95
+ const sections = this.splitBySections(processedContent, splitEnvironments);
96
+ let chunkIndex = chunks.length;
97
+ let currentPosition = includePreamble && preamble.length > 0 ? preamble.length : 0;
98
+ for (const section of sections) {
99
+ const { title, content, environment } = section;
100
+ // Restore math blocks
101
+ let restoredContent = content;
102
+ for (const { placeholder, content: mathContent } of mathBlocks) {
103
+ restoredContent = restoredContent.replace(placeholder, mathContent);
104
+ }
105
+ // Split if content is too large
106
+ const contentChunks = this.splitContent(restoredContent, maxSize, overlap);
107
+ for (let i = 0; i < contentChunks.length; i++) {
108
+ let chunkText = contentChunks[i];
109
+ // Include section command in first chunk
110
+ if (i === 0 && title && environment) {
111
+ chunkText = `\\${environment}{${title}}\n${chunkText}`;
112
+ }
113
+ const finalText = trimWhitespace ? chunkText.trim() : chunkText;
114
+ if (finalText.length > 0) {
115
+ chunks.push({
116
+ id: randomUUID(),
117
+ text: finalText,
118
+ metadata: {
119
+ documentId,
120
+ chunkIndex,
121
+ startPosition: currentPosition,
122
+ endPosition: currentPosition + chunkText.length,
123
+ documentType: "latex",
124
+ latexEnvironment: environment ?? undefined,
125
+ header: title ?? undefined,
126
+ custom: metadata,
127
+ },
128
+ });
129
+ chunkIndex++;
130
+ }
131
+ currentPosition += chunkText.length;
132
+ }
133
+ }
134
+ // Update total chunks count
135
+ chunks.forEach((chunk) => {
136
+ chunk.metadata.totalChunks = chunks.length;
137
+ });
138
+ return chunks;
139
+ }
140
+ /**
141
+ * Split LaTeX by sectioning commands
142
+ */
143
+ splitBySections(content, splitEnvironments) {
144
+ const sections = [];
145
+ // Build pattern for sectioning commands
146
+ const envPattern = splitEnvironments.join("|");
147
+ const sectionPattern = new RegExp(`\\\\(${envPattern})\\*?\\{([^}]*)\\}`, "g");
148
+ let lastIndex = 0;
149
+ let lastTitle = null;
150
+ let lastEnvironment = null;
151
+ let match;
152
+ // Reset regex
153
+ sectionPattern.lastIndex = 0;
154
+ while ((match = sectionPattern.exec(content)) !== null) {
155
+ // Content before this section
156
+ if (match.index > lastIndex) {
157
+ const sectionContent = content.slice(lastIndex, match.index);
158
+ if (sectionContent.trim()) {
159
+ sections.push({
160
+ title: lastTitle,
161
+ content: sectionContent.trim(),
162
+ environment: lastEnvironment,
163
+ });
164
+ }
165
+ }
166
+ lastEnvironment = match[1];
167
+ lastTitle = match[2];
168
+ lastIndex = match.index + match[0].length;
169
+ }
170
+ // Don't forget content after the last section
171
+ if (lastIndex < content.length) {
172
+ const remaining = content.slice(lastIndex);
173
+ if (remaining.trim()) {
174
+ sections.push({
175
+ title: lastTitle,
176
+ content: remaining.trim(),
177
+ environment: lastEnvironment,
178
+ });
179
+ }
180
+ }
181
+ // If no sections found, return entire content
182
+ if (sections.length === 0 && content.trim()) {
183
+ sections.push({
184
+ title: null,
185
+ content: content.trim(),
186
+ environment: null,
187
+ });
188
+ }
189
+ return sections;
190
+ }
191
+ /**
192
+ * Split content that exceeds max size
193
+ */
194
+ splitContent(content, maxSize, overlap) {
195
+ const effectiveMaxSize = Math.max(maxSize, 1);
196
+ const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
197
+ if (content.length <= effectiveMaxSize) {
198
+ return [content];
199
+ }
200
+ const chunks = [];
201
+ let start = 0;
202
+ while (start < content.length) {
203
+ let end = Math.min(start + effectiveMaxSize, content.length);
204
+ // Try to break at paragraph boundary
205
+ if (end < content.length) {
206
+ const searchStart = Math.max(start, end - 200);
207
+ const searchText = content.slice(searchStart, end);
208
+ // Look for paragraph break
209
+ const paragraphBreak = searchText.lastIndexOf("\n\n");
210
+ if (paragraphBreak > 0) {
211
+ end = searchStart + paragraphBreak;
212
+ }
213
+ else {
214
+ // Look for sentence break
215
+ const sentenceBreak = searchText.search(/[.!?]\s+[A-Z\\]/);
216
+ if (sentenceBreak > 0) {
217
+ end = searchStart + sentenceBreak + 1;
218
+ }
219
+ }
220
+ }
221
+ chunks.push(content.slice(start, end));
222
+ start = Math.max(start + 1, end - effectiveOverlap);
223
+ }
224
+ return chunks;
225
+ }
226
+ validateConfig(config) {
227
+ const errors = [];
228
+ const warnings = [];
229
+ const latexConfig = config;
230
+ if (latexConfig.maxSize !== undefined && latexConfig.maxSize <= 0) {
231
+ errors.push("maxSize must be greater than 0");
232
+ }
233
+ if (latexConfig.overlap !== undefined && latexConfig.overlap < 0) {
234
+ errors.push("overlap must be non-negative");
235
+ }
236
+ if (latexConfig.overlap !== undefined &&
237
+ latexConfig.maxSize !== undefined &&
238
+ latexConfig.overlap >= latexConfig.maxSize) {
239
+ errors.push("overlap must be less than maxSize");
240
+ }
241
+ if (latexConfig.splitEnvironments !== undefined &&
242
+ latexConfig.splitEnvironments.length === 0) {
243
+ warnings.push("No split environments specified, using defaults");
244
+ }
245
+ return {
246
+ valid: errors.length === 0,
247
+ errors,
248
+ warnings,
249
+ };
250
+ }
251
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Markdown-aware Chunker
3
+ *
4
+ * Splits markdown documents based on header structure while preserving formatting.
5
+ * Best for documentation, README files, and structured markdown content.
6
+ */
7
+ import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, MarkdownChunkerConfig } from "../types.js";
8
+ /**
9
+ * Markdown-aware chunker implementation
10
+ * Splits based on markdown structure (headers, code blocks, etc.)
11
+ */
12
+ export declare class MarkdownChunker implements Chunker {
13
+ readonly strategy: "markdown";
14
+ chunk(text: string, config?: MarkdownChunkerConfig): Promise<Chunk[]>;
15
+ private splitByHeaders;
16
+ private splitContent;
17
+ private stripMarkdown;
18
+ validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
19
+ }
@@ -0,0 +1,201 @@
1
+ /**
2
+ * Markdown-aware Chunker
3
+ *
4
+ * Splits markdown documents based on header structure while preserving formatting.
5
+ * Best for documentation, README files, and structured markdown content.
6
+ */
7
+ import { randomUUID } from "crypto";
8
+ /**
9
+ * Markdown-aware chunker implementation
10
+ * Splits based on markdown structure (headers, code blocks, etc.)
11
+ */
12
+ export class MarkdownChunker {
13
+ strategy = "markdown";
14
+ async chunk(text, config) {
15
+ const { maxSize = 1000, overlap = 0, headerLevels = [1, 2, 3], preserveCodeBlocks = true, includeHeader = true, stripFormatting = false, trimWhitespace = true, metadata = {}, } = config || {};
16
+ const documentId = randomUUID();
17
+ const chunks = [];
18
+ if (!text || text.length === 0) {
19
+ return chunks;
20
+ }
21
+ // Build header regex pattern
22
+ const headerPattern = new RegExp(`^(#{${Math.min(...headerLevels)},${Math.max(...headerLevels)}})\\s+(.+)$`, "gm");
23
+ // Split by headers while preserving them
24
+ const sections = this.splitByHeaders(text, headerPattern, includeHeader);
25
+ let chunkIndex = 0;
26
+ let currentPosition = 0;
27
+ for (const section of sections) {
28
+ const { header, content, level } = section;
29
+ // Handle code blocks
30
+ let processedContent = content;
31
+ const codeBlocks = [];
32
+ if (preserveCodeBlocks) {
33
+ processedContent = content.replace(/```[\s\S]*?```|`[^`]+`/g, (match) => {
34
+ const placeholder = `__CODE_BLOCK_${codeBlocks.length}__`;
35
+ codeBlocks.push({ placeholder, code: match });
36
+ return placeholder;
37
+ });
38
+ }
39
+ // Split content if too large
40
+ const effectiveMaxSize = Math.max(maxSize - (header?.length || 0), 100);
41
+ const contentChunks = this.splitContent(processedContent, effectiveMaxSize, overlap);
42
+ for (const contentChunk of contentChunks) {
43
+ let chunkText = header && includeHeader
44
+ ? `${header}\n\n${contentChunk}`
45
+ : contentChunk;
46
+ // Restore code blocks
47
+ for (const { placeholder, code } of codeBlocks) {
48
+ chunkText = chunkText.replace(placeholder, code);
49
+ }
50
+ // Strip formatting if requested
51
+ if (stripFormatting) {
52
+ chunkText = this.stripMarkdown(chunkText);
53
+ }
54
+ const finalText = trimWhitespace ? chunkText.trim() : chunkText;
55
+ if (finalText.length > 0) {
56
+ chunks.push({
57
+ id: randomUUID(),
58
+ text: finalText,
59
+ metadata: {
60
+ documentId,
61
+ chunkIndex,
62
+ startPosition: currentPosition,
63
+ endPosition: currentPosition + chunkText.length,
64
+ documentType: "markdown",
65
+ headerLevel: level ?? undefined,
66
+ header: header?.replace(/^#+\s*/, "") ?? undefined,
67
+ custom: metadata,
68
+ },
69
+ });
70
+ chunkIndex++;
71
+ }
72
+ currentPosition += chunkText.length;
73
+ }
74
+ }
75
+ // Update total chunks count
76
+ chunks.forEach((chunk) => {
77
+ chunk.metadata.totalChunks = chunks.length;
78
+ });
79
+ return chunks;
80
+ }
81
+ splitByHeaders(text, headerPattern, _includeHeader) {
82
+ const sections = [];
83
+ let lastIndex = 0;
84
+ let match;
85
+ let currentHeader = null;
86
+ let currentLevel = null;
87
+ // Reset regex
88
+ headerPattern.lastIndex = 0;
89
+ while ((match = headerPattern.exec(text)) !== null) {
90
+ // Content before this header
91
+ if (match.index > lastIndex) {
92
+ const content = text.slice(lastIndex, match.index);
93
+ if (content.trim()) {
94
+ sections.push({
95
+ header: currentHeader,
96
+ content: content.trim(),
97
+ level: currentLevel,
98
+ });
99
+ }
100
+ }
101
+ currentHeader = match[0];
102
+ currentLevel = match[1].length; // Number of # characters
103
+ lastIndex = match.index + match[0].length;
104
+ }
105
+ // Don't forget content after the last header
106
+ if (lastIndex < text.length) {
107
+ const content = text.slice(lastIndex);
108
+ if (content.trim()) {
109
+ sections.push({
110
+ header: currentHeader,
111
+ content: content.trim(),
112
+ level: currentLevel,
113
+ });
114
+ }
115
+ }
116
+ // If no headers found, return entire text as one section
117
+ if (sections.length === 0 && text.trim()) {
118
+ sections.push({
119
+ header: null,
120
+ content: text.trim(),
121
+ level: null,
122
+ });
123
+ }
124
+ return sections;
125
+ }
126
+ splitContent(content, maxSize, overlap) {
127
+ const effectiveMaxSize = Math.max(maxSize, 1);
128
+ const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
129
+ if (content.length <= effectiveMaxSize) {
130
+ return [content];
131
+ }
132
+ const chunks = [];
133
+ let start = 0;
134
+ while (start < content.length) {
135
+ let end = Math.min(start + effectiveMaxSize, content.length);
136
+ // Try to break at a paragraph or sentence boundary
137
+ if (end < content.length) {
138
+ const searchStart = Math.max(start, end - 200);
139
+ const searchText = content.slice(searchStart, end);
140
+ // Look for paragraph break first
141
+ const paragraphBreak = searchText.lastIndexOf("\n\n");
142
+ if (paragraphBreak > 0) {
143
+ end = searchStart + paragraphBreak;
144
+ }
145
+ else {
146
+ // Look for sentence break
147
+ const sentenceBreak = searchText.search(/[.!?]\s+[A-Z]/);
148
+ if (sentenceBreak > 0) {
149
+ end = searchStart + sentenceBreak + 1;
150
+ }
151
+ }
152
+ }
153
+ chunks.push(content.slice(start, end));
154
+ start = Math.max(start + 1, end - effectiveOverlap);
155
+ }
156
+ return chunks;
157
+ }
158
+ stripMarkdown(text) {
159
+ return text
160
+ .replace(/^#+\s+/gm, "") // Headers
161
+ .replace(/\*\*(.+?)\*\*/g, "$1") // Bold
162
+ .replace(/\*(.+?)\*/g, "$1") // Italic
163
+ .replace(/__(.+?)__/g, "$1") // Bold (underscore)
164
+ .replace(/_(.+?)_/g, "$1") // Italic (underscore)
165
+ .replace(/`(.+?)`/g, "$1") // Inline code
166
+ .replace(/```[\s\S]*?```/g, "") // Code blocks
167
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // Links
168
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1"); // Images
169
+ }
170
+ validateConfig(config) {
171
+ const errors = [];
172
+ const warnings = [];
173
+ const mdConfig = config;
174
+ if (mdConfig.maxSize !== undefined && mdConfig.maxSize <= 0) {
175
+ errors.push("maxSize must be greater than 0");
176
+ }
177
+ if (mdConfig.headerLevels !== undefined) {
178
+ if (mdConfig.headerLevels.length === 0) {
179
+ errors.push("headerLevels must not be empty");
180
+ }
181
+ for (const level of mdConfig.headerLevels) {
182
+ if (level < 1 || level > 6) {
183
+ errors.push(`Invalid header level: ${level}. Must be between 1 and 6`);
184
+ }
185
+ }
186
+ }
187
+ if (mdConfig.overlap !== undefined && mdConfig.overlap < 0) {
188
+ errors.push("overlap must be non-negative");
189
+ }
190
+ if (mdConfig.overlap !== undefined &&
191
+ mdConfig.maxSize !== undefined &&
192
+ mdConfig.overlap >= mdConfig.maxSize) {
193
+ errors.push("overlap must be less than maxSize");
194
+ }
195
+ return {
196
+ valid: errors.length === 0,
197
+ errors,
198
+ warnings,
199
+ };
200
+ }
201
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Recursive Chunker
3
+ *
4
+ * Smart text splitting using hierarchical separators.
5
+ * Tries each separator in order, recursively splitting chunks that are too large.
6
+ * Best for general-purpose text that has natural boundaries.
7
+ */
8
+ import type { Chunker, Chunk, ChunkerValidationResult, RecursiveChunkerConfig, BaseChunkerConfig } from "../types.js";
9
+ /**
10
+ * Recursive chunker implementation
11
+ * Smart splitting based on content structure using hierarchical separators
12
+ */
13
+ export declare class RecursiveChunker implements Chunker {
14
+ readonly strategy: "recursive";
15
+ private readonly defaultSeparators;
16
+ chunk(text: string, config?: RecursiveChunkerConfig): Promise<Chunk[]>;
17
+ private recursiveSplit;
18
+ validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
19
+ }
@@ -0,0 +1,148 @@
1
+ /**
2
+ * Recursive Chunker
3
+ *
4
+ * Smart text splitting using hierarchical separators.
5
+ * Tries each separator in order, recursively splitting chunks that are too large.
6
+ * Best for general-purpose text that has natural boundaries.
7
+ */
8
+ import { randomUUID } from "crypto";
9
+ /**
10
+ * Recursive chunker implementation
11
+ * Smart splitting based on content structure using hierarchical separators
12
+ */
13
+ export class RecursiveChunker {
14
+ strategy = "recursive";
15
+ defaultSeparators = ["\n\n", "\n", ". ", " ", ""];
16
+ async chunk(text, config) {
17
+ const { maxSize = 1000, overlap = 200, separators = this.defaultSeparators, isSeparatorRegex = false, trimWhitespace = true, metadata = {}, } = config || {};
18
+ const documentId = randomUUID();
19
+ const chunks = [];
20
+ if (!text || text.length === 0) {
21
+ return chunks;
22
+ }
23
+ const splitTexts = this.recursiveSplit(text, separators, maxSize, overlap, isSeparatorRegex);
24
+ let chunkIndex = 0;
25
+ let currentPosition = 0;
26
+ for (const splitText of splitTexts) {
27
+ const chunkText = trimWhitespace ? splitText.trim() : splitText;
28
+ if (chunkText.length > 0) {
29
+ const startPosition = text.indexOf(splitText, currentPosition);
30
+ chunks.push({
31
+ id: randomUUID(),
32
+ text: chunkText,
33
+ metadata: {
34
+ documentId,
35
+ chunkIndex,
36
+ startPosition: startPosition >= 0 ? startPosition : currentPosition,
37
+ endPosition: startPosition >= 0
38
+ ? startPosition + splitText.length
39
+ : currentPosition + splitText.length,
40
+ documentType: "text",
41
+ custom: metadata,
42
+ },
43
+ });
44
+ chunkIndex++;
45
+ if (startPosition >= 0) {
46
+ currentPosition = startPosition + splitText.length - overlap;
47
+ }
48
+ }
49
+ }
50
+ // Update total chunks count
51
+ chunks.forEach((chunk) => {
52
+ chunk.metadata.totalChunks = chunks.length;
53
+ });
54
+ return chunks;
55
+ }
56
+ recursiveSplit(text, separators, maxSize, overlap, isRegex) {
57
+ const results = [];
58
+ if (text.length <= maxSize) {
59
+ return [text];
60
+ }
61
+ // Find the best separator to use
62
+ let separator = separators[separators.length - 1]; // Default to last (usually "")
63
+ let newSeparators = separators;
64
+ for (let i = 0; i < separators.length; i++) {
65
+ const sep = separators[i];
66
+ const hasMatch = isRegex
67
+ ? new RegExp(sep).test(text)
68
+ : text.includes(sep);
69
+ if (sep === "" || hasMatch) {
70
+ separator = sep;
71
+ newSeparators = separators.slice(i + 1);
72
+ break;
73
+ }
74
+ }
75
+ // Split the text
76
+ const splits = isRegex
77
+ ? text.split(new RegExp(separator))
78
+ : text.split(separator);
79
+ // Merge splits into chunks
80
+ let currentChunk = "";
81
+ for (const split of splits) {
82
+ const potentialChunk = currentChunk
83
+ ? currentChunk + separator + split
84
+ : split;
85
+ if (potentialChunk.length <= maxSize) {
86
+ currentChunk = potentialChunk;
87
+ }
88
+ else {
89
+ // Current chunk is ready
90
+ if (currentChunk.length > 0) {
91
+ results.push(currentChunk);
92
+ }
93
+ // Handle split that's still too large
94
+ if (split.length > maxSize) {
95
+ const subSplits = this.recursiveSplit(split, newSeparators, maxSize, overlap, isRegex);
96
+ results.push(...subSplits.slice(0, -1));
97
+ currentChunk = subSplits[subSplits.length - 1] || "";
98
+ }
99
+ else {
100
+ // Add overlap from previous chunk
101
+ if (results.length > 0 && overlap > 0) {
102
+ const lastChunk = results[results.length - 1];
103
+ const overlapText = lastChunk.slice(-overlap);
104
+ currentChunk = overlapText + separator + split;
105
+ }
106
+ else {
107
+ currentChunk = split;
108
+ }
109
+ }
110
+ }
111
+ }
112
+ // Don't forget the last chunk
113
+ if (currentChunk.length > 0) {
114
+ results.push(currentChunk);
115
+ }
116
+ return results;
117
+ }
118
+ validateConfig(config) {
119
+ const errors = [];
120
+ const warnings = [];
121
+ const recConfig = config;
122
+ if (recConfig.maxSize !== undefined && recConfig.maxSize <= 0) {
123
+ errors.push("maxSize must be greater than 0");
124
+ }
125
+ if (recConfig.overlap !== undefined && recConfig.overlap < 0) {
126
+ errors.push("overlap must be non-negative");
127
+ }
128
+ if (recConfig.separators !== undefined &&
129
+ recConfig.separators.length === 0) {
130
+ errors.push("separators array must not be empty");
131
+ }
132
+ if (recConfig.isSeparatorRegex && recConfig.separators) {
133
+ for (const sep of recConfig.separators) {
134
+ try {
135
+ new RegExp(sep);
136
+ }
137
+ catch {
138
+ errors.push(`Invalid regex separator: ${sep}`);
139
+ }
140
+ }
141
+ }
142
+ return {
143
+ valid: errors.length === 0,
144
+ errors,
145
+ warnings,
146
+ };
147
+ }
148
+ }
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Semantic Chunker
3
+ *
4
+ * LLM-powered semantic chunking that groups related content together.
5
+ * Uses embedding similarity to determine natural breakpoints.
6
+ * Best for complex documents where meaning should drive segmentation.
7
+ */
8
+ import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, SemanticChunkerConfig } from "../types.js";
9
+ /**
10
+ * Semantic chunker implementation
11
+ * Uses embedding similarity to find natural content boundaries
12
+ */
13
+ export declare class SemanticChunker implements Chunker {
14
+ readonly strategy: "semantic";
15
+ chunk(text: string, config?: SemanticChunkerConfig): Promise<Chunk[]>;
16
+ /**
17
+ * Split text into initial segments for embedding
18
+ */
19
+ private splitIntoSegments;
20
+ /**
21
+ * Get embeddings for segments
22
+ */
23
+ private getEmbeddings;
24
+ /**
25
+ * Find semantic breakpoints using cosine similarity
26
+ */
27
+ private findSemanticBreakpoints;
28
+ /**
29
+ * Group segments based on breakpoints and size limits
30
+ */
31
+ private groupSegments;
32
+ /**
33
+ * Calculate cosine similarity between two vectors
34
+ */
35
+ private cosineSimilarity;
36
+ /**
37
+ * Fallback to simple chunking when embeddings fail
38
+ */
39
+ private fallbackChunk;
40
+ validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
41
+ }