@juspay/neurolink 9.1.1 → 9.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (555) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/README.md +106 -37
  3. package/dist/agent/directTools.d.ts +11 -11
  4. package/dist/cli/commands/config.d.ts +6 -6
  5. package/dist/cli/commands/rag.d.ts +19 -0
  6. package/dist/cli/commands/rag.js +756 -0
  7. package/dist/cli/factories/commandFactory.js +146 -83
  8. package/dist/cli/parser.js +4 -1
  9. package/dist/core/baseProvider.d.ts +43 -30
  10. package/dist/core/baseProvider.js +98 -138
  11. package/dist/core/conversationMemoryFactory.d.ts +2 -2
  12. package/dist/core/conversationMemoryFactory.js +2 -2
  13. package/dist/core/conversationMemoryInitializer.d.ts +1 -2
  14. package/dist/core/conversationMemoryInitializer.js +2 -2
  15. package/dist/core/infrastructure/baseError.d.ts +21 -0
  16. package/dist/core/infrastructure/baseError.js +22 -0
  17. package/dist/core/infrastructure/baseFactory.d.ts +21 -0
  18. package/dist/core/infrastructure/baseFactory.js +54 -0
  19. package/dist/core/infrastructure/baseRegistry.d.ts +21 -0
  20. package/dist/core/infrastructure/baseRegistry.js +49 -0
  21. package/dist/core/infrastructure/index.d.ts +5 -0
  22. package/dist/core/infrastructure/index.js +5 -0
  23. package/dist/core/infrastructure/retry.d.ts +7 -0
  24. package/dist/core/infrastructure/retry.js +20 -0
  25. package/dist/core/infrastructure/typedEventEmitter.d.ts +8 -0
  26. package/dist/core/infrastructure/typedEventEmitter.js +23 -0
  27. package/dist/core/redisConversationMemoryManager.d.ts +1 -6
  28. package/dist/core/redisConversationMemoryManager.js +7 -19
  29. package/dist/factories/providerFactory.d.ts +5 -3
  30. package/dist/factories/providerFactory.js +31 -24
  31. package/dist/image-gen/ImageGenService.d.ts +143 -0
  32. package/dist/image-gen/ImageGenService.js +345 -0
  33. package/dist/image-gen/imageGenTools.d.ts +126 -0
  34. package/dist/image-gen/imageGenTools.js +304 -0
  35. package/dist/image-gen/index.d.ts +46 -0
  36. package/dist/image-gen/index.js +48 -0
  37. package/dist/image-gen/types.d.ts +237 -0
  38. package/dist/image-gen/types.js +24 -0
  39. package/dist/index.d.ts +46 -12
  40. package/dist/index.js +88 -36
  41. package/dist/lib/agent/directTools.d.ts +8 -8
  42. package/dist/lib/core/baseProvider.d.ts +43 -30
  43. package/dist/lib/core/baseProvider.js +98 -138
  44. package/dist/lib/core/conversationMemoryFactory.d.ts +2 -2
  45. package/dist/lib/core/conversationMemoryFactory.js +2 -2
  46. package/dist/lib/core/conversationMemoryInitializer.d.ts +1 -2
  47. package/dist/lib/core/conversationMemoryInitializer.js +2 -2
  48. package/dist/lib/core/infrastructure/baseError.d.ts +21 -0
  49. package/dist/lib/core/infrastructure/baseError.js +23 -0
  50. package/dist/lib/core/infrastructure/baseFactory.d.ts +21 -0
  51. package/dist/lib/core/infrastructure/baseFactory.js +55 -0
  52. package/dist/lib/core/infrastructure/baseRegistry.d.ts +21 -0
  53. package/dist/lib/core/infrastructure/baseRegistry.js +50 -0
  54. package/dist/lib/core/infrastructure/index.d.ts +5 -0
  55. package/dist/lib/core/infrastructure/index.js +6 -0
  56. package/dist/lib/core/infrastructure/retry.d.ts +7 -0
  57. package/dist/lib/core/infrastructure/retry.js +21 -0
  58. package/dist/lib/core/infrastructure/typedEventEmitter.d.ts +8 -0
  59. package/dist/lib/core/infrastructure/typedEventEmitter.js +24 -0
  60. package/dist/lib/core/redisConversationMemoryManager.d.ts +1 -6
  61. package/dist/lib/core/redisConversationMemoryManager.js +7 -19
  62. package/dist/lib/factories/providerFactory.d.ts +5 -3
  63. package/dist/lib/factories/providerFactory.js +31 -24
  64. package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
  65. package/dist/lib/image-gen/ImageGenService.js +346 -0
  66. package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
  67. package/dist/lib/image-gen/imageGenTools.js +305 -0
  68. package/dist/lib/image-gen/index.d.ts +46 -0
  69. package/dist/lib/image-gen/index.js +49 -0
  70. package/dist/lib/image-gen/types.d.ts +237 -0
  71. package/dist/lib/image-gen/types.js +25 -0
  72. package/dist/lib/index.d.ts +46 -12
  73. package/dist/lib/index.js +88 -36
  74. package/dist/lib/mcp/index.d.ts +6 -5
  75. package/dist/lib/mcp/index.js +7 -5
  76. package/dist/lib/neurolink.d.ts +11 -13
  77. package/dist/lib/neurolink.js +95 -29
  78. package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
  79. package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
  80. package/dist/lib/processors/base/index.d.ts +14 -0
  81. package/dist/lib/processors/base/index.js +20 -0
  82. package/dist/lib/processors/base/types.d.ts +593 -0
  83. package/dist/lib/processors/base/types.js +77 -0
  84. package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
  85. package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
  86. package/dist/lib/processors/cli/index.d.ts +37 -0
  87. package/dist/lib/processors/cli/index.js +50 -0
  88. package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
  89. package/dist/lib/processors/code/ConfigProcessor.js +401 -0
  90. package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
  91. package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
  92. package/dist/lib/processors/code/index.d.ts +44 -0
  93. package/dist/lib/processors/code/index.js +61 -0
  94. package/dist/lib/processors/config/fileTypes.d.ts +283 -0
  95. package/dist/lib/processors/config/fileTypes.js +521 -0
  96. package/dist/lib/processors/config/index.d.ts +32 -0
  97. package/dist/lib/processors/config/index.js +93 -0
  98. package/dist/lib/processors/config/languageMap.d.ts +66 -0
  99. package/dist/lib/processors/config/languageMap.js +411 -0
  100. package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
  101. package/dist/lib/processors/config/mimeTypes.js +339 -0
  102. package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
  103. package/dist/lib/processors/config/sizeLimits.js +247 -0
  104. package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
  105. package/dist/lib/processors/data/JsonProcessor.js +204 -0
  106. package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
  107. package/dist/lib/processors/data/XmlProcessor.js +284 -0
  108. package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
  109. package/dist/lib/processors/data/YamlProcessor.js +295 -0
  110. package/dist/lib/processors/data/index.d.ts +49 -0
  111. package/dist/lib/processors/data/index.js +77 -0
  112. package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
  113. package/dist/lib/processors/document/ExcelProcessor.js +520 -0
  114. package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
  115. package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
  116. package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
  117. package/dist/lib/processors/document/RtfProcessor.js +362 -0
  118. package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
  119. package/dist/lib/processors/document/WordProcessor.js +354 -0
  120. package/dist/lib/processors/document/index.d.ts +54 -0
  121. package/dist/lib/processors/document/index.js +91 -0
  122. package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
  123. package/dist/lib/processors/errors/FileErrorCode.js +256 -0
  124. package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
  125. package/dist/lib/processors/errors/errorHelpers.js +379 -0
  126. package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
  127. package/dist/lib/processors/errors/errorSerializer.js +508 -0
  128. package/dist/lib/processors/errors/index.d.ts +46 -0
  129. package/dist/lib/processors/errors/index.js +50 -0
  130. package/dist/lib/processors/index.d.ts +76 -0
  131. package/dist/lib/processors/index.js +113 -0
  132. package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
  133. package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
  134. package/dist/lib/processors/integration/index.d.ts +42 -0
  135. package/dist/lib/processors/integration/index.js +45 -0
  136. package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
  137. package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
  138. package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
  139. package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
  140. package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
  141. package/dist/lib/processors/markup/SvgProcessor.js +241 -0
  142. package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
  143. package/dist/lib/processors/markup/TextProcessor.js +189 -0
  144. package/dist/lib/processors/markup/index.d.ts +66 -0
  145. package/dist/lib/processors/markup/index.js +103 -0
  146. package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
  147. package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
  148. package/dist/lib/processors/registry/index.d.ts +12 -0
  149. package/dist/lib/processors/registry/index.js +17 -0
  150. package/dist/lib/processors/registry/types.d.ts +53 -0
  151. package/dist/lib/processors/registry/types.js +11 -0
  152. package/dist/lib/providers/amazonBedrock.d.ts +15 -2
  153. package/dist/lib/providers/amazonBedrock.js +65 -8
  154. package/dist/lib/providers/anthropic.d.ts +3 -3
  155. package/dist/lib/providers/anthropic.js +10 -7
  156. package/dist/lib/providers/googleAiStudio.d.ts +5 -5
  157. package/dist/lib/providers/googleAiStudio.js +10 -7
  158. package/dist/lib/providers/googleVertex.d.ts +16 -4
  159. package/dist/lib/providers/googleVertex.js +72 -16
  160. package/dist/lib/providers/litellm.d.ts +3 -3
  161. package/dist/lib/providers/litellm.js +10 -10
  162. package/dist/lib/providers/mistral.d.ts +3 -3
  163. package/dist/lib/providers/mistral.js +7 -6
  164. package/dist/lib/providers/ollama.d.ts +3 -4
  165. package/dist/lib/providers/ollama.js +7 -8
  166. package/dist/lib/providers/openAI.d.ts +14 -2
  167. package/dist/lib/providers/openAI.js +60 -6
  168. package/dist/lib/providers/openRouter.d.ts +2 -2
  169. package/dist/lib/providers/openRouter.js +10 -6
  170. package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
  171. package/dist/lib/rag/ChunkerFactory.d.ts +91 -0
  172. package/dist/lib/rag/ChunkerFactory.js +321 -0
  173. package/dist/lib/rag/ChunkerRegistry.d.ts +91 -0
  174. package/dist/lib/rag/ChunkerRegistry.js +422 -0
  175. package/dist/lib/rag/chunkers/BaseChunker.d.ts +53 -0
  176. package/dist/lib/rag/chunkers/BaseChunker.js +144 -0
  177. package/dist/lib/rag/chunkers/CharacterChunker.d.ts +18 -0
  178. package/dist/lib/rag/chunkers/CharacterChunker.js +29 -0
  179. package/dist/lib/rag/chunkers/HTMLChunker.d.ts +19 -0
  180. package/dist/lib/rag/chunkers/HTMLChunker.js +39 -0
  181. package/dist/lib/rag/chunkers/JSONChunker.d.ts +19 -0
  182. package/dist/lib/rag/chunkers/JSONChunker.js +69 -0
  183. package/dist/lib/rag/chunkers/LaTeXChunker.d.ts +15 -0
  184. package/dist/lib/rag/chunkers/LaTeXChunker.js +64 -0
  185. package/dist/lib/rag/chunkers/MarkdownChunker.d.ts +15 -0
  186. package/dist/lib/rag/chunkers/MarkdownChunker.js +103 -0
  187. package/dist/lib/rag/chunkers/RecursiveChunker.d.ts +27 -0
  188. package/dist/lib/rag/chunkers/RecursiveChunker.js +140 -0
  189. package/dist/lib/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
  190. package/dist/lib/rag/chunkers/SemanticMarkdownChunker.js +139 -0
  191. package/dist/lib/rag/chunkers/SentenceChunker.d.ts +19 -0
  192. package/dist/lib/rag/chunkers/SentenceChunker.js +67 -0
  193. package/dist/lib/rag/chunkers/TokenChunker.d.ts +19 -0
  194. package/dist/lib/rag/chunkers/TokenChunker.js +62 -0
  195. package/dist/lib/rag/chunkers/index.d.ts +15 -0
  196. package/dist/lib/rag/chunkers/index.js +16 -0
  197. package/dist/lib/rag/chunking/characterChunker.d.ts +16 -0
  198. package/dist/lib/rag/chunking/characterChunker.js +143 -0
  199. package/dist/lib/rag/chunking/chunkerRegistry.d.ts +67 -0
  200. package/dist/lib/rag/chunking/chunkerRegistry.js +195 -0
  201. package/dist/lib/rag/chunking/htmlChunker.d.ts +34 -0
  202. package/dist/lib/rag/chunking/htmlChunker.js +248 -0
  203. package/dist/lib/rag/chunking/index.d.ts +15 -0
  204. package/dist/lib/rag/chunking/index.js +18 -0
  205. package/dist/lib/rag/chunking/jsonChunker.d.ts +20 -0
  206. package/dist/lib/rag/chunking/jsonChunker.js +282 -0
  207. package/dist/lib/rag/chunking/latexChunker.d.ts +26 -0
  208. package/dist/lib/rag/chunking/latexChunker.js +252 -0
  209. package/dist/lib/rag/chunking/markdownChunker.d.ts +19 -0
  210. package/dist/lib/rag/chunking/markdownChunker.js +202 -0
  211. package/dist/lib/rag/chunking/recursiveChunker.d.ts +19 -0
  212. package/dist/lib/rag/chunking/recursiveChunker.js +149 -0
  213. package/dist/lib/rag/chunking/semanticChunker.d.ts +41 -0
  214. package/dist/lib/rag/chunking/semanticChunker.js +307 -0
  215. package/dist/lib/rag/chunking/sentenceChunker.d.ts +25 -0
  216. package/dist/lib/rag/chunking/sentenceChunker.js +231 -0
  217. package/dist/lib/rag/chunking/tokenChunker.d.ts +36 -0
  218. package/dist/lib/rag/chunking/tokenChunker.js +184 -0
  219. package/dist/lib/rag/document/MDocument.d.ts +198 -0
  220. package/dist/lib/rag/document/MDocument.js +393 -0
  221. package/dist/lib/rag/document/index.d.ts +5 -0
  222. package/dist/lib/rag/document/index.js +6 -0
  223. package/dist/lib/rag/document/loaders.d.ts +201 -0
  224. package/dist/lib/rag/document/loaders.js +501 -0
  225. package/dist/lib/rag/errors/RAGError.d.ts +244 -0
  226. package/dist/lib/rag/errors/RAGError.js +275 -0
  227. package/dist/lib/rag/errors/index.d.ts +6 -0
  228. package/dist/lib/rag/errors/index.js +7 -0
  229. package/dist/lib/rag/graphRag/graphRAG.d.ts +115 -0
  230. package/dist/lib/rag/graphRag/graphRAG.js +385 -0
  231. package/dist/lib/rag/graphRag/index.d.ts +4 -0
  232. package/dist/lib/rag/graphRag/index.js +5 -0
  233. package/dist/lib/rag/index.d.ts +103 -0
  234. package/dist/lib/rag/index.js +142 -0
  235. package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
  236. package/dist/lib/rag/metadata/MetadataExtractorFactory.js +419 -0
  237. package/dist/lib/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
  238. package/dist/lib/rag/metadata/MetadataExtractorRegistry.js +363 -0
  239. package/dist/lib/rag/metadata/index.d.ts +6 -0
  240. package/dist/lib/rag/metadata/index.js +10 -0
  241. package/dist/lib/rag/metadata/metadataExtractor.d.ts +69 -0
  242. package/dist/lib/rag/metadata/metadataExtractor.js +278 -0
  243. package/dist/lib/rag/pipeline/RAGPipeline.d.ts +235 -0
  244. package/dist/lib/rag/pipeline/RAGPipeline.js +402 -0
  245. package/dist/lib/rag/pipeline/contextAssembly.d.ts +126 -0
  246. package/dist/lib/rag/pipeline/contextAssembly.js +338 -0
  247. package/dist/lib/rag/pipeline/index.d.ts +5 -0
  248. package/dist/lib/rag/pipeline/index.js +6 -0
  249. package/dist/lib/rag/ragIntegration.d.ts +38 -0
  250. package/dist/lib/rag/ragIntegration.js +212 -0
  251. package/dist/lib/rag/reranker/RerankerFactory.d.ts +184 -0
  252. package/dist/lib/rag/reranker/RerankerFactory.js +431 -0
  253. package/dist/lib/rag/reranker/RerankerRegistry.d.ts +119 -0
  254. package/dist/lib/rag/reranker/RerankerRegistry.js +403 -0
  255. package/dist/lib/rag/reranker/index.d.ts +6 -0
  256. package/dist/lib/rag/reranker/index.js +10 -0
  257. package/dist/lib/rag/reranker/reranker.d.ts +71 -0
  258. package/dist/lib/rag/reranker/reranker.js +278 -0
  259. package/dist/lib/rag/resilience/CircuitBreaker.d.ts +215 -0
  260. package/dist/lib/rag/resilience/CircuitBreaker.js +432 -0
  261. package/dist/lib/rag/resilience/RetryHandler.d.ts +115 -0
  262. package/dist/lib/rag/resilience/RetryHandler.js +301 -0
  263. package/dist/lib/rag/resilience/index.d.ts +7 -0
  264. package/dist/lib/rag/resilience/index.js +8 -0
  265. package/dist/lib/rag/retrieval/hybridSearch.d.ts +94 -0
  266. package/dist/lib/rag/retrieval/hybridSearch.js +314 -0
  267. package/dist/lib/rag/retrieval/index.d.ts +5 -0
  268. package/dist/lib/rag/retrieval/index.js +6 -0
  269. package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +93 -0
  270. package/dist/lib/rag/retrieval/vectorQueryTool.js +290 -0
  271. package/dist/lib/rag/types.d.ts +768 -0
  272. package/dist/lib/rag/types.js +9 -0
  273. package/dist/lib/server/index.d.ts +15 -11
  274. package/dist/lib/server/index.js +55 -51
  275. package/dist/lib/server/utils/validation.d.ts +2 -2
  276. package/dist/lib/types/common.d.ts +0 -1
  277. package/dist/lib/types/fileTypes.d.ts +1 -1
  278. package/dist/lib/types/generateTypes.d.ts +42 -8
  279. package/dist/lib/types/generateTypes.js +1 -1
  280. package/dist/lib/types/index.d.ts +25 -24
  281. package/dist/lib/types/index.js +21 -20
  282. package/dist/lib/types/modelTypes.d.ts +16 -16
  283. package/dist/lib/types/pptTypes.d.ts +14 -2
  284. package/dist/lib/types/pptTypes.js +16 -0
  285. package/dist/lib/types/streamTypes.d.ts +28 -8
  286. package/dist/lib/types/streamTypes.js +1 -1
  287. package/dist/lib/utils/async/delay.d.ts +40 -0
  288. package/dist/lib/utils/async/delay.js +43 -0
  289. package/dist/lib/utils/async/index.d.ts +23 -0
  290. package/dist/lib/utils/async/index.js +24 -0
  291. package/dist/lib/utils/async/retry.d.ts +141 -0
  292. package/dist/lib/utils/async/retry.js +172 -0
  293. package/dist/lib/utils/async/withTimeout.d.ts +73 -0
  294. package/dist/lib/utils/async/withTimeout.js +97 -0
  295. package/dist/lib/utils/fileDetector.d.ts +7 -1
  296. package/dist/lib/utils/fileDetector.js +91 -18
  297. package/dist/lib/utils/json/extract.d.ts +103 -0
  298. package/dist/lib/utils/json/extract.js +249 -0
  299. package/dist/lib/utils/json/index.d.ts +36 -0
  300. package/dist/lib/utils/json/index.js +37 -0
  301. package/dist/lib/utils/json/safeParse.d.ts +137 -0
  302. package/dist/lib/utils/json/safeParse.js +191 -0
  303. package/dist/lib/utils/messageBuilder.d.ts +2 -2
  304. package/dist/lib/utils/messageBuilder.js +15 -7
  305. package/dist/lib/utils/modelRouter.d.ts +4 -4
  306. package/dist/lib/utils/modelRouter.js +4 -4
  307. package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
  308. package/dist/lib/utils/sanitizers/filename.js +366 -0
  309. package/dist/lib/utils/sanitizers/html.d.ts +170 -0
  310. package/dist/lib/utils/sanitizers/html.js +326 -0
  311. package/dist/lib/utils/sanitizers/index.d.ts +26 -0
  312. package/dist/lib/utils/sanitizers/index.js +30 -0
  313. package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
  314. package/dist/lib/utils/sanitizers/svg.js +483 -0
  315. package/dist/mcp/index.d.ts +6 -5
  316. package/dist/mcp/index.js +7 -5
  317. package/dist/neurolink.d.ts +11 -13
  318. package/dist/neurolink.js +95 -29
  319. package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
  320. package/dist/processors/base/BaseFileProcessor.js +613 -0
  321. package/dist/processors/base/index.d.ts +14 -0
  322. package/dist/processors/base/index.js +19 -0
  323. package/dist/processors/base/types.d.ts +593 -0
  324. package/dist/processors/base/types.js +76 -0
  325. package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
  326. package/dist/processors/cli/fileProcessorCli.js +388 -0
  327. package/dist/processors/cli/index.d.ts +37 -0
  328. package/dist/processors/cli/index.js +49 -0
  329. package/dist/processors/code/ConfigProcessor.d.ts +171 -0
  330. package/dist/processors/code/ConfigProcessor.js +400 -0
  331. package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
  332. package/dist/processors/code/SourceCodeProcessor.js +304 -0
  333. package/dist/processors/code/index.d.ts +44 -0
  334. package/dist/processors/code/index.js +60 -0
  335. package/dist/processors/config/fileTypes.d.ts +283 -0
  336. package/dist/processors/config/fileTypes.js +520 -0
  337. package/dist/processors/config/index.d.ts +32 -0
  338. package/dist/processors/config/index.js +92 -0
  339. package/dist/processors/config/languageMap.d.ts +66 -0
  340. package/dist/processors/config/languageMap.js +410 -0
  341. package/dist/processors/config/mimeTypes.d.ts +376 -0
  342. package/dist/processors/config/mimeTypes.js +338 -0
  343. package/dist/processors/config/sizeLimits.d.ts +194 -0
  344. package/dist/processors/config/sizeLimits.js +246 -0
  345. package/dist/processors/data/JsonProcessor.d.ts +122 -0
  346. package/dist/processors/data/JsonProcessor.js +203 -0
  347. package/dist/processors/data/XmlProcessor.d.ts +160 -0
  348. package/dist/processors/data/XmlProcessor.js +283 -0
  349. package/dist/processors/data/YamlProcessor.d.ts +163 -0
  350. package/dist/processors/data/YamlProcessor.js +294 -0
  351. package/dist/processors/data/index.d.ts +49 -0
  352. package/dist/processors/data/index.js +76 -0
  353. package/dist/processors/document/ExcelProcessor.d.ts +238 -0
  354. package/dist/processors/document/ExcelProcessor.js +519 -0
  355. package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
  356. package/dist/processors/document/OpenDocumentProcessor.js +210 -0
  357. package/dist/processors/document/RtfProcessor.d.ts +152 -0
  358. package/dist/processors/document/RtfProcessor.js +361 -0
  359. package/dist/processors/document/WordProcessor.d.ts +168 -0
  360. package/dist/processors/document/WordProcessor.js +353 -0
  361. package/dist/processors/document/index.d.ts +54 -0
  362. package/dist/processors/document/index.js +90 -0
  363. package/dist/processors/errors/FileErrorCode.d.ts +98 -0
  364. package/dist/processors/errors/FileErrorCode.js +255 -0
  365. package/dist/processors/errors/errorHelpers.d.ts +151 -0
  366. package/dist/processors/errors/errorHelpers.js +378 -0
  367. package/dist/processors/errors/errorSerializer.d.ts +139 -0
  368. package/dist/processors/errors/errorSerializer.js +507 -0
  369. package/dist/processors/errors/index.d.ts +46 -0
  370. package/dist/processors/errors/index.js +49 -0
  371. package/dist/processors/index.d.ts +76 -0
  372. package/dist/processors/index.js +112 -0
  373. package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
  374. package/dist/processors/integration/FileProcessorIntegration.js +272 -0
  375. package/dist/processors/integration/index.d.ts +42 -0
  376. package/dist/processors/integration/index.js +44 -0
  377. package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
  378. package/dist/processors/markup/HtmlProcessor.js +249 -0
  379. package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
  380. package/dist/processors/markup/MarkdownProcessor.js +244 -0
  381. package/dist/processors/markup/SvgProcessor.d.ts +156 -0
  382. package/dist/processors/markup/SvgProcessor.js +240 -0
  383. package/dist/processors/markup/TextProcessor.d.ts +135 -0
  384. package/dist/processors/markup/TextProcessor.js +188 -0
  385. package/dist/processors/markup/index.d.ts +66 -0
  386. package/dist/processors/markup/index.js +102 -0
  387. package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
  388. package/dist/processors/registry/ProcessorRegistry.js +608 -0
  389. package/dist/processors/registry/index.d.ts +12 -0
  390. package/dist/processors/registry/index.js +16 -0
  391. package/dist/processors/registry/types.d.ts +53 -0
  392. package/dist/processors/registry/types.js +10 -0
  393. package/dist/providers/amazonBedrock.d.ts +15 -2
  394. package/dist/providers/amazonBedrock.js +65 -8
  395. package/dist/providers/anthropic.d.ts +3 -3
  396. package/dist/providers/anthropic.js +10 -7
  397. package/dist/providers/googleAiStudio.d.ts +5 -5
  398. package/dist/providers/googleAiStudio.js +10 -7
  399. package/dist/providers/googleVertex.d.ts +16 -4
  400. package/dist/providers/googleVertex.js +72 -16
  401. package/dist/providers/litellm.d.ts +3 -3
  402. package/dist/providers/litellm.js +10 -10
  403. package/dist/providers/mistral.d.ts +3 -3
  404. package/dist/providers/mistral.js +7 -6
  405. package/dist/providers/ollama.d.ts +3 -4
  406. package/dist/providers/ollama.js +7 -8
  407. package/dist/providers/openAI.d.ts +14 -2
  408. package/dist/providers/openAI.js +60 -6
  409. package/dist/providers/openRouter.d.ts +2 -2
  410. package/dist/providers/openRouter.js +10 -6
  411. package/dist/rag/ChunkerFactory.d.ts +91 -0
  412. package/dist/rag/ChunkerFactory.js +320 -0
  413. package/dist/rag/ChunkerRegistry.d.ts +91 -0
  414. package/dist/rag/ChunkerRegistry.js +421 -0
  415. package/dist/rag/chunkers/BaseChunker.d.ts +53 -0
  416. package/dist/rag/chunkers/BaseChunker.js +143 -0
  417. package/dist/rag/chunkers/CharacterChunker.d.ts +18 -0
  418. package/dist/rag/chunkers/CharacterChunker.js +28 -0
  419. package/dist/rag/chunkers/HTMLChunker.d.ts +19 -0
  420. package/dist/rag/chunkers/HTMLChunker.js +38 -0
  421. package/dist/rag/chunkers/JSONChunker.d.ts +19 -0
  422. package/dist/rag/chunkers/JSONChunker.js +68 -0
  423. package/dist/rag/chunkers/LaTeXChunker.d.ts +15 -0
  424. package/dist/rag/chunkers/LaTeXChunker.js +63 -0
  425. package/dist/rag/chunkers/MarkdownChunker.d.ts +15 -0
  426. package/dist/rag/chunkers/MarkdownChunker.js +102 -0
  427. package/dist/rag/chunkers/RecursiveChunker.d.ts +27 -0
  428. package/dist/rag/chunkers/RecursiveChunker.js +139 -0
  429. package/dist/rag/chunkers/SemanticMarkdownChunker.d.ts +22 -0
  430. package/dist/rag/chunkers/SemanticMarkdownChunker.js +138 -0
  431. package/dist/rag/chunkers/SentenceChunker.d.ts +19 -0
  432. package/dist/rag/chunkers/SentenceChunker.js +66 -0
  433. package/dist/rag/chunkers/TokenChunker.d.ts +19 -0
  434. package/dist/rag/chunkers/TokenChunker.js +61 -0
  435. package/dist/rag/chunkers/index.d.ts +15 -0
  436. package/dist/rag/chunkers/index.js +15 -0
  437. package/dist/rag/chunking/characterChunker.d.ts +16 -0
  438. package/dist/rag/chunking/characterChunker.js +142 -0
  439. package/dist/rag/chunking/chunkerRegistry.d.ts +67 -0
  440. package/dist/rag/chunking/chunkerRegistry.js +194 -0
  441. package/dist/rag/chunking/htmlChunker.d.ts +34 -0
  442. package/dist/rag/chunking/htmlChunker.js +247 -0
  443. package/dist/rag/chunking/index.d.ts +15 -0
  444. package/dist/rag/chunking/index.js +17 -0
  445. package/dist/rag/chunking/jsonChunker.d.ts +20 -0
  446. package/dist/rag/chunking/jsonChunker.js +281 -0
  447. package/dist/rag/chunking/latexChunker.d.ts +26 -0
  448. package/dist/rag/chunking/latexChunker.js +251 -0
  449. package/dist/rag/chunking/markdownChunker.d.ts +19 -0
  450. package/dist/rag/chunking/markdownChunker.js +201 -0
  451. package/dist/rag/chunking/recursiveChunker.d.ts +19 -0
  452. package/dist/rag/chunking/recursiveChunker.js +148 -0
  453. package/dist/rag/chunking/semanticChunker.d.ts +41 -0
  454. package/dist/rag/chunking/semanticChunker.js +306 -0
  455. package/dist/rag/chunking/sentenceChunker.d.ts +25 -0
  456. package/dist/rag/chunking/sentenceChunker.js +230 -0
  457. package/dist/rag/chunking/tokenChunker.d.ts +36 -0
  458. package/dist/rag/chunking/tokenChunker.js +183 -0
  459. package/dist/rag/document/MDocument.d.ts +198 -0
  460. package/dist/rag/document/MDocument.js +392 -0
  461. package/dist/rag/document/index.d.ts +5 -0
  462. package/dist/rag/document/index.js +5 -0
  463. package/dist/rag/document/loaders.d.ts +201 -0
  464. package/dist/rag/document/loaders.js +500 -0
  465. package/dist/rag/errors/RAGError.d.ts +244 -0
  466. package/dist/rag/errors/RAGError.js +274 -0
  467. package/dist/rag/errors/index.d.ts +6 -0
  468. package/dist/rag/errors/index.js +6 -0
  469. package/dist/rag/graphRag/graphRAG.d.ts +115 -0
  470. package/dist/rag/graphRag/graphRAG.js +384 -0
  471. package/dist/rag/graphRag/index.d.ts +4 -0
  472. package/dist/rag/graphRag/index.js +4 -0
  473. package/dist/rag/index.d.ts +103 -0
  474. package/dist/rag/index.js +141 -0
  475. package/dist/rag/metadata/MetadataExtractorFactory.d.ts +157 -0
  476. package/dist/rag/metadata/MetadataExtractorFactory.js +418 -0
  477. package/dist/rag/metadata/MetadataExtractorRegistry.d.ts +99 -0
  478. package/dist/rag/metadata/MetadataExtractorRegistry.js +362 -0
  479. package/dist/rag/metadata/index.d.ts +6 -0
  480. package/dist/rag/metadata/index.js +9 -0
  481. package/dist/rag/metadata/metadataExtractor.d.ts +69 -0
  482. package/dist/rag/metadata/metadataExtractor.js +277 -0
  483. package/dist/rag/pipeline/RAGPipeline.d.ts +235 -0
  484. package/dist/rag/pipeline/RAGPipeline.js +401 -0
  485. package/dist/rag/pipeline/contextAssembly.d.ts +126 -0
  486. package/dist/rag/pipeline/contextAssembly.js +337 -0
  487. package/dist/rag/pipeline/index.d.ts +5 -0
  488. package/dist/rag/pipeline/index.js +5 -0
  489. package/dist/rag/ragIntegration.d.ts +38 -0
  490. package/dist/rag/ragIntegration.js +211 -0
  491. package/dist/rag/reranker/RerankerFactory.d.ts +184 -0
  492. package/dist/rag/reranker/RerankerFactory.js +430 -0
  493. package/dist/rag/reranker/RerankerRegistry.d.ts +119 -0
  494. package/dist/rag/reranker/RerankerRegistry.js +402 -0
  495. package/dist/rag/reranker/index.d.ts +6 -0
  496. package/dist/rag/reranker/index.js +9 -0
  497. package/dist/rag/reranker/reranker.d.ts +71 -0
  498. package/dist/rag/reranker/reranker.js +277 -0
  499. package/dist/rag/resilience/CircuitBreaker.d.ts +215 -0
  500. package/dist/rag/resilience/CircuitBreaker.js +431 -0
  501. package/dist/rag/resilience/RetryHandler.d.ts +115 -0
  502. package/dist/rag/resilience/RetryHandler.js +300 -0
  503. package/dist/rag/resilience/index.d.ts +7 -0
  504. package/dist/rag/resilience/index.js +7 -0
  505. package/dist/rag/retrieval/hybridSearch.d.ts +94 -0
  506. package/dist/rag/retrieval/hybridSearch.js +313 -0
  507. package/dist/rag/retrieval/index.d.ts +5 -0
  508. package/dist/rag/retrieval/index.js +5 -0
  509. package/dist/rag/retrieval/vectorQueryTool.d.ts +93 -0
  510. package/dist/rag/retrieval/vectorQueryTool.js +289 -0
  511. package/dist/rag/types.d.ts +768 -0
  512. package/dist/rag/types.js +8 -0
  513. package/dist/server/index.d.ts +15 -11
  514. package/dist/server/index.js +55 -51
  515. package/dist/server/utils/validation.d.ts +8 -8
  516. package/dist/types/common.d.ts +0 -1
  517. package/dist/types/fileTypes.d.ts +1 -1
  518. package/dist/types/generateTypes.d.ts +42 -8
  519. package/dist/types/generateTypes.js +1 -1
  520. package/dist/types/index.d.ts +25 -24
  521. package/dist/types/index.js +21 -20
  522. package/dist/types/modelTypes.d.ts +10 -10
  523. package/dist/types/pptTypes.d.ts +14 -2
  524. package/dist/types/pptTypes.js +16 -0
  525. package/dist/types/streamTypes.d.ts +28 -8
  526. package/dist/types/streamTypes.js +1 -1
  527. package/dist/utils/async/delay.d.ts +40 -0
  528. package/dist/utils/async/delay.js +42 -0
  529. package/dist/utils/async/index.d.ts +23 -0
  530. package/dist/utils/async/index.js +23 -0
  531. package/dist/utils/async/retry.d.ts +141 -0
  532. package/dist/utils/async/retry.js +171 -0
  533. package/dist/utils/async/withTimeout.d.ts +73 -0
  534. package/dist/utils/async/withTimeout.js +96 -0
  535. package/dist/utils/fileDetector.d.ts +7 -1
  536. package/dist/utils/fileDetector.js +91 -18
  537. package/dist/utils/json/extract.d.ts +103 -0
  538. package/dist/utils/json/extract.js +248 -0
  539. package/dist/utils/json/index.d.ts +36 -0
  540. package/dist/utils/json/index.js +36 -0
  541. package/dist/utils/json/safeParse.d.ts +137 -0
  542. package/dist/utils/json/safeParse.js +190 -0
  543. package/dist/utils/messageBuilder.d.ts +2 -2
  544. package/dist/utils/messageBuilder.js +15 -7
  545. package/dist/utils/modelRouter.d.ts +4 -4
  546. package/dist/utils/modelRouter.js +4 -4
  547. package/dist/utils/sanitizers/filename.d.ts +137 -0
  548. package/dist/utils/sanitizers/filename.js +365 -0
  549. package/dist/utils/sanitizers/html.d.ts +170 -0
  550. package/dist/utils/sanitizers/html.js +325 -0
  551. package/dist/utils/sanitizers/index.d.ts +26 -0
  552. package/dist/utils/sanitizers/index.js +29 -0
  553. package/dist/utils/sanitizers/svg.d.ts +81 -0
  554. package/dist/utils/sanitizers/svg.js +482 -0
  555. package/package.json +2 -2
@@ -0,0 +1,307 @@
1
+ /**
2
+ * Semantic Chunker
3
+ *
4
+ * LLM-powered semantic chunking that groups related content together.
5
+ * Uses embedding similarity to determine natural breakpoints.
6
+ * Best for complex documents where meaning should drive segmentation.
7
+ */
8
+ import { randomUUID } from "crypto";
9
+ import { ProviderFactory } from "../../factories/providerFactory.js";
10
+ import { logger } from "../../utils/logger.js";
11
+ /**
12
+ * Semantic chunker implementation
13
+ * Uses embedding similarity to find natural content boundaries
14
+ */
15
+ export class SemanticChunker {
16
+ strategy = "semantic";
17
+ async chunk(text, config) {
18
+ const { maxSize = 1000, overlap = 0, joinThreshold = 100, modelName = "text-embedding-3-small", provider = "openai", similarityThreshold = 0.7, trimWhitespace = true, metadata = {}, } = config || {};
19
+ const documentId = randomUUID();
20
+ const chunks = [];
21
+ if (!text || text.length === 0) {
22
+ return chunks;
23
+ }
24
+ // First, split into initial segments (paragraphs or sentences)
25
+ const segments = this.splitIntoSegments(text, joinThreshold);
26
+ if (segments.length <= 1) {
27
+ // Single segment, no need for semantic analysis
28
+ chunks.push({
29
+ id: randomUUID(),
30
+ text: trimWhitespace ? text.trim() : text,
31
+ metadata: {
32
+ documentId,
33
+ chunkIndex: 0,
34
+ totalChunks: 1,
35
+ startPosition: 0,
36
+ endPosition: text.length,
37
+ documentType: "text",
38
+ custom: metadata,
39
+ },
40
+ });
41
+ return chunks;
42
+ }
43
+ try {
44
+ // Get embeddings for each segment
45
+ const embeddings = await this.getEmbeddings(segments, provider, modelName);
46
+ // Find semantic breakpoints
47
+ const breakpoints = this.findSemanticBreakpoints(embeddings, similarityThreshold);
48
+ // Group segments by semantic similarity
49
+ const groups = this.groupSegments(segments, breakpoints, maxSize);
50
+ // Create chunks from groups
51
+ let chunkIndex = 0;
52
+ let currentPosition = 0;
53
+ for (const group of groups) {
54
+ const chunkText = group.join("\n\n");
55
+ const finalText = trimWhitespace ? chunkText.trim() : chunkText;
56
+ if (finalText.length > 0) {
57
+ chunks.push({
58
+ id: randomUUID(),
59
+ text: finalText,
60
+ metadata: {
61
+ documentId,
62
+ chunkIndex,
63
+ startPosition: currentPosition,
64
+ endPosition: currentPosition + chunkText.length,
65
+ documentType: "text",
66
+ custom: {
67
+ ...metadata,
68
+ segmentCount: group.length,
69
+ },
70
+ },
71
+ });
72
+ chunkIndex++;
73
+ }
74
+ currentPosition += chunkText.length + 2; // +2 for separator
75
+ }
76
+ // Handle overlap if configured
77
+ if (overlap > 0) {
78
+ chunks.forEach((chunk, i) => {
79
+ if (i > 0) {
80
+ // Add overlap from previous chunk
81
+ const prevText = chunks[i - 1].text;
82
+ const overlapText = prevText.slice(-overlap);
83
+ chunk.text = overlapText + "\n" + chunk.text;
84
+ }
85
+ });
86
+ }
87
+ }
88
+ catch (error) {
89
+ // Fallback to simple chunking if embeddings fail
90
+ logger.warn("[SemanticChunker] Embedding failed, falling back to simple chunking", {
91
+ error: error instanceof Error ? error.message : String(error),
92
+ });
93
+ return this.fallbackChunk(text, maxSize, overlap, documentId, metadata, trimWhitespace);
94
+ }
95
+ // Update total chunks count
96
+ chunks.forEach((chunk) => {
97
+ chunk.metadata.totalChunks = chunks.length;
98
+ });
99
+ return chunks;
100
+ }
101
+ /**
102
+ * Split text into initial segments for embedding
103
+ */
104
+ splitIntoSegments(text, minSize) {
105
+ const segments = [];
106
+ // Split by double newlines (paragraphs)
107
+ const paragraphs = text.split(/\n\n+/);
108
+ let currentSegment = "";
109
+ for (const paragraph of paragraphs) {
110
+ const trimmed = paragraph.trim();
111
+ if (trimmed.length === 0) {
112
+ continue;
113
+ }
114
+ if (currentSegment.length === 0) {
115
+ currentSegment = trimmed;
116
+ }
117
+ else if (currentSegment.length + trimmed.length < minSize) {
118
+ // Join small paragraphs
119
+ currentSegment += "\n\n" + trimmed;
120
+ }
121
+ else {
122
+ // Save current and start new
123
+ if (currentSegment.length > 0) {
124
+ segments.push(currentSegment);
125
+ }
126
+ currentSegment = trimmed;
127
+ }
128
+ }
129
+ // Don't forget the last segment
130
+ if (currentSegment.length > 0) {
131
+ segments.push(currentSegment);
132
+ }
133
+ return segments;
134
+ }
135
+ /**
136
+ * Get embeddings for segments
137
+ */
138
+ async getEmbeddings(segments, provider, modelName) {
139
+ const embeddingProvider = await ProviderFactory.createProvider(provider, modelName);
140
+ // Check if provider has embed method
141
+ if (typeof embeddingProvider.embed !==
142
+ "function") {
143
+ throw new Error(`Provider ${provider} does not support embeddings`);
144
+ }
145
+ const embeddings = [];
146
+ // Process in batches to avoid rate limits
147
+ const batchSize = 10;
148
+ for (let i = 0; i < segments.length; i += batchSize) {
149
+ const batch = segments.slice(i, i + batchSize);
150
+ for (const segment of batch) {
151
+ try {
152
+ const embedding = await embeddingProvider.embed(segment);
153
+ embeddings.push(embedding);
154
+ }
155
+ catch (error) {
156
+ logger.warn("[SemanticChunker] Failed to embed segment", {
157
+ error: error instanceof Error ? error.message : String(error),
158
+ });
159
+ // Use zero vector as fallback
160
+ embeddings.push(new Array(1536).fill(0));
161
+ }
162
+ }
163
+ }
164
+ return embeddings;
165
+ }
166
+ /**
167
+ * Find semantic breakpoints using cosine similarity
168
+ */
169
+ findSemanticBreakpoints(embeddings, threshold) {
170
+ const breakpoints = [];
171
+ for (let i = 1; i < embeddings.length; i++) {
172
+ const similarity = this.cosineSimilarity(embeddings[i - 1], embeddings[i]);
173
+ // If similarity is below threshold, it's a breakpoint
174
+ if (similarity < threshold) {
175
+ breakpoints.push(i);
176
+ }
177
+ }
178
+ return breakpoints;
179
+ }
180
+ /**
181
+ * Group segments based on breakpoints and size limits
182
+ */
183
+ groupSegments(segments, breakpoints, maxSize) {
184
+ const groups = [];
185
+ let currentGroup = [];
186
+ let currentSize = 0;
187
+ let breakpointIndex = 0;
188
+ for (let i = 0; i < segments.length; i++) {
189
+ const segment = segments[i];
190
+ const segmentSize = segment.length;
191
+ // Check if we're at a breakpoint or exceeding size
192
+ const isBreakpoint = breakpointIndex < breakpoints.length &&
193
+ breakpoints[breakpointIndex] === i;
194
+ if ((currentSize + segmentSize > maxSize && currentGroup.length > 0) ||
195
+ (isBreakpoint && currentGroup.length > 0)) {
196
+ // Save current group
197
+ groups.push(currentGroup);
198
+ currentGroup = [];
199
+ currentSize = 0;
200
+ }
201
+ if (isBreakpoint) {
202
+ breakpointIndex++;
203
+ }
204
+ currentGroup.push(segment);
205
+ currentSize += segmentSize;
206
+ }
207
+ // Don't forget the last group
208
+ if (currentGroup.length > 0) {
209
+ groups.push(currentGroup);
210
+ }
211
+ return groups;
212
+ }
213
+ /**
214
+ * Calculate cosine similarity between two vectors
215
+ */
216
+ cosineSimilarity(a, b) {
217
+ if (a.length !== b.length) {
218
+ return 0;
219
+ }
220
+ let dotProduct = 0;
221
+ let normA = 0;
222
+ let normB = 0;
223
+ for (let i = 0; i < a.length; i++) {
224
+ dotProduct += a[i] * b[i];
225
+ normA += a[i] * a[i];
226
+ normB += b[i] * b[i];
227
+ }
228
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
229
+ return denominator === 0 ? 0 : dotProduct / denominator;
230
+ }
231
+ /**
232
+ * Fallback to simple chunking when embeddings fail
233
+ */
234
+ fallbackChunk(text, maxSize, overlap, documentId, metadata, trimWhitespace) {
235
+ const effectiveMaxSize = Math.max(maxSize, 1);
236
+ const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
237
+ const chunks = [];
238
+ let start = 0;
239
+ let chunkIndex = 0;
240
+ while (start < text.length) {
241
+ let end = Math.min(start + effectiveMaxSize, text.length);
242
+ // Try to break at paragraph boundary
243
+ if (end < text.length) {
244
+ const searchStart = Math.max(start, end - 200);
245
+ const searchText = text.slice(searchStart, end);
246
+ const paragraphBreak = searchText.lastIndexOf("\n\n");
247
+ if (paragraphBreak > 0) {
248
+ end = searchStart + paragraphBreak;
249
+ }
250
+ }
251
+ const chunkText = text.slice(start, end);
252
+ const finalText = trimWhitespace ? chunkText.trim() : chunkText;
253
+ if (finalText.length > 0) {
254
+ chunks.push({
255
+ id: randomUUID(),
256
+ text: finalText,
257
+ metadata: {
258
+ documentId,
259
+ chunkIndex,
260
+ startPosition: start,
261
+ endPosition: end,
262
+ documentType: "text",
263
+ custom: {
264
+ ...metadata,
265
+ fallbackChunking: true,
266
+ },
267
+ },
268
+ });
269
+ chunkIndex++;
270
+ }
271
+ start = Math.max(start + 1, end - effectiveOverlap);
272
+ }
273
+ return chunks;
274
+ }
275
+ validateConfig(config) {
276
+ const errors = [];
277
+ const warnings = [];
278
+ const semConfig = config;
279
+ if (semConfig.maxSize !== undefined && semConfig.maxSize <= 0) {
280
+ errors.push("maxSize must be greater than 0");
281
+ }
282
+ if (semConfig.overlap !== undefined && semConfig.overlap < 0) {
283
+ errors.push("overlap must be non-negative");
284
+ }
285
+ if (semConfig.overlap !== undefined &&
286
+ semConfig.maxSize !== undefined &&
287
+ semConfig.overlap >= semConfig.maxSize) {
288
+ errors.push("overlap must be less than maxSize");
289
+ }
290
+ if (semConfig.similarityThreshold !== undefined) {
291
+ if (semConfig.similarityThreshold < 0 ||
292
+ semConfig.similarityThreshold > 1) {
293
+ errors.push("similarityThreshold must be between 0 and 1");
294
+ }
295
+ }
296
+ if (semConfig.joinThreshold !== undefined && semConfig.joinThreshold < 0) {
297
+ errors.push("joinThreshold must be non-negative");
298
+ }
299
+ warnings.push("Semantic chunking requires an embedding provider. Ensure API credentials are configured.");
300
+ return {
301
+ valid: errors.length === 0,
302
+ errors,
303
+ warnings,
304
+ };
305
+ }
306
+ }
307
+ //# sourceMappingURL=semanticChunker.js.map
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Sentence-based Chunker
3
+ *
4
+ * Splits text based on sentence boundaries while respecting size limits.
5
+ * Best for prose and natural language content where sentence integrity matters.
6
+ */
7
+ import type { BaseChunkerConfig, Chunk, Chunker, ChunkerValidationResult, SentenceChunkerConfig } from "../types.js";
8
+ /**
9
+ * Sentence-aware chunker implementation
10
+ * Splits text by sentences while respecting size constraints
11
+ */
12
+ export declare class SentenceChunker implements Chunker {
13
+ readonly strategy: "sentence";
14
+ private readonly defaultSentenceEnders;
15
+ chunk(text: string, config?: SentenceChunkerConfig): Promise<Chunk[]>;
16
+ /**
17
+ * Split text into sentences based on sentence enders
18
+ */
19
+ private splitIntoSentences;
20
+ /**
21
+ * Split a large sentence into smaller chunks
22
+ */
23
+ private splitLargeSentence;
24
+ validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
25
+ }
@@ -0,0 +1,231 @@
1
+ /**
2
+ * Sentence-based Chunker
3
+ *
4
+ * Splits text based on sentence boundaries while respecting size limits.
5
+ * Best for prose and natural language content where sentence integrity matters.
6
+ */
7
+ import { randomUUID } from "crypto";
8
+ /**
9
+ * Sentence-aware chunker implementation
10
+ * Splits text by sentences while respecting size constraints
11
+ */
12
+ export class SentenceChunker {
13
+ strategy = "sentence";
14
+ defaultSentenceEnders = [".", "!", "?"];
15
+ async chunk(text, config) {
16
+ const { maxSize = 1000, overlap = 0, sentenceEnders = this.defaultSentenceEnders, minSentences = 1, maxSentences, trimWhitespace = true, metadata = {}, } = config || {};
17
+ const chunks = [];
18
+ const documentId = randomUUID();
19
+ if (!text || text.length === 0) {
20
+ return chunks;
21
+ }
22
+ // Split text into sentences
23
+ const sentences = this.splitIntoSentences(text, sentenceEnders);
24
+ if (sentences.length === 0) {
25
+ return chunks;
26
+ }
27
+ let currentChunkSentences = [];
28
+ let currentChunkLength = 0;
29
+ let chunkIndex = 0;
30
+ let startPosition = 0;
31
+ let currentPosition = 0;
32
+ for (let i = 0; i < sentences.length; i++) {
33
+ const sentence = sentences[i];
34
+ const sentenceLength = sentence.length;
35
+ // Check if adding this sentence would exceed limits
36
+ const wouldExceedSize = currentChunkLength + sentenceLength + 1 > maxSize;
37
+ const wouldExceedSentences = maxSentences !== undefined &&
38
+ currentChunkSentences.length >= maxSentences;
39
+ if (currentChunkSentences.length > 0 &&
40
+ (wouldExceedSize || wouldExceedSentences)) {
41
+ // Save current chunk if it meets minimum requirements
42
+ if (currentChunkSentences.length >= minSentences) {
43
+ const chunkText = currentChunkSentences.join(" ");
44
+ const finalText = trimWhitespace ? chunkText.trim() : chunkText;
45
+ if (finalText.length > 0) {
46
+ chunks.push({
47
+ id: randomUUID(),
48
+ text: finalText,
49
+ metadata: {
50
+ documentId,
51
+ chunkIndex,
52
+ startPosition,
53
+ endPosition: startPosition + chunkText.length,
54
+ documentType: "text",
55
+ custom: metadata,
56
+ },
57
+ });
58
+ chunkIndex++;
59
+ }
60
+ }
61
+ // Handle overlap by keeping some sentences
62
+ if (overlap > 0 && currentChunkSentences.length > 0) {
63
+ // Calculate how many sentences to keep for overlap
64
+ let overlapLength = 0;
65
+ const overlapSentences = [];
66
+ for (let j = currentChunkSentences.length - 1; j >= 0; j--) {
67
+ const s = currentChunkSentences[j];
68
+ if (overlapLength + s.length + 1 <= overlap) {
69
+ overlapSentences.unshift(s);
70
+ overlapLength += s.length + 1;
71
+ }
72
+ else {
73
+ break;
74
+ }
75
+ }
76
+ currentChunkSentences = overlapSentences;
77
+ currentChunkLength = overlapLength;
78
+ startPosition = currentPosition - overlapLength;
79
+ }
80
+ else {
81
+ currentChunkSentences = [];
82
+ currentChunkLength = 0;
83
+ startPosition = currentPosition;
84
+ }
85
+ }
86
+ // Handle sentences larger than maxSize
87
+ if (sentenceLength > maxSize) {
88
+ // Split the sentence itself if necessary
89
+ const subChunks = this.splitLargeSentence(sentence, maxSize);
90
+ for (const subChunk of subChunks) {
91
+ chunks.push({
92
+ id: randomUUID(),
93
+ text: trimWhitespace ? subChunk.trim() : subChunk,
94
+ metadata: {
95
+ documentId,
96
+ chunkIndex,
97
+ startPosition: currentPosition,
98
+ endPosition: currentPosition + subChunk.length,
99
+ documentType: "text",
100
+ custom: metadata,
101
+ },
102
+ });
103
+ chunkIndex++;
104
+ currentPosition += subChunk.length;
105
+ }
106
+ startPosition = currentPosition;
107
+ }
108
+ else {
109
+ currentChunkSentences.push(sentence);
110
+ currentChunkLength += sentenceLength + 1; // +1 for space
111
+ currentPosition += sentenceLength + 1;
112
+ }
113
+ }
114
+ // Don't forget the last chunk
115
+ if (currentChunkSentences.length >= minSentences) {
116
+ const chunkText = currentChunkSentences.join(" ");
117
+ const finalText = trimWhitespace ? chunkText.trim() : chunkText;
118
+ if (finalText.length > 0) {
119
+ chunks.push({
120
+ id: randomUUID(),
121
+ text: finalText,
122
+ metadata: {
123
+ documentId,
124
+ chunkIndex,
125
+ startPosition,
126
+ endPosition: startPosition + chunkText.length,
127
+ documentType: "text",
128
+ custom: metadata,
129
+ },
130
+ });
131
+ }
132
+ }
133
+ // Update total chunks count
134
+ chunks.forEach((chunk) => {
135
+ chunk.metadata.totalChunks = chunks.length;
136
+ });
137
+ return chunks;
138
+ }
139
+ /**
140
+ * Split text into sentences based on sentence enders
141
+ */
142
+ splitIntoSentences(text, sentenceEnders) {
143
+ const sentences = [];
144
+ // Build regex pattern for sentence splitting
145
+ // Look for sentence enders followed by whitespace or end of string
146
+ const pattern = new RegExp(`([${sentenceEnders.map((e) => "\\" + e).join("")}]+)(?=\\s|$)`, "g");
147
+ let lastIndex = 0;
148
+ let match;
149
+ // Reset regex state
150
+ pattern.lastIndex = 0;
151
+ while ((match = pattern.exec(text)) !== null) {
152
+ const endIndex = match.index + match[0].length;
153
+ const sentence = text.slice(lastIndex, endIndex).trim();
154
+ if (sentence.length > 0) {
155
+ sentences.push(sentence);
156
+ }
157
+ lastIndex = endIndex;
158
+ // Skip whitespace
159
+ while (lastIndex < text.length && /\s/.test(text[lastIndex])) {
160
+ lastIndex++;
161
+ }
162
+ }
163
+ // Don't forget the last part
164
+ if (lastIndex < text.length) {
165
+ const remaining = text.slice(lastIndex).trim();
166
+ if (remaining.length > 0) {
167
+ sentences.push(remaining);
168
+ }
169
+ }
170
+ return sentences;
171
+ }
172
+ /**
173
+ * Split a large sentence into smaller chunks
174
+ */
175
+ splitLargeSentence(sentence, maxSize) {
176
+ const chunks = [];
177
+ const words = sentence.split(/\s+/);
178
+ let currentChunk = "";
179
+ for (const word of words) {
180
+ if (currentChunk.length + word.length + 1 <= maxSize) {
181
+ currentChunk = currentChunk ? currentChunk + " " + word : word;
182
+ }
183
+ else {
184
+ if (currentChunk.length > 0) {
185
+ chunks.push(currentChunk);
186
+ }
187
+ // If a single word is larger than maxSize, we have to include it anyway
188
+ currentChunk = word;
189
+ }
190
+ }
191
+ if (currentChunk.length > 0) {
192
+ chunks.push(currentChunk);
193
+ }
194
+ return chunks;
195
+ }
196
+ validateConfig(config) {
197
+ const errors = [];
198
+ const warnings = [];
199
+ const sentConfig = config;
200
+ if (sentConfig.maxSize !== undefined && sentConfig.maxSize <= 0) {
201
+ errors.push("maxSize must be greater than 0");
202
+ }
203
+ if (sentConfig.overlap !== undefined && sentConfig.overlap < 0) {
204
+ errors.push("overlap must be non-negative");
205
+ }
206
+ if (sentConfig.overlap !== undefined &&
207
+ sentConfig.maxSize !== undefined &&
208
+ sentConfig.overlap >= sentConfig.maxSize) {
209
+ errors.push("overlap must be less than maxSize");
210
+ }
211
+ if (sentConfig.minSentences !== undefined && sentConfig.minSentences < 1) {
212
+ errors.push("minSentences must be at least 1");
213
+ }
214
+ if (sentConfig.maxSentences !== undefined &&
215
+ sentConfig.minSentences !== undefined) {
216
+ if (sentConfig.maxSentences < sentConfig.minSentences) {
217
+ errors.push("maxSentences must be >= minSentences");
218
+ }
219
+ }
220
+ if (sentConfig.sentenceEnders !== undefined &&
221
+ sentConfig.sentenceEnders.length === 0) {
222
+ warnings.push("No sentence enders specified, using defaults");
223
+ }
224
+ return {
225
+ valid: errors.length === 0,
226
+ errors,
227
+ warnings,
228
+ };
229
+ }
230
+ }
231
+ //# sourceMappingURL=sentenceChunker.js.map
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Token-based Chunker
3
+ *
4
+ * Splits text based on token counts using simple tokenization.
5
+ * Best for controlling context window usage with LLMs.
6
+ */
7
+ import type { Chunker, Chunk, ChunkerValidationResult, TokenChunkerConfig, BaseChunkerConfig } from "../types.js";
8
+ /**
9
+ * Token-aware chunker implementation
10
+ * Splits text based on approximate token counts
11
+ *
12
+ * Note: Uses simple word-based tokenization as approximation.
13
+ * For exact token counts, integrate with tiktoken or model-specific tokenizers.
14
+ */
15
+ export declare class TokenChunker implements Chunker {
16
+ readonly strategy: "token";
17
+ private readonly CHARS_PER_TOKEN;
18
+ chunk(text: string, config?: TokenChunkerConfig): Promise<Chunk[]>;
19
+ /**
20
+ * Simple word-based tokenization
21
+ */
22
+ private tokenize;
23
+ /**
24
+ * Get characters per token for a tokenizer
25
+ */
26
+ private getCharsPerToken;
27
+ /**
28
+ * Estimate average tokens per word
29
+ */
30
+ private estimateTokensPerWord;
31
+ /**
32
+ * Estimate token count for text
33
+ */
34
+ estimateTokenCount(text: string, tokenizer?: string): number;
35
+ validateConfig(config: BaseChunkerConfig): ChunkerValidationResult;
36
+ }