rag-lite-ts 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (328) hide show
  1. package/README.md +88 -5
  2. package/dist/{cli → cjs/cli}/indexer.js +73 -15
  3. package/dist/cjs/cli/ui-server.d.ts +5 -0
  4. package/dist/cjs/cli/ui-server.js +152 -0
  5. package/dist/{cli.js → cjs/cli.js} +25 -6
  6. package/dist/{core → cjs/core}/binary-index-format.js +6 -3
  7. package/dist/{core → cjs/core}/db.d.ts +56 -0
  8. package/dist/{core → cjs/core}/db.js +105 -0
  9. package/dist/{core → cjs/core}/ingestion.js +3 -0
  10. package/dist/cjs/core/knowledge-base-manager.d.ts +109 -0
  11. package/dist/cjs/core/knowledge-base-manager.js +256 -0
  12. package/dist/{core → cjs/core}/model-validator.js +1 -1
  13. package/dist/{core → cjs/core}/search-pipeline.js +1 -1
  14. package/dist/{core → cjs/core}/search.js +1 -1
  15. package/dist/cjs/core/vector-index-messages.d.ts +52 -0
  16. package/dist/cjs/core/vector-index-messages.js +5 -0
  17. package/dist/cjs/core/vector-index-worker.d.ts +6 -0
  18. package/dist/cjs/core/vector-index-worker.js +304 -0
  19. package/dist/cjs/core/vector-index.d.ts +107 -0
  20. package/dist/cjs/core/vector-index.js +344 -0
  21. package/dist/{factories → cjs/factories}/ingestion-factory.js +3 -7
  22. package/dist/{factories → cjs/factories}/search-factory.js +11 -0
  23. package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +23 -3
  24. package/dist/{index-manager.js → cjs/index-manager.js} +84 -15
  25. package/dist/{index.d.ts → cjs/index.d.ts} +2 -1
  26. package/dist/{index.js → cjs/index.js} +3 -1
  27. package/dist/esm/api-errors.d.ts +90 -0
  28. package/dist/esm/api-errors.js +320 -0
  29. package/dist/esm/cli/indexer.d.ts +11 -0
  30. package/dist/esm/cli/indexer.js +529 -0
  31. package/dist/esm/cli/search.d.ts +7 -0
  32. package/dist/esm/cli/search.js +332 -0
  33. package/dist/esm/cli/ui-server.d.ts +5 -0
  34. package/dist/esm/cli/ui-server.js +152 -0
  35. package/dist/esm/cli.d.ts +3 -0
  36. package/dist/esm/cli.js +548 -0
  37. package/dist/esm/config.d.ts +51 -0
  38. package/dist/esm/config.js +79 -0
  39. package/dist/esm/core/abstract-embedder.d.ts +125 -0
  40. package/dist/esm/core/abstract-embedder.js +264 -0
  41. package/dist/esm/core/actionable-error-messages.d.ts +60 -0
  42. package/dist/esm/core/actionable-error-messages.js +397 -0
  43. package/dist/esm/core/adapters.d.ts +93 -0
  44. package/dist/esm/core/adapters.js +139 -0
  45. package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
  46. package/dist/esm/core/batch-processing-optimizer.js +536 -0
  47. package/dist/esm/core/binary-index-format.d.ts +78 -0
  48. package/dist/esm/core/binary-index-format.js +294 -0
  49. package/dist/esm/core/chunker.d.ts +119 -0
  50. package/dist/esm/core/chunker.js +73 -0
  51. package/dist/esm/core/cli-database-utils.d.ts +53 -0
  52. package/dist/esm/core/cli-database-utils.js +239 -0
  53. package/dist/esm/core/config.d.ts +102 -0
  54. package/dist/esm/core/config.js +247 -0
  55. package/dist/esm/core/content-errors.d.ts +111 -0
  56. package/dist/esm/core/content-errors.js +362 -0
  57. package/dist/esm/core/content-manager.d.ts +335 -0
  58. package/dist/esm/core/content-manager.js +1476 -0
  59. package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
  60. package/dist/esm/core/content-performance-optimizer.js +516 -0
  61. package/dist/esm/core/content-resolver.d.ts +104 -0
  62. package/dist/esm/core/content-resolver.js +285 -0
  63. package/dist/esm/core/cross-modal-search.d.ts +164 -0
  64. package/dist/esm/core/cross-modal-search.js +342 -0
  65. package/dist/esm/core/database-connection-manager.d.ts +109 -0
  66. package/dist/esm/core/database-connection-manager.js +310 -0
  67. package/dist/esm/core/db.d.ts +269 -0
  68. package/dist/esm/core/db.js +1000 -0
  69. package/dist/esm/core/embedder-factory.d.ts +154 -0
  70. package/dist/esm/core/embedder-factory.js +311 -0
  71. package/dist/esm/core/error-handler.d.ts +112 -0
  72. package/dist/esm/core/error-handler.js +239 -0
  73. package/dist/esm/core/index.d.ts +59 -0
  74. package/dist/esm/core/index.js +69 -0
  75. package/dist/esm/core/ingestion.d.ts +202 -0
  76. package/dist/esm/core/ingestion.js +904 -0
  77. package/dist/esm/core/interfaces.d.ts +408 -0
  78. package/dist/esm/core/interfaces.js +106 -0
  79. package/dist/esm/core/knowledge-base-manager.d.ts +109 -0
  80. package/dist/esm/core/knowledge-base-manager.js +256 -0
  81. package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
  82. package/dist/esm/core/lazy-dependency-loader.js +435 -0
  83. package/dist/esm/core/mode-detection-service.d.ts +150 -0
  84. package/dist/esm/core/mode-detection-service.js +565 -0
  85. package/dist/esm/core/mode-model-validator.d.ts +92 -0
  86. package/dist/esm/core/mode-model-validator.js +203 -0
  87. package/dist/esm/core/model-registry.d.ts +116 -0
  88. package/dist/esm/core/model-registry.js +411 -0
  89. package/dist/esm/core/model-validator.d.ts +217 -0
  90. package/dist/esm/core/model-validator.js +782 -0
  91. package/dist/esm/core/path-manager.d.ts +47 -0
  92. package/dist/esm/core/path-manager.js +71 -0
  93. package/dist/esm/core/raglite-paths.d.ts +121 -0
  94. package/dist/esm/core/raglite-paths.js +145 -0
  95. package/dist/esm/core/reranking-config.d.ts +42 -0
  96. package/dist/esm/core/reranking-config.js +147 -0
  97. package/dist/esm/core/reranking-factory.d.ts +92 -0
  98. package/dist/esm/core/reranking-factory.js +410 -0
  99. package/dist/esm/core/reranking-strategies.d.ts +310 -0
  100. package/dist/esm/core/reranking-strategies.js +650 -0
  101. package/dist/esm/core/resource-cleanup.d.ts +163 -0
  102. package/dist/esm/core/resource-cleanup.js +371 -0
  103. package/dist/esm/core/resource-manager.d.ts +212 -0
  104. package/dist/esm/core/resource-manager.js +564 -0
  105. package/dist/esm/core/search-pipeline.d.ts +111 -0
  106. package/dist/esm/core/search-pipeline.js +287 -0
  107. package/dist/esm/core/search.d.ts +141 -0
  108. package/dist/esm/core/search.js +320 -0
  109. package/dist/esm/core/streaming-operations.d.ts +145 -0
  110. package/dist/esm/core/streaming-operations.js +409 -0
  111. package/dist/esm/core/types.d.ts +66 -0
  112. package/dist/esm/core/types.js +6 -0
  113. package/dist/esm/core/universal-embedder.d.ts +177 -0
  114. package/dist/esm/core/universal-embedder.js +139 -0
  115. package/dist/esm/core/validation-messages.d.ts +99 -0
  116. package/dist/esm/core/validation-messages.js +334 -0
  117. package/dist/esm/core/vector-index-messages.d.ts +52 -0
  118. package/dist/esm/core/vector-index-messages.js +5 -0
  119. package/dist/esm/core/vector-index-worker.d.ts +6 -0
  120. package/dist/esm/core/vector-index-worker.js +304 -0
  121. package/dist/esm/core/vector-index.d.ts +107 -0
  122. package/dist/esm/core/vector-index.js +344 -0
  123. package/dist/esm/dom-polyfills.d.ts +6 -0
  124. package/dist/esm/dom-polyfills.js +37 -0
  125. package/dist/esm/factories/index.d.ts +27 -0
  126. package/dist/esm/factories/index.js +29 -0
  127. package/dist/esm/factories/ingestion-factory.d.ts +200 -0
  128. package/dist/esm/factories/ingestion-factory.js +473 -0
  129. package/dist/esm/factories/search-factory.d.ts +154 -0
  130. package/dist/esm/factories/search-factory.js +355 -0
  131. package/dist/esm/file-processor.d.ts +147 -0
  132. package/dist/esm/file-processor.js +963 -0
  133. package/dist/esm/index-manager.d.ts +136 -0
  134. package/dist/esm/index-manager.js +667 -0
  135. package/dist/esm/index.d.ts +76 -0
  136. package/dist/esm/index.js +112 -0
  137. package/dist/esm/indexer.d.ts +7 -0
  138. package/dist/esm/indexer.js +54 -0
  139. package/dist/esm/ingestion.d.ts +63 -0
  140. package/dist/esm/ingestion.js +124 -0
  141. package/dist/esm/mcp-server.d.ts +46 -0
  142. package/dist/esm/mcp-server.js +1820 -0
  143. package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
  144. package/dist/esm/multimodal/clip-embedder.js +996 -0
  145. package/dist/esm/multimodal/index.d.ts +6 -0
  146. package/dist/esm/multimodal/index.js +6 -0
  147. package/dist/esm/preprocess.d.ts +19 -0
  148. package/dist/esm/preprocess.js +203 -0
  149. package/dist/esm/preprocessors/index.d.ts +17 -0
  150. package/dist/esm/preprocessors/index.js +38 -0
  151. package/dist/esm/preprocessors/mdx.d.ts +25 -0
  152. package/dist/esm/preprocessors/mdx.js +101 -0
  153. package/dist/esm/preprocessors/mermaid.d.ts +68 -0
  154. package/dist/esm/preprocessors/mermaid.js +329 -0
  155. package/dist/esm/preprocessors/registry.d.ts +56 -0
  156. package/dist/esm/preprocessors/registry.js +179 -0
  157. package/dist/esm/run-error-recovery-tests.d.ts +7 -0
  158. package/dist/esm/run-error-recovery-tests.js +101 -0
  159. package/dist/esm/search-standalone.d.ts +7 -0
  160. package/dist/esm/search-standalone.js +117 -0
  161. package/dist/esm/search.d.ts +99 -0
  162. package/dist/esm/search.js +177 -0
  163. package/dist/esm/test-utils.d.ts +18 -0
  164. package/dist/esm/test-utils.js +27 -0
  165. package/dist/esm/text/chunker.d.ts +33 -0
  166. package/dist/esm/text/chunker.js +279 -0
  167. package/dist/esm/text/embedder.d.ts +111 -0
  168. package/dist/esm/text/embedder.js +386 -0
  169. package/dist/esm/text/index.d.ts +8 -0
  170. package/dist/esm/text/index.js +9 -0
  171. package/dist/esm/text/preprocessors/index.d.ts +17 -0
  172. package/dist/esm/text/preprocessors/index.js +38 -0
  173. package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
  174. package/dist/esm/text/preprocessors/mdx.js +101 -0
  175. package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
  176. package/dist/esm/text/preprocessors/mermaid.js +330 -0
  177. package/dist/esm/text/preprocessors/registry.d.ts +56 -0
  178. package/dist/esm/text/preprocessors/registry.js +180 -0
  179. package/dist/esm/text/reranker.d.ts +49 -0
  180. package/dist/esm/text/reranker.js +274 -0
  181. package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
  182. package/dist/esm/text/sentence-transformer-embedder.js +340 -0
  183. package/dist/esm/text/tokenizer.d.ts +22 -0
  184. package/dist/esm/text/tokenizer.js +64 -0
  185. package/dist/esm/types.d.ts +83 -0
  186. package/dist/esm/types.js +3 -0
  187. package/dist/esm/utils/vector-math.d.ts +31 -0
  188. package/dist/esm/utils/vector-math.js +70 -0
  189. package/package.json +39 -14
  190. package/dist/core/vector-index.d.ts +0 -72
  191. package/dist/core/vector-index.js +0 -331
  192. /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
  193. /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
  194. /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
  195. /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
  196. /package/dist/{cli → cjs/cli}/search.js +0 -0
  197. /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
  198. /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
  199. /package/dist/{config.js → cjs/config.js} +0 -0
  200. /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
  201. /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
  202. /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
  203. /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
  204. /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
  205. /package/dist/{core → cjs/core}/adapters.js +0 -0
  206. /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
  207. /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
  208. /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
  209. /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
  210. /package/dist/{core → cjs/core}/chunker.js +0 -0
  211. /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
  212. /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
  213. /package/dist/{core → cjs/core}/config.d.ts +0 -0
  214. /package/dist/{core → cjs/core}/config.js +0 -0
  215. /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
  216. /package/dist/{core → cjs/core}/content-errors.js +0 -0
  217. /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
  218. /package/dist/{core → cjs/core}/content-manager.js +0 -0
  219. /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
  220. /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
  221. /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
  222. /package/dist/{core → cjs/core}/content-resolver.js +0 -0
  223. /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
  224. /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
  225. /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
  226. /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
  227. /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
  228. /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
  229. /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
  230. /package/dist/{core → cjs/core}/error-handler.js +0 -0
  231. /package/dist/{core → cjs/core}/index.d.ts +0 -0
  232. /package/dist/{core → cjs/core}/index.js +0 -0
  233. /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
  234. /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
  235. /package/dist/{core → cjs/core}/interfaces.js +0 -0
  236. /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
  237. /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
  238. /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
  239. /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
  240. /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
  241. /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
  242. /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
  243. /package/dist/{core → cjs/core}/model-registry.js +0 -0
  244. /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
  245. /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
  246. /package/dist/{core → cjs/core}/path-manager.js +0 -0
  247. /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
  248. /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
  249. /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
  250. /package/dist/{core → cjs/core}/reranking-config.js +0 -0
  251. /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
  252. /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
  253. /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
  254. /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
  255. /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
  256. /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
  257. /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
  258. /package/dist/{core → cjs/core}/resource-manager.js +0 -0
  259. /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
  260. /package/dist/{core → cjs/core}/search.d.ts +0 -0
  261. /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
  262. /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
  263. /package/dist/{core → cjs/core}/types.d.ts +0 -0
  264. /package/dist/{core → cjs/core}/types.js +0 -0
  265. /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
  266. /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
  267. /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
  268. /package/dist/{core → cjs/core}/validation-messages.js +0 -0
  269. /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
  270. /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
  271. /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
  272. /package/dist/{factories → cjs/factories}/index.js +0 -0
  273. /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
  274. /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
  275. /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
  276. /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
  277. /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
  278. /package/dist/{indexer.js → cjs/indexer.js} +0 -0
  279. /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
  280. /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
  281. /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
  282. /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
  283. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
  284. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
  285. /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
  286. /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
  287. /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
  288. /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
  289. /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
  290. /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
  291. /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
  292. /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
  293. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
  294. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
  295. /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
  296. /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
  297. /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
  298. /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
  299. /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
  300. /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
  301. /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
  302. /package/dist/{search.js → cjs/search.js} +0 -0
  303. /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
  304. /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
  305. /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
  306. /package/dist/{text → cjs/text}/chunker.js +0 -0
  307. /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
  308. /package/dist/{text → cjs/text}/embedder.js +0 -0
  309. /package/dist/{text → cjs/text}/index.d.ts +0 -0
  310. /package/dist/{text → cjs/text}/index.js +0 -0
  311. /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
  312. /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
  313. /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
  314. /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
  315. /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
  316. /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
  317. /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
  318. /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
  319. /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
  320. /package/dist/{text → cjs/text}/reranker.js +0 -0
  321. /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
  322. /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
  323. /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
  324. /package/dist/{text → cjs/text}/tokenizer.js +0 -0
  325. /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
  326. /package/dist/{types.js → cjs/types.js} +0 -0
  327. /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
  328. /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Text-specific chunking implementation
3
+ * Implements the ChunkingStrategy interface for text content
4
+ */
5
+ import '../dom-polyfills.js';
6
+ import { DEFAULT_CHUNK_CONFIG } from '../core/chunker.js';
7
+ import { countTokens } from './tokenizer.js';
8
+ /**
9
+ * Split text at paragraph boundaries (double newlines)
10
+ * This is the first tier of the chunking strategy
11
+ */
12
+ function splitIntoParagraphs(text) {
13
+ // Split on double newlines, filter out empty strings
14
+ return text
15
+ .split(/\n\s*\n/)
16
+ .map(p => p.trim())
17
+ .filter(p => p.length > 0);
18
+ }
19
+ /**
20
+ * Split text at sentence boundaries using punctuation marks
21
+ * This is the second tier of the chunking strategy
22
+ */
23
+ function splitIntoSentences(text) {
24
+ // Split on sentence-ending punctuation followed by whitespace or end of string
25
+ // Handle common abbreviations and edge cases
26
+ const sentences = text
27
+ .split(/(?<=[.!?])\s+/)
28
+ .map(s => s.trim())
29
+ .filter(s => s.length > 0);
30
+ return sentences;
31
+ }
32
+ /**
33
+ * Split text into fixed-size chunks based on character count
34
+ * This is the fallback tier when semantic splitting fails
35
+ */
36
+ async function splitIntoFixedSizeChunks(text, maxTokens, overlapTokens) {
37
+ const chunks = [];
38
+ const words = text.split(/\s+/);
39
+ let currentChunk = '';
40
+ let currentTokens = 0;
41
+ let i = 0;
42
+ while (i < words.length) {
43
+ const word = words[i];
44
+ const testChunk = currentChunk ? `${currentChunk} ${word}` : word;
45
+ const testTokens = await countTokens(testChunk);
46
+ if (testTokens <= maxTokens) {
47
+ currentChunk = testChunk;
48
+ currentTokens = testTokens;
49
+ i++;
50
+ }
51
+ else {
52
+ // Current chunk is full, save it
53
+ if (currentChunk) {
54
+ chunks.push(currentChunk);
55
+ // Create overlap for next chunk
56
+ if (overlapTokens > 0 && chunks.length > 0) {
57
+ const overlapText = await createOverlapFromWords(currentChunk, overlapTokens);
58
+ currentChunk = overlapText;
59
+ currentTokens = await countTokens(currentChunk);
60
+ }
61
+ else {
62
+ currentChunk = '';
63
+ currentTokens = 0;
64
+ }
65
+ }
66
+ else {
67
+ // Single word exceeds limit, add it anyway
68
+ chunks.push(word);
69
+ i++;
70
+ }
71
+ }
72
+ }
73
+ // Add final chunk if it has content
74
+ if (currentChunk.trim()) {
75
+ chunks.push(currentChunk.trim());
76
+ }
77
+ return chunks;
78
+ }
79
+ /**
80
+ * Create overlap text from words at the end of a chunk
81
+ */
82
+ async function createOverlapFromWords(text, overlapTokens) {
83
+ const words = text.split(/\s+/);
84
+ let overlapText = '';
85
+ let tokens = 0;
86
+ // Work backwards from the end
87
+ for (let i = words.length - 1; i >= 0; i--) {
88
+ const word = words[i];
89
+ const testText = word + (overlapText ? ' ' + overlapText : '');
90
+ const testTokens = await countTokens(testText);
91
+ if (testTokens <= overlapTokens) {
92
+ overlapText = testText;
93
+ tokens = testTokens;
94
+ }
95
+ else {
96
+ break;
97
+ }
98
+ }
99
+ return overlapText;
100
+ }
101
+ /**
102
+ * Create chunks from a list of text segments, respecting token limits
103
+ */
104
+ async function createChunksFromSegments(segments, config) {
105
+ const chunks = [];
106
+ let currentChunk = '';
107
+ let currentTokens = 0;
108
+ for (const segment of segments) {
109
+ const segmentTokens = await countTokens(segment);
110
+ // If this single segment exceeds our limit, we need to split it further
111
+ if (segmentTokens > config.chunkSize) {
112
+ // Save current chunk if it has content
113
+ if (currentChunk.trim()) {
114
+ chunks.push(currentChunk.trim());
115
+ currentChunk = '';
116
+ currentTokens = 0;
117
+ }
118
+ // Split the large segment using fixed-size chunking based on tokens
119
+ const subChunks = await splitIntoFixedSizeChunks(segment, config.chunkSize, config.chunkOverlap);
120
+ chunks.push(...subChunks);
121
+ continue;
122
+ }
123
+ // Check if adding this segment would exceed our token limit
124
+ const potentialChunk = currentChunk ? `${currentChunk}\n\n${segment}` : segment;
125
+ const potentialTokens = await countTokens(potentialChunk);
126
+ if (potentialTokens <= config.chunkSize) {
127
+ // Add to current chunk
128
+ currentChunk = potentialChunk;
129
+ currentTokens = potentialTokens;
130
+ }
131
+ else {
132
+ // Save current chunk and start a new one
133
+ if (currentChunk.trim()) {
134
+ chunks.push(currentChunk.trim());
135
+ }
136
+ // Start new chunk with overlap if possible
137
+ if (config.chunkOverlap > 0 && currentChunk) {
138
+ const overlapText = await createOverlapText(currentChunk, config.chunkOverlap);
139
+ currentChunk = overlapText ? `${overlapText}\n\n${segment}` : segment;
140
+ }
141
+ else {
142
+ currentChunk = segment;
143
+ }
144
+ currentTokens = await countTokens(currentChunk);
145
+ }
146
+ }
147
+ // Add final chunk if it has content
148
+ if (currentChunk.trim()) {
149
+ chunks.push(currentChunk.trim());
150
+ }
151
+ return chunks;
152
+ }
153
+ /**
154
+ * Create overlap text from the end of a chunk
155
+ */
156
+ async function createOverlapText(text, overlapTokens) {
157
+ // Split into sentences and work backwards to get approximately the right amount of overlap
158
+ const sentences = splitIntoSentences(text);
159
+ let overlapText = '';
160
+ let tokens = 0;
161
+ for (let i = sentences.length - 1; i >= 0; i--) {
162
+ const sentence = sentences[i];
163
+ const sentenceTokens = await countTokens(sentence);
164
+ if (tokens + sentenceTokens <= overlapTokens) {
165
+ overlapText = sentence + (overlapText ? ' ' + overlapText : '');
166
+ tokens += sentenceTokens;
167
+ }
168
+ else {
169
+ break;
170
+ }
171
+ }
172
+ return overlapText;
173
+ }
174
+ /**
175
+ * Text chunking strategy implementation
176
+ */
177
+ export class TextChunkingStrategy {
178
+ appliesTo(contentType) {
179
+ return contentType === 'text';
180
+ }
181
+ async chunk(document, config) {
182
+ console.log(`📝 Chunking document "${document.title}" with config: chunkSize=${config.chunkSize}, chunkOverlap=${config.chunkOverlap}`);
183
+ if (!document.content || document.content.trim().length === 0) {
184
+ return [];
185
+ }
186
+ // Tier 1: Split into paragraphs
187
+ const paragraphs = splitIntoParagraphs(document.content);
188
+ // Tier 2: For large paragraphs, split into sentences
189
+ const segments = [];
190
+ for (const paragraph of paragraphs) {
191
+ const paragraphTokens = await countTokens(paragraph);
192
+ if (paragraphTokens <= config.chunkSize) {
193
+ // Paragraph is small enough, use as-is
194
+ segments.push(paragraph);
195
+ }
196
+ else {
197
+ // Paragraph is too large, split into sentences
198
+ const sentences = splitIntoSentences(paragraph);
199
+ // Group sentences that fit within token limits
200
+ let currentGroup = '';
201
+ let currentTokens = 0;
202
+ for (const sentence of sentences) {
203
+ const sentenceTokens = await countTokens(sentence);
204
+ // If single sentence exceeds limit, it will be handled in createChunksFromSegments
205
+ if (sentenceTokens > config.chunkSize) {
206
+ // Save current group if it has content
207
+ if (currentGroup.trim()) {
208
+ segments.push(currentGroup.trim());
209
+ currentGroup = '';
210
+ currentTokens = 0;
211
+ }
212
+ // Add the large sentence as its own segment (will be split later)
213
+ segments.push(sentence);
214
+ continue;
215
+ }
216
+ const potentialGroup = currentGroup ? `${currentGroup} ${sentence}` : sentence;
217
+ const potentialTokens = await countTokens(potentialGroup);
218
+ if (potentialTokens <= config.chunkSize) {
219
+ currentGroup = potentialGroup;
220
+ currentTokens = potentialTokens;
221
+ }
222
+ else {
223
+ // Save current group and start new one
224
+ if (currentGroup.trim()) {
225
+ segments.push(currentGroup.trim());
226
+ }
227
+ currentGroup = sentence;
228
+ currentTokens = sentenceTokens;
229
+ }
230
+ }
231
+ // Add final group if it has content
232
+ if (currentGroup.trim()) {
233
+ segments.push(currentGroup.trim());
234
+ }
235
+ }
236
+ }
237
+ // Tier 3: Create final chunks with overlap handling
238
+ const chunkTexts = await createChunksFromSegments(segments, config);
239
+ // Convert to GenericChunk objects
240
+ const chunks = [];
241
+ for (let i = 0; i < chunkTexts.length; i++) {
242
+ const content = chunkTexts[i];
243
+ chunks.push({
244
+ content,
245
+ contentType: document.contentType,
246
+ chunkIndex: i,
247
+ metadata: {
248
+ tokenCount: await countTokens(content),
249
+ ...document.metadata
250
+ }
251
+ });
252
+ }
253
+ return chunks;
254
+ }
255
+ }
256
+ /**
257
+ * Text document chunking function
258
+ * Converts between text-specific and generic interfaces
259
+ */
260
+ export async function chunkDocument(document, config = DEFAULT_CHUNK_CONFIG) {
261
+ const strategy = new TextChunkingStrategy();
262
+ // Convert Document to GenericDocument
263
+ const genericDocument = {
264
+ source: document.source,
265
+ title: document.title,
266
+ content: document.content,
267
+ contentType: 'text'
268
+ };
269
+ // Use the strategy to chunk
270
+ const genericChunks = await strategy.chunk(genericDocument, config);
271
+ // Convert GenericChunk back to Chunk format
272
+ const chunks = genericChunks.map(chunk => ({
273
+ text: chunk.content,
274
+ chunkIndex: chunk.chunkIndex,
275
+ tokenCount: chunk.metadata?.tokenCount || 0
276
+ }));
277
+ return chunks;
278
+ }
279
+ //# sourceMappingURL=chunker.js.map
@@ -0,0 +1,111 @@
1
+ import '../dom-polyfills.js';
2
+ import type { EmbeddingResult, EmbedFunction } from '../core/types.js';
3
+ /**
4
+ * Embedding engine using transformers.js for generating embeddings
5
+ */
6
+ export declare class EmbeddingEngine {
7
+ private model;
8
+ private modelVersion;
9
+ private readonly modelName;
10
+ private readonly batchSize;
11
+ constructor(modelName?: string, batchSize?: number);
12
+ /**
13
+ * Load the embedding model
14
+ * @throws {Error} If model loading fails
15
+ */
16
+ loadModel(): Promise<void>;
17
+ /**
18
+ * Generate embeddings for a batch of texts
19
+ * @param texts - Array of text strings to embed
20
+ * @returns Promise resolving to array of embedding results
21
+ */
22
+ embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
23
+ /**
24
+ * Process a single batch with error handling for individual chunks
25
+ * @param batch - Array of text strings in this batch
26
+ * @param startIndex - Starting index for this batch in the original array
27
+ * @returns Promise resolving to array of embedding results
28
+ */
29
+ private processBatchWithErrorHandling;
30
+ /**
31
+ * Fallback to individual chunk processing when batch fails
32
+ */
33
+ private fallbackToIndividualProcessing;
34
+ /**
35
+ * Process a single chunk with error handling
36
+ * @param text - Text to embed
37
+ * @param index - Index of this chunk
38
+ * @returns Promise resolving to embedding result or null if failed
39
+ */
40
+ private processSingleChunk;
41
+ /**
42
+ * Generate embedding for a single text
43
+ * @param text - Text string to embed
44
+ * @returns Promise resolving to embedding result
45
+ */
46
+ embedSingle(text: string): Promise<EmbeddingResult>;
47
+ /**
48
+ * Generate embeddings for document chunks with progress logging
49
+ * Optimized for large document ingestion with batch processing
50
+ * @param chunks - Array of text chunks from documents
51
+ * @returns Promise resolving to array of embedding results
52
+ */
53
+ embedDocumentBatch(chunks: string[]): Promise<EmbeddingResult[]>;
54
+ /**
55
+ * Get the current model version identifier
56
+ * @returns Model version string
57
+ */
58
+ getModelVersion(): string;
59
+ /**
60
+ * Check if the model is loaded
61
+ * @returns True if model is loaded
62
+ */
63
+ isLoaded(): boolean;
64
+ /**
65
+ * Get the model name
66
+ * @returns Model name string
67
+ */
68
+ getModelName(): string;
69
+ /**
70
+ * Get the batch size
71
+ * @returns Batch size number
72
+ */
73
+ getBatchSize(): number;
74
+ /**
75
+ * Generate a deterministic model version identifier
76
+ * Uses model name and configuration for consistent versioning
77
+ * @returns Model version string
78
+ */
79
+ private generateModelVersion;
80
+ /**
81
+ * Generate a deterministic embedding ID for a text chunk
82
+ * @param text - The text content
83
+ * @param index - Index in the batch
84
+ * @returns Deterministic embedding ID
85
+ */
86
+ private generateEmbeddingId;
87
+ }
88
+ /**
89
+ * Get the singleton embedding engine instance
90
+ * @param modelName - Optional model name override
91
+ * @param batchSize - Optional batch size override
92
+ * @returns EmbeddingEngine instance
93
+ */
94
+ export declare function getEmbeddingEngine(modelName?: string, batchSize?: number): EmbeddingEngine;
95
+ /**
96
+ * Initialize the embedding engine and load the model
97
+ * @param modelName - Optional model name override
98
+ * @param batchSize - Optional batch size override
99
+ * @returns Promise resolving to the loaded embedding engine
100
+ */
101
+ export declare function initializeEmbeddingEngine(modelName?: string, batchSize?: number): Promise<EmbeddingEngine>;
102
+ /**
103
+
104
+ * Create an EmbedFunction implementation using the text embedding engine
105
+ * This function implements the core EmbedFunction interface for dependency injection
106
+ * @param modelName - Optional model name override
107
+ * @param batchSize - Optional batch size override
108
+ * @returns EmbedFunction that can be injected into core components
109
+ */
110
+ export declare function createTextEmbedFunction(modelName?: string, batchSize?: number): EmbedFunction;
111
+ //# sourceMappingURL=embedder.d.ts.map