rag-lite-ts 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (310) hide show
  1. package/dist/{cli → cjs/cli}/indexer.js +1 -1
  2. package/dist/{cli → cjs/cli}/search.js +5 -10
  3. package/dist/{core → cjs/core}/binary-index-format.d.ts +28 -2
  4. package/dist/cjs/core/binary-index-format.js +291 -0
  5. package/dist/{core → cjs/core}/ingestion.d.ts +5 -1
  6. package/dist/{core → cjs/core}/ingestion.js +76 -9
  7. package/dist/{core → cjs/core}/model-validator.js +1 -1
  8. package/dist/{core → cjs/core}/reranking-strategies.js +4 -5
  9. package/dist/{core → cjs/core}/search.js +2 -1
  10. package/dist/{core → cjs/core}/types.d.ts +1 -1
  11. package/dist/{core → cjs/core}/vector-index.d.ts +4 -0
  12. package/dist/{core → cjs/core}/vector-index.js +10 -2
  13. package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +2 -0
  14. package/dist/{file-processor.js → cjs/file-processor.js} +20 -0
  15. package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +17 -1
  16. package/dist/{index-manager.js → cjs/index-manager.js} +148 -7
  17. package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +71 -66
  18. package/dist/esm/api-errors.d.ts +90 -0
  19. package/dist/esm/api-errors.js +320 -0
  20. package/dist/esm/cli/indexer.d.ts +11 -0
  21. package/dist/esm/cli/indexer.js +471 -0
  22. package/dist/esm/cli/search.d.ts +7 -0
  23. package/dist/esm/cli/search.js +332 -0
  24. package/dist/esm/cli.d.ts +3 -0
  25. package/dist/esm/cli.js +529 -0
  26. package/dist/esm/config.d.ts +51 -0
  27. package/dist/esm/config.js +79 -0
  28. package/dist/esm/core/abstract-embedder.d.ts +125 -0
  29. package/dist/esm/core/abstract-embedder.js +264 -0
  30. package/dist/esm/core/actionable-error-messages.d.ts +60 -0
  31. package/dist/esm/core/actionable-error-messages.js +397 -0
  32. package/dist/esm/core/adapters.d.ts +93 -0
  33. package/dist/esm/core/adapters.js +139 -0
  34. package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
  35. package/dist/esm/core/batch-processing-optimizer.js +536 -0
  36. package/dist/esm/core/binary-index-format.d.ts +78 -0
  37. package/dist/esm/core/binary-index-format.js +291 -0
  38. package/dist/esm/core/chunker.d.ts +119 -0
  39. package/dist/esm/core/chunker.js +73 -0
  40. package/dist/esm/core/cli-database-utils.d.ts +53 -0
  41. package/dist/esm/core/cli-database-utils.js +239 -0
  42. package/dist/esm/core/config.d.ts +102 -0
  43. package/dist/esm/core/config.js +247 -0
  44. package/dist/esm/core/content-errors.d.ts +111 -0
  45. package/dist/esm/core/content-errors.js +362 -0
  46. package/dist/esm/core/content-manager.d.ts +335 -0
  47. package/dist/esm/core/content-manager.js +1476 -0
  48. package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
  49. package/dist/esm/core/content-performance-optimizer.js +516 -0
  50. package/dist/esm/core/content-resolver.d.ts +104 -0
  51. package/dist/esm/core/content-resolver.js +285 -0
  52. package/dist/esm/core/cross-modal-search.d.ts +164 -0
  53. package/dist/esm/core/cross-modal-search.js +342 -0
  54. package/dist/esm/core/database-connection-manager.d.ts +109 -0
  55. package/dist/esm/core/database-connection-manager.js +310 -0
  56. package/dist/esm/core/db.d.ts +213 -0
  57. package/dist/esm/core/db.js +895 -0
  58. package/dist/esm/core/embedder-factory.d.ts +154 -0
  59. package/dist/esm/core/embedder-factory.js +311 -0
  60. package/dist/esm/core/error-handler.d.ts +112 -0
  61. package/dist/esm/core/error-handler.js +239 -0
  62. package/dist/esm/core/index.d.ts +59 -0
  63. package/dist/esm/core/index.js +69 -0
  64. package/dist/esm/core/ingestion.d.ts +202 -0
  65. package/dist/esm/core/ingestion.js +901 -0
  66. package/dist/esm/core/interfaces.d.ts +408 -0
  67. package/dist/esm/core/interfaces.js +106 -0
  68. package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
  69. package/dist/esm/core/lazy-dependency-loader.js +435 -0
  70. package/dist/esm/core/mode-detection-service.d.ts +150 -0
  71. package/dist/esm/core/mode-detection-service.js +565 -0
  72. package/dist/esm/core/mode-model-validator.d.ts +92 -0
  73. package/dist/esm/core/mode-model-validator.js +203 -0
  74. package/dist/esm/core/model-registry.d.ts +116 -0
  75. package/dist/esm/core/model-registry.js +411 -0
  76. package/dist/esm/core/model-validator.d.ts +217 -0
  77. package/dist/esm/core/model-validator.js +782 -0
  78. package/dist/esm/core/path-manager.d.ts +47 -0
  79. package/dist/esm/core/path-manager.js +71 -0
  80. package/dist/esm/core/raglite-paths.d.ts +121 -0
  81. package/dist/esm/core/raglite-paths.js +145 -0
  82. package/dist/esm/core/reranking-config.d.ts +42 -0
  83. package/dist/esm/core/reranking-config.js +147 -0
  84. package/dist/esm/core/reranking-factory.d.ts +92 -0
  85. package/dist/esm/core/reranking-factory.js +410 -0
  86. package/dist/esm/core/reranking-strategies.d.ts +310 -0
  87. package/dist/esm/core/reranking-strategies.js +650 -0
  88. package/dist/esm/core/resource-cleanup.d.ts +163 -0
  89. package/dist/esm/core/resource-cleanup.js +371 -0
  90. package/dist/esm/core/resource-manager.d.ts +212 -0
  91. package/dist/esm/core/resource-manager.js +564 -0
  92. package/dist/esm/core/search-pipeline.d.ts +111 -0
  93. package/dist/esm/core/search-pipeline.js +287 -0
  94. package/dist/esm/core/search.d.ts +141 -0
  95. package/dist/esm/core/search.js +320 -0
  96. package/dist/esm/core/streaming-operations.d.ts +145 -0
  97. package/dist/esm/core/streaming-operations.js +409 -0
  98. package/dist/esm/core/types.d.ts +66 -0
  99. package/dist/esm/core/types.js +6 -0
  100. package/dist/esm/core/universal-embedder.d.ts +177 -0
  101. package/dist/esm/core/universal-embedder.js +139 -0
  102. package/dist/esm/core/validation-messages.d.ts +99 -0
  103. package/dist/esm/core/validation-messages.js +334 -0
  104. package/dist/esm/core/vector-index.d.ts +72 -0
  105. package/dist/esm/core/vector-index.js +333 -0
  106. package/dist/esm/dom-polyfills.d.ts +6 -0
  107. package/dist/esm/dom-polyfills.js +37 -0
  108. package/dist/esm/factories/index.d.ts +27 -0
  109. package/dist/esm/factories/index.js +29 -0
  110. package/dist/esm/factories/ingestion-factory.d.ts +200 -0
  111. package/dist/esm/factories/ingestion-factory.js +477 -0
  112. package/dist/esm/factories/search-factory.d.ts +154 -0
  113. package/dist/esm/factories/search-factory.js +344 -0
  114. package/dist/esm/file-processor.d.ts +147 -0
  115. package/dist/esm/file-processor.js +963 -0
  116. package/dist/esm/index-manager.d.ts +116 -0
  117. package/dist/esm/index-manager.js +598 -0
  118. package/dist/esm/index.d.ts +75 -0
  119. package/dist/esm/index.js +110 -0
  120. package/dist/esm/indexer.d.ts +7 -0
  121. package/dist/esm/indexer.js +54 -0
  122. package/dist/esm/ingestion.d.ts +63 -0
  123. package/dist/esm/ingestion.js +124 -0
  124. package/dist/esm/mcp-server.d.ts +46 -0
  125. package/dist/esm/mcp-server.js +1820 -0
  126. package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
  127. package/dist/esm/multimodal/clip-embedder.js +996 -0
  128. package/dist/esm/multimodal/index.d.ts +6 -0
  129. package/dist/esm/multimodal/index.js +6 -0
  130. package/dist/esm/preprocess.d.ts +19 -0
  131. package/dist/esm/preprocess.js +203 -0
  132. package/dist/esm/preprocessors/index.d.ts +17 -0
  133. package/dist/esm/preprocessors/index.js +38 -0
  134. package/dist/esm/preprocessors/mdx.d.ts +25 -0
  135. package/dist/esm/preprocessors/mdx.js +101 -0
  136. package/dist/esm/preprocessors/mermaid.d.ts +68 -0
  137. package/dist/esm/preprocessors/mermaid.js +329 -0
  138. package/dist/esm/preprocessors/registry.d.ts +56 -0
  139. package/dist/esm/preprocessors/registry.js +179 -0
  140. package/dist/esm/run-error-recovery-tests.d.ts +7 -0
  141. package/dist/esm/run-error-recovery-tests.js +101 -0
  142. package/dist/esm/search-standalone.d.ts +7 -0
  143. package/dist/esm/search-standalone.js +117 -0
  144. package/dist/esm/search.d.ts +99 -0
  145. package/dist/esm/search.js +177 -0
  146. package/dist/esm/test-utils.d.ts +18 -0
  147. package/dist/esm/test-utils.js +27 -0
  148. package/dist/esm/text/chunker.d.ts +33 -0
  149. package/dist/esm/text/chunker.js +279 -0
  150. package/dist/esm/text/embedder.d.ts +111 -0
  151. package/dist/esm/text/embedder.js +386 -0
  152. package/dist/esm/text/index.d.ts +8 -0
  153. package/dist/esm/text/index.js +9 -0
  154. package/dist/esm/text/preprocessors/index.d.ts +17 -0
  155. package/dist/esm/text/preprocessors/index.js +38 -0
  156. package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
  157. package/dist/esm/text/preprocessors/mdx.js +101 -0
  158. package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
  159. package/dist/esm/text/preprocessors/mermaid.js +330 -0
  160. package/dist/esm/text/preprocessors/registry.d.ts +56 -0
  161. package/dist/esm/text/preprocessors/registry.js +180 -0
  162. package/dist/esm/text/reranker.d.ts +49 -0
  163. package/dist/esm/text/reranker.js +274 -0
  164. package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
  165. package/dist/esm/text/sentence-transformer-embedder.js +340 -0
  166. package/dist/esm/text/tokenizer.d.ts +22 -0
  167. package/dist/esm/text/tokenizer.js +64 -0
  168. package/dist/esm/types.d.ts +83 -0
  169. package/dist/esm/types.js +3 -0
  170. package/dist/esm/utils/vector-math.d.ts +31 -0
  171. package/dist/esm/utils/vector-math.js +70 -0
  172. package/package.json +30 -12
  173. package/dist/core/binary-index-format.js +0 -122
  174. /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
  175. /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
  176. /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
  177. /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
  178. /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
  179. /package/dist/{cli.js → cjs/cli.js} +0 -0
  180. /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
  181. /package/dist/{config.js → cjs/config.js} +0 -0
  182. /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
  183. /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
  184. /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
  185. /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
  186. /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
  187. /package/dist/{core → cjs/core}/adapters.js +0 -0
  188. /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
  189. /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
  190. /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
  191. /package/dist/{core → cjs/core}/chunker.js +0 -0
  192. /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
  193. /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
  194. /package/dist/{core → cjs/core}/config.d.ts +0 -0
  195. /package/dist/{core → cjs/core}/config.js +0 -0
  196. /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
  197. /package/dist/{core → cjs/core}/content-errors.js +0 -0
  198. /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
  199. /package/dist/{core → cjs/core}/content-manager.js +0 -0
  200. /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
  201. /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
  202. /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
  203. /package/dist/{core → cjs/core}/content-resolver.js +0 -0
  204. /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
  205. /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
  206. /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
  207. /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
  208. /package/dist/{core → cjs/core}/db.d.ts +0 -0
  209. /package/dist/{core → cjs/core}/db.js +0 -0
  210. /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
  211. /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
  212. /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
  213. /package/dist/{core → cjs/core}/error-handler.js +0 -0
  214. /package/dist/{core → cjs/core}/index.d.ts +0 -0
  215. /package/dist/{core → cjs/core}/index.js +0 -0
  216. /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
  217. /package/dist/{core → cjs/core}/interfaces.js +0 -0
  218. /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
  219. /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
  220. /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
  221. /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
  222. /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
  223. /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
  224. /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
  225. /package/dist/{core → cjs/core}/model-registry.js +0 -0
  226. /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
  227. /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
  228. /package/dist/{core → cjs/core}/path-manager.js +0 -0
  229. /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
  230. /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
  231. /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
  232. /package/dist/{core → cjs/core}/reranking-config.js +0 -0
  233. /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
  234. /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
  235. /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
  236. /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
  237. /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
  238. /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
  239. /package/dist/{core → cjs/core}/resource-manager.js +0 -0
  240. /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
  241. /package/dist/{core → cjs/core}/search-pipeline.js +0 -0
  242. /package/dist/{core → cjs/core}/search.d.ts +0 -0
  243. /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
  244. /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
  245. /package/dist/{core → cjs/core}/types.js +0 -0
  246. /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
  247. /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
  248. /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
  249. /package/dist/{core → cjs/core}/validation-messages.js +0 -0
  250. /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
  251. /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
  252. /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
  253. /package/dist/{factories → cjs/factories}/index.js +0 -0
  254. /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
  255. /package/dist/{factories → cjs/factories}/ingestion-factory.js +0 -0
  256. /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
  257. /package/dist/{factories → cjs/factories}/search-factory.js +0 -0
  258. /package/dist/{index.d.ts → cjs/index.d.ts} +0 -0
  259. /package/dist/{index.js → cjs/index.js} +0 -0
  260. /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
  261. /package/dist/{indexer.js → cjs/indexer.js} +0 -0
  262. /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
  263. /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
  264. /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
  265. /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
  266. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
  267. /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
  268. /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
  269. /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
  270. /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
  271. /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
  272. /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
  273. /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
  274. /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
  275. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
  276. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
  277. /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
  278. /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
  279. /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
  280. /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
  281. /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
  282. /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
  283. /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
  284. /package/dist/{search.js → cjs/search.js} +0 -0
  285. /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
  286. /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
  287. /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
  288. /package/dist/{text → cjs/text}/chunker.js +0 -0
  289. /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
  290. /package/dist/{text → cjs/text}/embedder.js +0 -0
  291. /package/dist/{text → cjs/text}/index.d.ts +0 -0
  292. /package/dist/{text → cjs/text}/index.js +0 -0
  293. /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
  294. /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
  295. /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
  296. /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
  297. /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
  298. /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
  299. /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
  300. /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
  301. /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
  302. /package/dist/{text → cjs/text}/reranker.js +0 -0
  303. /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
  304. /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
  305. /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
  306. /package/dist/{text → cjs/text}/tokenizer.js +0 -0
  307. /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
  308. /package/dist/{types.js → cjs/types.js} +0 -0
  309. /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
  310. /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
@@ -0,0 +1,901 @@
1
+ /**
2
+ * CORE MODULE — Shared between text-only (rag-lite-ts) and future multimodal (rag-lite-mm)
3
+ * Model-agnostic. No transformer or modality-specific logic.
4
+ */
5
+ import { discoverAndProcessFiles } from '../file-processor.js';
6
+ import { chunkDocument } from './chunker.js';
7
+ import { insertChunk, upsertDocument } from './db.js';
8
+ import { config } from './config.js';
9
+ import { DocumentPathManager } from './path-manager.js';
10
+ import { existsSync } from 'fs';
11
+ import { ContentManager } from './content-manager.js';
12
+ import { createRequire } from 'module';
13
+ // Create require for CommonJS modules in ES module context
14
+ const require = createRequire(import.meta.url);
15
+ /**
16
+ * Main ingestion pipeline class
17
+ * Coordinates the entire process from file discovery to vector storage
18
+ * Uses explicit dependency injection for clean architecture
19
+ */
20
+ export class IngestionPipeline {
21
+ embedFn;
22
+ indexManager;
23
+ db;
24
+ defaultChunkConfig;
25
+ pathManager;
26
+ contentManager;
27
+ /**
28
+ * Creates a new IngestionPipeline with explicit dependency injection
29
+ * Enhanced with ContentManager integration for unified content system
30
+ *
31
+ * DEPENDENCY INJECTION PATTERN:
32
+ * This constructor requires all dependencies to be explicitly provided, enabling:
33
+ * - Clean separation between core ingestion logic and implementation-specific components
34
+ * - Support for different embedding models and content types
35
+ * - Testability through mock injection
36
+ * - Future extensibility for multimodal content processing
37
+ * - Unified content management for both filesystem and memory-based ingestion
38
+ *
39
+ * @param embedFn - Function to embed document chunks into vectors
40
+ * - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
41
+ * - Must handle chunk text and return consistent embedding format
42
+ * - Examples:
43
+ * - Text: const embedFn = (text) => textEmbedder.embedSingle(text)
44
+ * - Multimodal: const embedFn = (content, type) => type === 'image' ? clipEmbedder.embedImage(content) : clipEmbedder.embedText(content)
45
+ * - Custom: const embedFn = (text) => customModel.embed(text)
46
+ *
47
+ * @param indexManager - Vector index manager for storing embeddings
48
+ * - Handles vector storage and indexing operations
49
+ * - Must support the embedding dimensions produced by embedFn
50
+ * - Example: new IndexManager('./index.bin')
51
+ *
52
+ * @param db - Database connection for metadata storage
53
+ * - Stores document and chunk metadata with content type support
54
+ * - Supports different content types through metadata fields
55
+ * - Example: await openDatabase('./db.sqlite')
56
+ *
57
+ * @param contentManager - Optional ContentManager for unified content system
58
+ * - Handles content storage routing and deduplication
59
+ * - If not provided, creates default instance with standard configuration
60
+ * - Example: new ContentManager(db, { contentDir: '.raglite/content' })
61
+ *
62
+ * USAGE EXAMPLES:
63
+ * ```typescript
64
+ * // Text-only ingestion pipeline with unified content system
65
+ * const textEmbedFn = createTextEmbedFunction();
66
+ * const indexManager = new IndexManager('./index.bin');
67
+ * const db = await openDatabase('./db.sqlite');
68
+ * const contentManager = new ContentManager(db);
69
+ * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
70
+ *
71
+ * // Simple usage (ContentManager created automatically)
72
+ * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
73
+ *
74
+ * // Custom embedding implementation with memory ingestion
75
+ * const customEmbedFn = async (text) => ({
76
+ * embedding_id: generateId(),
77
+ * vector: await myCustomModel.embed(text)
78
+ * });
79
+ * const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
80
+ * await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
81
+ * ```
82
+ */
83
+ constructor(embedFn, indexManager, db, defaultChunkConfig, contentManager) {
84
+ this.embedFn = embedFn;
85
+ this.indexManager = indexManager;
86
+ this.db = db;
87
+ this.defaultChunkConfig = defaultChunkConfig;
88
+ // Validate required dependencies
89
+ if (!embedFn || typeof embedFn !== 'function') {
90
+ throw new Error('embedFn must be a valid function');
91
+ }
92
+ if (!indexManager) {
93
+ throw new Error('indexManager is required');
94
+ }
95
+ if (!db) {
96
+ throw new Error('db connection is required');
97
+ }
98
+ // Initialize path manager with default configuration
99
+ this.pathManager = new DocumentPathManager(config.path_storage_strategy, process.cwd());
100
+ // Initialize ContentManager (create default if not provided)
101
+ this.contentManager = contentManager || new ContentManager(this.db);
102
+ }
103
+ /**
104
+ * Ingest documents from a directory
105
+ * @param directoryPath - Path to directory containing documents
106
+ * @param options - Optional ingestion configuration
107
+ * @returns Promise resolving to ingestion results
108
+ */
109
+ async ingestDirectory(directoryPath, options = {}) {
110
+ if (!existsSync(directoryPath)) {
111
+ throw new Error(`Directory not found: ${directoryPath}`);
112
+ }
113
+ return this.ingestPath(directoryPath, options);
114
+ }
115
+ /**
116
+ * Ingest a single file
117
+ * @param filePath - Path to the file to ingest
118
+ * @param options - Optional ingestion configuration
119
+ * @returns Promise resolving to ingestion results
120
+ */
121
+ async ingestFile(filePath, options = {}) {
122
+ if (!existsSync(filePath)) {
123
+ throw new Error(`File not found: ${filePath}`);
124
+ }
125
+ return this.ingestPath(filePath, options);
126
+ }
127
+ /**
128
+ * Ingest content from memory buffer
129
+ * Enables MCP integration and real-time content processing
130
+ * @param content - Buffer containing the content to ingest
131
+ * @param metadata - Memory content metadata including display name and content type
132
+ * @param options - Optional ingestion configuration
133
+ * @returns Promise resolving to content ID for the ingested content
134
+ */
135
+ async ingestFromMemory(content, metadata, options = {}) {
136
+ const startTime = Date.now();
137
+ console.log(`\n=== Starting memory ingestion: ${metadata.displayName} ===`);
138
+ try {
139
+ // Phase 1: Content Storage via ContentManager
140
+ console.log('\n--- Phase 1: Content Storage ---');
141
+ const contentResult = await this.contentManager.ingestFromMemory(content, metadata);
142
+ if (contentResult.wasDeduped) {
143
+ console.log(`✓ Content deduplicated: ${metadata.displayName} (ID: ${contentResult.contentId})`);
144
+ return contentResult.contentId;
145
+ }
146
+ console.log(`✓ Content stored: ${metadata.displayName} (ID: ${contentResult.contentId})`);
147
+ // Phase 2: Document Processing
148
+ console.log('\n--- Phase 2: Document Processing ---');
149
+ // Determine content type for processing
150
+ const detectedContentType = metadata.contentType || 'text/plain';
151
+ const isImageContent = detectedContentType.startsWith('image/');
152
+ let document;
153
+ if (isImageContent) {
154
+ // Process image content using the existing image processing pipeline
155
+ console.log(`Processing image content: ${metadata.displayName} (${detectedContentType})`);
156
+ document = await this.processImageFromMemory(content, contentResult, metadata, options);
157
+ }
158
+ else if (detectedContentType === 'application/pdf') {
159
+ // Process PDF content
160
+ console.log(`Processing PDF content: ${metadata.displayName}`);
161
+ document = await this.processPDFFromMemory(content, contentResult, metadata, options);
162
+ }
163
+ else if (detectedContentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
164
+ // Process DOCX content
165
+ console.log(`Processing DOCX content: ${metadata.displayName}`);
166
+ document = await this.processDOCXFromMemory(content, contentResult, metadata, options);
167
+ }
168
+ else {
169
+ // Process as text content
170
+ console.log(`Processing text content: ${metadata.displayName} (${detectedContentType})`);
171
+ document = {
172
+ source: metadata.displayName,
173
+ title: metadata.displayName,
174
+ content: content.toString('utf8'), // Convert buffer to string for processing
175
+ metadata: {
176
+ contentType: detectedContentType,
177
+ contentId: contentResult.contentId,
178
+ storageType: contentResult.storageType,
179
+ originalPath: metadata.originalPath
180
+ }
181
+ };
182
+ }
183
+ // Phase 3: Document Chunking
184
+ console.log('\n--- Phase 3: Document Chunking ---');
185
+ const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
186
+ chunkSize: config.chunk_size,
187
+ chunkOverlap: config.chunk_overlap
188
+ };
189
+ const chunks = await chunkDocument(document, effectiveChunkConfig);
190
+ console.log(`✓ Created ${chunks.length} chunks from memory content`);
191
+ if (chunks.length === 0) {
192
+ console.log('No chunks created from memory content');
193
+ return contentResult.contentId;
194
+ }
195
+ // Phase 4: Embedding Generation
196
+ console.log('\n--- Phase 4: Embedding Generation ---');
197
+ const embeddings = [];
198
+ let embeddingErrors = 0;
199
+ for (let i = 0; i < chunks.length; i++) {
200
+ const chunk = chunks[i];
201
+ try {
202
+ // Convert MIME type to simple content type for embedding function
203
+ const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
204
+ // For images, use the image path from metadata instead of text description
205
+ let contentForEmbedding = chunk.text;
206
+ if (contentTypeForEmbedding === 'image' && document.metadata) {
207
+ // Try to get image path from metadata (contentPath, originalPath, or source)
208
+ // contentPath is where the image is stored (from contentResult)
209
+ const imagePath = document.metadata.contentPath ||
210
+ document.metadata.originalPath ||
211
+ document.metadata.source;
212
+ if (imagePath) {
213
+ contentForEmbedding = imagePath;
214
+ }
215
+ else {
216
+ // Fallback: try to extract path from source if available
217
+ console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
218
+ }
219
+ }
220
+ const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
221
+ // Enhance embedding result with content type metadata
222
+ if (!embedding.contentType) {
223
+ embedding.contentType = contentTypeForEmbedding;
224
+ }
225
+ if (!embedding.metadata) {
226
+ embedding.metadata = document.metadata;
227
+ }
228
+ embeddings.push(embedding);
229
+ }
230
+ catch (error) {
231
+ console.warn(`Failed to embed chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
232
+ embeddingErrors++;
233
+ }
234
+ }
235
+ console.log(`✓ Generated ${embeddings.length} embeddings for memory content`);
236
+ if (embeddings.length === 0) {
237
+ console.log('No embeddings generated from memory content');
238
+ return contentResult.contentId;
239
+ }
240
+ // Phase 5: Database Storage
241
+ console.log('\n--- Phase 5: Database Storage ---');
242
+ // Insert document with content_id reference
243
+ const documentContentType = this.getContentTypeForEmbedding(document.metadata?.contentType);
244
+ const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentResult.contentId);
245
+ // Insert chunks with embeddings
246
+ let chunksStored = 0;
247
+ for (let i = 0; i < chunks.length && i < embeddings.length; i++) {
248
+ const chunk = chunks[i];
249
+ const embedding = embeddings[i];
250
+ try {
251
+ await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, documentContentType, document.metadata);
252
+ chunksStored++;
253
+ }
254
+ catch (error) {
255
+ console.error(`Failed to store chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
256
+ }
257
+ }
258
+ console.log(`✓ Stored document and ${chunksStored} chunks in database`);
259
+ // Phase 6: Vector Index Updates
260
+ console.log('\n--- Phase 6: Vector Index Updates ---');
261
+ await this.updateVectorIndex(embeddings);
262
+ const endTime = Date.now();
263
+ const processingTimeMs = endTime - startTime;
264
+ console.log('\n=== Memory Ingestion Complete ===');
265
+ console.log(`Content ID: ${contentResult.contentId}`);
266
+ console.log(`Chunks created: ${chunks.length}`);
267
+ console.log(`Embeddings generated: ${embeddings.length}`);
268
+ console.log(`Chunks stored: ${chunksStored}`);
269
+ console.log(`Embedding errors: ${embeddingErrors}`);
270
+ console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
271
+ return contentResult.contentId;
272
+ }
273
+ catch (error) {
274
+ console.error('\n=== Memory Ingestion Failed ===');
275
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
276
+ throw new Error(`Memory ingestion failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
277
+ }
278
+ }
279
+ /**
280
+ * Ingest documents from a path (file or directory)
281
+ * Implements the complete pipeline: file processing → chunking → embedding → storage
282
+ * Enhanced to handle mixed content types (text and images) in multimodal mode
283
+ */
284
+ async ingestPath(path, options = {}) {
285
+ const startTime = Date.now();
286
+ console.log(`\n=== Starting ingestion from: ${path} ===`);
287
+ try {
288
+ // Phase 1: File Discovery and Processing with Content-Type Detection
289
+ console.log('\n--- Phase 1: File Discovery and Processing ---');
290
+ const mode = options.mode || 'text';
291
+ const fileOptions = {
292
+ recursive: true,
293
+ maxFileSize: 10 * 1024 * 1024, // 10MB
294
+ ...options.fileOptions,
295
+ mode
296
+ };
297
+ const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
298
+ // Additional filtering as fallback (should be minimal with mode-aware discovery)
299
+ const filteredResult = this.filterDocumentsByMode(fileResult, mode);
300
+ if (filteredResult.documents.length === 0) {
301
+ console.log('No documents found to process');
302
+ return {
303
+ documentsProcessed: 0,
304
+ chunksCreated: 0,
305
+ embeddingsGenerated: 0,
306
+ documentErrors: filteredResult.processingResult.errors.length,
307
+ embeddingErrors: 0,
308
+ processingTimeMs: Date.now() - startTime,
309
+ contentIds: []
310
+ };
311
+ }
312
+ // Content-type detection and routing
313
+ const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
314
+ console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
315
+ // Phase 2: Document Chunking with Content-Type Awareness
316
+ console.log('\n--- Phase 2: Document Chunking ---');
317
+ const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
318
+ chunkSize: config.chunk_size,
319
+ chunkOverlap: config.chunk_overlap
320
+ };
321
+ const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
322
+ if (chunkingResult.totalChunks === 0) {
323
+ console.log('No chunks created from documents');
324
+ return {
325
+ documentsProcessed: fileResult.documents.length,
326
+ chunksCreated: 0,
327
+ embeddingsGenerated: 0,
328
+ documentErrors: fileResult.processingResult.errors.length,
329
+ embeddingErrors: 0,
330
+ processingTimeMs: Date.now() - startTime,
331
+ contentIds: []
332
+ };
333
+ }
334
+ // Phase 3: Embedding Generation with Content-Type Support
335
+ console.log('\n--- Phase 3: Embedding Generation ---');
336
+ const embeddingResult = await this.generateEmbeddingsWithContentTypes(chunkingResult.allChunks);
337
+ // Phase 4: Database and Index Storage with Content-Type Metadata
338
+ console.log('\n--- Phase 4: Storage Operations ---');
339
+ const contentIds = await this.storeDocumentsAndChunksWithContentTypes(chunkingResult.documentChunks, embeddingResult.embeddings);
340
+ // Phase 5: Vector Index Updates
341
+ console.log('\n--- Phase 5: Vector Index Updates ---');
342
+ await this.updateVectorIndex(embeddingResult.embeddings);
343
+ const endTime = Date.now();
344
+ const processingTimeMs = endTime - startTime;
345
+ const result = {
346
+ documentsProcessed: filteredResult.documents.length,
347
+ chunksCreated: chunkingResult.totalChunks,
348
+ embeddingsGenerated: embeddingResult.embeddings.length,
349
+ documentErrors: filteredResult.processingResult.errors.length,
350
+ embeddingErrors: embeddingResult.errors,
351
+ processingTimeMs,
352
+ contentIds
353
+ };
354
+ console.log('\n=== Ingestion Complete ===');
355
+ console.log(`Documents processed: ${result.documentsProcessed}`);
356
+ console.log(`Chunks created: ${result.chunksCreated}`);
357
+ console.log(`Embeddings generated: ${result.embeddingsGenerated}`);
358
+ console.log(`Document errors: ${result.documentErrors}`);
359
+ console.log(`Embedding errors: ${result.embeddingErrors}`);
360
+ console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
361
+ return result;
362
+ }
363
+ catch (error) {
364
+ console.error('\n=== Ingestion Failed ===');
365
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
366
+ throw new Error(`Ingestion failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
367
+ }
368
+ }
369
+ /**
370
+ * Analyze content types in the document collection
371
+ * @private
372
+ */
373
+ analyzeContentTypes(documents) {
374
+ const stats = { text: 0, image: 0, other: 0 };
375
+ for (const document of documents) {
376
+ const contentType = document.metadata?.contentType || 'text';
377
+ switch (contentType) {
378
+ case 'text':
379
+ stats.text++;
380
+ break;
381
+ case 'image':
382
+ stats.image++;
383
+ break;
384
+ default:
385
+ stats.other++;
386
+ break;
387
+ }
388
+ }
389
+ return stats;
390
+ }
391
+ /**
392
+ * Chunk all documents and organize results with content-type awareness
393
+ * Enhanced to handle different content types appropriately
394
+ */
395
+ async chunkDocumentsWithContentTypes(documents, chunkConfig, mode) {
396
+ const documentChunks = [];
397
+ const allChunks = [];
398
+ let totalChunks = 0;
399
+ console.log(`Processing ${documents.length} document${documents.length === 1 ? '' : 's'} for chunking...`);
400
+ for (let i = 0; i < documents.length; i++) {
401
+ const document = documents[i];
402
+ try {
403
+ const contentType = document.metadata?.contentType || 'text';
404
+ // Handle different content types appropriately
405
+ let chunks;
406
+ if (contentType === 'image') {
407
+ // For images, create a single chunk with the full content (description + metadata)
408
+ chunks = [{
409
+ text: document.content,
410
+ chunkIndex: 0,
411
+ contentType: 'image',
412
+ metadata: document.metadata
413
+ }];
414
+ }
415
+ else if (mode === 'multimodal') {
416
+ // In multimodal mode, don't chunk text - CLIP handles truncation at 77 tokens
417
+ // Chunking doesn't make sense because CLIP can't handle long text anyway
418
+ chunks = [{
419
+ text: document.content,
420
+ chunkIndex: 0,
421
+ contentType: 'text',
422
+ metadata: document.metadata
423
+ }];
424
+ }
425
+ else {
426
+ // For text mode, use normal chunking
427
+ const textChunks = await chunkDocument(document, chunkConfig);
428
+ chunks = textChunks.map(chunk => ({
429
+ ...chunk,
430
+ contentType: 'text',
431
+ metadata: document.metadata
432
+ }));
433
+ }
434
+ documentChunks.push({ document, chunks });
435
+ // Collect all chunks with their content type information
436
+ for (const chunk of chunks) {
437
+ allChunks.push({
438
+ text: chunk.text,
439
+ contentType: chunk.contentType,
440
+ metadata: chunk.metadata
441
+ });
442
+ }
443
+ totalChunks += chunks.length;
444
+ // Progress logging - more frequent for better user experience
445
+ if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
446
+ const percentage = Math.round(((i + 1) / documents.length) * 100);
447
+ console.log(`Processed ${i + 1} of ${documents.length} documents (${percentage}%) - ${totalChunks} chunks created`);
448
+ }
449
+ }
450
+ catch (error) {
451
+ console.error(`Failed to chunk document ${document.source}:`, error instanceof Error ? error.message : String(error));
452
+ // Continue with other documents
453
+ continue;
454
+ }
455
+ }
456
+ console.log(`✓ Chunking complete: Created ${totalChunks} chunks from ${documentChunks.length} documents`);
457
+ return { documentChunks, allChunks, totalChunks };
458
+ }
459
+ /**
460
+ * Generate embeddings for all chunks with content-type support
461
+ * Enhanced to handle different content types and pass metadata to embedding function
462
+ */
463
+ async generateEmbeddingsWithContentTypes(chunks) {
464
+ console.log(`Generating embeddings for ${chunks.length} chunk${chunks.length === 1 ? '' : 's'}...`);
465
+ console.log('This may take a few minutes depending on the number of chunks...');
466
+ try {
467
+ // Generate embeddings using injected embed function with content type support
468
+ const embeddings = [];
469
+ let errors = 0;
470
+ for (let i = 0; i < chunks.length; i++) {
471
+ const chunk = chunks[i];
472
+ try {
473
+ // Convert MIME type to simple content type for embedding function
474
+ const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
475
+ // For images, use the image path from metadata instead of text description
476
+ let contentForEmbedding = chunk.text;
477
+ if (contentTypeForEmbedding === 'image' && chunk.metadata) {
478
+ // Try to get image path from metadata (originalPath or contentPath)
479
+ const imagePath = chunk.metadata.originalPath || chunk.metadata.contentPath || chunk.metadata.source;
480
+ if (imagePath) {
481
+ contentForEmbedding = imagePath;
482
+ }
483
+ else {
484
+ // Fallback: try to extract path from source if available
485
+ console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
486
+ }
487
+ }
488
+ const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
489
+ // Enhance embedding result with content type metadata if not already present
490
+ if (!embedding.contentType) {
491
+ embedding.contentType = contentTypeForEmbedding;
492
+ }
493
+ if (!embedding.metadata && chunk.metadata) {
494
+ embedding.metadata = chunk.metadata;
495
+ }
496
+ embeddings.push(embedding);
497
+ }
498
+ catch (error) {
499
+ console.warn(`Failed to embed ${chunk.contentType} chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
500
+ errors++;
501
+ }
502
+ // Progress logging
503
+ if (chunks.length > 10 && (i + 1) % Math.max(1, Math.floor(chunks.length / 10)) === 0) {
504
+ const percentage = Math.round(((i + 1) / chunks.length) * 100);
505
+ console.log(`Generated ${i + 1} of ${chunks.length} embeddings (${percentage}%)`);
506
+ }
507
+ }
508
+ if (errors > 0) {
509
+ console.warn(`⚠ Warning: ${errors} chunk${errors === 1 ? '' : 's'} failed embedding and ${errors === 1 ? 'was' : 'were'} skipped`);
510
+ }
511
+ console.log(`✓ Generated ${embeddings.length} embeddings successfully`);
512
+ return { embeddings, errors };
513
+ }
514
+ catch (error) {
515
+ console.error('Critical embedding failure:', error instanceof Error ? error.message : String(error));
516
+ throw new Error(`Embedding generation failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
517
+ }
518
+ }
519
+ /**
520
+ * Store documents and chunks in database with content-type support
521
+ * Enhanced to handle content type metadata and multimodal content
522
+ * @returns Array of content IDs for successfully stored documents
523
+ */
524
+ async storeDocumentsAndChunksWithContentTypes(documentChunks, embeddings) {
525
+ console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
526
+ // Create a mapping of chunk text to embedding for efficient lookup
527
+ const embeddingMap = new Map();
528
+ let embeddingIndex = 0;
529
+ // Build mapping - this assumes embeddings are in the same order as chunks were processed
530
+ for (const { chunks } of documentChunks) {
531
+ for (const chunk of chunks) {
532
+ if (embeddingIndex < embeddings.length) {
533
+ embeddingMap.set(chunk.text, embeddings[embeddingIndex]);
534
+ embeddingIndex++;
535
+ }
536
+ }
537
+ }
538
+ let totalChunksStored = 0;
539
+ let documentsStored = 0;
540
+ const contentIds = [];
541
+ // Process each document sequentially
542
+ for (const { document, chunks } of documentChunks) {
543
+ try {
544
+ // Generate content ID for filesystem content using ContentManager
545
+ let contentId = document.metadata?.contentId;
546
+ if (!contentId) {
547
+ try {
548
+ // Use ContentManager to create filesystem reference and get content ID
549
+ const contentResult = await this.contentManager.ingestFromFilesystem(document.source);
550
+ contentId = contentResult.contentId;
551
+ // Update document metadata with content ID
552
+ if (!document.metadata) {
553
+ document.metadata = {};
554
+ }
555
+ document.metadata.contentId = contentId;
556
+ document.metadata.storageType = contentResult.storageType;
557
+ }
558
+ catch (contentError) {
559
+ console.warn(`Failed to create content reference for ${document.source}:`, contentError instanceof Error ? contentError.message : String(contentError));
560
+ // Continue without content ID - fallback to legacy behavior
561
+ }
562
+ }
563
+ // Insert or get existing document with content type support and content_id reference
564
+ const documentContentType = document.metadata?.contentType || 'text';
565
+ const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentId);
566
+ documentsStored++;
567
+ // Add content ID to results if available
568
+ if (contentId) {
569
+ contentIds.push(contentId);
570
+ }
571
+ // Insert all chunks for this document with content type support
572
+ let chunksStoredForDoc = 0;
573
+ for (const chunk of chunks) {
574
+ const embedding = embeddingMap.get(chunk.text);
575
+ if (embedding) {
576
+ try {
577
+ const chunkContentType = chunk.contentType || documentContentType;
578
+ const chunkMetadata = chunk.metadata || document.metadata;
579
+ await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, chunkContentType, chunkMetadata);
580
+ chunksStoredForDoc++;
581
+ totalChunksStored++;
582
+ }
583
+ catch (chunkError) {
584
+ console.error(`Failed to store ${chunk.contentType || 'text'} chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
585
+ // Continue with other chunks
586
+ }
587
+ }
588
+ else {
589
+ console.warn(`No embedding found for chunk ${chunk.chunkIndex} in document ${document.source}`);
590
+ }
591
+ }
592
+ // Progress logging for storage
593
+ if (documentChunks.length <= 20 || documentsStored % Math.max(1, Math.floor(documentChunks.length / 10)) === 0 || documentsStored === documentChunks.length) {
594
+ const percentage = Math.round((documentsStored / documentChunks.length) * 100);
595
+ console.log(`Stored ${documentsStored} of ${documentChunks.length} documents (${percentage}%) - ${totalChunksStored} chunks total`);
596
+ }
597
+ }
598
+ catch (docError) {
599
+ console.error(`Failed to store document ${document.source}:`, docError instanceof Error ? docError.message : String(docError));
600
+ // Continue with other documents
601
+ }
602
+ }
603
+ console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
604
+ return contentIds;
605
+ }
606
+ /**
607
+ * Update vector index with new embeddings (supports grouped content type storage)
608
+ */
609
+ async updateVectorIndex(embeddings) {
610
+ console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
611
+ if (embeddings.length === 0) {
612
+ console.log('No embeddings to add to vector index');
613
+ return;
614
+ }
615
+ console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
616
+ try {
617
+ // Group embeddings by content type for optimized storage
618
+ const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
619
+ const contentType = embedding.contentType || 'text';
620
+ if (!groups[contentType]) {
621
+ groups[contentType] = [];
622
+ }
623
+ groups[contentType].push(embedding);
624
+ return groups;
625
+ }, {});
626
+ const textEmbeddings = groupedEmbeddings.text || [];
627
+ const imageEmbeddings = groupedEmbeddings.image || [];
628
+ console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
629
+ // Use grouped storage method if available, fallback to regular method
630
+ if (this.indexManager.addGroupedEmbeddings) {
631
+ await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
632
+ }
633
+ else {
634
+ await this.indexManager.addVectors(embeddings);
635
+ }
636
+ console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
637
+ }
638
+ catch (error) {
639
+ console.error('Failed to update vector index:', error instanceof Error ? error.message : String(error));
640
+ throw error;
641
+ }
642
+ }
643
+ /**
644
+ * Filter documents based on ingestion mode to avoid processing incompatible content types
645
+ */
646
+ filterDocumentsByMode(fileResult, mode) {
647
+ if (mode === 'multimodal') {
648
+ // In multimodal mode, keep all documents
649
+ return fileResult;
650
+ }
651
+ // In text mode, filter out image documents
652
+ const filteredDocuments = fileResult.documents.filter(doc => {
653
+ const contentType = doc.metadata?.contentType || 'text';
654
+ const isCompatible = contentType === 'text' ||
655
+ contentType.startsWith('text/') ||
656
+ contentType === 'application/pdf' ||
657
+ contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
658
+ if (!isCompatible) {
659
+ console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
660
+ }
661
+ return isCompatible;
662
+ });
663
+ // Update processing result to reflect filtering
664
+ const filteredProcessingResult = {
665
+ ...fileResult.processingResult,
666
+ skippedFiles: [
667
+ ...(fileResult.processingResult.skippedFiles || []),
668
+ ...fileResult.documents
669
+ .filter(doc => !filteredDocuments.includes(doc))
670
+ .map(doc => ({
671
+ path: doc.source,
672
+ reason: `Content type not compatible with ${mode} mode`
673
+ }))
674
+ ]
675
+ };
676
+ return {
677
+ documents: filteredDocuments,
678
+ discoveryResult: fileResult.discoveryResult,
679
+ processingResult: filteredProcessingResult
680
+ };
681
+ }
682
+ /**
683
+ * Converts MIME type to simple content type for embedding function
684
+ * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
685
+ * @returns Simple content type ('text', 'image', etc.)
686
+ */
687
+ getContentTypeForEmbedding(contentType) {
688
+ if (!contentType) {
689
+ return 'text';
690
+ }
691
+ // Handle simple content type strings (used by chunking)
692
+ if (contentType === 'image') {
693
+ return 'image';
694
+ }
695
+ else if (contentType === 'text') {
696
+ return 'text';
697
+ }
698
+ // Convert MIME types to simple content types (legacy support)
699
+ if (contentType.startsWith('text/')) {
700
+ return 'text';
701
+ }
702
+ else if (contentType.startsWith('image/')) {
703
+ return 'image';
704
+ }
705
+ else if (contentType === 'application/pdf') {
706
+ return 'text'; // PDFs are processed as text
707
+ }
708
+ else if (contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
709
+ return 'text'; // DOCX files are processed as text
710
+ }
711
+ else {
712
+ return 'text'; // Default to text for unknown types
713
+ }
714
+ }
715
+ /**
716
+ * Save the vector index to disk
717
+ */
718
+ async saveIndex() {
719
+ await this.indexManager.saveIndex();
720
+ }
721
+ /**
722
+ * Process image content from memory using the existing image processing pipeline
723
+ * @private
724
+ */
725
+ async processImageFromMemory(content, contentResult, metadata, options) {
726
+ try {
727
+ // Import image processing functions
728
+ const { generateImageDescriptionForFile, extractImageMetadataForFile } = await import('../file-processor.js');
729
+ // Use the content path from the content manager (where the image is stored)
730
+ const imagePath = contentResult.contentPath;
731
+ // Extract image metadata
732
+ let imageMetadata = {};
733
+ try {
734
+ imageMetadata = await extractImageMetadataForFile(imagePath);
735
+ }
736
+ catch (error) {
737
+ console.warn(`Failed to extract image metadata for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
738
+ // Continue with empty metadata
739
+ }
740
+ // Generate text description for the image
741
+ let descriptionResult = { description: 'Image content', model: 'none', confidence: 0 };
742
+ try {
743
+ const imageToTextOptions = {}; // Use default options for now
744
+ descriptionResult = await generateImageDescriptionForFile(imagePath, imageToTextOptions);
745
+ console.log(`✓ Generated image description: "${descriptionResult.description}"`);
746
+ }
747
+ catch (error) {
748
+ console.warn(`Failed to generate image description for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
749
+ // Continue with fallback description
750
+ }
751
+ // Update metadata with description information
752
+ imageMetadata.description = descriptionResult.description;
753
+ imageMetadata.descriptionModel = descriptionResult.model;
754
+ imageMetadata.descriptionConfidence = descriptionResult.confidence;
755
+ // Create document with image description as content
756
+ const title = metadata.displayName;
757
+ // Create content that includes description and key metadata
758
+ const contentParts = [
759
+ `Image: ${title}`,
760
+ `Description: ${descriptionResult.description}`
761
+ ];
762
+ if (imageMetadata.dimensions) {
763
+ contentParts.push(`Dimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}`);
764
+ }
765
+ if (imageMetadata.format) {
766
+ contentParts.push(`Format: ${imageMetadata.format}`);
767
+ }
768
+ const documentContent = contentParts.join('\n');
769
+ return {
770
+ source: metadata.displayName,
771
+ title,
772
+ content: documentContent.trim(),
773
+ metadata: {
774
+ contentType: 'image',
775
+ contentId: contentResult.contentId,
776
+ storageType: contentResult.storageType,
777
+ contentPath: contentResult.contentPath, // Store contentPath for embedding
778
+ originalPath: metadata.originalPath,
779
+ ...imageMetadata // Spread all image metadata fields
780
+ }
781
+ };
782
+ }
783
+ catch (error) {
784
+ console.warn(`Failed to process image from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
785
+ // Fallback to basic document creation
786
+ return {
787
+ source: metadata.displayName,
788
+ title: metadata.displayName,
789
+ content: `Image: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
790
+ metadata: {
791
+ contentType: 'image',
792
+ contentId: contentResult.contentId,
793
+ storageType: contentResult.storageType,
794
+ contentPath: contentResult.contentPath, // Store contentPath for embedding
795
+ originalPath: metadata.originalPath,
796
+ processingError: error instanceof Error ? error.message : String(error)
797
+ }
798
+ };
799
+ }
800
+ }
801
+ /**
802
+ * Process PDF content from memory using the existing PDF processing pipeline
803
+ * @private
804
+ */
805
+ async processPDFFromMemory(content, contentResult, metadata, options) {
806
+ try {
807
+ // Import PDF processing
808
+ const pdfParse = require('pdf-parse');
809
+ // Parse PDF content directly from buffer
810
+ const pdfData = await pdfParse(content);
811
+ console.log(`✓ Extracted ${pdfData.text.length} characters from PDF`);
812
+ return {
813
+ source: metadata.displayName,
814
+ title: metadata.displayName,
815
+ content: pdfData.text.trim(),
816
+ metadata: {
817
+ contentType: 'application/pdf',
818
+ contentId: contentResult.contentId,
819
+ storageType: contentResult.storageType,
820
+ originalPath: metadata.originalPath,
821
+ pages: pdfData.numpages,
822
+ pdfInfo: pdfData.info
823
+ }
824
+ };
825
+ }
826
+ catch (error) {
827
+ console.warn(`Failed to process PDF from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
828
+ // Fallback to basic document creation
829
+ return {
830
+ source: metadata.displayName,
831
+ title: metadata.displayName,
832
+ content: `PDF Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
833
+ metadata: {
834
+ contentType: 'application/pdf',
835
+ contentId: contentResult.contentId,
836
+ storageType: contentResult.storageType,
837
+ originalPath: metadata.originalPath,
838
+ processingError: error instanceof Error ? error.message : String(error)
839
+ }
840
+ };
841
+ }
842
+ }
843
+ /**
844
+ * Process DOCX content from memory using the existing DOCX processing pipeline
845
+ * @private
846
+ */
847
+ async processDOCXFromMemory(content, contentResult, metadata, options) {
848
+ try {
849
+ // Import DOCX processing
850
+ const mammoth = await import('mammoth');
851
+ // Parse DOCX content directly from buffer
852
+ const docxResult = await mammoth.extractRawText({ buffer: content });
853
+ console.log(`✓ Extracted ${docxResult.value.length} characters from DOCX`);
854
+ return {
855
+ source: metadata.displayName,
856
+ title: metadata.displayName,
857
+ content: docxResult.value.trim(),
858
+ metadata: {
859
+ contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
860
+ contentId: contentResult.contentId,
861
+ storageType: contentResult.storageType,
862
+ originalPath: metadata.originalPath,
863
+ messages: docxResult.messages
864
+ }
865
+ };
866
+ }
867
+ catch (error) {
868
+ console.warn(`Failed to process DOCX from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
869
+ // Fallback to basic document creation
870
+ return {
871
+ source: metadata.displayName,
872
+ title: metadata.displayName,
873
+ content: `DOCX Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
874
+ metadata: {
875
+ contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
876
+ contentId: contentResult.contentId,
877
+ storageType: contentResult.storageType,
878
+ originalPath: metadata.originalPath,
879
+ processingError: error instanceof Error ? error.message : String(error)
880
+ }
881
+ };
882
+ }
883
+ }
884
+ /**
885
+ * Clean up resources - explicit cleanup method
886
+ */
887
+ async cleanup() {
888
+ try {
889
+ // Clean up ContentManager to prevent resource leaks
890
+ if (this.contentManager && typeof this.contentManager.cleanup === 'function') {
891
+ this.contentManager.cleanup();
892
+ }
893
+ await this.db.close();
894
+ await this.indexManager.close();
895
+ }
896
+ catch (error) {
897
+ console.error('Error during IngestionPipeline cleanup:', error instanceof Error ? error.message : String(error));
898
+ }
899
+ }
900
+ }
901
+ //# sourceMappingURL=ingestion.js.map