rag-lite-ts 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (328) hide show
  1. package/README.md +88 -5
  2. package/dist/{cli → cjs/cli}/indexer.js +73 -15
  3. package/dist/cjs/cli/ui-server.d.ts +5 -0
  4. package/dist/cjs/cli/ui-server.js +152 -0
  5. package/dist/{cli.js → cjs/cli.js} +25 -6
  6. package/dist/{core → cjs/core}/binary-index-format.js +6 -3
  7. package/dist/{core → cjs/core}/db.d.ts +56 -0
  8. package/dist/{core → cjs/core}/db.js +105 -0
  9. package/dist/{core → cjs/core}/ingestion.js +3 -0
  10. package/dist/cjs/core/knowledge-base-manager.d.ts +109 -0
  11. package/dist/cjs/core/knowledge-base-manager.js +256 -0
  12. package/dist/{core → cjs/core}/model-validator.js +1 -1
  13. package/dist/{core → cjs/core}/search-pipeline.js +1 -1
  14. package/dist/{core → cjs/core}/search.js +1 -1
  15. package/dist/cjs/core/vector-index-messages.d.ts +52 -0
  16. package/dist/cjs/core/vector-index-messages.js +5 -0
  17. package/dist/cjs/core/vector-index-worker.d.ts +6 -0
  18. package/dist/cjs/core/vector-index-worker.js +304 -0
  19. package/dist/cjs/core/vector-index.d.ts +107 -0
  20. package/dist/cjs/core/vector-index.js +344 -0
  21. package/dist/{factories → cjs/factories}/ingestion-factory.js +3 -7
  22. package/dist/{factories → cjs/factories}/search-factory.js +11 -0
  23. package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +23 -3
  24. package/dist/{index-manager.js → cjs/index-manager.js} +84 -15
  25. package/dist/{index.d.ts → cjs/index.d.ts} +2 -1
  26. package/dist/{index.js → cjs/index.js} +3 -1
  27. package/dist/esm/api-errors.d.ts +90 -0
  28. package/dist/esm/api-errors.js +320 -0
  29. package/dist/esm/cli/indexer.d.ts +11 -0
  30. package/dist/esm/cli/indexer.js +529 -0
  31. package/dist/esm/cli/search.d.ts +7 -0
  32. package/dist/esm/cli/search.js +332 -0
  33. package/dist/esm/cli/ui-server.d.ts +5 -0
  34. package/dist/esm/cli/ui-server.js +152 -0
  35. package/dist/esm/cli.d.ts +3 -0
  36. package/dist/esm/cli.js +548 -0
  37. package/dist/esm/config.d.ts +51 -0
  38. package/dist/esm/config.js +79 -0
  39. package/dist/esm/core/abstract-embedder.d.ts +125 -0
  40. package/dist/esm/core/abstract-embedder.js +264 -0
  41. package/dist/esm/core/actionable-error-messages.d.ts +60 -0
  42. package/dist/esm/core/actionable-error-messages.js +397 -0
  43. package/dist/esm/core/adapters.d.ts +93 -0
  44. package/dist/esm/core/adapters.js +139 -0
  45. package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
  46. package/dist/esm/core/batch-processing-optimizer.js +536 -0
  47. package/dist/esm/core/binary-index-format.d.ts +78 -0
  48. package/dist/esm/core/binary-index-format.js +294 -0
  49. package/dist/esm/core/chunker.d.ts +119 -0
  50. package/dist/esm/core/chunker.js +73 -0
  51. package/dist/esm/core/cli-database-utils.d.ts +53 -0
  52. package/dist/esm/core/cli-database-utils.js +239 -0
  53. package/dist/esm/core/config.d.ts +102 -0
  54. package/dist/esm/core/config.js +247 -0
  55. package/dist/esm/core/content-errors.d.ts +111 -0
  56. package/dist/esm/core/content-errors.js +362 -0
  57. package/dist/esm/core/content-manager.d.ts +335 -0
  58. package/dist/esm/core/content-manager.js +1476 -0
  59. package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
  60. package/dist/esm/core/content-performance-optimizer.js +516 -0
  61. package/dist/esm/core/content-resolver.d.ts +104 -0
  62. package/dist/esm/core/content-resolver.js +285 -0
  63. package/dist/esm/core/cross-modal-search.d.ts +164 -0
  64. package/dist/esm/core/cross-modal-search.js +342 -0
  65. package/dist/esm/core/database-connection-manager.d.ts +109 -0
  66. package/dist/esm/core/database-connection-manager.js +310 -0
  67. package/dist/esm/core/db.d.ts +269 -0
  68. package/dist/esm/core/db.js +1000 -0
  69. package/dist/esm/core/embedder-factory.d.ts +154 -0
  70. package/dist/esm/core/embedder-factory.js +311 -0
  71. package/dist/esm/core/error-handler.d.ts +112 -0
  72. package/dist/esm/core/error-handler.js +239 -0
  73. package/dist/esm/core/index.d.ts +59 -0
  74. package/dist/esm/core/index.js +69 -0
  75. package/dist/esm/core/ingestion.d.ts +202 -0
  76. package/dist/esm/core/ingestion.js +904 -0
  77. package/dist/esm/core/interfaces.d.ts +408 -0
  78. package/dist/esm/core/interfaces.js +106 -0
  79. package/dist/esm/core/knowledge-base-manager.d.ts +109 -0
  80. package/dist/esm/core/knowledge-base-manager.js +256 -0
  81. package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
  82. package/dist/esm/core/lazy-dependency-loader.js +435 -0
  83. package/dist/esm/core/mode-detection-service.d.ts +150 -0
  84. package/dist/esm/core/mode-detection-service.js +565 -0
  85. package/dist/esm/core/mode-model-validator.d.ts +92 -0
  86. package/dist/esm/core/mode-model-validator.js +203 -0
  87. package/dist/esm/core/model-registry.d.ts +116 -0
  88. package/dist/esm/core/model-registry.js +411 -0
  89. package/dist/esm/core/model-validator.d.ts +217 -0
  90. package/dist/esm/core/model-validator.js +782 -0
  91. package/dist/esm/core/path-manager.d.ts +47 -0
  92. package/dist/esm/core/path-manager.js +71 -0
  93. package/dist/esm/core/raglite-paths.d.ts +121 -0
  94. package/dist/esm/core/raglite-paths.js +145 -0
  95. package/dist/esm/core/reranking-config.d.ts +42 -0
  96. package/dist/esm/core/reranking-config.js +147 -0
  97. package/dist/esm/core/reranking-factory.d.ts +92 -0
  98. package/dist/esm/core/reranking-factory.js +410 -0
  99. package/dist/esm/core/reranking-strategies.d.ts +310 -0
  100. package/dist/esm/core/reranking-strategies.js +650 -0
  101. package/dist/esm/core/resource-cleanup.d.ts +163 -0
  102. package/dist/esm/core/resource-cleanup.js +371 -0
  103. package/dist/esm/core/resource-manager.d.ts +212 -0
  104. package/dist/esm/core/resource-manager.js +564 -0
  105. package/dist/esm/core/search-pipeline.d.ts +111 -0
  106. package/dist/esm/core/search-pipeline.js +287 -0
  107. package/dist/esm/core/search.d.ts +141 -0
  108. package/dist/esm/core/search.js +320 -0
  109. package/dist/esm/core/streaming-operations.d.ts +145 -0
  110. package/dist/esm/core/streaming-operations.js +409 -0
  111. package/dist/esm/core/types.d.ts +66 -0
  112. package/dist/esm/core/types.js +6 -0
  113. package/dist/esm/core/universal-embedder.d.ts +177 -0
  114. package/dist/esm/core/universal-embedder.js +139 -0
  115. package/dist/esm/core/validation-messages.d.ts +99 -0
  116. package/dist/esm/core/validation-messages.js +334 -0
  117. package/dist/esm/core/vector-index-messages.d.ts +52 -0
  118. package/dist/esm/core/vector-index-messages.js +5 -0
  119. package/dist/esm/core/vector-index-worker.d.ts +6 -0
  120. package/dist/esm/core/vector-index-worker.js +304 -0
  121. package/dist/esm/core/vector-index.d.ts +107 -0
  122. package/dist/esm/core/vector-index.js +344 -0
  123. package/dist/esm/dom-polyfills.d.ts +6 -0
  124. package/dist/esm/dom-polyfills.js +37 -0
  125. package/dist/esm/factories/index.d.ts +27 -0
  126. package/dist/esm/factories/index.js +29 -0
  127. package/dist/esm/factories/ingestion-factory.d.ts +200 -0
  128. package/dist/esm/factories/ingestion-factory.js +473 -0
  129. package/dist/esm/factories/search-factory.d.ts +154 -0
  130. package/dist/esm/factories/search-factory.js +355 -0
  131. package/dist/esm/file-processor.d.ts +147 -0
  132. package/dist/esm/file-processor.js +963 -0
  133. package/dist/esm/index-manager.d.ts +136 -0
  134. package/dist/esm/index-manager.js +667 -0
  135. package/dist/esm/index.d.ts +76 -0
  136. package/dist/esm/index.js +112 -0
  137. package/dist/esm/indexer.d.ts +7 -0
  138. package/dist/esm/indexer.js +54 -0
  139. package/dist/esm/ingestion.d.ts +63 -0
  140. package/dist/esm/ingestion.js +124 -0
  141. package/dist/esm/mcp-server.d.ts +46 -0
  142. package/dist/esm/mcp-server.js +1820 -0
  143. package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
  144. package/dist/esm/multimodal/clip-embedder.js +996 -0
  145. package/dist/esm/multimodal/index.d.ts +6 -0
  146. package/dist/esm/multimodal/index.js +6 -0
  147. package/dist/esm/preprocess.d.ts +19 -0
  148. package/dist/esm/preprocess.js +203 -0
  149. package/dist/esm/preprocessors/index.d.ts +17 -0
  150. package/dist/esm/preprocessors/index.js +38 -0
  151. package/dist/esm/preprocessors/mdx.d.ts +25 -0
  152. package/dist/esm/preprocessors/mdx.js +101 -0
  153. package/dist/esm/preprocessors/mermaid.d.ts +68 -0
  154. package/dist/esm/preprocessors/mermaid.js +329 -0
  155. package/dist/esm/preprocessors/registry.d.ts +56 -0
  156. package/dist/esm/preprocessors/registry.js +179 -0
  157. package/dist/esm/run-error-recovery-tests.d.ts +7 -0
  158. package/dist/esm/run-error-recovery-tests.js +101 -0
  159. package/dist/esm/search-standalone.d.ts +7 -0
  160. package/dist/esm/search-standalone.js +117 -0
  161. package/dist/esm/search.d.ts +99 -0
  162. package/dist/esm/search.js +177 -0
  163. package/dist/esm/test-utils.d.ts +18 -0
  164. package/dist/esm/test-utils.js +27 -0
  165. package/dist/esm/text/chunker.d.ts +33 -0
  166. package/dist/esm/text/chunker.js +279 -0
  167. package/dist/esm/text/embedder.d.ts +111 -0
  168. package/dist/esm/text/embedder.js +386 -0
  169. package/dist/esm/text/index.d.ts +8 -0
  170. package/dist/esm/text/index.js +9 -0
  171. package/dist/esm/text/preprocessors/index.d.ts +17 -0
  172. package/dist/esm/text/preprocessors/index.js +38 -0
  173. package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
  174. package/dist/esm/text/preprocessors/mdx.js +101 -0
  175. package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
  176. package/dist/esm/text/preprocessors/mermaid.js +330 -0
  177. package/dist/esm/text/preprocessors/registry.d.ts +56 -0
  178. package/dist/esm/text/preprocessors/registry.js +180 -0
  179. package/dist/esm/text/reranker.d.ts +49 -0
  180. package/dist/esm/text/reranker.js +274 -0
  181. package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
  182. package/dist/esm/text/sentence-transformer-embedder.js +340 -0
  183. package/dist/esm/text/tokenizer.d.ts +22 -0
  184. package/dist/esm/text/tokenizer.js +64 -0
  185. package/dist/esm/types.d.ts +83 -0
  186. package/dist/esm/types.js +3 -0
  187. package/dist/esm/utils/vector-math.d.ts +31 -0
  188. package/dist/esm/utils/vector-math.js +70 -0
  189. package/package.json +39 -14
  190. package/dist/core/vector-index.d.ts +0 -72
  191. package/dist/core/vector-index.js +0 -331
  192. /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
  193. /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
  194. /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
  195. /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
  196. /package/dist/{cli → cjs/cli}/search.js +0 -0
  197. /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
  198. /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
  199. /package/dist/{config.js → cjs/config.js} +0 -0
  200. /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
  201. /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
  202. /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
  203. /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
  204. /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
  205. /package/dist/{core → cjs/core}/adapters.js +0 -0
  206. /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
  207. /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
  208. /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
  209. /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
  210. /package/dist/{core → cjs/core}/chunker.js +0 -0
  211. /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
  212. /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
  213. /package/dist/{core → cjs/core}/config.d.ts +0 -0
  214. /package/dist/{core → cjs/core}/config.js +0 -0
  215. /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
  216. /package/dist/{core → cjs/core}/content-errors.js +0 -0
  217. /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
  218. /package/dist/{core → cjs/core}/content-manager.js +0 -0
  219. /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
  220. /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
  221. /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
  222. /package/dist/{core → cjs/core}/content-resolver.js +0 -0
  223. /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
  224. /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
  225. /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
  226. /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
  227. /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
  228. /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
  229. /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
  230. /package/dist/{core → cjs/core}/error-handler.js +0 -0
  231. /package/dist/{core → cjs/core}/index.d.ts +0 -0
  232. /package/dist/{core → cjs/core}/index.js +0 -0
  233. /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
  234. /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
  235. /package/dist/{core → cjs/core}/interfaces.js +0 -0
  236. /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
  237. /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
  238. /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
  239. /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
  240. /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
  241. /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
  242. /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
  243. /package/dist/{core → cjs/core}/model-registry.js +0 -0
  244. /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
  245. /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
  246. /package/dist/{core → cjs/core}/path-manager.js +0 -0
  247. /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
  248. /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
  249. /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
  250. /package/dist/{core → cjs/core}/reranking-config.js +0 -0
  251. /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
  252. /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
  253. /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
  254. /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
  255. /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
  256. /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
  257. /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
  258. /package/dist/{core → cjs/core}/resource-manager.js +0 -0
  259. /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
  260. /package/dist/{core → cjs/core}/search.d.ts +0 -0
  261. /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
  262. /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
  263. /package/dist/{core → cjs/core}/types.d.ts +0 -0
  264. /package/dist/{core → cjs/core}/types.js +0 -0
  265. /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
  266. /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
  267. /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
  268. /package/dist/{core → cjs/core}/validation-messages.js +0 -0
  269. /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
  270. /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
  271. /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
  272. /package/dist/{factories → cjs/factories}/index.js +0 -0
  273. /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
  274. /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
  275. /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
  276. /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
  277. /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
  278. /package/dist/{indexer.js → cjs/indexer.js} +0 -0
  279. /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
  280. /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
  281. /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
  282. /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
  283. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
  284. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
  285. /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
  286. /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
  287. /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
  288. /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
  289. /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
  290. /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
  291. /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
  292. /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
  293. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
  294. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
  295. /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
  296. /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
  297. /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
  298. /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
  299. /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
  300. /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
  301. /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
  302. /package/dist/{search.js → cjs/search.js} +0 -0
  303. /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
  304. /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
  305. /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
  306. /package/dist/{text → cjs/text}/chunker.js +0 -0
  307. /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
  308. /package/dist/{text → cjs/text}/embedder.js +0 -0
  309. /package/dist/{text → cjs/text}/index.d.ts +0 -0
  310. /package/dist/{text → cjs/text}/index.js +0 -0
  311. /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
  312. /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
  313. /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
  314. /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
  315. /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
  316. /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
  317. /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
  318. /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
  319. /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
  320. /package/dist/{text → cjs/text}/reranker.js +0 -0
  321. /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
  322. /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
  323. /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
  324. /package/dist/{text → cjs/text}/tokenizer.js +0 -0
  325. /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
  326. /package/dist/{types.js → cjs/types.js} +0 -0
  327. /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
  328. /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
@@ -0,0 +1,963 @@
1
+ import { promises as fs } from 'fs';
2
+ import { join, extname, basename, resolve } from 'path';
3
+ import { ErrorCategory, ErrorSeverity, safeExecute } from './core/error-handler.js';
4
+ import { preprocessDocument } from './preprocess.js';
5
+ import { config } from './core/config.js';
6
+ import { DocumentPathManager } from './core/path-manager.js';
7
+ import { createRequire } from 'module';
8
+ import { JSDOM } from 'jsdom';
9
+ const require = createRequire(import.meta.url);
10
+ // Set up DOM polyfills for PDF parsing
11
+ const dom = new JSDOM('<!DOCTYPE html><html><body></body></html>', {
12
+ pretendToBeVisual: true,
13
+ resources: 'usable'
14
+ });
15
+ // Polyfill global objects needed by pdf-parse
16
+ global.DOMMatrix = dom.window.DOMMatrix || class {
17
+ a = 1;
18
+ b = 0;
19
+ c = 0;
20
+ d = 1;
21
+ e = 0;
22
+ f = 0;
23
+ constructor() { }
24
+ };
25
+ global.ImageData = dom.window.ImageData || class {
26
+ width;
27
+ height;
28
+ data;
29
+ constructor(width, height) {
30
+ this.width = width;
31
+ this.height = height;
32
+ this.data = new Uint8ClampedArray(width * height * 4);
33
+ }
34
+ };
35
+ global.Path2D = dom.window.Path2D || class {
36
+ constructor() { }
37
+ moveTo() { }
38
+ lineTo() { }
39
+ closePath() { }
40
+ };
41
+ const pdfParse = require('pdf-parse');
42
+ import * as mammoth from 'mammoth';
43
+ /**
44
+ * Supported file extensions for document ingestion
45
+ */
46
+ const SUPPORTED_TEXT_EXTENSIONS = ['.md', '.txt', '.mdx', '.pdf', '.docx'];
47
+ /**
48
+ * Supported image file extensions for multimodal ingestion
49
+ */
50
+ const SUPPORTED_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'];
51
+ /**
52
+ * All supported file extensions (text + image)
53
+ */
54
+ const SUPPORTED_EXTENSIONS = [...SUPPORTED_TEXT_EXTENSIONS, ...SUPPORTED_IMAGE_EXTENSIONS];
55
+ /**
56
+ * Default options for file processing
57
+ */
58
+ export const DEFAULT_FILE_PROCESSOR_OPTIONS = {
59
+ recursive: true,
60
+ maxFileSize: 10 * 1024 * 1024 // 10MB
61
+ };
62
+ /**
63
+ * Default options for image-to-text processing
64
+ */
65
+ export const DEFAULT_IMAGE_TO_TEXT_OPTIONS = {
66
+ model: 'Xenova/vit-gpt2-image-captioning',
67
+ maxLength: 50,
68
+ batchSize: 4,
69
+ includeConfidence: false
70
+ };
71
+ /**
72
+ * Check if a file has a supported extension
73
+ */
74
+ function isSupportedFile(filePath) {
75
+ const ext = extname(filePath).toLowerCase();
76
+ return SUPPORTED_EXTENSIONS.includes(ext);
77
+ }
78
+ /**
79
+ * Determine content type based on file extension
80
+ */
81
+ function getContentType(filePath) {
82
+ const ext = extname(filePath).toLowerCase();
83
+ if (SUPPORTED_IMAGE_EXTENSIONS.includes(ext)) {
84
+ return 'image';
85
+ }
86
+ return 'text';
87
+ }
88
+ /**
89
+ * Check if a file is an image file
90
+ */
91
+ function isImageFile(filePath) {
92
+ const ext = extname(filePath).toLowerCase();
93
+ return SUPPORTED_IMAGE_EXTENSIONS.includes(ext);
94
+ }
95
+ /**
96
+ * Validate image file format and accessibility
97
+ */
98
+ async function validateImageFile(filePath) {
99
+ try {
100
+ const stats = await fs.stat(filePath);
101
+ // Check if file is readable
102
+ if (!stats.isFile()) {
103
+ return { valid: false, error: 'Path is not a file' };
104
+ }
105
+ // Check file size (images can be larger than text files)
106
+ const maxImageSize = 50 * 1024 * 1024; // 50MB for images
107
+ if (stats.size > maxImageSize) {
108
+ return {
109
+ valid: false,
110
+ error: `Image file size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (50MB)`
111
+ };
112
+ }
113
+ // Check if file is empty
114
+ if (stats.size === 0) {
115
+ return { valid: false, error: 'Image file is empty' };
116
+ }
117
+ // Basic format validation by reading file header
118
+ const buffer = await fs.readFile(filePath, { encoding: null });
119
+ const ext = extname(filePath).toLowerCase();
120
+ // Validate file signatures (magic numbers)
121
+ if (ext === '.jpg' || ext === '.jpeg') {
122
+ if (buffer[0] !== 0xFF || buffer[1] !== 0xD8) {
123
+ return { valid: false, error: 'Invalid JPEG file format' };
124
+ }
125
+ }
126
+ else if (ext === '.png') {
127
+ const pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
128
+ for (let i = 0; i < pngSignature.length; i++) {
129
+ if (buffer[i] !== pngSignature[i]) {
130
+ return { valid: false, error: 'Invalid PNG file format' };
131
+ }
132
+ }
133
+ }
134
+ else if (ext === '.gif') {
135
+ const gifSignature = [0x47, 0x49, 0x46]; // "GIF"
136
+ for (let i = 0; i < gifSignature.length; i++) {
137
+ if (buffer[i] !== gifSignature[i]) {
138
+ return { valid: false, error: 'Invalid GIF file format' };
139
+ }
140
+ }
141
+ }
142
+ else if (ext === '.webp') {
143
+ // WebP: "RIFF" at start and "WEBP" at offset 8
144
+ if (buffer[0] !== 0x52 || buffer[1] !== 0x49 || buffer[2] !== 0x46 || buffer[3] !== 0x46) {
145
+ return { valid: false, error: 'Invalid WebP file format (missing RIFF header)' };
146
+ }
147
+ if (buffer[8] !== 0x57 || buffer[9] !== 0x45 || buffer[10] !== 0x42 || buffer[11] !== 0x50) {
148
+ return { valid: false, error: 'Invalid WebP file format (missing WEBP signature)' };
149
+ }
150
+ }
151
+ else if (ext === '.bmp') {
152
+ if (buffer[0] !== 0x42 || buffer[1] !== 0x4D) { // "BM"
153
+ return { valid: false, error: 'Invalid BMP file format' };
154
+ }
155
+ }
156
+ return { valid: true };
157
+ }
158
+ catch (error) {
159
+ return {
160
+ valid: false,
161
+ error: `Failed to validate image file: ${error instanceof Error ? error.message : String(error)}`
162
+ };
163
+ }
164
+ }
165
+ /**
166
+ * Recursively discover files in a directory
167
+ */
168
+ async function discoverFilesRecursive(dirPath, options) {
169
+ const result = {
170
+ files: [],
171
+ skipped: []
172
+ };
173
+ try {
174
+ const entries = await fs.readdir(dirPath, { withFileTypes: true });
175
+ for (const entry of entries) {
176
+ const fullPath = join(dirPath, entry.name);
177
+ if (entry.isDirectory()) {
178
+ if (options.recursive) {
179
+ // Recursively process subdirectory
180
+ const subResult = await discoverFilesRecursive(fullPath, options);
181
+ result.files.push(...subResult.files);
182
+ result.skipped.push(...subResult.skipped);
183
+ }
184
+ }
185
+ else if (entry.isFile()) {
186
+ if (isSupportedFile(fullPath)) {
187
+ try {
188
+ // Check file size based on content type
189
+ const stats = await fs.stat(fullPath);
190
+ const contentType = getContentType(fullPath);
191
+ // Filter by mode: skip incompatible content types
192
+ const mode = options.mode || 'text';
193
+ if (mode === 'text' && contentType === 'image') {
194
+ result.skipped.push({
195
+ path: fullPath,
196
+ reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
197
+ });
198
+ continue;
199
+ }
200
+ // Different size limits for different content types
201
+ const maxSize = contentType === 'image'
202
+ ? 50 * 1024 * 1024 // 50MB for images
203
+ : (options.maxFileSize || 10 * 1024 * 1024); // 10MB for text files
204
+ if (stats.size > maxSize) {
205
+ result.skipped.push({
206
+ path: fullPath,
207
+ reason: `File size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (${Math.round(maxSize / 1024 / 1024)}MB) for ${contentType} files`
208
+ });
209
+ continue;
210
+ }
211
+ // Additional validation for image files
212
+ if (contentType === 'image') {
213
+ const validation = await validateImageFile(fullPath);
214
+ if (!validation.valid) {
215
+ result.skipped.push({
216
+ path: fullPath,
217
+ reason: validation.error || 'Invalid image file'
218
+ });
219
+ continue;
220
+ }
221
+ }
222
+ result.files.push(fullPath);
223
+ }
224
+ catch (error) {
225
+ result.skipped.push({
226
+ path: fullPath,
227
+ reason: `Failed to validate file: ${error instanceof Error ? error.message : String(error)}`
228
+ });
229
+ }
230
+ }
231
+ }
232
+ }
233
+ }
234
+ catch (error) {
235
+ result.skipped.push({
236
+ path: dirPath,
237
+ reason: `Failed to read directory: ${error instanceof Error ? error.message : String(error)}`
238
+ });
239
+ }
240
+ return result;
241
+ }
242
+ /**
243
+ * Discover files for ingestion
244
+ * Supports both single files and directories (with optional recursion)
245
+ */
246
+ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIONS) {
247
+ const resolvedPath = resolve(path);
248
+ try {
249
+ const stats = await fs.stat(resolvedPath);
250
+ if (stats.isFile()) {
251
+ // Single file processing
252
+ if (!isSupportedFile(resolvedPath)) {
253
+ return {
254
+ files: [],
255
+ skipped: [{
256
+ path: resolvedPath,
257
+ reason: `Unsupported file extension. Supported text: ${SUPPORTED_TEXT_EXTENSIONS.join(', ')}, images: ${SUPPORTED_IMAGE_EXTENSIONS.join(', ')}`
258
+ }]
259
+ };
260
+ }
261
+ const contentType = getContentType(resolvedPath);
262
+ // Filter by mode: skip incompatible content types
263
+ const mode = options.mode || 'text';
264
+ if (mode === 'text' && contentType === 'image') {
265
+ return {
266
+ files: [],
267
+ skipped: [{
268
+ path: resolvedPath,
269
+ reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
270
+ }]
271
+ };
272
+ }
273
+ // Check file size based on content type
274
+ const maxSize = contentType === 'image'
275
+ ? 50 * 1024 * 1024 // 50MB for images
276
+ : (options.maxFileSize || 10 * 1024 * 1024); // 10MB for text files
277
+ if (stats.size > maxSize) {
278
+ return {
279
+ files: [],
280
+ skipped: [{
281
+ path: resolvedPath,
282
+ reason: `File size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (${Math.round(maxSize / 1024 / 1024)}MB) for ${contentType} files`
283
+ }]
284
+ };
285
+ }
286
+ // Additional validation for image files
287
+ if (contentType === 'image') {
288
+ const validation = await validateImageFile(resolvedPath);
289
+ if (!validation.valid) {
290
+ return {
291
+ files: [],
292
+ skipped: [{
293
+ path: resolvedPath,
294
+ reason: validation.error || 'Invalid image file'
295
+ }]
296
+ };
297
+ }
298
+ }
299
+ return {
300
+ files: [resolvedPath],
301
+ skipped: []
302
+ };
303
+ }
304
+ else if (stats.isDirectory()) {
305
+ // Directory processing
306
+ return await discoverFilesRecursive(resolvedPath, options);
307
+ }
308
+ else {
309
+ return {
310
+ files: [],
311
+ skipped: [{
312
+ path: resolvedPath,
313
+ reason: 'Path is neither a file nor a directory'
314
+ }]
315
+ };
316
+ }
317
+ }
318
+ catch (error) {
319
+ return {
320
+ files: [],
321
+ skipped: [{
322
+ path: resolvedPath,
323
+ reason: `Failed to access path: ${error instanceof Error ? error.message : String(error)}`
324
+ }]
325
+ };
326
+ }
327
+ }
328
+ /**
329
+ * Extract text content from PDF file
330
+ */
331
+ async function extractPdfContent(filePath) {
332
+ const buffer = await fs.readFile(filePath);
333
+ const data = await pdfParse(buffer);
334
+ return data.text;
335
+ }
336
+ /**
337
+ * Extract text content from DOCX file
338
+ */
339
+ async function extractDocxContent(filePath) {
340
+ const buffer = await fs.readFile(filePath);
341
+ const result = await mammoth.extractRawText({ buffer });
342
+ return result.value;
343
+ }
344
+ /**
345
+ * Extract document title from content
346
+ * Looks for markdown H1 headers first, then falls back to filename
347
+ */
348
+ function extractTitle(content, filePath) {
349
+ // Try to find markdown H1 header
350
+ const lines = content.split('\n');
351
+ for (const line of lines) {
352
+ const trimmed = line.trim();
353
+ if (trimmed.startsWith('# ')) {
354
+ const title = trimmed.substring(2).trim();
355
+ if (title) {
356
+ return title;
357
+ }
358
+ }
359
+ }
360
+ // Fallback to filename without extension
361
+ const filename = basename(filePath);
362
+ const ext = extname(filename);
363
+ return ext ? filename.slice(0, -ext.length) : filename;
364
+ }
365
+ /**
366
+ * Cache for image-to-text pipeline to avoid reloading
367
+ */
368
+ let imageToTextPipeline = null;
369
+ let imageToTextPipelinePromise = null;
370
+ /**
371
+ * Initialize the image-to-text pipeline with proper async locking
372
+ */
373
+ async function initializeImageToTextPipeline(modelName = 'Xenova/vit-gpt2-image-captioning') {
374
+ // Return cached pipeline if available
375
+ if (imageToTextPipeline) {
376
+ return imageToTextPipeline;
377
+ }
378
+ // If pipeline is currently loading, wait for it
379
+ if (imageToTextPipelinePromise) {
380
+ return imageToTextPipelinePromise;
381
+ }
382
+ // Start loading pipeline
383
+ imageToTextPipelinePromise = (async () => {
384
+ try {
385
+ const { pipeline } = await import('@huggingface/transformers');
386
+ console.log(`Loading image-to-text model: ${modelName}`);
387
+ imageToTextPipeline = await pipeline('image-to-text', modelName);
388
+ console.log(`Successfully loaded image-to-text model: ${modelName}`);
389
+ return imageToTextPipeline;
390
+ }
391
+ catch (error) {
392
+ console.error(`Failed to load image-to-text model ${modelName}:`, error);
393
+ imageToTextPipelinePromise = null; // Reset on error so it can be retried
394
+ throw new Error(`Failed to initialize image-to-text pipeline: ${error instanceof Error ? error.message : String(error)}`);
395
+ }
396
+ })();
397
+ return imageToTextPipelinePromise;
398
+ }
399
+ /**
400
+ * Parse PNG image dimensions from file buffer
401
+ */
402
+ function parsePngDimensions(buffer) {
403
+ try {
404
+ // PNG signature: 89 50 4E 47 0D 0A 1A 0A
405
+ if (buffer.length < 24)
406
+ return null;
407
+ // Check PNG signature
408
+ const pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
409
+ for (let i = 0; i < pngSignature.length; i++) {
410
+ if (buffer[i] !== pngSignature[i])
411
+ return null;
412
+ }
413
+ // IHDR chunk starts at byte 8, dimensions at bytes 16-23
414
+ const width = buffer.readUInt32BE(16);
415
+ const height = buffer.readUInt32BE(20);
416
+ return { width, height };
417
+ }
418
+ catch (error) {
419
+ return null;
420
+ }
421
+ }
422
+ /**
423
+ * Parse JPEG image dimensions from file buffer
424
+ */
425
+ function parseJpegDimensions(buffer) {
426
+ try {
427
+ if (buffer.length < 4)
428
+ return null;
429
+ // Check JPEG signature
430
+ if (buffer[0] !== 0xFF || buffer[1] !== 0xD8)
431
+ return null;
432
+ let offset = 2;
433
+ while (offset < buffer.length - 8) {
434
+ // Find SOF (Start of Frame) markers
435
+ if (buffer[offset] === 0xFF) {
436
+ const marker = buffer[offset + 1];
437
+ // SOF0 (0xC0) or SOF2 (0xC2) markers contain dimensions
438
+ if (marker === 0xC0 || marker === 0xC2) {
439
+ const height = buffer.readUInt16BE(offset + 5);
440
+ const width = buffer.readUInt16BE(offset + 7);
441
+ return { width, height };
442
+ }
443
+ // Skip to next marker
444
+ const segmentLength = buffer.readUInt16BE(offset + 2);
445
+ offset += 2 + segmentLength;
446
+ }
447
+ else {
448
+ offset++;
449
+ }
450
+ }
451
+ return null;
452
+ }
453
+ catch (error) {
454
+ return null;
455
+ }
456
+ }
457
+ /**
458
+ * Parse GIF image dimensions from file buffer
459
+ */
460
+ function parseGifDimensions(buffer) {
461
+ try {
462
+ if (buffer.length < 10)
463
+ return null;
464
+ // Check GIF signature
465
+ const gifSignature = [0x47, 0x49, 0x46]; // "GIF"
466
+ for (let i = 0; i < gifSignature.length; i++) {
467
+ if (buffer[i] !== gifSignature[i])
468
+ return null;
469
+ }
470
+ // Dimensions are at bytes 6-9 (little endian)
471
+ const width = buffer.readUInt16LE(6);
472
+ const height = buffer.readUInt16LE(8);
473
+ return { width, height };
474
+ }
475
+ catch (error) {
476
+ return null;
477
+ }
478
+ }
479
+ /**
480
+ * Parse WebP image dimensions from file buffer
481
+ */
482
+ function parseWebpDimensions(buffer) {
483
+ try {
484
+ if (buffer.length < 30)
485
+ return null;
486
+ // Check WebP signature
487
+ if (buffer.readUInt32BE(0) !== 0x52494646)
488
+ return null; // "RIFF"
489
+ if (buffer.readUInt32BE(8) !== 0x57454250)
490
+ return null; // "WEBP"
491
+ // VP8 format
492
+ if (buffer.readUInt32BE(12) === 0x56503820) { // "VP8 "
493
+ const width = buffer.readUInt16LE(26) & 0x3FFF;
494
+ const height = buffer.readUInt16LE(28) & 0x3FFF;
495
+ return { width, height };
496
+ }
497
+ // VP8L format
498
+ if (buffer.readUInt32BE(12) === 0x5650384C) { // "VP8L"
499
+ const bits = buffer.readUInt32LE(21);
500
+ const width = (bits & 0x3FFF) + 1;
501
+ const height = ((bits >> 14) & 0x3FFF) + 1;
502
+ return { width, height };
503
+ }
504
+ return null;
505
+ }
506
+ catch (error) {
507
+ return null;
508
+ }
509
+ }
510
+ /**
511
+ * Parse BMP image dimensions from file buffer
512
+ */
513
+ function parseBmpDimensions(buffer) {
514
+ try {
515
+ if (buffer.length < 26)
516
+ return null;
517
+ // Check BMP signature
518
+ if (buffer[0] !== 0x42 || buffer[1] !== 0x4D)
519
+ return null; // "BM"
520
+ // Dimensions are at bytes 18-25 (little endian)
521
+ const width = buffer.readInt32LE(18);
522
+ const height = Math.abs(buffer.readInt32LE(22)); // Height can be negative
523
+ return { width, height };
524
+ }
525
+ catch (error) {
526
+ return null;
527
+ }
528
+ }
529
+ /**
530
+ * Extract image dimensions from file buffer based on format
531
+ */
532
+ function extractImageDimensions(buffer, format) {
533
+ switch (format.toLowerCase()) {
534
+ case 'png':
535
+ return parsePngDimensions(buffer);
536
+ case 'jpg':
537
+ case 'jpeg':
538
+ return parseJpegDimensions(buffer);
539
+ case 'gif':
540
+ return parseGifDimensions(buffer);
541
+ case 'webp':
542
+ return parseWebpDimensions(buffer);
543
+ case 'bmp':
544
+ return parseBmpDimensions(buffer);
545
+ default:
546
+ return null;
547
+ }
548
+ }
549
+ /**
550
+ * Extract metadata from an image file using native parsing
551
+ */
552
+ async function extractImageMetadata(imagePath) {
553
+ try {
554
+ const stats = await fs.stat(imagePath);
555
+ const format = extname(imagePath).toLowerCase().substring(1);
556
+ // Read file buffer for dimension extraction
557
+ const buffer = await fs.readFile(imagePath);
558
+ // Extract dimensions using native parsing
559
+ const dimensions = extractImageDimensions(buffer, format);
560
+ const imageMetadata = {
561
+ originalPath: imagePath,
562
+ dimensions: dimensions || { width: 0, height: 0 }, // Use 0 if dimensions can't be extracted
563
+ fileSize: stats.size,
564
+ format: format,
565
+ createdAt: stats.birthtime || stats.mtime
566
+ };
567
+ return imageMetadata;
568
+ }
569
+ catch (error) {
570
+ throw new Error(`Failed to extract metadata for image ${imagePath}: ${error instanceof Error ? error.message : String(error)}`);
571
+ }
572
+ }
573
+ /**
574
+ * Generate text description for a single image
575
+ */
576
+ async function generateImageDescription(imagePath, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
577
+ try {
578
+ const pipeline = await initializeImageToTextPipeline(options.model);
579
+ // Load image using RawImage.fromURL which works with local file paths
580
+ const { RawImage } = await import('@huggingface/transformers');
581
+ const image = await RawImage.fromURL(imagePath);
582
+ // Generate description with loaded image
583
+ const result = await pipeline(image, {
584
+ max_length: options.maxLength || 50,
585
+ num_beams: 4,
586
+ early_stopping: true
587
+ });
588
+ // Extract description and confidence
589
+ const description = Array.isArray(result) ? result[0]?.generated_text : result?.generated_text;
590
+ const confidence = Array.isArray(result) ? result[0]?.score : result?.score;
591
+ if (!description) {
592
+ throw new Error('No description generated for image');
593
+ }
594
+ // Clean up the description
595
+ const cleanDescription = description.trim();
596
+ return {
597
+ description: cleanDescription,
598
+ confidence: options.includeConfidence ? confidence : undefined,
599
+ model: options.model || DEFAULT_IMAGE_TO_TEXT_OPTIONS.model
600
+ };
601
+ }
602
+ catch (error) {
603
+ throw new Error(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : String(error)}`);
604
+ }
605
+ }
606
+ /**
607
+ * Generate text descriptions for multiple images in batches
608
+ */
609
+ async function generateImageDescriptionsBatch(imagePaths, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
610
+ const results = [];
611
+ const batchSize = options.batchSize || DEFAULT_IMAGE_TO_TEXT_OPTIONS.batchSize;
612
+ // Process images in batches
613
+ for (let i = 0; i < imagePaths.length; i += batchSize) {
614
+ const batch = imagePaths.slice(i, i + batchSize);
615
+ console.log(`Processing image batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(imagePaths.length / batchSize)} (${batch.length} images)`);
616
+ // Process batch in parallel
617
+ const batchPromises = batch.map(async (imagePath) => {
618
+ try {
619
+ const result = await generateImageDescription(imagePath, options);
620
+ return { path: imagePath, result };
621
+ }
622
+ catch (error) {
623
+ return {
624
+ path: imagePath,
625
+ error: error instanceof Error ? error.message : String(error)
626
+ };
627
+ }
628
+ });
629
+ const batchResults = await Promise.all(batchPromises);
630
+ results.push(...batchResults);
631
+ }
632
+ return results;
633
+ }
634
+ /**
635
+ * Process image file to extract text description and metadata
636
+ */
637
+ async function processImageFile(filePath, pathManager, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
638
+ try {
639
+ // Extract image metadata first
640
+ const imageMetadata = await extractImageMetadata(filePath);
641
+ // Generate text description for the image
642
+ const descriptionResult = await generateImageDescription(filePath, options);
643
+ // Update metadata with description information
644
+ imageMetadata.description = descriptionResult.description;
645
+ imageMetadata.descriptionModel = descriptionResult.model;
646
+ imageMetadata.descriptionConfidence = descriptionResult.confidence;
647
+ // Create document with image description as content
648
+ const title = extractTitle('', filePath); // Use filename as title for images
649
+ // Create content that includes description and key metadata
650
+ const content = `Image: ${title}\nDescription: ${descriptionResult.description}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
651
+ return {
652
+ source: pathManager.toStoragePath(filePath),
653
+ title,
654
+ content: content.trim(),
655
+ // Store comprehensive metadata about the image
656
+ metadata: {
657
+ contentType: 'image',
658
+ ...imageMetadata // Spread all image metadata fields
659
+ }
660
+ };
661
+ }
662
+ catch (error) {
663
+ // If processing fails, try to extract at least basic metadata
664
+ console.warn(`Failed to fully process image ${filePath}, attempting basic metadata extraction: ${error instanceof Error ? error.message : String(error)}`);
665
+ try {
666
+ const imageMetadata = await extractImageMetadata(filePath);
667
+ const title = extractTitle('', filePath);
668
+ const content = `Image: ${title}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
669
+ return {
670
+ source: pathManager.toStoragePath(filePath),
671
+ title,
672
+ content: content.trim(),
673
+ metadata: {
674
+ contentType: 'image',
675
+ ...imageMetadata,
676
+ processingError: error instanceof Error ? error.message : String(error)
677
+ }
678
+ };
679
+ }
680
+ catch (metadataError) {
681
+ // Final fallback - create document with minimal information
682
+ console.warn(`Failed to extract any metadata for image ${filePath}, using minimal fallback: ${metadataError instanceof Error ? metadataError.message : String(metadataError)}`);
683
+ const title = extractTitle('', filePath);
684
+ const content = `Image: ${title}\nPath: ${filePath}`;
685
+ return {
686
+ source: pathManager.toStoragePath(filePath),
687
+ title,
688
+ content: content.trim(),
689
+ metadata: {
690
+ contentType: 'image',
691
+ originalPath: filePath,
692
+ processingError: error instanceof Error ? error.message : String(error),
693
+ metadataError: metadataError instanceof Error ? metadataError.message : String(metadataError)
694
+ }
695
+ };
696
+ }
697
+ }
698
+ }
699
+ /**
700
+ * Process a single file into a Document
701
+ */
702
+ async function processFile(filePath, pathManager, imageToTextOptions) {
703
+ const result = await safeExecute(async () => {
704
+ const contentType = getContentType(filePath);
705
+ // Handle image files differently
706
+ if (contentType === 'image') {
707
+ return await processImageFile(filePath, pathManager, imageToTextOptions);
708
+ }
709
+ // Handle text files (existing logic)
710
+ let content;
711
+ const ext = extname(filePath).toLowerCase();
712
+ // Extract content based on file type
713
+ switch (ext) {
714
+ case '.pdf':
715
+ content = await extractPdfContent(filePath);
716
+ break;
717
+ case '.docx':
718
+ content = await extractDocxContent(filePath);
719
+ break;
720
+ case '.md':
721
+ case '.txt':
722
+ case '.mdx':
723
+ default:
724
+ content = await fs.readFile(filePath, 'utf-8');
725
+ break;
726
+ }
727
+ // Validate content is not empty
728
+ if (!content.trim()) {
729
+ throw new Error('File is empty or contains only whitespace');
730
+ }
731
+ // Use preprocessing module for all content types
732
+ content = preprocessDocument(content, filePath, config.preprocessing);
733
+ // Validate processed content is not empty (preprocessing module ensures this)
734
+ if (!content.trim()) {
735
+ throw new Error('File contains no content after preprocessing');
736
+ }
737
+ const title = extractTitle(content, filePath);
738
+ return {
739
+ source: pathManager.toStoragePath(filePath), // Use path manager
740
+ title,
741
+ content: content.trim(),
742
+ metadata: {
743
+ contentType: 'text'
744
+ }
745
+ };
746
+ }, `File Processing: ${filePath}`, {
747
+ category: ErrorCategory.FILE_SYSTEM,
748
+ severity: ErrorSeverity.ERROR
749
+ });
750
+ if (!result) {
751
+ throw new Error(`Failed to process file: ${filePath}`);
752
+ }
753
+ return result;
754
+ }
755
+ /**
756
+ * Process multiple files into Documents
757
+ * Handles errors gracefully by skipping problematic files
758
+ */
759
+ export async function processFiles(filePaths, pathManager, imageToTextOptions) {
760
+ const result = {
761
+ documents: [],
762
+ errors: []
763
+ };
764
+ // Separate image and text files for optimized processing
765
+ const imageFiles = filePaths.filter(path => getContentType(path) === 'image');
766
+ const textFiles = filePaths.filter(path => getContentType(path) === 'text');
767
+ // Process text files sequentially (existing behavior)
768
+ for (const filePath of textFiles) {
769
+ try {
770
+ const document = await processFile(filePath, pathManager, imageToTextOptions);
771
+ result.documents.push(document);
772
+ }
773
+ catch (error) {
774
+ result.errors.push({
775
+ path: filePath,
776
+ error: error instanceof Error ? error.message : String(error)
777
+ });
778
+ }
779
+ }
780
+ // Process image files in batches for efficiency
781
+ if (imageFiles.length > 0) {
782
+ console.log(`Processing ${imageFiles.length} image files with optimized batch processing`);
783
+ try {
784
+ // Use batch processing for image descriptions
785
+ const batchResults = await generateImageDescriptionsBatch(imageFiles, imageToTextOptions);
786
+ // Convert batch results to documents with metadata extraction
787
+ for (const batchResult of batchResults) {
788
+ try {
789
+ // Extract metadata for each image
790
+ const imageMetadata = await extractImageMetadata(batchResult.path);
791
+ if (batchResult.result) {
792
+ // Create document from successful description generation
793
+ imageMetadata.description = batchResult.result.description;
794
+ imageMetadata.descriptionModel = batchResult.result.model;
795
+ imageMetadata.descriptionConfidence = batchResult.result.confidence;
796
+ const title = extractTitle('', batchResult.path);
797
+ const content = `Image: ${title}\nDescription: ${batchResult.result.description}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
798
+ result.documents.push({
799
+ source: pathManager.toStoragePath(batchResult.path),
800
+ title,
801
+ content: content.trim(),
802
+ metadata: {
803
+ contentType: 'image',
804
+ ...imageMetadata
805
+ }
806
+ });
807
+ }
808
+ else {
809
+ // Create fallback document for failed description generation
810
+ const title = extractTitle('', batchResult.path);
811
+ const content = `Image: ${title}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
812
+ result.documents.push({
813
+ source: pathManager.toStoragePath(batchResult.path),
814
+ title,
815
+ content: content.trim(),
816
+ metadata: {
817
+ contentType: 'image',
818
+ ...imageMetadata,
819
+ processingError: batchResult.error
820
+ }
821
+ });
822
+ }
823
+ }
824
+ catch (error) {
825
+ result.errors.push({
826
+ path: batchResult.path,
827
+ error: error instanceof Error ? error.message : String(error)
828
+ });
829
+ }
830
+ }
831
+ }
832
+ catch (error) {
833
+ // If batch processing fails entirely, fall back to individual processing
834
+ console.warn(`Batch processing failed, falling back to individual processing: ${error instanceof Error ? error.message : String(error)}`);
835
+ for (const filePath of imageFiles) {
836
+ try {
837
+ const document = await processFile(filePath, pathManager, imageToTextOptions);
838
+ result.documents.push(document);
839
+ }
840
+ catch (error) {
841
+ result.errors.push({
842
+ path: filePath,
843
+ error: error instanceof Error ? error.message : String(error)
844
+ });
845
+ }
846
+ }
847
+ }
848
+ }
849
+ return result;
850
+ }
851
+ /**
852
+ * Complete file discovery and processing pipeline
853
+ * Discovers files and processes them into Documents
854
+ */
855
+ export async function discoverAndProcessFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIONS, pathManager, imageToTextOptions) {
856
+ console.log(`Discovering files in: ${path}`);
857
+ // Discover files
858
+ const discoveryResult = await discoverFiles(path, options);
859
+ // Log discovery results
860
+ if (discoveryResult.skipped.length > 0) {
861
+ console.log(`Skipped ${discoveryResult.skipped.length} files:`);
862
+ for (const skipped of discoveryResult.skipped) {
863
+ console.error(` - ${skipped.path}: ${skipped.reason}`);
864
+ }
865
+ }
866
+ console.log(`Found ${discoveryResult.files.length} supported files`);
867
+ // Count different content types
868
+ const imageFiles = discoveryResult.files.filter(file => getContentType(file) === 'image');
869
+ const textFiles = discoveryResult.files.filter(file => getContentType(file) === 'text');
870
+ if (imageFiles.length > 0) {
871
+ console.log(` - ${textFiles.length} text files`);
872
+ console.log(` - ${imageFiles.length} image files`);
873
+ if (imageToTextOptions?.model) {
874
+ console.log(`Using image-to-text model: ${imageToTextOptions.model}`);
875
+ }
876
+ else {
877
+ console.log(`Using default image-to-text model: ${DEFAULT_IMAGE_TO_TEXT_OPTIONS.model}`);
878
+ }
879
+ }
880
+ // Create default path manager if not provided
881
+ const effectivePathManager = pathManager || new DocumentPathManager(config.path_storage_strategy, resolve(path));
882
+ // Process discovered files with path manager and image-to-text options
883
+ const processingResult = await processFiles(discoveryResult.files, effectivePathManager, imageToTextOptions);
884
+ // Log processing results
885
+ if (processingResult.errors.length > 0) {
886
+ console.log(`Failed to process ${processingResult.errors.length} files:`);
887
+ for (const error of processingResult.errors) {
888
+ console.error(` - ${error.path}: ${error.error}`);
889
+ }
890
+ }
891
+ console.log(`Successfully processed ${processingResult.documents.length} documents`);
892
+ return {
893
+ documents: processingResult.documents,
894
+ discoveryResult,
895
+ processingResult
896
+ };
897
+ }
898
+ /**
899
+ * Clean up image processing resources
900
+ * Call this when shutting down the application to free memory
901
+ */
902
+ export async function cleanupImageProcessingResources() {
903
+ // Clean up image-to-text pipeline
904
+ if (imageToTextPipeline) {
905
+ try {
906
+ // Dispose of the pipeline if it has a dispose method
907
+ if (typeof imageToTextPipeline.dispose === 'function') {
908
+ await imageToTextPipeline.dispose();
909
+ }
910
+ imageToTextPipeline = null;
911
+ imageToTextPipelinePromise = null;
912
+ console.log('Image-to-text pipeline cleaned up');
913
+ }
914
+ catch (error) {
915
+ console.warn('Error cleaning up image-to-text pipeline:', error);
916
+ }
917
+ }
918
+ }
919
+ /**
920
+ * Clean up image-to-text pipeline resources (legacy function for backward compatibility)
921
+ * @deprecated Use cleanupImageProcessingResources() instead
922
+ */
923
+ export async function cleanupImageToTextPipeline() {
924
+ return cleanupImageProcessingResources();
925
+ }
926
+ /**
927
+ * Generate description for a single image (exported for external use)
928
+ */
929
+ export async function generateImageDescriptionForFile(imagePath, options) {
930
+ return generateImageDescription(imagePath, { ...DEFAULT_IMAGE_TO_TEXT_OPTIONS, ...options });
931
+ }
932
+ /**
933
+ * Generate descriptions for multiple images (exported for external use)
934
+ */
935
+ export async function generateImageDescriptionsForFiles(imagePaths, options) {
936
+ return generateImageDescriptionsBatch(imagePaths, { ...DEFAULT_IMAGE_TO_TEXT_OPTIONS, ...options });
937
+ }
938
+ /**
939
+ * Extract metadata from a single image file (exported for external use)
940
+ */
941
+ export async function extractImageMetadataForFile(imagePath) {
942
+ return extractImageMetadata(imagePath);
943
+ }
944
+ /**
945
+ * Extract metadata from multiple image files (exported for external use)
946
+ */
947
+ export async function extractImageMetadataForFiles(imagePaths) {
948
+ const results = [];
949
+ for (const imagePath of imagePaths) {
950
+ try {
951
+ const metadata = await extractImageMetadata(imagePath);
952
+ results.push({ path: imagePath, metadata });
953
+ }
954
+ catch (error) {
955
+ results.push({
956
+ path: imagePath,
957
+ error: error instanceof Error ? error.message : String(error)
958
+ });
959
+ }
960
+ }
961
+ return results;
962
+ }
963
+ //# sourceMappingURL=file-processor.js.map