ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,1659 @@
1
+ /**
2
+ * Ingestion MCP Tools
3
+ *
4
+ * Extracted from src/index.ts Task 20.
5
+ * Tools: ocr_ingest_directory, ocr_ingest_files, ocr_process_pending, ocr_status
6
+ *
7
+ * CRITICAL: NEVER use console.log() - stdout is reserved for JSON-RPC protocol.
8
+ * Use console.error() for all logging.
9
+ *
10
+ * @module tools/ingestion
11
+ */
12
+ import { z } from 'zod';
13
+ import { v4 as uuidv4 } from 'uuid';
14
+ import { existsSync, statSync, lstatSync, readdirSync, mkdirSync, writeFileSync } from 'fs';
15
+ import { resolve, extname, basename } from 'path';
16
+ import { OCRProcessor } from '../services/ocr/processor.js';
17
+ import { DatalabClient } from '../services/ocr/datalab.js';
18
+ import { chunkHybridSectionAware, DEFAULT_CHUNKING_CONFIG, } from '../services/chunking/chunker.js';
19
+ import { extractPageOffsetsFromText } from '../services/chunking/markdown-parser.js';
20
+ import { EmbeddingService } from '../services/embedding/embedder.js';
21
+ import { ProvenanceTracker } from '../services/provenance/tracker.js';
22
+ import { computeHash, hashFile, computeFileHashSync } from '../utils/hash.js';
23
+ import { state, requireDatabase, getConfig, withDatabaseOperation } from '../server/state.js';
24
+ import { successResult } from '../server/types.js';
25
+ import { validateInput, sanitizePath, IngestDirectoryInput, IngestFilesInput, ProcessPendingInput, OCRStatusInput, RetryFailedInput, DEFAULT_FILE_TYPES, } from '../utils/validation.js';
26
+ import { pathNotFoundError, pathNotDirectoryError, documentNotFoundError, } from '../server/errors.js';
27
+ import { formatResponse, handleError } from './shared.js';
28
+ import { ProvenanceType } from '../models/provenance.js';
29
+ import { insertImageBatch, updateImageProvenance, } from '../services/storage/database/image-operations.js';
30
+ import { getProvenanceTracker } from '../services/provenance/index.js';
31
+ import { createVLMPipeline } from '../services/vlm/pipeline.js';
32
+ import { ImageExtractor } from '../services/images/extractor.js';
33
+ import { computeBlockTypeStats, detectRepeatedHeadersFooters, isRepeatedHeaderFooter } from '../services/chunking/json-block-analyzer.js';
34
+ // ═══════════════════════════════════════════════════════════════════════════════
35
+ // HELPER FUNCTIONS
36
+ // ═══════════════════════════════════════════════════════════════════════════════
37
+ /**
38
+ * Store chunks in database with provenance records
39
+ *
40
+ * Creates CHUNK provenance records (chain_depth=2) and inserts chunk records.
41
+ * Returns array of stored Chunk objects for embedding.
42
+ */
43
+ function storeChunks(db, doc, ocrResult, chunkResults, config = DEFAULT_CHUNKING_CONFIG) {
44
+ const provenanceTracker = new ProvenanceTracker(db);
45
+ const chunks = [];
46
+ const now = new Date().toISOString();
47
+ for (let i = 0; i < chunkResults.length; i++) {
48
+ const cr = chunkResults[i];
49
+ const chunkId = uuidv4();
50
+ const textHash = computeHash(cr.text);
51
+ // Create chunk provenance (chain_depth=2)
52
+ const chunkProvId = provenanceTracker.createProvenance({
53
+ type: ProvenanceType.CHUNK,
54
+ source_type: 'CHUNKING',
55
+ source_id: ocrResult.provenance_id,
56
+ root_document_id: doc.provenance_id,
57
+ content_hash: textHash,
58
+ input_hash: ocrResult.content_hash,
59
+ file_hash: doc.file_hash,
60
+ processor: 'chunker',
61
+ processor_version: '2.0.0',
62
+ processing_params: {
63
+ strategy: 'hybrid_section',
64
+ max_chunk_size: config.maxChunkSize,
65
+ chunk_size: config.chunkSize,
66
+ overlap_percent: config.overlapPercent,
67
+ chunk_index: i,
68
+ total_chunks: chunkResults.length,
69
+ character_start: cr.startOffset,
70
+ character_end: cr.endOffset,
71
+ heading_context: cr.headingContext ?? null,
72
+ section_path: cr.sectionPath ?? null,
73
+ is_atomic: cr.isAtomic,
74
+ content_types: cr.contentTypes,
75
+ ...(cr.tableMetadata ? {
76
+ table_columns: cr.tableMetadata.columnHeaders,
77
+ table_row_count: cr.tableMetadata.rowCount,
78
+ table_column_count: cr.tableMetadata.columnCount,
79
+ ...(cr.tableMetadata.summary ? { table_summary: cr.tableMetadata.summary } : {}),
80
+ ...(cr.tableMetadata.caption ? { table_caption: cr.tableMetadata.caption } : {}),
81
+ ...(cr.tableMetadata.continuationOf !== undefined ? { table_continuation_of: cr.tableMetadata.continuationOf } : {}),
82
+ } : {}),
83
+ },
84
+ location: {
85
+ chunk_index: i,
86
+ character_start: cr.startOffset,
87
+ character_end: cr.endOffset,
88
+ page_number: cr.pageNumber ?? undefined,
89
+ page_range: cr.pageRange ?? undefined,
90
+ },
91
+ });
92
+ db.insertChunk({
93
+ id: chunkId,
94
+ document_id: doc.id,
95
+ ocr_result_id: ocrResult.id,
96
+ text: cr.text,
97
+ text_hash: textHash,
98
+ chunk_index: i,
99
+ character_start: cr.startOffset,
100
+ character_end: cr.endOffset,
101
+ page_number: cr.pageNumber,
102
+ page_range: cr.pageRange,
103
+ overlap_previous: cr.overlapWithPrevious,
104
+ overlap_next: cr.overlapWithNext,
105
+ provenance_id: chunkProvId,
106
+ ocr_quality_score: ocrResult.parse_quality_score ?? null,
107
+ heading_context: cr.headingContext ?? null,
108
+ heading_level: cr.headingLevel ?? null,
109
+ section_path: cr.sectionPath ?? null,
110
+ content_types: JSON.stringify(cr.contentTypes),
111
+ is_atomic: cr.isAtomic ? 1 : 0,
112
+ chunking_strategy: 'hybrid_section',
113
+ });
114
+ // Build Chunk object directly from insert data (avoids re-fetching from DB)
115
+ chunks.push({
116
+ id: chunkId,
117
+ document_id: doc.id,
118
+ ocr_result_id: ocrResult.id,
119
+ text: cr.text,
120
+ text_hash: textHash,
121
+ chunk_index: i,
122
+ character_start: cr.startOffset,
123
+ character_end: cr.endOffset,
124
+ page_number: cr.pageNumber,
125
+ page_range: cr.pageRange,
126
+ overlap_previous: cr.overlapWithPrevious,
127
+ overlap_next: cr.overlapWithNext,
128
+ provenance_id: chunkProvId,
129
+ created_at: now,
130
+ embedding_status: 'pending',
131
+ embedded_at: null,
132
+ ocr_quality_score: ocrResult.parse_quality_score ?? null,
133
+ heading_context: cr.headingContext ?? null,
134
+ heading_level: cr.headingLevel ?? null,
135
+ section_path: cr.sectionPath ?? null,
136
+ content_types: JSON.stringify(cr.contentTypes),
137
+ is_atomic: cr.isAtomic ? 1 : 0,
138
+ chunking_strategy: 'hybrid_section',
139
+ });
140
+ }
141
+ return chunks;
142
+ }
143
+ /**
144
+ * Extract a context text window from OCR text for a target page.
145
+ *
146
+ * When pageOffsets are provided, uses exact character boundaries from OCR.
147
+ * Falls back to heuristic estimation when pageOffsets are unavailable.
148
+ *
149
+ * @param ocrText - Full OCR extracted text
150
+ * @param pageCount - Total number of pages in the document
151
+ * @param targetPage - The page number to extract context for (1-indexed)
152
+ * @param pageOffsets - Optional exact page offset data from OCR
153
+ * @returns Context text window (max ~1000 chars)
154
+ */
155
+ function extractContextText(ocrText, pageCount, targetPage, pageOffsets) {
156
+ if (!ocrText || ocrText.length === 0 || pageCount <= 0) {
157
+ return '';
158
+ }
159
+ const textLength = ocrText.length;
160
+ // Use exact page boundaries when available
161
+ if (pageOffsets && pageOffsets.length > 0) {
162
+ const pageInfo = pageOffsets.find((p) => p.page === targetPage);
163
+ if (pageInfo) {
164
+ const start = Math.max(0, Math.min(pageInfo.charStart, textLength));
165
+ const end = Math.min(pageInfo.charEnd, textLength);
166
+ // Cap at 1000 chars to match original behavior
167
+ return ocrText.slice(start, Math.min(end, start + 1000)).trim();
168
+ }
169
+ }
170
+ // Fallback: heuristic estimation
171
+ const safePageCount = Math.max(1, pageCount);
172
+ const safePage = Math.max(1, Math.min(targetPage, safePageCount));
173
+ // Estimate position in text for this page
174
+ // Use (safePageCount - 1) as denominator so last page maps to end of text
175
+ const estimatedPosition = Math.floor(((safePage - 1) / Math.max(1, safePageCount - 1)) * textLength);
176
+ // Take ±500 char window
177
+ const windowStart = Math.max(0, estimatedPosition - 500);
178
+ const windowEnd = Math.min(textLength, estimatedPosition + 500);
179
+ let context = ocrText.slice(windowStart, windowEnd);
180
+ // Trim to word boundaries
181
+ if (windowStart > 0) {
182
+ const firstSpace = context.indexOf(' ');
183
+ if (firstSpace > 0 && firstSpace < 50) {
184
+ context = context.slice(firstSpace + 1);
185
+ }
186
+ }
187
+ if (windowEnd < textLength) {
188
+ const lastSpace = context.lastIndexOf(' ');
189
+ if (lastSpace > 0 && lastSpace > context.length - 50) {
190
+ context = context.slice(0, lastSpace);
191
+ }
192
+ }
193
+ return context.trim();
194
+ }
195
+ /**
196
+ * Parse Datalab block type from image filename.
197
+ * Datalab names images like: _page_0_Picture_21.jpeg, _page_0_Figure_3.jpeg
198
+ * Returns block_type string or null if pattern doesn't match.
199
+ */
200
+ export function parseBlockTypeFromFilename(filename) {
201
+ const match = filename.match(/_page_\d+_([A-Za-z]+)_\d+\./);
202
+ return match ? match[1] : null;
203
+ }
204
+ /**
205
+ * From Datalab JSON block hierarchy, classify each page's image regions.
206
+ * Returns a map: pageNumber -> PageImageClassification
207
+ *
208
+ * The JSON structure has top-level children (pages), each page has children (blocks).
209
+ * Image blocks have block_type 'Figure', 'Picture', 'FigureGroup', 'PictureGroup'.
210
+ * Layout blocks have block_type 'PageHeader', 'PageFooter'.
211
+ */
212
+ export function buildPageBlockClassification(jsonBlocks) {
213
+ const pageMap = new Map();
214
+ const topChildren = jsonBlocks.children ??
215
+ jsonBlocks.blocks ??
216
+ [];
217
+ if (!Array.isArray(topChildren)) {
218
+ console.error('[WARN] JSON blocks has no children/blocks array');
219
+ return pageMap;
220
+ }
221
+ let pageNum = 0;
222
+ for (const pageBlock of topChildren) {
223
+ const block = pageBlock;
224
+ if (block.block_type === 'Page' || !block.block_type) {
225
+ pageNum++;
226
+ }
227
+ else {
228
+ continue;
229
+ }
230
+ const classification = {
231
+ hasFigure: false,
232
+ hasPicture: false,
233
+ pictureInHeaderFooter: 0,
234
+ pictureInBody: 0,
235
+ figureCount: 0,
236
+ };
237
+ const walkChildren = (children, inHeaderFooter) => {
238
+ if (!Array.isArray(children))
239
+ return;
240
+ for (const child of children) {
241
+ const c = child;
242
+ const btype = c.block_type;
243
+ const isHF = inHeaderFooter || btype === 'PageHeader' || btype === 'PageFooter';
244
+ if (btype === 'Figure' || btype === 'FigureGroup') {
245
+ classification.hasFigure = true;
246
+ classification.figureCount++;
247
+ }
248
+ if (btype === 'Picture' || btype === 'PictureGroup') {
249
+ classification.hasPicture = true;
250
+ if (isHF) {
251
+ classification.pictureInHeaderFooter++;
252
+ }
253
+ else {
254
+ classification.pictureInBody++;
255
+ }
256
+ }
257
+ if (c.children) {
258
+ walkChildren(c.children, isHF);
259
+ }
260
+ }
261
+ };
262
+ walkChildren(block.children ?? [], false);
263
+ pageMap.set(pageNum, classification);
264
+ }
265
+ return pageMap;
266
+ }
267
+ /**
268
+ * Save images from Datalab to disk and store references in database.
269
+ *
270
+ * Images come from Datalab as {filename: base64_data}.
271
+ * This function:
272
+ * 1. Creates output directory
273
+ * 2. Saves each image to disk
274
+ * 3. Creates image records in database for VLM processing
275
+ *
276
+ * @param db - Database connection
277
+ * @param doc - Document record
278
+ * @param ocrResult - OCR result for provenance chain
279
+ * @param images - Images from Datalab: {filename: base64}
280
+ * @param outputDir - Directory to save images
281
+ * @returns Array of stored ImageReference records
282
+ */
283
+ function saveAndStoreImages(db, doc, ocrResult, images, outputDir, jsonBlocks, pageOffsets) {
284
+ // Create output directory
285
+ if (!existsSync(outputDir)) {
286
+ mkdirSync(outputDir, { recursive: true });
287
+ }
288
+ // Build page-level image classification from JSON blocks
289
+ const pageClassification = jsonBlocks
290
+ ? buildPageBlockClassification(jsonBlocks)
291
+ : new Map();
292
+ const imageRefs = [];
293
+ const pageImageCounts = new Map();
294
+ for (const filename of Object.keys(images)) {
295
+ const buffer = Buffer.from(images[filename], 'base64');
296
+ // Release base64 string immediately to reduce peak memory
297
+ delete images[filename];
298
+ const filePath = resolve(outputDir, filename);
299
+ writeFileSync(filePath, buffer);
300
+ // Parse page number from filename (e.g., "page_1_image_0.png" or "p001_i000.png")
301
+ const pageMatch = filename.match(/page_(\d+)|p(\d+)/i);
302
+ const pageNumber = pageMatch ? parseInt(pageMatch[1] || pageMatch[2], 10) + 1 : 1;
303
+ // Per-page image index
304
+ const currentPageCount = pageImageCounts.get(pageNumber) ?? 0;
305
+ pageImageCounts.set(pageNumber, currentPageCount + 1);
306
+ const imageIndex = currentPageCount;
307
+ const contentHash = computeHash(buffer);
308
+ // Parse block type from Datalab filename
309
+ const blockType = parseBlockTypeFromFilename(filename);
310
+ // Determine if image is in header/footer region
311
+ const pageInfo = pageClassification.get(pageNumber);
312
+ const isHeaderFooter = blockType === 'PageHeader' ||
313
+ blockType === 'PageFooter' ||
314
+ (pageInfo !== undefined &&
315
+ !pageInfo.hasFigure &&
316
+ pageInfo.pictureInHeaderFooter > 0 &&
317
+ pageInfo.pictureInBody === 0);
318
+ // Get image format from extension
319
+ const ext = extname(filename).slice(1).toLowerCase();
320
+ const format = ext || 'png';
321
+ // Extract context text from OCR for this page (uses exact pageOffsets when available)
322
+ const contextText = extractContextText(ocrResult.extracted_text, ocrResult.page_count ?? 1, pageNumber, pageOffsets);
323
+ // Create image reference for database
324
+ // Note: dimensions will be estimated - VLM pipeline can update if needed
325
+ imageRefs.push({
326
+ document_id: doc.id,
327
+ ocr_result_id: ocrResult.id,
328
+ page_number: pageNumber,
329
+ bounding_box: { x: 0, y: 0, width: 0, height: 0 }, // Datalab doesn't provide bbox
330
+ image_index: imageIndex,
331
+ format,
332
+ dimensions: { width: 0, height: 0 }, // Datalab does not provide dimensions; filtering pipeline bypasses dimension check when both are 0
333
+ extracted_path: filePath,
334
+ file_size: buffer.length,
335
+ context_text: contextText || null,
336
+ provenance_id: null, // Will be set after insert with provenance record
337
+ block_type: blockType,
338
+ is_header_footer: isHeaderFooter,
339
+ content_hash: contentHash,
340
+ });
341
+ }
342
+ // Batch insert all images
343
+ if (imageRefs.length > 0) {
344
+ const insertedImages = insertImageBatch(db.getConnection(), imageRefs);
345
+ // Create IMAGE provenance records and update image records
346
+ const tracker = getProvenanceTracker(db);
347
+ for (const img of insertedImages) {
348
+ try {
349
+ const provenanceId = tracker.createProvenance({
350
+ type: ProvenanceType.IMAGE,
351
+ source_type: 'IMAGE_EXTRACTION',
352
+ source_id: ocrResult.provenance_id,
353
+ root_document_id: doc.provenance_id,
354
+ content_hash: img.content_hash ??
355
+ (img.extracted_path && existsSync(img.extracted_path)
356
+ ? computeFileHashSync(img.extracted_path)
357
+ : computeHash(img.id)),
358
+ source_path: img.extracted_path ?? undefined,
359
+ processor: 'datalab-image-extraction',
360
+ processor_version: '1.0.0',
361
+ processing_params: {
362
+ page_number: img.page_number,
363
+ image_index: img.image_index,
364
+ format: img.format,
365
+ block_type: img.block_type,
366
+ is_header_footer: img.is_header_footer,
367
+ },
368
+ location: {
369
+ page_number: img.page_number,
370
+ },
371
+ });
372
+ // Update the image record with the provenance ID
373
+ updateImageProvenance(db.getConnection(), img.id, provenanceId);
374
+ img.provenance_id = provenanceId;
375
+ }
376
+ catch (error) {
377
+ console.error(`[WARN] Failed to create IMAGE provenance for ${img.id}: ${error instanceof Error ? error.message : String(error)}`);
378
+ throw error;
379
+ }
380
+ }
381
+ return insertedImages;
382
+ }
383
+ return [];
384
+ }
385
+ /**
386
+ * Process a single document through the full OCR pipeline.
387
+ *
388
+ * Pipeline: OCR -> Extract Images -> Chunk -> Embed -> VLM -> Structured Extraction -> Complete
389
+ *
390
+ * This function is the core processing unit used by both handleProcessPending (batch)
391
+ * and handleReprocess (single document). Extracting it prevents the race condition
392
+ * where handleReprocess calls handleProcessPending and the target document may not
393
+ * be claimed when other pending documents exist (M-11).
394
+ *
395
+ * @param doc - Document record (must already have status='processing')
396
+ * @param params - Processing parameters
397
+ * @returns void on success, throws on failure
398
+ */
399
+ async function processOneDocument(doc, params) {
400
+ const warnings = [];
401
+ const { db, vector, ocrMode, ocrOptions, pageSchema, imagesBaseDir } = params;
402
+ console.error(`[INFO] Processing document: ${doc.id} (${doc.file_name})`);
403
+ // Step 1: OCR via Datalab
404
+ // OCRProcessor.processDocument() throws on failure (FAIL-FAST).
405
+ // It handles status='processing' internally and marks 'failed' before throwing.
406
+ const ocrProcessor = new OCRProcessor(db);
407
+ const processResult = await ocrProcessor.processDocument(doc.id, ocrMode, ocrOptions);
408
+ // Get the OCR result
409
+ const ocrResult = db.getOCRResultByDocumentId(doc.id);
410
+ if (!ocrResult) {
411
+ throw new Error('OCR result not found after processing');
412
+ }
413
+ console.error(`[INFO] OCR complete: ${ocrResult.text_length} chars, ${ocrResult.page_count} pages`);
414
+ // Step 1.5: Extract and store images from OCR result (if any)
415
+ let imageCount = 0;
416
+ const imageOutputDir = resolve(imagesBaseDir, doc.id);
417
+ if (processResult.images && Object.keys(processResult.images).length > 0) {
418
+ const imageRefs = saveAndStoreImages(db, doc, ocrResult, processResult.images, imageOutputDir, processResult.jsonBlocks, processResult.pageOffsets);
419
+ imageCount = imageRefs.length;
420
+ console.error(`[INFO] Images from Datalab: ${imageCount}`);
421
+ }
422
+ // Step 1.6: File-based image extraction fallback
423
+ // If Datalab didn't return images, extract directly from file (PDF or DOCX)
424
+ if (imageCount === 0 &&
425
+ !ocrOptions.disableImageExtraction &&
426
+ ImageExtractor.isSupported(doc.file_path)) {
427
+ console.error(`[INFO] No images from Datalab for ${doc.file_type} file, running file-based extraction`);
428
+ const extractor = new ImageExtractor();
429
+ const extractedImages = await extractor.extractImages(doc.file_path, {
430
+ outputDir: imageOutputDir,
431
+ minSize: 50,
432
+ maxImages: 500,
433
+ });
434
+ if (extractedImages.length > 0) {
435
+ // Build page classification from JSON blocks for header/footer detection
436
+ const pageClassification = processResult.jsonBlocks
437
+ ? buildPageBlockClassification(processResult.jsonBlocks)
438
+ : new Map();
439
+ const imageRefs = extractedImages.map((img) => {
440
+ const contentHash = computeFileHashSync(img.path);
441
+ const pageInfo = pageClassification.get(img.page);
442
+ const isHeaderFooter = pageInfo !== undefined &&
443
+ !pageInfo.hasFigure &&
444
+ pageInfo.pictureInHeaderFooter > 0 &&
445
+ pageInfo.pictureInBody === 0;
446
+ const contextText = extractContextText(ocrResult.extracted_text, ocrResult.page_count ?? 1, img.page);
447
+ return {
448
+ document_id: doc.id,
449
+ ocr_result_id: ocrResult.id,
450
+ page_number: img.page,
451
+ bounding_box: img.bbox,
452
+ image_index: img.index,
453
+ format: img.format,
454
+ dimensions: { width: img.width, height: img.height },
455
+ extracted_path: img.path,
456
+ file_size: img.size,
457
+ context_text: contextText || null,
458
+ provenance_id: null,
459
+ block_type: null, // File-based extraction has no block type
460
+ is_header_footer: isHeaderFooter,
461
+ content_hash: contentHash,
462
+ };
463
+ });
464
+ const insertedImages = insertImageBatch(db.getConnection(), imageRefs);
465
+ // Create IMAGE provenance records
466
+ const tracker = getProvenanceTracker(db);
467
+ for (const img of insertedImages) {
468
+ try {
469
+ const provenanceId = tracker.createProvenance({
470
+ type: ProvenanceType.IMAGE,
471
+ source_type: 'IMAGE_EXTRACTION',
472
+ source_id: ocrResult.provenance_id,
473
+ root_document_id: doc.provenance_id,
474
+ content_hash: img.content_hash ??
475
+ (img.extracted_path && existsSync(img.extracted_path)
476
+ ? computeFileHashSync(img.extracted_path)
477
+ : computeHash(img.id)),
478
+ source_path: img.extracted_path ?? undefined,
479
+ processor: `${doc.file_type}-image-extraction`,
480
+ processor_version: '1.0.0',
481
+ processing_params: {
482
+ page_number: img.page_number,
483
+ image_index: img.image_index,
484
+ format: img.format,
485
+ extraction_method: 'file-based',
486
+ is_header_footer: img.is_header_footer,
487
+ },
488
+ location: {
489
+ page_number: img.page_number,
490
+ },
491
+ });
492
+ updateImageProvenance(db.getConnection(), img.id, provenanceId);
493
+ }
494
+ catch (provError) {
495
+ console.error(`[ERROR] Failed to create IMAGE provenance for ${img.id}: ` +
496
+ `${provError instanceof Error ? provError.message : String(provError)}`);
497
+ throw provError;
498
+ }
499
+ }
500
+ imageCount = insertedImages.length;
501
+ console.error(`[INFO] File-based extraction: ${imageCount} images`);
502
+ }
503
+ else {
504
+ console.error(`[INFO] File-based extraction: no images found in document`);
505
+ }
506
+ }
507
+ // Step 2: Chunk the OCR text using hybrid section-aware chunker
508
+ const chunkConfig = {
509
+ chunkSize: state.config.chunkSize,
510
+ overlapPercent: state.config.chunkOverlapPercent,
511
+ maxChunkSize: state.config.maxChunkSize,
512
+ };
513
+ let pageOffsets = processResult.pageOffsets ?? [];
514
+ // Fallback: if Python returned a single page offset covering the entire text,
515
+ // re-extract using TypeScript's extractPageOffsetsFromText which handles both
516
+ // HTML comment (<!-- Page N -->) and Datalab ({N}---) separator formats.
517
+ if (pageOffsets.length <= 1 && ocrResult.extracted_text.length > 0) {
518
+ const extracted = extractPageOffsetsFromText(ocrResult.extracted_text);
519
+ if (extracted.length > pageOffsets.length) {
520
+ pageOffsets = extracted;
521
+ }
522
+ }
523
+ const chunkResults = chunkHybridSectionAware(ocrResult.extracted_text, pageOffsets, processResult.jsonBlocks ?? null, chunkConfig);
524
+ console.error(`[INFO] Chunking complete: ${chunkResults.length} chunks`);
525
+ // Step 3: Store chunks in database with provenance
526
+ const chunks = storeChunks(db, doc, ocrResult, chunkResults, chunkConfig);
527
+ console.error(`[INFO] Chunks stored: ${chunks.length}`);
528
+ // Step 3.4: Detect repeated headers/footers and tag matching chunks (T2.8)
529
+ if (processResult.jsonBlocks) {
530
+ try {
531
+ const headerFooterInfo = detectRepeatedHeadersFooters(processResult.jsonBlocks);
532
+ const allRepeated = [...headerFooterInfo.repeatedHeaders, ...headerFooterInfo.repeatedFooters];
533
+ if (allRepeated.length > 0) {
534
+ const conn = db.getConnection();
535
+ let tagRow = conn.prepare("SELECT id FROM tags WHERE name = ?").get('system:repeated_header_footer');
536
+ if (!tagRow) {
537
+ const tagId = uuidv4();
538
+ conn.prepare("INSERT INTO tags (id, name, description, color) VALUES (?, ?, ?, ?)").run(tagId, 'system:repeated_header_footer', 'Auto-detected repeated page header or footer content', '#888888');
539
+ tagRow = { id: tagId };
540
+ }
541
+ let taggedCount = 0;
542
+ for (const chunk of chunks) {
543
+ if (isRepeatedHeaderFooter(chunk.text, allRepeated)) {
544
+ const entityTagId = uuidv4();
545
+ conn.prepare("INSERT OR IGNORE INTO entity_tags (id, tag_id, entity_id, entity_type) VALUES (?, ?, ?, 'chunk')").run(entityTagId, tagRow.id, chunk.id);
546
+ taggedCount++;
547
+ }
548
+ }
549
+ console.error(`[T2.8] Tagged ${taggedCount} chunks as repeated header/footer (${allRepeated.length} patterns detected) for document ${doc.id}`);
550
+ }
551
+ }
552
+ catch (tagError) {
553
+ const tagErrMsg = tagError instanceof Error ? tagError.message : String(tagError);
554
+ console.error(`[WARN] Header/footer tagging failed for ${doc.id}: ${tagErrMsg}`);
555
+ warnings.push(`Header/footer auto-tagging failed: ${tagErrMsg}. Chunks stored but repeated headers/footers not tagged.`);
556
+ }
557
+ }
558
+ // Step 3.5: Enrich extras_json with block stats, links, and structural fingerprint
559
+ // (Tasks 4.1, 4.2, 4.4 - Ingestion Pipeline Enrichment)
560
+ try {
561
+ const existingExtras = ocrResult.extras_json
562
+ ? JSON.parse(ocrResult.extras_json)
563
+ : {};
564
+ // Task 4.1: Block-type statistics from json_blocks
565
+ const blockStats = computeBlockTypeStats(processResult.jsonBlocks ?? null);
566
+ if (blockStats) {
567
+ existingExtras.block_type_stats = blockStats;
568
+ }
569
+ // Task 4.2: Extract structured hyperlinks from Datalab metadata
570
+ const metadataObj = (existingExtras.metadata ?? processResult.metadata ?? null);
571
+ if (metadataObj) {
572
+ // Datalab stores links under metadata.extras_features.links or metadata.links
573
+ const extrasFeatures = metadataObj.extras_features;
574
+ const rawLinks = (extrasFeatures?.links ?? metadataObj.links ?? null);
575
+ if (Array.isArray(rawLinks) && rawLinks.length > 0) {
576
+ const structuredLinks = rawLinks
577
+ .filter((link) => {
578
+ const url = (link.url ?? link.href ?? '');
579
+ return url.length > 0;
580
+ })
581
+ .map((link) => ({
582
+ url: (link.url ?? link.href),
583
+ anchor_text: (link.anchor_text ?? link.text ?? link.title ?? ''),
584
+ page_number: (link.page_number ?? link.page ?? null),
585
+ }));
586
+ existingExtras.structured_links = structuredLinks;
587
+ existingExtras.link_count = structuredLinks.length;
588
+ }
589
+ else {
590
+ existingExtras.link_count = 0;
591
+ }
592
+ }
593
+ // Task 4.4: Structural fingerprint from chunks
594
+ const headingDepths = {};
595
+ let totalChunkSize = 0;
596
+ let atomicChunkCount = 0;
597
+ let tableCount = 0;
598
+ let figureCount = 0;
599
+ const contentTypeDist = {};
600
+ for (const cr of chunkResults) {
601
+ totalChunkSize += cr.text.length;
602
+ if (cr.isAtomic)
603
+ atomicChunkCount++;
604
+ // Count heading depths from heading level
605
+ if (cr.headingLevel !== null && cr.headingLevel !== undefined) {
606
+ const key = `h${cr.headingLevel}`;
607
+ headingDepths[key] = (headingDepths[key] ?? 0) + 1;
608
+ }
609
+ // Count content types
610
+ for (const ct of cr.contentTypes) {
611
+ contentTypeDist[ct] = (contentTypeDist[ct] ?? 0) + 1;
612
+ if (ct === 'Table' || ct === 'TableGroup')
613
+ tableCount++;
614
+ if (ct === 'Figure' || ct === 'FigureGroup')
615
+ figureCount++;
616
+ }
617
+ }
618
+ existingExtras.structural_fingerprint = {
619
+ page_count: ocrResult.page_count ?? 0,
620
+ chunk_count: chunkResults.length,
621
+ table_count: tableCount,
622
+ figure_count: figureCount,
623
+ heading_depths: headingDepths,
624
+ avg_chunk_size: chunkResults.length > 0
625
+ ? Math.round(totalChunkSize / chunkResults.length)
626
+ : 0,
627
+ atomic_chunk_ratio: chunkResults.length > 0
628
+ ? Math.round((atomicChunkCount / chunkResults.length) * 100) / 100
629
+ : 0,
630
+ content_type_distribution: contentTypeDist,
631
+ };
632
+ // Persist enriched extras_json back to ocr_results
633
+ const updatedExtrasJson = JSON.stringify(existingExtras);
634
+ db.getConnection()
635
+ .prepare('UPDATE ocr_results SET extras_json = ? WHERE id = ?')
636
+ .run(updatedExtrasJson, ocrResult.id);
637
+ console.error(`[INFO] Extras enriched: block_stats=${blockStats ? 'yes' : 'no'}, ` +
638
+ `links=${existingExtras.link_count ?? 0}, fingerprint=yes`);
639
+ }
640
+ catch (enrichError) {
641
+ const enrichErrMsg = enrichError instanceof Error ? enrichError.message : String(enrichError);
642
+ console.error(`[WARN] Extras enrichment failed for ${doc.id}: ${enrichErrMsg}`);
643
+ warnings.push(`Metadata enrichment failed: ${enrichErrMsg}. Document complete but block stats, links, and structural fingerprint are missing.`);
644
+ }
645
+ // Step 4: Generate embeddings for text chunks
646
+ const embeddingService = new EmbeddingService();
647
+ const documentInfo = {
648
+ documentId: doc.id,
649
+ filePath: doc.file_path,
650
+ fileName: doc.file_name,
651
+ fileHash: doc.file_hash,
652
+ documentProvenanceId: doc.provenance_id,
653
+ };
654
+ const embedResult = await embeddingService.embedDocumentChunks(db, vector, chunks, documentInfo);
655
+ if (!embedResult.success) {
656
+ throw new Error(embedResult.error ?? 'Embedding generation failed');
657
+ }
658
+ console.error(`[INFO] Embeddings complete: ${embedResult.embeddingIds.length} embeddings in ${embedResult.elapsedMs}ms`);
659
+ // Step 5: VLM process images (generate 3+ paragraph descriptions)
660
+ // Only run if document had images extracted.
661
+ // VLM failures for individual images are logged as warnings but do NOT fail
662
+ // the document -- OCR, chunking, and embeddings already succeeded. Each image
663
+ // has its own vlm_status ('complete'|'failed'|'skipped') tracked independently.
664
+ if (imageCount > 0) {
665
+ const vlmPipeline = createVLMPipeline(db, vector, {
666
+ batchSize: 5,
667
+ concurrency: 3,
668
+ minConfidence: 0.5,
669
+ });
670
+ const vlmResult = await vlmPipeline.processDocument(doc.id);
671
+ console.error(`[INFO] VLM complete: ${vlmResult.successful}/${vlmResult.total} images processed, ` +
672
+ `${vlmResult.skipped} skipped, ${vlmResult.failed} failed, ` +
673
+ `${vlmResult.totalTokens} tokens used`);
674
+ if (vlmResult.failed > 0) {
675
+ const failedDetails = vlmResult.results
676
+ .filter((r) => !r.success)
677
+ .map((r) => `${r.imageId}: ${r.error ?? 'unknown error'}`)
678
+ .join('; ');
679
+ console.error(`[WARN] VLM processing failed for ${vlmResult.failed}/${vlmResult.total} images ` +
680
+ `in document ${doc.id}. Individual images marked as failed; document will ` +
681
+ `complete normally. Details: ${failedDetails}`);
682
+ }
683
+ }
684
+ // Step 5.5: Store structured extraction if present
685
+ // Errors propagate to fail the document (no swallowing)
686
+ if (processResult.extractionJson && pageSchema) {
687
+ const extractionContent = JSON.stringify(processResult.extractionJson);
688
+ const extractionHash = computeHash(extractionContent);
689
+ // Create EXTRACTION provenance record
690
+ const extractionProvId = uuidv4();
691
+ const ocrProvId = processResult.provenanceId;
692
+ const docProvId = doc.provenance_id;
693
+ const now = new Date().toISOString();
694
+ db.insertProvenance({
695
+ id: extractionProvId,
696
+ type: ProvenanceType.EXTRACTION,
697
+ created_at: now,
698
+ processed_at: now,
699
+ source_file_created_at: null,
700
+ source_file_modified_at: null,
701
+ source_type: 'EXTRACTION',
702
+ source_path: doc.file_path,
703
+ source_id: ocrProvId,
704
+ root_document_id: docProvId,
705
+ location: null,
706
+ content_hash: extractionHash,
707
+ input_hash: ocrResult.content_hash,
708
+ file_hash: doc.file_hash,
709
+ processor: 'datalab-extraction',
710
+ processor_version: '1.0.0',
711
+ processing_params: { page_schema: pageSchema },
712
+ processing_duration_ms: null,
713
+ processing_quality_score: null,
714
+ parent_id: ocrProvId,
715
+ parent_ids: JSON.stringify([docProvId, ocrProvId]),
716
+ chain_depth: 2,
717
+ chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'EXTRACTION']),
718
+ });
719
+ db.insertExtraction({
720
+ id: uuidv4(),
721
+ document_id: doc.id,
722
+ ocr_result_id: ocrResult.id,
723
+ schema_json: pageSchema,
724
+ extraction_json: extractionContent,
725
+ content_hash: extractionHash,
726
+ provenance_id: extractionProvId,
727
+ created_at: now,
728
+ });
729
+ console.error(`[INFO] Stored structured extraction for document ${doc.id}`);
730
+ }
731
+ // Step 5.6: Update document metadata if available
732
+ if (processResult.docTitle || processResult.docAuthor || processResult.docSubject) {
733
+ db.updateDocumentMetadata(doc.id, {
734
+ docTitle: processResult.docTitle ?? null,
735
+ docAuthor: processResult.docAuthor ?? null,
736
+ docSubject: processResult.docSubject ?? null,
737
+ });
738
+ }
739
+ // Step 6: Mark document complete (OCR + chunks + embeddings succeeded)
740
+ // Note: Generation validation is handled by withDatabaseOperation() in the caller.
741
+ db.updateDocumentStatus(doc.id, 'complete');
742
+ console.error(`[INFO] Document ${doc.id} processing complete`);
743
+ return warnings;
744
+ }
745
+ // ═══════════════════════════════════════════════════════════════════════════════
746
+ // INGESTION TOOL HANDLERS
747
+ // ═══════════════════════════════════════════════════════════════════════════════
748
+ /**
749
+ * Handle ocr_ingest_directory - Ingest all documents from a directory
750
+ */
751
+ export async function handleIngestDirectory(params) {
752
+ try {
753
+ const input = validateInput(IngestDirectoryInput, params);
754
+ const { db } = requireDatabase();
755
+ const safeDirPath = sanitizePath(input.directory_path);
756
+ // Validate directory exists - FAIL FAST
757
+ if (!existsSync(safeDirPath)) {
758
+ throw pathNotFoundError(safeDirPath);
759
+ }
760
+ const dirStats = statSync(safeDirPath);
761
+ if (!dirStats.isDirectory()) {
762
+ throw pathNotDirectoryError(safeDirPath);
763
+ }
764
+ const fileTypes = input.file_types ?? [...DEFAULT_FILE_TYPES];
765
+ const items = [];
766
+ const collectFiles = (dirPath) => {
767
+ const files = [];
768
+ const entries = readdirSync(dirPath, { withFileTypes: true });
769
+ for (const entry of entries) {
770
+ const fullPath = resolve(dirPath, entry.name);
771
+ try {
772
+ if (lstatSync(fullPath).isSymbolicLink()) {
773
+ console.error(`[WARN] Skipping symlink during ingestion: ${fullPath}`);
774
+ continue;
775
+ }
776
+ }
777
+ catch (error) {
778
+ console.error(`[WARN] Could not stat entry, skipping: ${fullPath}:`, error instanceof Error ? error.message : String(error));
779
+ continue;
780
+ }
781
+ if (entry.isDirectory() && input.recursive) {
782
+ files.push(...collectFiles(fullPath));
783
+ }
784
+ else if (entry.isFile()) {
785
+ const ext = extname(entry.name).slice(1).toLowerCase();
786
+ if (fileTypes.includes(ext)) {
787
+ files.push(fullPath);
788
+ }
789
+ }
790
+ }
791
+ return files;
792
+ };
793
+ const files = collectFiles(safeDirPath);
794
+ // Ingest each file
795
+ for (const filePath of files) {
796
+ try {
797
+ // Check if already ingested by path
798
+ const existingByPath = db.getDocumentByPath(filePath);
799
+ const stats = statSync(filePath);
800
+ const fileHash = await hashFile(filePath);
801
+ if (existingByPath) {
802
+ if (fileHash === existingByPath.file_hash) {
803
+ items.push({
804
+ file_path: filePath,
805
+ file_name: basename(filePath),
806
+ document_id: existingByPath.id,
807
+ status: 'skipped',
808
+ error_message: 'Already ingested, content unchanged',
809
+ });
810
+ continue;
811
+ }
812
+ // Version change detected - continue with normal ingestion flow below
813
+ console.error(`[Ingestion] Version update detected for ${filePath}: ${existingByPath.file_hash} -> ${fileHash}`);
814
+ }
815
+ else {
816
+ // Check for duplicate by file hash (same content, different path)
817
+ const existingByHash = db.getDocumentByHash(fileHash);
818
+ if (existingByHash) {
819
+ items.push({
820
+ file_path: filePath,
821
+ file_name: basename(filePath),
822
+ document_id: existingByHash.id,
823
+ status: 'skipped',
824
+ error_message: `Duplicate file (same hash as ${existingByHash.file_path})`,
825
+ });
826
+ continue;
827
+ }
828
+ }
829
+ // Determine if this is a version update
830
+ const isVersionUpdate = !!existingByPath;
831
+ // Create document record
832
+ const documentId = uuidv4();
833
+ const provenanceId = uuidv4();
834
+ const now = new Date().toISOString();
835
+ const ext = extname(filePath).slice(1).toLowerCase();
836
+ // Create document provenance
837
+ db.insertProvenance({
838
+ id: provenanceId,
839
+ type: ProvenanceType.DOCUMENT,
840
+ created_at: now,
841
+ processed_at: now,
842
+ source_file_created_at: null,
843
+ source_file_modified_at: null,
844
+ source_type: 'FILE',
845
+ source_path: filePath,
846
+ source_id: null,
847
+ root_document_id: provenanceId,
848
+ location: null,
849
+ content_hash: fileHash,
850
+ input_hash: null,
851
+ file_hash: fileHash,
852
+ processor: 'file-scanner',
853
+ processor_version: '1.0.0',
854
+ processing_params: {
855
+ directory_path: safeDirPath,
856
+ recursive: input.recursive,
857
+ ...(isVersionUpdate ? { previous_version_id: existingByPath.id } : {}),
858
+ },
859
+ processing_duration_ms: null,
860
+ processing_quality_score: null,
861
+ parent_id: null,
862
+ parent_ids: '[]',
863
+ chain_depth: 0,
864
+ chain_path: '["DOCUMENT"]',
865
+ });
866
+ // Insert document
867
+ db.insertDocument({
868
+ id: documentId,
869
+ file_path: filePath,
870
+ file_name: basename(filePath),
871
+ file_hash: fileHash,
872
+ file_size: stats.size,
873
+ file_type: ext,
874
+ status: 'pending',
875
+ page_count: null,
876
+ provenance_id: provenanceId,
877
+ error_message: null,
878
+ modified_at: null,
879
+ ocr_completed_at: null,
880
+ doc_title: null,
881
+ doc_author: null,
882
+ doc_subject: null,
883
+ datalab_file_id: null,
884
+ });
885
+ items.push({
886
+ file_path: filePath,
887
+ file_name: basename(filePath),
888
+ document_id: documentId,
889
+ status: isVersionUpdate ? 'version_updated' : 'pending',
890
+ ...(isVersionUpdate ? { previous_version_id: existingByPath.id } : {}),
891
+ });
892
+ }
893
+ catch (error) {
894
+ const errorMsg = error instanceof Error ? error.message : String(error);
895
+ console.error(`[ERROR] Failed to ingest ${filePath}: ${errorMsg}`);
896
+ items.push({
897
+ file_path: filePath,
898
+ file_name: basename(filePath),
899
+ document_id: '',
900
+ status: 'error',
901
+ error_message: errorMsg,
902
+ });
903
+ }
904
+ }
905
+ const result = {
906
+ directory_path: safeDirPath,
907
+ files_found: files.length,
908
+ files_ingested: items.filter((i) => i.status === 'pending').length,
909
+ files_version_updated: items.filter((i) => i.status === 'version_updated').length,
910
+ files_skipped: items.filter((i) => i.status === 'skipped').length,
911
+ files_errored: items.filter((i) => i.status === 'error').length,
912
+ items,
913
+ next_steps: [
914
+ { tool: 'ocr_process_pending', description: 'Run OCR pipeline on the ingested files' },
915
+ ],
916
+ };
917
+ return formatResponse(successResult(result));
918
+ }
919
+ catch (error) {
920
+ return handleError(error);
921
+ }
922
+ }
923
+ /**
924
+ * Handle ocr_ingest_files - Ingest specific files
925
+ */
926
+ export async function handleIngestFiles(params) {
927
+ try {
928
+ const input = validateInput(IngestFilesInput, params);
929
+ const { db } = requireDatabase();
930
+ const items = [];
931
+ for (const rawFilePath of input.file_paths) {
932
+ const filePath = sanitizePath(rawFilePath);
933
+ try {
934
+ // Validate file exists - FAIL FAST
935
+ if (!existsSync(filePath)) {
936
+ items.push({
937
+ file_path: filePath,
938
+ file_name: basename(filePath),
939
+ document_id: '',
940
+ status: 'error',
941
+ error_message: 'File not found',
942
+ });
943
+ continue;
944
+ }
945
+ const stats = statSync(filePath);
946
+ if (!stats.isFile()) {
947
+ items.push({
948
+ file_path: filePath,
949
+ file_name: basename(filePath),
950
+ document_id: '',
951
+ status: 'error',
952
+ error_message: 'Path is not a file',
953
+ });
954
+ continue;
955
+ }
956
+ // Check if already ingested
957
+ const existingByPath = db.getDocumentByPath(filePath);
958
+ // Create document record
959
+ const documentId = uuidv4();
960
+ const provenanceId = uuidv4();
961
+ const now = new Date().toISOString();
962
+ const ext = extname(filePath).slice(1).toLowerCase();
963
+ // Validate file type is supported
964
+ if (!DEFAULT_FILE_TYPES.includes(ext)) {
965
+ items.push({
966
+ file_path: filePath,
967
+ file_name: basename(filePath),
968
+ document_id: '',
969
+ status: 'error',
970
+ error_message: `Unsupported file type: .${ext}. Supported: ${DEFAULT_FILE_TYPES.join(', ')}`,
971
+ });
972
+ continue;
973
+ }
974
+ const fileHash = await hashFile(filePath);
975
+ if (existingByPath) {
976
+ if (fileHash === existingByPath.file_hash) {
977
+ items.push({
978
+ file_path: filePath,
979
+ file_name: basename(filePath),
980
+ document_id: existingByPath.id,
981
+ status: 'skipped',
982
+ error_message: 'Already ingested, content unchanged',
983
+ });
984
+ continue;
985
+ }
986
+ // Version change detected - continue with normal ingestion flow below
987
+ console.error(`[Ingestion] Version update detected for ${filePath}: ${existingByPath.file_hash} -> ${fileHash}`);
988
+ }
989
+ else {
990
+ // Check for duplicate by file hash (same content, different path)
991
+ const existingByHash = db.getDocumentByHash(fileHash);
992
+ if (existingByHash) {
993
+ items.push({
994
+ file_path: filePath,
995
+ file_name: basename(filePath),
996
+ document_id: existingByHash.id,
997
+ status: 'skipped',
998
+ error_message: `Duplicate file (same hash as ${existingByHash.file_path})`,
999
+ });
1000
+ continue;
1001
+ }
1002
+ }
1003
+ // Determine if this is a version update
1004
+ const isVersionUpdate = !!existingByPath;
1005
+ // Create document provenance
1006
+ db.insertProvenance({
1007
+ id: provenanceId,
1008
+ type: ProvenanceType.DOCUMENT,
1009
+ created_at: now,
1010
+ processed_at: now,
1011
+ source_file_created_at: null,
1012
+ source_file_modified_at: null,
1013
+ source_type: 'FILE',
1014
+ source_path: filePath,
1015
+ source_id: null,
1016
+ root_document_id: provenanceId,
1017
+ location: null,
1018
+ content_hash: fileHash,
1019
+ input_hash: null,
1020
+ file_hash: fileHash,
1021
+ processor: 'file-scanner',
1022
+ processor_version: '1.0.0',
1023
+ processing_params: isVersionUpdate ? { previous_version_id: existingByPath.id } : {},
1024
+ processing_duration_ms: null,
1025
+ processing_quality_score: null,
1026
+ parent_id: null,
1027
+ parent_ids: '[]',
1028
+ chain_depth: 0,
1029
+ chain_path: '["DOCUMENT"]',
1030
+ });
1031
+ // Insert document
1032
+ db.insertDocument({
1033
+ id: documentId,
1034
+ file_path: filePath,
1035
+ file_name: basename(filePath),
1036
+ file_hash: fileHash,
1037
+ file_size: stats.size,
1038
+ file_type: ext,
1039
+ status: 'pending',
1040
+ page_count: null,
1041
+ provenance_id: provenanceId,
1042
+ error_message: null,
1043
+ modified_at: null,
1044
+ ocr_completed_at: null,
1045
+ doc_title: null,
1046
+ doc_author: null,
1047
+ doc_subject: null,
1048
+ datalab_file_id: null,
1049
+ });
1050
+ items.push({
1051
+ file_path: filePath,
1052
+ file_name: basename(filePath),
1053
+ document_id: documentId,
1054
+ status: isVersionUpdate ? 'version_updated' : 'pending',
1055
+ ...(isVersionUpdate ? { previous_version_id: existingByPath.id } : {}),
1056
+ });
1057
+ }
1058
+ catch (error) {
1059
+ const errorMsg = error instanceof Error ? error.message : String(error);
1060
+ console.error(`[ERROR] Failed to ingest ${filePath}: ${errorMsg}`);
1061
+ items.push({
1062
+ file_path: filePath,
1063
+ file_name: basename(filePath),
1064
+ document_id: '',
1065
+ status: 'error',
1066
+ error_message: errorMsg,
1067
+ });
1068
+ }
1069
+ }
1070
+ return formatResponse(successResult({
1071
+ files_ingested: items.filter((i) => i.status === 'pending').length,
1072
+ files_version_updated: items.filter((i) => i.status === 'version_updated').length,
1073
+ files_skipped: items.filter((i) => i.status === 'skipped').length,
1074
+ files_errored: items.filter((i) => i.status === 'error').length,
1075
+ items,
1076
+ next_steps: [
1077
+ { tool: 'ocr_process_pending', description: 'Run OCR pipeline on the ingested files' },
1078
+ ],
1079
+ }));
1080
+ }
1081
+ catch (error) {
1082
+ return handleError(error);
1083
+ }
1084
+ }
1085
+ /**
1086
+ * Handle ocr_process_pending - Process pending documents through full OCR pipeline
1087
+ *
1088
+ * Pipeline: OCR -> Extract Images -> Chunk -> Embed -> VLM Process Images -> Complete
1089
+ * Provenance chain: DOCUMENT(0) -> OCR_RESULT(1) -> CHUNK(2)/IMAGE(2) -> EMBEDDING(3)/VLM_DESC(3)
1090
+ */
1091
+ export async function handleProcessPending(params) {
1092
+ try {
1093
+ const input = validateInput(ProcessPendingInput, params);
1094
+ if (!process.env.DATALAB_API_KEY) {
1095
+ throw new Error('DATALAB_API_KEY environment variable is required for OCR processing');
1096
+ }
1097
+ // H-1/H-2: Use withDatabaseOperation to track this long-running async operation.
1098
+ // This prevents database switches while processing is in-flight and validates
1099
+ // generation on completion.
1100
+ return await withDatabaseOperation(async ({ db, vector, generation }) => {
1101
+ // Atomic document claiming: UPDATE then SELECT to prevent concurrent callers
1102
+ // from processing the same documents (F-INTEG-3)
1103
+ const claimLimit = input.max_concurrent ?? 3;
1104
+ const conn = db.getConnection();
1105
+ conn
1106
+ .prepare(`UPDATE documents SET status = 'processing', modified_at = ?
1107
+ WHERE id IN (SELECT id FROM documents WHERE status = 'pending' ORDER BY created_at ASC LIMIT ?)`)
1108
+ .run(new Date().toISOString(), claimLimit);
1109
+ const pendingDocs = db.listDocuments({ status: 'processing', limit: claimLimit });
1110
+ if (pendingDocs.length === 0) {
1111
+ return formatResponse(successResult({
1112
+ processed: 0,
1113
+ failed: 0,
1114
+ remaining: 0,
1115
+ message: 'No pending documents to process',
1116
+ next_steps: [{ tool: 'ocr_status', description: 'Check overall processing status' }],
1117
+ }));
1118
+ }
1119
+ const ocrMode = input.ocr_mode ?? state.config.defaultOCRMode;
1120
+ const ocrOptions = {
1121
+ maxPages: input.max_pages,
1122
+ pageRange: input.page_range,
1123
+ skipCache: input.skip_cache,
1124
+ disableImageExtraction: input.disable_image_extraction,
1125
+ extras: input.extras,
1126
+ pageSchema: input.page_schema,
1127
+ additionalConfig: input.additional_config,
1128
+ };
1129
+ const results = {
1130
+ processed: 0,
1131
+ failed: 0,
1132
+ errors: [],
1133
+ warnings: [],
1134
+ };
1135
+ const successfulDocIds = [];
1136
+ const batchId = uuidv4();
1137
+ const batchStartTime = Date.now();
1138
+ console.error(`[INFO] Batch ${batchId}: processing ${pendingDocs.length} documents`);
1139
+ // Default images output directory
1140
+ const imagesBaseDir = resolve(state.config.defaultStoragePath, 'images');
1141
+ // FIX-P1-2: Process documents in parallel batches using max_concurrent
1142
+ const maxConcurrent = input.max_concurrent ?? 3;
1143
+ // Build shared processing params for the module-level processOneDocument function
1144
+ const processingParams = {
1145
+ db,
1146
+ vector,
1147
+ generation,
1148
+ ocrMode,
1149
+ ocrOptions,
1150
+ pageSchema: input.page_schema,
1151
+ imagesBaseDir,
1152
+ };
1153
+ // Wrapper that handles per-document error tracking and cleanup
1154
+ const processDocWithTracking = async (doc) => {
1155
+ try {
1156
+ const docWarnings = await processOneDocument(doc, processingParams);
1157
+ results.processed++;
1158
+ successfulDocIds.push(doc.id);
1159
+ if (docWarnings.length > 0) {
1160
+ results.warnings.push({ document_id: doc.id, warnings: docWarnings });
1161
+ }
1162
+ }
1163
+ catch (error) {
1164
+ const errorMsg = error instanceof Error ? error.message : String(error);
1165
+ console.error(`[ERROR] Document ${doc.id} failed: ${errorMsg}`);
1166
+ // F-INTEG-1: Clean up partial derived data (orphaned chunks, embeddings)
1167
+ // before marking as failed, so a retry starts from a clean state.
1168
+ try {
1169
+ db.cleanDocumentDerivedData(doc.id);
1170
+ console.error(`[INFO] Cleaned partial data for failed document ${doc.id}`);
1171
+ }
1172
+ catch (cleanupError) {
1173
+ const cleanupMsg = cleanupError instanceof Error ? cleanupError.message : String(cleanupError);
1174
+ console.error(`[WARN] Cleanup of partial data failed for ${doc.id}: ${cleanupMsg}`);
1175
+ }
1176
+ db.updateDocumentStatus(doc.id, 'failed', errorMsg);
1177
+ results.failed++;
1178
+ results.errors.push({ document_id: doc.id, error: errorMsg });
1179
+ }
1180
+ if (typeof global.gc === 'function') {
1181
+ global.gc();
1182
+ }
1183
+ };
1184
+ // FIX-P1-2: Execute documents in parallel batches
1185
+ for (let batchStart = 0; batchStart < pendingDocs.length; batchStart += maxConcurrent) {
1186
+ const batch = pendingDocs.slice(batchStart, batchStart + maxConcurrent);
1187
+ if (batch.length > 1) {
1188
+ console.error(`[INFO] Processing document batch ${Math.floor(batchStart / maxConcurrent) + 1}: ` +
1189
+ `${batch.length} documents (${batchStart + 1}-${batchStart + batch.length} of ${pendingDocs.length})`);
1190
+ }
1191
+ await Promise.allSettled(batch.map(processDocWithTracking));
1192
+ }
1193
+ // Get remaining count - CRITICAL: use 'status' not 'statusFilter'
1194
+ const remaining = db.listDocuments({ status: 'pending' }).length;
1195
+ // Auto-clustering check
1196
+ let autoClusterResult;
1197
+ const config = getConfig();
1198
+ if (config.autoClusterEnabled && results.processed > 0) {
1199
+ const totalDocs = conn.prepare('SELECT COUNT(*) as cnt FROM documents WHERE status = ?').get('complete').cnt;
1200
+ const threshold = config.autoClusterThreshold ?? 10;
1201
+ // Check if we have enough docs and no recent clustering run
1202
+ const lastCluster = conn.prepare('SELECT MAX(created_at) as latest FROM clusters').get();
1203
+ const lastClusterDate = lastCluster?.latest ? new Date(lastCluster.latest) : null;
1204
+ const hoursSinceLastCluster = lastClusterDate ? (Date.now() - lastClusterDate.getTime()) / 3600000 : Infinity;
1205
+ if (totalDocs >= threshold && hoursSinceLastCluster > 1) {
1206
+ try {
1207
+ const { runClustering } = await import('../services/clustering/clustering-service.js');
1208
+ const algorithm = config.autoClusterAlgorithm ?? 'hdbscan';
1209
+ const clusterResult = await runClustering(db, vector, { algorithm, n_clusters: null, min_cluster_size: 3, distance_threshold: null, linkage: 'average' });
1210
+ autoClusterResult = { triggered: true, run_id: clusterResult.run_id, clusters: clusterResult.n_clusters, algorithm };
1211
+ console.error(`[Ingestion] Auto-clustering triggered: ${clusterResult.n_clusters} clusters via ${algorithm}`);
1212
+ }
1213
+ catch (e) {
1214
+ console.error(`[Ingestion] Auto-clustering failed: ${e instanceof Error ? e.message : String(e)}`);
1215
+ autoClusterResult = { triggered: true, error: e instanceof Error ? e.message : String(e) };
1216
+ }
1217
+ }
1218
+ }
1219
+ // Build response
1220
+ const response = {
1221
+ batch_id: batchId,
1222
+ batch_duration_ms: Date.now() - batchStartTime,
1223
+ processed: results.processed,
1224
+ failed: results.failed,
1225
+ remaining,
1226
+ errors: results.errors.length > 0 ? results.errors : undefined,
1227
+ warnings: results.warnings.length > 0 ? results.warnings : undefined,
1228
+ };
1229
+ response.next_steps = [
1230
+ { tool: 'ocr_search', description: 'Search across all processed documents' },
1231
+ { tool: 'ocr_document_list', description: 'Browse all documents in the database' },
1232
+ ];
1233
+ try {
1234
+ const totalDocCount = db
1235
+ .getConnection()
1236
+ .prepare('SELECT COUNT(*) as cnt FROM documents WHERE status = ?')
1237
+ .get('complete').cnt;
1238
+ if (totalDocCount > 1) {
1239
+ response.next_steps.push({ tool: 'ocr_document_compare', description: 'Compare differences between documents' });
1240
+ }
1241
+ }
1242
+ catch (error) {
1243
+ console.error(`[Ingestion] Failed to query document count for auto-compare hint: ${String(error)}`);
1244
+ }
1245
+ if (autoClusterResult) {
1246
+ response.auto_clustering = autoClusterResult;
1247
+ }
1248
+ return formatResponse(successResult(response));
1249
+ });
1250
+ }
1251
+ catch (error) {
1252
+ return handleError(error);
1253
+ }
1254
+ }
1255
+ /**
1256
+ * Handle ocr_status - Get OCR processing status
1257
+ */
1258
+ export async function handleOCRStatus(params) {
1259
+ try {
1260
+ const input = validateInput(OCRStatusInput, params);
1261
+ const { db } = requireDatabase();
1262
+ if (input.document_id) {
1263
+ const doc = db.getDocument(input.document_id);
1264
+ if (!doc) {
1265
+ throw documentNotFoundError(input.document_id);
1266
+ }
1267
+ return formatResponse(successResult({
1268
+ documents: [
1269
+ {
1270
+ document_id: doc.id,
1271
+ file_name: doc.file_name,
1272
+ status: doc.status,
1273
+ page_count: doc.page_count,
1274
+ error_message: doc.error_message ?? undefined,
1275
+ created_at: doc.created_at,
1276
+ },
1277
+ ],
1278
+ summary: {
1279
+ total: 1,
1280
+ pending: doc.status === 'pending' ? 1 : 0,
1281
+ processing: doc.status === 'processing' ? 1 : 0,
1282
+ complete: doc.status === 'complete' ? 1 : 0,
1283
+ failed: doc.status === 'failed' ? 1 : 0,
1284
+ },
1285
+ next_steps: [
1286
+ { tool: 'ocr_document_get', description: 'View full document details and metadata' },
1287
+ { tool: 'ocr_process_pending', description: 'Process documents still pending OCR' },
1288
+ ],
1289
+ }));
1290
+ }
1291
+ // Map filter values - CRITICAL: use 'status' not 'statusFilter' for listDocuments
1292
+ const statusFilter = input.status_filter ?? 'all';
1293
+ const filterMap = {
1294
+ pending: 'pending',
1295
+ processing: 'processing',
1296
+ complete: 'complete',
1297
+ failed: 'failed',
1298
+ all: undefined,
1299
+ };
1300
+ const documents = db.listDocuments({
1301
+ status: filterMap[statusFilter],
1302
+ limit: 1000,
1303
+ });
1304
+ const stats = db.getStats();
1305
+ return formatResponse(successResult({
1306
+ documents: documents.map((d) => ({
1307
+ document_id: d.id,
1308
+ file_name: d.file_name,
1309
+ status: d.status,
1310
+ page_count: d.page_count,
1311
+ error_message: d.error_message ?? undefined,
1312
+ created_at: d.created_at,
1313
+ })),
1314
+ summary: {
1315
+ total: stats.total_documents,
1316
+ pending: stats.documents_by_status.pending,
1317
+ processing: stats.documents_by_status.processing,
1318
+ complete: stats.documents_by_status.complete,
1319
+ failed: stats.documents_by_status.failed,
1320
+ },
1321
+ supplementary: {
1322
+ total_chunks: stats.total_chunks,
1323
+ total_embeddings: stats.total_embeddings,
1324
+ total_extractions: stats.total_extractions,
1325
+ total_form_fills: stats.total_form_fills,
1326
+ ocr_quality: stats.ocr_quality,
1327
+ costs: stats.costs,
1328
+ },
1329
+ next_steps: [
1330
+ { tool: 'ocr_process_pending', description: 'Process documents still pending OCR' },
1331
+ { tool: 'ocr_retry_failed', description: 'Reset failed documents for reprocessing' },
1332
+ ],
1333
+ }));
1334
+ }
1335
+ catch (error) {
1336
+ return handleError(error);
1337
+ }
1338
+ }
1339
+ /**
1340
+ * Handle ocr_retry_failed - Reset failed documents back to pending for reprocessing
1341
+ *
1342
+ * Cleans all derived data (OCR results, chunks, embeddings, images, non-root provenance)
1343
+ * before resetting status to 'pending' to avoid duplicate data on reprocessing.
1344
+ */
1345
+ export async function handleRetryFailed(params) {
1346
+ try {
1347
+ const input = validateInput(RetryFailedInput, params);
1348
+ const { db } = requireDatabase();
1349
+ let resetCount = 0;
1350
+ if (input.document_id) {
1351
+ const doc = db.getDocument(input.document_id);
1352
+ if (!doc) {
1353
+ throw documentNotFoundError(input.document_id);
1354
+ }
1355
+ if (doc.status !== 'failed') {
1356
+ return formatResponse(successResult({
1357
+ reset: 0,
1358
+ message: `Document ${input.document_id} is not in failed state (current: ${doc.status})`,
1359
+ next_steps: [{ tool: 'ocr_status', description: 'Check document processing status' }],
1360
+ }));
1361
+ }
1362
+ // Clean all derived data before resetting to pending
1363
+ db.cleanDocumentDerivedData(input.document_id);
1364
+ db.updateDocumentStatus(input.document_id, 'pending');
1365
+ resetCount = 1;
1366
+ }
1367
+ else {
1368
+ const failedDocs = db.listDocuments({ status: 'failed', limit: 1000 });
1369
+ for (const doc of failedDocs) {
1370
+ // Clean all derived data before resetting to pending
1371
+ db.cleanDocumentDerivedData(doc.id);
1372
+ db.updateDocumentStatus(doc.id, 'pending');
1373
+ resetCount++;
1374
+ }
1375
+ }
1376
+ return formatResponse(successResult({
1377
+ reset: resetCount,
1378
+ message: `Reset ${resetCount} failed document(s) to pending (derived data cleaned)`,
1379
+ next_steps: [
1380
+ { tool: 'ocr_process_pending', description: 'Process the reset documents' },
1381
+ { tool: 'ocr_status', description: 'Check processing status after retry' },
1382
+ ],
1383
+ }));
1384
+ }
1385
+ catch (error) {
1386
+ return handleError(error);
1387
+ }
1388
+ }
1389
+ // ═══════════════════════════════════════════════════════════════════════════════
1390
+ // RAW CONVERSION HANDLER (AI-4)
1391
+ // ═══════════════════════════════════════════════════════════════════════════════
1392
+ /**
1393
+ * Handle ocr_convert_raw - Convert a document via OCR and return raw results
1394
+ * without storing in database. Quick one-off conversions.
1395
+ */
1396
+ async function handleConvertRaw(params) {
1397
+ try {
1398
+ const input = validateInput(z.object({
1399
+ file_path: z.string().min(1),
1400
+ ocr_mode: z.enum(['fast', 'balanced', 'accurate']).default('balanced'),
1401
+ max_pages: z.number().int().min(1).max(7000).optional(),
1402
+ page_range: z.string().optional(),
1403
+ }), params);
1404
+ // Verify file exists - FAIL FAST
1405
+ if (!existsSync(input.file_path)) {
1406
+ throw new Error(`File not found: ${input.file_path}`);
1407
+ }
1408
+ const stats = statSync(input.file_path);
1409
+ if (!stats.isFile()) {
1410
+ throw new Error(`Not a file: ${input.file_path}`);
1411
+ }
1412
+ // Use DatalabClient directly without DB storage
1413
+ const client = new DatalabClient();
1414
+ const result = await client.processRaw(input.file_path, input.ocr_mode, {
1415
+ maxPages: input.max_pages,
1416
+ pageRange: input.page_range,
1417
+ });
1418
+ return formatResponse(successResult({
1419
+ file_path: input.file_path,
1420
+ text_length: result.markdown.length,
1421
+ page_count: result.pageCount,
1422
+ markdown: result.markdown,
1423
+ metadata: result.metadata ?? {},
1424
+ quality_score: result.qualityScore,
1425
+ cost_cents: result.costCents,
1426
+ processing_duration_ms: result.durationMs,
1427
+ next_steps: [
1428
+ { tool: 'ocr_ingest_files', description: 'Ingest the file for full pipeline processing' },
1429
+ ],
1430
+ }));
1431
+ }
1432
+ catch (error) {
1433
+ return handleError(error);
1434
+ }
1435
+ }
1436
+ // ═══════════════════════════════════════════════════════════════════════════════
1437
+ // REPROCESS HANDLER
1438
+ // ═══════════════════════════════════════════════════════════════════════════════
1439
+ /**
1440
+ * Handle ocr_reprocess - Reprocess a document with different OCR settings
1441
+ * Cleans all derived data first, then re-runs the pipeline.
1442
+ *
1443
+ * M-11 FIX: Previously called handleProcessPending() which uses atomic batch
1444
+ * claiming on ALL pending documents. If other documents were already pending,
1445
+ * the target document might not be claimed. Now directly claims and processes
1446
+ * only the target document via the module-level processOneDocument function.
1447
+ */
1448
+ async function handleReprocess(params) {
1449
+ try {
1450
+ const input = validateInput(z.object({
1451
+ document_id: z.string().min(1),
1452
+ ocr_mode: z.enum(['fast', 'balanced', 'accurate']).optional(),
1453
+ skip_cache: z.boolean().default(true),
1454
+ }), params);
1455
+ if (!process.env.DATALAB_API_KEY) {
1456
+ throw new Error('DATALAB_API_KEY environment variable is required for OCR processing');
1457
+ }
1458
+ // H-1/H-2: Use withDatabaseOperation to track this long-running async operation.
1459
+ return await withDatabaseOperation(async ({ db, vector, generation }) => {
1460
+ const doc = db.getDocument(input.document_id);
1461
+ if (!doc)
1462
+ throw documentNotFoundError(input.document_id);
1463
+ if (doc.status !== 'complete' && doc.status !== 'failed') {
1464
+ throw new Error(`Document status must be 'complete' or 'failed' to reprocess (current: ${doc.status})`);
1465
+ }
1466
+ // Save previous quality score for comparison
1467
+ const previousOCR = db.getOCRResultByDocumentId(doc.id);
1468
+ const previousQuality = previousOCR?.parse_quality_score ?? null;
1469
+ // Clean all derived data (chunks, embeddings, images, ocr_results, extractions)
1470
+ db.cleanDocumentDerivedData(doc.id);
1471
+ // M-11 FIX: Directly claim THIS document by setting status to 'processing'.
1472
+ // Previously set to 'pending' then called handleProcessPending() which batch-claims
1473
+ // from ALL pending documents -- a race condition if other documents are also pending.
1474
+ db.updateDocumentStatus(doc.id, 'processing');
1475
+ const ocrMode = input.ocr_mode ?? state.config.defaultOCRMode;
1476
+ const imagesBaseDir = resolve(state.config.defaultStoragePath, 'images');
1477
+ const startTime = Date.now();
1478
+ // Process the single document directly -- no batch claiming needed
1479
+ let reprocessWarnings = [];
1480
+ try {
1481
+ reprocessWarnings = await processOneDocument(doc, {
1482
+ db,
1483
+ vector,
1484
+ generation,
1485
+ ocrMode,
1486
+ ocrOptions: {
1487
+ skipCache: input.skip_cache,
1488
+ },
1489
+ imagesBaseDir,
1490
+ });
1491
+ }
1492
+ catch (error) {
1493
+ const errorMsg = error instanceof Error ? error.message : String(error);
1494
+ console.error(`[ERROR] Reprocess failed for document ${doc.id}: ${errorMsg}`);
1495
+ // Clean up partial data and mark as failed
1496
+ try {
1497
+ db.cleanDocumentDerivedData(doc.id);
1498
+ }
1499
+ catch (cleanupError) {
1500
+ console.error(`[WARN] Cleanup of partial data failed for ${doc.id}: ` +
1501
+ `${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`);
1502
+ }
1503
+ db.updateDocumentStatus(doc.id, 'failed', errorMsg);
1504
+ throw error;
1505
+ }
1506
+ // Get new quality score
1507
+ const newOCR = db.getOCRResultByDocumentId(doc.id);
1508
+ return formatResponse(successResult({
1509
+ document_id: doc.id,
1510
+ previous_quality: previousQuality,
1511
+ new_quality: newOCR?.parse_quality_score ?? null,
1512
+ quality_change: previousQuality !== null &&
1513
+ newOCR?.parse_quality_score !== null &&
1514
+ newOCR?.parse_quality_score !== undefined
1515
+ ? (newOCR.parse_quality_score - previousQuality).toFixed(2)
1516
+ : null,
1517
+ processing_duration_ms: Date.now() - startTime,
1518
+ ...(reprocessWarnings.length > 0 ? { warnings: reprocessWarnings } : {}),
1519
+ next_steps: [
1520
+ { tool: 'ocr_status', description: 'Check processing status' },
1521
+ { tool: 'ocr_document_get', description: 'View updated document details' },
1522
+ ],
1523
+ }));
1524
+ });
1525
+ }
1526
+ catch (error) {
1527
+ return handleError(error);
1528
+ }
1529
+ }
1530
+ // ═══════════════════════════════════════════════════════════════════════════════
1531
+ // TOOL DEFINITIONS FOR MCP REGISTRATION
1532
+ // ═══════════════════════════════════════════════════════════════════════════════
1533
+ /**
1534
+ * Ingestion tools collection for MCP server registration
1535
+ */
1536
+ export const ingestionTools = {
1537
+ ocr_ingest_directory: {
1538
+ description: '[PROCESSING] Use to bulk-ingest all supported files from a directory. Returns per-file status (pending/skipped/error). Follow with ocr_process_pending to run OCR.',
1539
+ inputSchema: {
1540
+ directory_path: z.string().min(1).describe('Path to directory to scan'),
1541
+ recursive: z.boolean().default(true).describe('Scan subdirectories'),
1542
+ file_types: z
1543
+ .array(z.string())
1544
+ .optional()
1545
+ .describe('File types to include (default: pdf, png, jpg, docx, etc.)'),
1546
+ },
1547
+ handler: handleIngestDirectory,
1548
+ },
1549
+ ocr_ingest_files: {
1550
+ description: '[ESSENTIAL] Use to ingest specific files by path into the current database. Returns per-file status. Follow with ocr_process_pending to run OCR.',
1551
+ inputSchema: {
1552
+ file_paths: z.array(z.string().min(1)).min(1).describe('Array of file paths to ingest'),
1553
+ },
1554
+ handler: handleIngestFiles,
1555
+ },
1556
+ ocr_process_pending: {
1557
+ description: '[ESSENTIAL] Use after ingesting files to run the full OCR pipeline (OCR, chunking, embedding, VLM). Returns processed/failed counts. Requires DATALAB_API_KEY.',
1558
+ inputSchema: {
1559
+ max_concurrent: z
1560
+ .number()
1561
+ .int()
1562
+ .min(1)
1563
+ .max(10)
1564
+ .default(3)
1565
+ .describe('Maximum concurrent OCR operations'),
1566
+ ocr_mode: z
1567
+ .enum(['fast', 'balanced', 'accurate'])
1568
+ .optional()
1569
+ .describe('OCR processing mode override'),
1570
+ max_pages: z
1571
+ .number()
1572
+ .int()
1573
+ .min(1)
1574
+ .max(7000)
1575
+ .optional()
1576
+ .describe('Maximum pages to process per document (Datalab limit: 7000)'),
1577
+ page_range: z
1578
+ .string()
1579
+ .regex(/^[0-9,\-\s]+$/)
1580
+ .optional()
1581
+ .describe('Specific pages to process, 0-indexed (e.g., "0-5,10")'),
1582
+ skip_cache: z.boolean().optional().describe('Force reprocessing, skip Datalab cache'),
1583
+ disable_image_extraction: z
1584
+ .boolean()
1585
+ .optional()
1586
+ .describe('Skip image extraction for text-only processing'),
1587
+ extras: z
1588
+ .array(z.enum([
1589
+ 'track_changes',
1590
+ 'chart_understanding',
1591
+ 'extract_links',
1592
+ 'table_row_bboxes',
1593
+ 'infographic',
1594
+ 'new_block_types',
1595
+ ]))
1596
+ .optional()
1597
+ .describe('Extra Datalab features to enable'),
1598
+ page_schema: z
1599
+ .string()
1600
+ .optional()
1601
+ .describe('JSON schema string for structured data extraction per page'),
1602
+ additional_config: z
1603
+ .record(z.unknown())
1604
+ .optional()
1605
+ .describe('Additional Datalab config: keep_pageheader_in_output, keep_pagefooter_in_output, keep_spreadsheet_formatting'),
1606
+ },
1607
+ handler: handleProcessPending,
1608
+ },
1609
+ ocr_status: {
1610
+ description: '[STATUS] Use to check processing status of documents (pending/processing/complete/failed). Returns per-document status and summary counts.',
1611
+ inputSchema: {
1612
+ document_id: z.string().optional().describe('Specific document ID to check'),
1613
+ status_filter: z
1614
+ .enum(['pending', 'processing', 'complete', 'failed', 'all'])
1615
+ .default('all')
1616
+ .describe('Filter by status'),
1617
+ },
1618
+ handler: handleOCRStatus,
1619
+ },
1620
+ ocr_retry_failed: {
1621
+ description: '[PROCESSING] Use to reset failed documents back to pending for reprocessing. Cleans derived data first. Follow with ocr_process_pending.',
1622
+ inputSchema: {
1623
+ document_id: z
1624
+ .string()
1625
+ .optional()
1626
+ .describe('Specific document ID to retry (omit to retry all failed)'),
1627
+ },
1628
+ handler: handleRetryFailed,
1629
+ },
1630
+ ocr_reprocess: {
1631
+ description: '[PROCESSING] Use to re-run OCR on a document with different settings. Cleans existing data first. Returns quality comparison (before/after).',
1632
+ inputSchema: {
1633
+ document_id: z.string().min(1).describe('Document ID to reprocess'),
1634
+ ocr_mode: z.enum(['fast', 'balanced', 'accurate']).optional().describe('OCR mode override'),
1635
+ skip_cache: z
1636
+ .boolean()
1637
+ .default(true)
1638
+ .describe('Skip Datalab cache (default: true for reprocessing)'),
1639
+ },
1640
+ handler: handleReprocess,
1641
+ },
1642
+ ocr_convert_raw: {
1643
+ description: '[PROCESSING] Use when you need a quick OCR preview of a file without creating database records. Converts a file to markdown text via Datalab API and returns the raw result. Use ocr_ingest_files + ocr_process_pending instead for full pipeline processing.',
1644
+ inputSchema: {
1645
+ file_path: z.string().min(1).describe('Path to file to convert'),
1646
+ ocr_mode: z
1647
+ .enum(['fast', 'balanced', 'accurate'])
1648
+ .default('balanced')
1649
+ .describe('OCR processing mode'),
1650
+ max_pages: z.number().int().min(1).max(7000).optional().describe('Maximum pages to process'),
1651
+ page_range: z
1652
+ .string()
1653
+ .optional()
1654
+ .describe('Specific pages to process (0-indexed, e.g., "0-5,10")'),
1655
+ },
1656
+ handler: handleConvertRaw,
1657
+ },
1658
+ };
1659
+ //# sourceMappingURL=ingestion.js.map