ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,800 @@
1
+ /**
2
+ * VLM Pipeline - Batch Image Processing with Embedding Integration
3
+ *
4
+ * Orchestrates the full VLM processing pipeline:
5
+ * 1. Fetch pending images from database
6
+ * 2. Analyze with Gemini VLM
7
+ * 3. Generate embeddings for descriptions
8
+ * 4. Track provenance
9
+ * 5. Update database records
10
+ *
11
+ * @module services/vlm/pipeline
12
+ */
13
+ import { existsSync, unlinkSync } from 'fs';
14
+ import { v4 as uuidv4 } from 'uuid';
15
+ import { getVLMService, } from './service.js';
16
+ import { getImage, getImagesByDocument, getPendingImages, setImageProcessing, updateImageVLMResult, setImageVLMFailed, setImageVLMSkipped, getImageStats, findByContentHash, copyVLMResult, resetProcessingImages, } from '../storage/database/image-operations.js';
17
+ import { getEmbeddingClient, MODEL_NAME as EMBEDDING_MODEL, } from '../embedding/nomic.js';
18
+ import { computeHash } from '../../utils/hash.js';
19
+ import { ProvenanceType } from '../../models/provenance.js';
20
+ import { getImageOptimizer } from '../images/optimizer.js';
21
+ const DEFAULT_CONFIG = {
22
+ batchSize: 10,
23
+ concurrency: 5,
24
+ minConfidence: 0.5,
25
+ useUniversalPrompt: true,
26
+ skipEmbeddings: false,
27
+ skipProvenance: false,
28
+ imageOptimization: {
29
+ enabled: true,
30
+ ocrMaxWidth: 4800,
31
+ vlmMaxDimension: 2048,
32
+ vlmSkipBelowSize: 50,
33
+ vlmMinRelevance: 0.3,
34
+ vlmSkipLogosIcons: true,
35
+ },
36
+ };
37
+ /**
38
+ * VLMPipeline - Orchestrates image processing workflow
39
+ *
40
+ * Integrates VLM analysis with:
41
+ * - Database operations (image records)
42
+ * - Embedding generation (Nomic)
43
+ * - Vector storage (sqlite-vec)
44
+ * - Provenance tracking
45
+ * - Image relevance filtering (logos, icons, decorative elements)
46
+ */
47
+ export class VLMPipeline {
48
+ vlm;
49
+ embeddingClient;
50
+ config;
51
+ db;
52
+ dbService;
53
+ vectorService;
54
+ optimizer;
55
+ constructor(db, options) {
56
+ this.db = db;
57
+ this.vlm = options.vlmService ?? getVLMService();
58
+ this.embeddingClient = options.embeddingClient ?? getEmbeddingClient();
59
+ this.config = { ...DEFAULT_CONFIG, ...options.config };
60
+ this.dbService = options.dbService ?? null;
61
+ this.vectorService = options.vectorService;
62
+ this.optimizer =
63
+ options.optimizer ??
64
+ getImageOptimizer({
65
+ vlmMaxDimension: this.config.imageOptimization.vlmMaxDimension,
66
+ vlmSkipBelowSize: this.config.imageOptimization.vlmSkipBelowSize,
67
+ minRelevanceScore: this.config.imageOptimization.vlmMinRelevance,
68
+ });
69
+ }
70
+ /**
71
+ * Process all images in a document.
72
+ *
73
+ * @param documentId - Document UUID
74
+ * @returns BatchResult with processing summary
75
+ */
76
+ async processDocument(documentId) {
77
+ // Reset any stuck 'processing' images back to pending (crash recovery)
78
+ const stuckCount = resetProcessingImages(this.db, documentId);
79
+ if (stuckCount > 0) {
80
+ console.error(`[VLMPipeline] Reset ${stuckCount} stuck processing images for document ${documentId}`);
81
+ }
82
+ const pending = getImagesByDocument(this.db, documentId, { vlmStatus: 'pending' }).filter((img) => !img.is_header_footer);
83
+ if (pending.length === 0) {
84
+ return {
85
+ total: 0,
86
+ successful: 0,
87
+ failed: 0,
88
+ skipped: 0,
89
+ totalTokens: 0,
90
+ totalTimeMs: 0,
91
+ results: [],
92
+ };
93
+ }
94
+ return this.processImages(pending);
95
+ }
96
+ /**
97
+ * Process all pending images in the database.
98
+ *
99
+ * @param limit - Maximum images to process
100
+ * @returns BatchResult with processing summary
101
+ */
102
+ async processPending(limit) {
103
+ const images = getPendingImages(this.db, limit ?? this.config.batchSize * 10).filter((img) => !img.is_header_footer);
104
+ return this.processImages(images);
105
+ }
106
+ /**
107
+ * Process a single image by ID.
108
+ *
109
+ * @param imageId - Image UUID
110
+ * @returns ProcessingResult
111
+ */
112
+ async processOne(imageId) {
113
+ const image = getImage(this.db, imageId);
114
+ if (!image) {
115
+ return {
116
+ imageId,
117
+ success: false,
118
+ error: 'Image not found',
119
+ processingTimeMs: 0,
120
+ };
121
+ }
122
+ const [result] = await this.processBatch([image]);
123
+ return result;
124
+ }
125
+ /**
126
+ * Process array of images in batches.
127
+ */
128
+ async processImages(images) {
129
+ const startTime = Date.now();
130
+ const results = [];
131
+ for (let i = 0; i < images.length; i += this.config.batchSize) {
132
+ const batch = images.slice(i, i + this.config.batchSize);
133
+ const batchResults = await this.processBatch(batch);
134
+ results.push(...batchResults);
135
+ }
136
+ // Count successful (processed), skipped (relevance filtered), and failed
137
+ const successful = results.filter((r) => r.success && r.description);
138
+ const skipped = results.filter((r) => r.success && !r.description && r.error?.startsWith('Skipped:'));
139
+ const failed = results.filter((r) => !r.success);
140
+ return {
141
+ total: results.length,
142
+ successful: successful.length,
143
+ failed: failed.length,
144
+ skipped: skipped.length,
145
+ totalTokens: successful.reduce((sum, r) => sum + (r.tokensUsed || 0), 0),
146
+ totalTimeMs: Date.now() - startTime,
147
+ results,
148
+ };
149
+ }
150
+ /**
151
+ * Process a batch of images with rate limiting and exponential backoff.
152
+ *
153
+ * F-INTEG-9: Uses exponential backoff on 429/5xx errors (1s -> 2s -> 4s -> ... -> 32s max).
154
+ * Aborts batch after 5 consecutive failures to avoid wasting resources.
155
+ */
156
+ async processBatch(images) {
157
+ const BASE_DELAY_MS = 100; // 100ms courtesy delay; rate limiter handles throttling (FIX-P0-2)
158
+ const MAX_DELAY_MS = 32000; // 32 second max backoff
159
+ const MAX_CONSECUTIVE_FAILURES = 5; // Abort batch after this many consecutive failures
160
+ // Mark all as processing (returns false if image not in 'pending' state)
161
+ const claimedImages = [];
162
+ for (const img of images) {
163
+ const claimed = setImageProcessing(this.db, img.id);
164
+ if (!claimed) {
165
+ console.error(`[WARN] Image ${img.id} is no longer pending, skipping`);
166
+ continue;
167
+ }
168
+ claimedImages.push(img);
169
+ }
170
+ // Process SEQUENTIALLY with rate limiting (no concurrency)
171
+ const results = [];
172
+ let currentDelay = BASE_DELAY_MS;
173
+ let consecutiveFailures = 0;
174
+ for (let i = 0; i < claimedImages.length; i++) {
175
+ const img = claimedImages[i];
176
+ // Abort batch if too many consecutive failures (likely API outage)
177
+ if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
178
+ console.error(`[VLMPipeline] Aborting batch: ${MAX_CONSECUTIVE_FAILURES} consecutive failures. ` +
179
+ `Processed ${results.length}/${claimedImages.length} images.`);
180
+ // Mark remaining as failed so they can be retried later
181
+ for (let j = i; j < claimedImages.length; j++) {
182
+ try {
183
+ setImageVLMFailed(this.db, claimedImages[j].id, 'Batch aborted: too many consecutive failures');
184
+ }
185
+ catch (error) {
186
+ console.error(`[VLMPipeline] Failed to mark image ${claimedImages[j].id} as failed during batch abort: ${String(error)}`);
187
+ }
188
+ results.push({
189
+ imageId: claimedImages[j].id,
190
+ success: false,
191
+ error: 'Batch aborted: too many consecutive failures',
192
+ processingTimeMs: 0,
193
+ });
194
+ }
195
+ break;
196
+ }
197
+ // Rate limit: wait between requests (skip for first request)
198
+ if (i > 0) {
199
+ console.error(`[VLMPipeline] Rate limiting: waiting ${currentDelay / 1000}s before next request...`);
200
+ await new Promise((resolve) => setTimeout(resolve, currentDelay));
201
+ }
202
+ console.error(`[VLMPipeline] Processing image ${i + 1}/${claimedImages.length}: ${img.id}`);
203
+ try {
204
+ const result = await this.processImage(img);
205
+ results.push(result);
206
+ if (result.success) {
207
+ console.error(`[VLMPipeline] Success: ${img.id} (confidence: ${result.confidence?.toFixed(2)})`);
208
+ // Reset backoff on success
209
+ currentDelay = BASE_DELAY_MS;
210
+ consecutiveFailures = 0;
211
+ }
212
+ else {
213
+ console.error(`[VLMPipeline] Failed: ${img.id} - ${result.error}`);
214
+ consecutiveFailures++;
215
+ // Apply exponential backoff on failure (likely 429 or 5xx)
216
+ currentDelay = Math.min(currentDelay * 2, MAX_DELAY_MS);
217
+ }
218
+ }
219
+ catch (error) {
220
+ const errorMessage = error instanceof Error ? error.message : String(error);
221
+ console.error(`[VLMPipeline] Error: ${img.id} - ${errorMessage}`);
222
+ results.push({
223
+ imageId: img.id,
224
+ success: false,
225
+ error: errorMessage,
226
+ processingTimeMs: 0,
227
+ });
228
+ consecutiveFailures++;
229
+ // Apply exponential backoff on error
230
+ currentDelay = Math.min(currentDelay * 2, MAX_DELAY_MS);
231
+ }
232
+ }
233
+ return results;
234
+ }
235
+ /**
236
+ * Process a single image through the full pipeline.
237
+ * Includes relevance filtering to skip logos, icons, and decorative elements.
238
+ */
239
+ async processImage(image) {
240
+ const start = Date.now();
241
+ try {
242
+ // Validate image has extracted file
243
+ if (!image.extracted_path) {
244
+ const error = 'No extracted image file';
245
+ setImageVLMFailed(this.db, image.id, error);
246
+ return {
247
+ imageId: image.id,
248
+ success: false,
249
+ error,
250
+ processingTimeMs: Date.now() - start,
251
+ };
252
+ }
253
+ // Check image relevance if optimization enabled
254
+ if (this.config.imageOptimization.enabled) {
255
+ const shouldProcess = await this.checkImageRelevance(image);
256
+ if (!shouldProcess.process) {
257
+ const skipReason = `Skipped: ${shouldProcess.reason}`;
258
+ console.error(`[VLMPipeline] ${skipReason} - ${image.id}`);
259
+ // Dedup copies are already marked 'complete' by copyVLMResult — don't re-mark
260
+ if (shouldProcess.dedupSource) {
261
+ // Create VLM_DESCRIPTION provenance for the dedup copy
262
+ this.trackDedupProvenance(image, shouldProcess.dedupSource);
263
+ }
264
+ else {
265
+ // Mark as 'complete' (not 'failed') so retry_failed won't reprocess intentionally-skipped images
266
+ setImageVLMSkipped(this.db, image.id, skipReason);
267
+ }
268
+ return {
269
+ imageId: image.id,
270
+ success: true, // Not a failure, intentionally skipped
271
+ error: skipReason,
272
+ processingTimeMs: Date.now() - start,
273
+ };
274
+ }
275
+ }
276
+ // Verify image file exists on disk before processing
277
+ if (!existsSync(image.extracted_path)) {
278
+ const error = `Image file not found on disk: ${image.extracted_path} (image_id: ${image.id}). The database record exists but the file has been deleted.`;
279
+ setImageVLMFailed(this.db, image.id, error);
280
+ return {
281
+ imageId: image.id,
282
+ success: false,
283
+ error,
284
+ processingTimeMs: Date.now() - start,
285
+ };
286
+ }
287
+ // Optionally resize large images for VLM
288
+ let imagePath = image.extracted_path;
289
+ if (this.config.imageOptimization.enabled) {
290
+ const resized = await this.maybeResizeForVLM(image);
291
+ if (resized) {
292
+ imagePath = resized;
293
+ }
294
+ }
295
+ try {
296
+ // Run VLM analysis
297
+ const vlmResult = await this.vlm.describeImage(imagePath, {
298
+ contextText: image.context_text ?? undefined,
299
+ useUniversalPrompt: this.config.useUniversalPrompt,
300
+ });
301
+ // Check confidence threshold
302
+ if (vlmResult.analysis.confidence < this.config.minConfidence) {
303
+ console.error(`[VLMPipeline] Low confidence (${vlmResult.analysis.confidence}) for image ${image.id}`);
304
+ }
305
+ // Track VLM_DESCRIPTION provenance FIRST (returns provenance ID for embedding chain)
306
+ let vlmProvId;
307
+ if (!this.config.skipProvenance && this.dbService) {
308
+ vlmProvId = this.trackProvenance(image, vlmResult);
309
+ }
310
+ // Generate embedding for description with VLM provenance ID
311
+ // T2.10: Include VLM extracted text in embedding for FTS searchability
312
+ let embeddingId = null;
313
+ if (!this.config.skipEmbeddings && vlmResult.description) {
314
+ let textForEmbedding = vlmResult.description;
315
+ if (vlmResult.analysis?.extractedText?.length > 0) {
316
+ textForEmbedding += '\n\nExtracted text: ' + vlmResult.analysis.extractedText.join(', ');
317
+ }
318
+ embeddingId = await this.generateAndStoreEmbedding(textForEmbedding, image, vlmProvId);
319
+ }
320
+ // Build VLM result for database
321
+ const dbResult = {
322
+ description: vlmResult.description,
323
+ structuredData: this.convertToStructuredData(vlmResult.analysis),
324
+ embeddingId: embeddingId || '',
325
+ model: vlmResult.model,
326
+ confidence: vlmResult.analysis.confidence,
327
+ tokensUsed: vlmResult.tokensUsed,
328
+ };
329
+ // Update database record
330
+ updateImageVLMResult(this.db, image.id, dbResult);
331
+ return {
332
+ imageId: image.id,
333
+ success: true,
334
+ description: vlmResult.description,
335
+ embeddingId: embeddingId ?? undefined,
336
+ tokensUsed: vlmResult.tokensUsed,
337
+ confidence: vlmResult.analysis.confidence,
338
+ processingTimeMs: Date.now() - start,
339
+ };
340
+ }
341
+ finally {
342
+ // Clean up temp resized file if it differs from the original
343
+ if (imagePath !== image.extracted_path) {
344
+ try {
345
+ unlinkSync(imagePath);
346
+ }
347
+ catch (cleanupErr) {
348
+ console.error('[VLMPipeline] Failed to clean up temp resized file:', cleanupErr instanceof Error ? cleanupErr.message : String(cleanupErr));
349
+ /* ignore cleanup errors */
350
+ }
351
+ }
352
+ }
353
+ }
354
+ catch (error) {
355
+ const errorMessage = error instanceof Error ? error.message : String(error);
356
+ // Mark as failed in database
357
+ try {
358
+ setImageVLMFailed(this.db, image.id, errorMessage);
359
+ }
360
+ catch (secondaryError) {
361
+ console.error('[VLMPipeline] Failed to mark image as failed:', image.id, secondaryError instanceof Error ? secondaryError.message : String(secondaryError));
362
+ }
363
+ return {
364
+ imageId: image.id,
365
+ success: false,
366
+ error: errorMessage,
367
+ processingTimeMs: Date.now() - start,
368
+ };
369
+ }
370
+ }
371
+ /**
372
+ * Check if an image should be processed by VLM based on relevance analysis.
373
+ *
374
+ * Uses multi-layer heuristics to filter out:
375
+ * - Tiny images (likely icons)
376
+ * - Extreme aspect ratios (likely banners/decorative)
377
+ * - Low color diversity (likely logos)
378
+ *
379
+ * @param image - Image reference with dimensions
380
+ * @returns Object with process flag and reason
381
+ */
382
+ async checkImageRelevance(image) {
383
+ const { imageOptimization } = this.config;
384
+ // LAYER 1: Header/footer block classification (from Datalab JSON)
385
+ if (image.is_header_footer) {
386
+ return {
387
+ process: false,
388
+ reason: `Header/footer decorative: block_type=${image.block_type ?? 'unknown'}`,
389
+ };
390
+ }
391
+ // LAYER 2: Figure blocks are always content — skip further checks
392
+ if (image.block_type === 'Figure' || image.block_type === 'FigureGroup') {
393
+ return { process: true, reason: 'Figure block — content image' };
394
+ }
395
+ // LAYER 3: Content hash deduplication
396
+ if (image.content_hash) {
397
+ const duplicate = findByContentHash(this.db, image.content_hash, image.id);
398
+ if (duplicate) {
399
+ // Copy VLM results from the existing processed image
400
+ copyVLMResult(this.db, image.id, duplicate);
401
+ return {
402
+ process: false,
403
+ reason: `Duplicate of image ${duplicate.id} — VLM results copied, 0 tokens used`,
404
+ dedupSource: duplicate,
405
+ };
406
+ }
407
+ }
408
+ // LAYER 4: Quick dimension check (no file I/O needed)
409
+ const width = image.dimensions?.width ?? 0;
410
+ const height = image.dimensions?.height ?? 0;
411
+ if (width > 0 && height > 0) {
412
+ if (Math.max(width, height) < imageOptimization.vlmSkipBelowSize) {
413
+ return {
414
+ process: false,
415
+ reason: `Too small: ${width}x${height} < ${imageOptimization.vlmSkipBelowSize}px`,
416
+ };
417
+ }
418
+ if (Math.max(width, height) < 100) {
419
+ return {
420
+ process: false,
421
+ reason: `Likely icon: ${width}x${height} (largest dim < 100px)`,
422
+ };
423
+ }
424
+ const aspectRatio = Math.max(width, height) / Math.min(width, height);
425
+ if (aspectRatio > 6) {
426
+ return {
427
+ process: false,
428
+ reason: `Extreme aspect ratio: ${aspectRatio.toFixed(1)}:1 (likely banner/separator)`,
429
+ };
430
+ }
431
+ }
432
+ // LAYER 5: Full file-based analysis (existing Python optimizer)
433
+ if (imageOptimization.vlmSkipLogosIcons && image.extracted_path) {
434
+ try {
435
+ const analysis = await this.optimizer.analyzeImage(image.extracted_path);
436
+ if (analysis.success && !analysis.should_vlm) {
437
+ return {
438
+ process: false,
439
+ reason: analysis.skip_reason ?? `Low relevance: ${analysis.overall_relevance}`,
440
+ };
441
+ }
442
+ }
443
+ catch (error) {
444
+ const errMsg = error instanceof Error ? error.message : String(error);
445
+ console.error(`[VLMPipeline] Relevance analysis failed for ${image.id}, skipping to avoid processing potentially irrelevant images: ${errMsg}`);
446
+ return {
447
+ process: false,
448
+ reason: `Relevance analysis failed: ${errMsg}. Skipping to avoid processing potentially irrelevant images.`,
449
+ };
450
+ }
451
+ }
452
+ return { process: true, reason: 'Passed all relevance checks' };
453
+ }
454
+ /**
455
+ * Resize an image for VLM if it exceeds the max dimension.
456
+ *
457
+ * @param image - Image reference
458
+ * @returns Path to resized image, or null if no resize needed
459
+ */
460
+ async maybeResizeForVLM(image) {
461
+ if (!image.extracted_path)
462
+ return null;
463
+ const { vlmMaxDimension } = this.config.imageOptimization;
464
+ const width = image.dimensions?.width ?? 0;
465
+ const height = image.dimensions?.height ?? 0;
466
+ const maxDim = Math.max(width, height);
467
+ // Unknown dimensions (Datalab images) - skip resize
468
+ if (maxDim === 0) {
469
+ return null;
470
+ }
471
+ // Dimensions known but within limit - no resize needed
472
+ if (maxDim <= vlmMaxDimension) {
473
+ return null;
474
+ }
475
+ // Try to resize
476
+ try {
477
+ const result = await this.optimizer.resizeForVLM(image.extracted_path);
478
+ if (result.success && 'output_path' in result) {
479
+ if (result.resized) {
480
+ console.error(`[VLMPipeline] Resized image for VLM: ${result.original_width}x${result.original_height} -> ${result.output_width}x${result.output_height}`);
481
+ }
482
+ return result.output_path;
483
+ }
484
+ }
485
+ catch (error) {
486
+ console.error(`[VLMPipeline] Failed to resize image ${image.id}, using original: ${error}`);
487
+ }
488
+ return null;
489
+ }
490
+ /**
491
+ * Generate embedding and store in vector database.
492
+ * Creates EMBEDDING provenance at depth 4 (from VLM_DESCRIPTION).
493
+ *
494
+ * @param description - VLM description text to embed
495
+ * @param image - Source image reference
496
+ * @param vlmDescriptionProvId - VLM_DESCRIPTION provenance ID for chain tracking
497
+ */
498
+ async generateAndStoreEmbedding(description, image, vlmDescriptionProvId) {
499
+ // Generate embedding vector
500
+ const vectors = await this.embeddingClient.embedChunks([description], 1);
501
+ if (vectors.length === 0) {
502
+ throw new Error('Embedding generation returned empty result');
503
+ }
504
+ const vector = vectors[0];
505
+ const embeddingId = uuidv4();
506
+ // Store in database and vector storage - database service is REQUIRED
507
+ if (!this.dbService) {
508
+ throw new Error('VLM embedding storage requires dbService - pipeline was created without database service');
509
+ }
510
+ {
511
+ // Create EMBEDDING provenance if we have VLM_DESCRIPTION provenance
512
+ let embeddingProvId = embeddingId; // Default: use embedding ID as provenance ID
513
+ if (vlmDescriptionProvId) {
514
+ embeddingProvId = uuidv4();
515
+ const vlmProv = this.dbService.getProvenance(vlmDescriptionProvId);
516
+ if (vlmProv) {
517
+ // Build parent_ids: ... + VLM_DESCRIPTION
518
+ const parentIds = JSON.parse(vlmProv.parent_ids);
519
+ parentIds.push(vlmDescriptionProvId);
520
+ const now = new Date().toISOString();
521
+ const embeddingProvRecord = {
522
+ id: embeddingProvId,
523
+ type: ProvenanceType.EMBEDDING,
524
+ created_at: now,
525
+ processed_at: now,
526
+ source_file_created_at: null,
527
+ source_file_modified_at: null,
528
+ source_type: 'EMBEDDING',
529
+ source_path: null,
530
+ source_id: vlmDescriptionProvId, // Parent is VLM_DESCRIPTION
531
+ root_document_id: vlmProv.root_document_id,
532
+ location: {
533
+ page_number: image.page_number,
534
+ chunk_index: image.image_index,
535
+ },
536
+ content_hash: computeHash(description),
537
+ input_hash: vlmProv.content_hash,
538
+ file_hash: vlmProv.file_hash,
539
+ processor: EMBEDDING_MODEL,
540
+ processor_version: '1.5.0',
541
+ processing_params: { task_type: 'search_document', dimensions: 768 },
542
+ processing_duration_ms: null,
543
+ processing_quality_score: null,
544
+ parent_id: vlmDescriptionProvId,
545
+ parent_ids: JSON.stringify(parentIds),
546
+ chain_depth: 4, // EMBEDDING from VLM_DESCRIPTION is depth 4
547
+ chain_path: JSON.stringify([
548
+ 'DOCUMENT',
549
+ 'OCR_RESULT',
550
+ 'IMAGE',
551
+ 'VLM_DESCRIPTION',
552
+ 'EMBEDDING',
553
+ ]),
554
+ };
555
+ this.dbService.insertProvenance(embeddingProvRecord);
556
+ }
557
+ else {
558
+ // vlmDescriptionProvId was set but provenance not found - fall back
559
+ console.error(`[VLMPipeline] VLM description provenance ${vlmDescriptionProvId} not found, using embedding ID as provenance`);
560
+ embeddingProvId = embeddingId;
561
+ }
562
+ }
563
+ // Create embedding record (VLM description embeddings use image_id, not chunk_id)
564
+ this.dbService.insertEmbedding({
565
+ id: embeddingId,
566
+ chunk_id: null, // VLM embeddings don't have a chunk
567
+ image_id: image.id, // Use image ID for VLM embeddings
568
+ extraction_id: null, // VLM embeddings don't have an extraction
569
+ document_id: image.document_id,
570
+ original_text: description,
571
+ original_text_length: description.length,
572
+ source_file_path: image.extracted_path ?? 'unknown',
573
+ source_file_name: image.extracted_path?.split('/').pop() ?? 'vlm_description',
574
+ source_file_hash: 'vlm_generated',
575
+ page_number: image.page_number,
576
+ page_range: null,
577
+ character_start: 0,
578
+ character_end: description.length,
579
+ chunk_index: image.image_index,
580
+ total_chunks: 1,
581
+ model_name: EMBEDDING_MODEL,
582
+ model_version: '1.5.0',
583
+ task_type: 'search_document',
584
+ inference_mode: 'local',
585
+ gpu_device: 'cuda:0',
586
+ provenance_id: embeddingProvId, // Use embedding provenance ID
587
+ content_hash: computeHash(description),
588
+ generation_duration_ms: null,
589
+ });
590
+ // Store vector
591
+ this.vectorService.storeVector(embeddingId, vector);
592
+ }
593
+ return embeddingId;
594
+ }
595
+ /**
596
+ * Convert ImageAnalysis to VLMStructuredData format.
597
+ */
598
+ convertToStructuredData(analysis) {
599
+ return {
600
+ imageType: analysis.imageType,
601
+ primarySubject: analysis.primarySubject,
602
+ extractedText: analysis.extractedText,
603
+ dates: analysis.dates,
604
+ names: analysis.names,
605
+ numbers: analysis.numbers,
606
+ paragraph1: analysis.paragraph1,
607
+ paragraph2: analysis.paragraph2,
608
+ paragraph3: analysis.paragraph3,
609
+ };
610
+ }
611
+ /**
612
+ * Track VLM_DESCRIPTION provenance for VLM processing output.
613
+ * Chain: DOCUMENT (0) -> OCR_RESULT (1) -> IMAGE (2) -> VLM_DESCRIPTION (3)
614
+ *
615
+ * @param image - Source image reference with provenance_id
616
+ * @param vlmResult - VLM analysis result
617
+ * @returns Provenance ID for the VLM_DESCRIPTION record (used for embedding chain)
618
+ */
619
+ trackProvenance(image, vlmResult) {
620
+ if (!this.dbService) {
621
+ throw new Error('DatabaseService required for provenance tracking');
622
+ }
623
+ const provenanceId = uuidv4();
624
+ const now = new Date().toISOString();
625
+ // Get IMAGE provenance to build parent chain
626
+ if (!image.provenance_id) {
627
+ throw new Error(`Image ${image.id} has no provenance_id - cannot track VLM provenance`);
628
+ }
629
+ const imageProv = this.dbService.getProvenance(image.provenance_id);
630
+ if (!imageProv) {
631
+ throw new Error(`Image provenance not found: ${image.provenance_id}`);
632
+ }
633
+ // Build parent_ids: document + OCR + IMAGE
634
+ const parentIds = JSON.parse(imageProv.parent_ids);
635
+ parentIds.push(image.provenance_id);
636
+ const record = {
637
+ id: provenanceId,
638
+ type: ProvenanceType.VLM_DESCRIPTION, // CORRECT type for VLM descriptions
639
+ created_at: now,
640
+ processed_at: now,
641
+ source_file_created_at: null,
642
+ source_file_modified_at: null,
643
+ source_type: 'VLM', // CORRECT source type
644
+ source_path: image.extracted_path,
645
+ source_id: image.provenance_id, // Parent is IMAGE
646
+ root_document_id: imageProv.root_document_id,
647
+ location: {
648
+ page_number: image.page_number,
649
+ chunk_index: image.image_index,
650
+ },
651
+ content_hash: computeHash(vlmResult.description),
652
+ input_hash: imageProv.content_hash, // Input was the image
653
+ file_hash: imageProv.file_hash,
654
+ processor: `gemini-vlm:${vlmResult.model}`,
655
+ processor_version: '3.0',
656
+ processing_params: {
657
+ type: 'vlm_description',
658
+ confidence: vlmResult.analysis.confidence,
659
+ tokensUsed: vlmResult.tokensUsed,
660
+ },
661
+ processing_duration_ms: vlmResult.processingTimeMs,
662
+ processing_quality_score: vlmResult.analysis.confidence,
663
+ parent_id: image.provenance_id,
664
+ parent_ids: JSON.stringify(parentIds),
665
+ chain_depth: 3, // VLM_DESCRIPTION is depth 3
666
+ chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'IMAGE', 'VLM_DESCRIPTION']),
667
+ };
668
+ this.dbService.insertProvenance(record);
669
+ return provenanceId; // Return the ID so we can use it for embedding provenance
670
+ }
671
+ /**
672
+ * Track VLM_DESCRIPTION provenance for a deduplicated image.
673
+ * Creates provenance record documenting that VLM results were copied from a source image
674
+ * with identical content hash, preserving full chain: DOCUMENT(0) -> OCR_RESULT(1) -> IMAGE(2) -> VLM_DESCRIPTION(3).
675
+ *
676
+ * @param image - The dedup copy image that received copied VLM results
677
+ * @param source - The source image whose VLM results were copied
678
+ */
679
+ trackDedupProvenance(image, source) {
680
+ if (!this.dbService || this.config.skipProvenance)
681
+ return;
682
+ if (!image.provenance_id) {
683
+ console.error(`[VLMPipeline] Cannot track dedup provenance: image ${image.id} has no provenance_id`);
684
+ return;
685
+ }
686
+ const imageProv = this.dbService.getProvenance(image.provenance_id);
687
+ if (!imageProv) {
688
+ console.error(`[VLMPipeline] Image provenance not found: ${image.provenance_id}`);
689
+ return;
690
+ }
691
+ const provenanceId = uuidv4();
692
+ const now = new Date().toISOString();
693
+ const parentIds = JSON.parse(imageProv.parent_ids);
694
+ parentIds.push(image.provenance_id);
695
+ if (!source.vlm_description) {
696
+ console.error(`[VLMPipeline] Cannot create dedup provenance: source image ${source.id} has null vlm_description despite vlm_status=complete`);
697
+ return;
698
+ }
699
+ const record = {
700
+ id: provenanceId,
701
+ type: ProvenanceType.VLM_DESCRIPTION,
702
+ created_at: now,
703
+ processed_at: now,
704
+ source_file_created_at: null,
705
+ source_file_modified_at: null,
706
+ source_type: 'VLM_DEDUP',
707
+ source_path: image.extracted_path,
708
+ source_id: image.provenance_id,
709
+ root_document_id: imageProv.root_document_id,
710
+ location: {
711
+ page_number: image.page_number,
712
+ chunk_index: image.image_index,
713
+ },
714
+ content_hash: computeHash(source.vlm_description),
715
+ input_hash: imageProv.content_hash,
716
+ file_hash: imageProv.file_hash,
717
+ processor: 'dedup-copy',
718
+ processor_version: '1.0.0',
719
+ processing_params: {
720
+ type: 'vlm_dedup_copy',
721
+ source_image_id: source.id,
722
+ content_hash: image.content_hash,
723
+ },
724
+ processing_duration_ms: 0,
725
+ processing_quality_score: source.vlm_confidence,
726
+ parent_id: image.provenance_id,
727
+ parent_ids: JSON.stringify(parentIds),
728
+ chain_depth: 3,
729
+ chain_path: JSON.stringify(['DOCUMENT', 'OCR_RESULT', 'IMAGE', 'VLM_DESCRIPTION']),
730
+ };
731
+ this.dbService.insertProvenance(record);
732
+ console.error(`[VLMPipeline] Created dedup VLM_DESCRIPTION provenance: ${provenanceId} (source: ${source.id})`);
733
+ // If source has an embedding, create EMBEDDING provenance linking to target's chain
734
+ // This ensures the dedup target has a complete provenance chain including the shared embedding
735
+ if (source.vlm_embedding_id) {
736
+ const embProvId = uuidv4();
737
+ const embParentIds = [...parentIds, provenanceId];
738
+ const embRecord = {
739
+ id: embProvId,
740
+ type: ProvenanceType.EMBEDDING,
741
+ created_at: now,
742
+ processed_at: now,
743
+ source_file_created_at: null,
744
+ source_file_modified_at: null,
745
+ source_type: 'EMBEDDING',
746
+ source_path: null,
747
+ source_id: provenanceId, // Parent is the VLM_DESCRIPTION we just created
748
+ root_document_id: imageProv.root_document_id,
749
+ location: {
750
+ page_number: image.page_number,
751
+ chunk_index: image.image_index,
752
+ },
753
+ content_hash: record.content_hash, // Same content as VLM description
754
+ input_hash: record.content_hash,
755
+ file_hash: imageProv.file_hash,
756
+ processor: 'vlm-dedup-embedding-link',
757
+ processor_version: '1.0.0',
758
+ processing_params: {
759
+ source_image_id: source.id,
760
+ source_embedding_id: source.vlm_embedding_id,
761
+ dedup_reason: 'content_hash_match',
762
+ },
763
+ processing_duration_ms: 0,
764
+ processing_quality_score: null,
765
+ parent_id: provenanceId,
766
+ parent_ids: JSON.stringify(embParentIds),
767
+ chain_depth: 4, // EMBEDDING from VLM_DESCRIPTION is depth 4
768
+ chain_path: JSON.stringify([
769
+ 'DOCUMENT',
770
+ 'OCR_RESULT',
771
+ 'IMAGE',
772
+ 'VLM_DESCRIPTION',
773
+ 'EMBEDDING',
774
+ ]),
775
+ };
776
+ this.dbService.insertProvenance(embRecord);
777
+ console.error(`[VLMPipeline] Created dedup EMBEDDING provenance: ${embProvId} (source embedding: ${source.vlm_embedding_id})`);
778
+ }
779
+ }
780
+ /**
781
+ * Get processing statistics.
782
+ */
783
+ getStats() {
784
+ return {
785
+ images: getImageStats(this.db),
786
+ vlm: this.vlm.getStatus(),
787
+ };
788
+ }
789
+ }
790
+ /**
791
+ * Create a VLMPipeline with full service integration.
792
+ */
793
+ export function createVLMPipeline(dbService, vectorService, config) {
794
+ return new VLMPipeline(dbService.getConnection(), {
795
+ config,
796
+ dbService,
797
+ vectorService,
798
+ });
799
+ }
800
+ //# sourceMappingURL=pipeline.js.map