ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,524 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract images from DOCX documents using stdlib zipfile + Pillow.
4
+
5
+ DOCX files are ZIP archives containing images in word/media/. This module
6
+ extracts those images and maps them to estimated page positions by parsing
7
+ word/document.xml for image references (a:blip elements).
8
+
9
+ This is a parallel extractor to image_extractor.py (PDF) for the OCR
10
+ Provenance MCP system, enabling VLM analysis of DOCX document images.
11
+
12
+ Usage:
13
+ python docx_image_extractor.py --input /path/to/doc.docx --output /path/to/images/
14
+ python docx_image_extractor.py -i doc.docx -o ./images --min-size 100 --max-images 50
15
+
16
+ Output:
17
+ JSON to stdout with extraction results:
18
+ {
19
+ "success": true,
20
+ "count": 5,
21
+ "images": [
22
+ {
23
+ "page": 1,
24
+ "index": 0,
25
+ "format": "png",
26
+ "width": 800,
27
+ "height": 600,
28
+ "bbox": {"x": 0, "y": 0, "width": 800, "height": 600},
29
+ "path": "/path/to/images/p001_i000.png",
30
+ "size": 12345
31
+ },
32
+ ...
33
+ ]
34
+ }
35
+ """
36
+
37
+ import argparse
38
+ import io
39
+ import json
40
+ import os
41
+ import shutil
42
+ import subprocess
43
+ import sys
44
+ import tempfile
45
+ import xml.etree.ElementTree as ET
46
+ import zipfile
47
+ from pathlib import Path
48
+ from typing import Any
49
+
50
+ # Check for Pillow
51
+ try:
52
+ from PIL import Image
53
+ except ImportError:
54
+ print(
55
+ json.dumps(
56
+ {
57
+ "success": False,
58
+ "error": "Pillow not installed. Run: pip install Pillow",
59
+ "images": [],
60
+ }
61
+ )
62
+ )
63
+ sys.exit(1)
64
+
65
+
66
+ # OOXML namespaces used in word/document.xml
67
+ NSMAP = {
68
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
69
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
70
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
71
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
72
+ "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
73
+ "v": "urn:schemas-microsoft-com:vml",
74
+ }
75
+
76
+ # Relationship namespace for .rels files
77
+ RELS_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
78
+
79
+ # Paragraphs per estimated page
80
+ PARAGRAPHS_PER_PAGE = 40
81
+
82
+ # Formats accepted by Gemini VLM - anything else must be converted to PNG
83
+ GEMINI_NATIVE_FORMATS = {"png", "jpg", "jpeg", "gif", "webp"}
84
+
85
+ # Cache inkscape availability check
86
+ _INKSCAPE_PATH: str | None = shutil.which("inkscape")
87
+
88
+
89
+ def _convert_with_inkscape(img_bytes: bytes, ext: str, filename: str) -> tuple[bool, bytes]:
90
+ """Convert EMF/WMF to PNG using inkscape subprocess.
91
+
92
+ Returns (success, png_bytes_or_original_bytes).
93
+ """
94
+ if _INKSCAPE_PATH is None:
95
+ return False, img_bytes
96
+
97
+ tmpdir = tempfile.mkdtemp(prefix="docx_img_")
98
+ try:
99
+ src = os.path.join(tmpdir, f"input.{ext}")
100
+ dst = os.path.join(tmpdir, "output.png")
101
+ with open(src, "wb") as f:
102
+ f.write(img_bytes)
103
+
104
+ result = subprocess.run(
105
+ [_INKSCAPE_PATH, src, "--export-type=png", f"--export-filename={dst}"],
106
+ capture_output=True,
107
+ text=True,
108
+ timeout=30,
109
+ )
110
+ if result.returncode == 0 and os.path.exists(dst):
111
+ with open(dst, "rb") as f:
112
+ return True, f.read()
113
+
114
+ print(
115
+ f"WARNING: inkscape failed for '{filename}': {result.stderr[:200]}",
116
+ file=sys.stderr,
117
+ )
118
+ return False, img_bytes
119
+ except subprocess.TimeoutExpired:
120
+ print(
121
+ f"WARNING: inkscape timed out converting '{filename}'",
122
+ file=sys.stderr,
123
+ )
124
+ return False, img_bytes
125
+ except Exception as e:
126
+ print(
127
+ f"WARNING: inkscape error for '{filename}': {e}",
128
+ file=sys.stderr,
129
+ )
130
+ return False, img_bytes
131
+ finally:
132
+ shutil.rmtree(tmpdir, ignore_errors=True)
133
+
134
+
135
+ def _parse_relationships(zf: zipfile.ZipFile) -> dict[str, str]:
136
+ """
137
+ Parse word/_rels/document.xml.rels to build a map of rId -> target path.
138
+
139
+ Returns:
140
+ Dictionary mapping relationship IDs (e.g. "rId5") to target paths
141
+ (e.g. "media/image1.png").
142
+ """
143
+ rels_path = "word/_rels/document.xml.rels"
144
+ rid_to_target: dict[str, str] = {}
145
+
146
+ try:
147
+ with zf.open(rels_path) as f:
148
+ tree = ET.parse(f) # noqa: S314 - parsing trusted DOCX internal XML
149
+ except KeyError:
150
+ return rid_to_target
151
+ except ET.ParseError as e:
152
+ print(
153
+ f"WARNING: Failed to parse {rels_path}: {e}",
154
+ file=sys.stderr,
155
+ )
156
+ return rid_to_target
157
+
158
+ root = tree.getroot()
159
+ for rel in root.iter(f"{{{RELS_NS}}}Relationship"):
160
+ rid = rel.get("Id", "")
161
+ target = rel.get("Target", "")
162
+ if rid and target:
163
+ rid_to_target[rid] = target
164
+
165
+ return rid_to_target
166
+
167
+
168
+ def _parse_image_positions(
169
+ zf: zipfile.ZipFile,
170
+ rid_to_target: dict[str, str],
171
+ ) -> list[dict[str, Any]]:
172
+ """
173
+ Parse word/document.xml to find image references and their paragraph positions.
174
+
175
+ Walks all paragraphs (<w:p>) in order. For each paragraph that contains
176
+ an image reference (a:blip with r:embed), records the paragraph index
177
+ and the target media file.
178
+
179
+ Returns:
180
+ List of dicts: {"paragraph_index": int, "media_file": str}
181
+ where media_file is the filename inside word/media/.
182
+ """
183
+ doc_path = "word/document.xml"
184
+ positions: list[dict[str, Any]] = []
185
+
186
+ try:
187
+ with zf.open(doc_path) as f:
188
+ tree = ET.parse(f) # noqa: S314 - parsing trusted DOCX internal XML
189
+ except KeyError:
190
+ return positions
191
+ except ET.ParseError as e:
192
+ print(
193
+ f"WARNING: Failed to parse {doc_path}: {e}",
194
+ file=sys.stderr,
195
+ )
196
+ return positions
197
+
198
+ root = tree.getroot()
199
+ w_p_tag = f"{{{NSMAP['w']}}}p"
200
+ a_blip_tag = f"{{{NSMAP['a']}}}blip"
201
+ r_embed_attr = f"{{{NSMAP['r']}}}embed"
202
+
203
+ for paragraph_index, element in enumerate(root.iter(w_p_tag)):
204
+ # Search for a:blip elements inside this paragraph
205
+ for blip in element.iter(a_blip_tag):
206
+ rid = blip.get(r_embed_attr, "")
207
+ if rid and rid in rid_to_target:
208
+ target = rid_to_target[rid]
209
+ # target is like "media/image1.png"
210
+ media_file = target.split("/")[-1] if "/" in target else target
211
+ positions.append(
212
+ {
213
+ "paragraph_index": paragraph_index,
214
+ "media_file": media_file,
215
+ }
216
+ )
217
+
218
+ return positions
219
+
220
+
221
+ def _estimate_page(paragraph_index: int) -> int:
222
+ """Estimate 1-indexed page number from paragraph index."""
223
+ return (paragraph_index // PARAGRAPHS_PER_PAGE) + 1
224
+
225
+
226
+ def extract_images(
227
+ docx_path: str,
228
+ output_dir: str,
229
+ min_size: int = 50,
230
+ max_images: int = 100,
231
+ formats: list[str] | None = None,
232
+ ) -> dict[str, Any]:
233
+ """
234
+ Extract images from a DOCX document.
235
+
236
+ Args:
237
+ docx_path: Path to the DOCX file
238
+ output_dir: Directory to save extracted images
239
+ min_size: Minimum dimension (width or height) to include an image
240
+ max_images: Maximum number of images to extract
241
+ formats: List of formats to include (default: all)
242
+
243
+ Returns:
244
+ Dictionary with success status and list of extracted images
245
+ """
246
+ output = Path(output_dir)
247
+ output.mkdir(parents=True, exist_ok=True)
248
+
249
+ images: list[dict[str, Any]] = []
250
+ errors: list[str] = []
251
+
252
+ # Open DOCX as ZIP - fail fast if it cannot be opened
253
+ try:
254
+ zf = zipfile.ZipFile(docx_path, "r")
255
+ except zipfile.BadZipFile:
256
+ return {
257
+ "success": False,
258
+ "error": (
259
+ f"Cannot open as ZIP archive: {docx_path}. "
260
+ "The file may be corrupted or not a valid DOCX. "
261
+ "Verify the file opens in Microsoft Word or LibreOffice."
262
+ ),
263
+ "images": [],
264
+ }
265
+ except FileNotFoundError:
266
+ return {
267
+ "success": False,
268
+ "error": f"DOCX file not found: {docx_path}",
269
+ "images": [],
270
+ }
271
+ except PermissionError:
272
+ return {
273
+ "success": False,
274
+ "error": (
275
+ f"Permission denied reading: {docx_path}. "
276
+ "Check file permissions with: ls -la '{docx_path}'"
277
+ ),
278
+ "images": [],
279
+ }
280
+ except Exception as e:
281
+ return {
282
+ "success": False,
283
+ "error": f"Failed to open DOCX file '{docx_path}': {type(e).__name__}: {e}",
284
+ "images": [],
285
+ }
286
+
287
+ with zf:
288
+ # List all files in word/media/
289
+ media_files = [
290
+ name
291
+ for name in zf.namelist()
292
+ if name.startswith("word/media/") and not name.endswith("/")
293
+ ]
294
+
295
+ # No images directory - valid DOCX with no embedded images
296
+ if not media_files:
297
+ return {
298
+ "success": True,
299
+ "count": 0,
300
+ "images": [],
301
+ }
302
+
303
+ # Parse relationships and document.xml for position mapping
304
+ rid_to_target = _parse_relationships(zf)
305
+ image_positions = _parse_image_positions(zf, rid_to_target)
306
+
307
+ # Build a lookup: media filename -> paragraph index
308
+ media_to_paragraph: dict[str, int] = {}
309
+ for pos in image_positions:
310
+ fname = pos["media_file"]
311
+ if fname not in media_to_paragraph:
312
+ media_to_paragraph[fname] = pos["paragraph_index"]
313
+
314
+ # Sort media files for deterministic output
315
+ media_files.sort()
316
+
317
+ count = 0
318
+ # Per-page image index tracking (matches PDF extractor pattern)
319
+ page_image_counts: dict[int, int] = {}
320
+
321
+ for zip_entry in media_files:
322
+ if count >= max_images:
323
+ break
324
+
325
+ media_filename = zip_entry.split("/")[-1]
326
+ ext = media_filename.rsplit(".", 1)[-1].lower() if "." in media_filename else ""
327
+
328
+ # Filter by format if specified
329
+ if formats and ext not in [f.lower() for f in formats]:
330
+ continue
331
+
332
+ # Read image bytes from ZIP
333
+ try:
334
+ img_bytes = zf.read(zip_entry)
335
+ except Exception as e:
336
+ errors.append(
337
+ f"File '{zip_entry}': Failed to read from ZIP: "
338
+ f"{type(e).__name__}: {e}. The DOCX archive may be corrupted."
339
+ )
340
+ continue
341
+
342
+ # Get dimensions using PIL (C-1: close pil_img after use)
343
+ try:
344
+ pil_img = Image.open(io.BytesIO(img_bytes))
345
+ width, height = pil_img.size
346
+ except Exception as e:
347
+ errors.append(
348
+ f"File '{zip_entry}': Failed to read image dimensions with Pillow: "
349
+ f"{type(e).__name__}: {e}. The image data may be corrupted or in "
350
+ f"an unsupported format."
351
+ )
352
+ continue
353
+
354
+ # Skip images smaller than min_size
355
+ if width < min_size or height < min_size:
356
+ pil_img.close()
357
+ continue
358
+
359
+ # Estimate page from paragraph position
360
+ paragraph_idx = media_to_paragraph.get(media_filename, 0)
361
+ page = _estimate_page(paragraph_idx)
362
+
363
+ bbox = {
364
+ "x": 0,
365
+ "y": 0,
366
+ "width": width,
367
+ "height": height,
368
+ }
369
+
370
+ # Convert non-native formats (EMF, WMF, BMP, TIFF) to PNG
371
+ # so the VLM pipeline (Gemini) can process them.
372
+ save_ext = ext
373
+ if ext not in GEMINI_NATIVE_FORMATS:
374
+ converted = False
375
+ # For EMF/WMF: use inkscape (best Linux EMF rasterizer)
376
+ if not converted and ext in ("emf", "wmf"):
377
+ converted, img_bytes = _convert_with_inkscape(img_bytes, ext, media_filename)
378
+ if converted:
379
+ save_ext = "png"
380
+ # Fallback to Pillow for simpler formats (BMP, TIFF)
381
+ # M-6: close RGBA intermediate and BytesIO buffer
382
+ if not converted:
383
+ try:
384
+ buf = io.BytesIO()
385
+ rgba_img = pil_img.convert("RGBA")
386
+ rgba_img.save(buf, format="PNG")
387
+ rgba_img.close()
388
+ img_bytes = buf.getvalue()
389
+ buf.close()
390
+ save_ext = "png"
391
+ converted = True
392
+ except Exception as e:
393
+ print(f"WARNING: Failed to convert {ext} to PNG: {e}", file=sys.stderr)
394
+ if not converted:
395
+ errors.append(
396
+ f"File '{media_filename}': Cannot convert {ext.upper()} to "
397
+ f"Gemini-compatible format (png/jpg/gif/webp). Saving as "
398
+ f"{ext.upper()}. VLM processing will skip this image. "
399
+ f"Install inkscape to enable conversion: "
400
+ f"sudo apt install inkscape"
401
+ )
402
+
403
+ if converted:
404
+ # Re-read dimensions from converted image
405
+ try:
406
+ with Image.open(io.BytesIO(img_bytes)) as converted_img:
407
+ width, height = converted_img.size
408
+ except Exception as e:
409
+ print(
410
+ f"WARNING: Failed to read converted image dimensions: {e}",
411
+ file=sys.stderr,
412
+ )
413
+
414
+ # C-1: close pil_img now that dimensions and conversion are done
415
+ pil_img.close()
416
+
417
+ # Per-page image index (matches PDF extractor pattern)
418
+ img_idx = page_image_counts.get(page, 0)
419
+ page_image_counts[page] = img_idx + 1
420
+
421
+ # Generate filename matching PDF extractor pattern
422
+ filename = f"p{page:03d}_i{img_idx:03d}.{save_ext}"
423
+ filepath = output / filename
424
+
425
+ # Save image
426
+ try:
427
+ with open(filepath, "wb") as f:
428
+ f.write(img_bytes)
429
+ except Exception as e:
430
+ errors.append(
431
+ f"File '{zip_entry}': Failed to save to '{filepath}': "
432
+ f"{type(e).__name__}: {e}. Check that the output directory "
433
+ f"'{output_dir}' is writable."
434
+ )
435
+ continue
436
+
437
+ img_size = len(img_bytes)
438
+ # M-7: free img_bytes after writing to disk
439
+ del img_bytes
440
+
441
+ images.append(
442
+ {
443
+ "page": page,
444
+ "index": img_idx,
445
+ "format": save_ext,
446
+ "width": width,
447
+ "height": height,
448
+ "bbox": bbox,
449
+ "path": str(filepath.absolute()),
450
+ "size": img_size,
451
+ }
452
+ )
453
+ count += 1
454
+
455
+ result: dict[str, Any] = {
456
+ "success": True,
457
+ "count": len(images),
458
+ "images": images,
459
+ }
460
+
461
+ if errors:
462
+ result["warnings"] = errors
463
+
464
+ return result
465
+
466
+
467
+ def main():
468
+ """CLI entry point."""
469
+ parser = argparse.ArgumentParser(
470
+ description="Extract images from DOCX documents for VLM analysis"
471
+ )
472
+ parser.add_argument(
473
+ "--input",
474
+ "-i",
475
+ required=True,
476
+ help="Path to input DOCX file",
477
+ )
478
+ parser.add_argument(
479
+ "--output",
480
+ "-o",
481
+ required=True,
482
+ help="Output directory for extracted images",
483
+ )
484
+ parser.add_argument(
485
+ "--min-size",
486
+ type=int,
487
+ default=50,
488
+ help="Minimum image dimension in pixels (default: 50)",
489
+ )
490
+ parser.add_argument(
491
+ "--max-images",
492
+ type=int,
493
+ default=100,
494
+ help="Maximum images to extract (default: 100)",
495
+ )
496
+
497
+ args = parser.parse_args()
498
+
499
+ # Validate input file exists
500
+ if not os.path.isfile(args.input):
501
+ print(
502
+ json.dumps(
503
+ {
504
+ "success": False,
505
+ "error": f"Input file does not exist: {args.input}",
506
+ "images": [],
507
+ }
508
+ )
509
+ )
510
+ sys.exit(1)
511
+
512
+ result = extract_images(
513
+ docx_path=args.input,
514
+ output_dir=args.output,
515
+ min_size=args.min_size,
516
+ max_images=args.max_images,
517
+ )
518
+
519
+ print(json.dumps(result))
520
+ sys.exit(0 if result["success"] else 1)
521
+
522
+
523
+ if __name__ == "__main__":
524
+ main()