ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,712 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Datalab OCR Worker for OCR Provenance MCP System
4
+
5
+ Extracts text from documents using Datalab API.
6
+ FAIL-FAST: No fallbacks, no mocks. Errors propagate immediately.
7
+ """
8
+
9
+ import argparse
10
+ import hashlib
11
+ import json
12
+ import logging
13
+ import os
14
+ import re
15
+ import sys
16
+ import time
17
+ import uuid
18
+ from dataclasses import asdict, dataclass
19
+ from pathlib import Path
20
+ from typing import Literal
21
+
22
+ # Configure logging FIRST
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
26
+ stream=sys.stderr,
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ # =============================================================================
32
+ # ERROR CLASSES (CS-ERR-001 compliant - inline, no separate module)
33
+ # =============================================================================
34
+
35
+
36
+ class OCRError(Exception):
37
+ """Base OCR error with category for error handling."""
38
+
39
+ def __init__(self, message: str, category: str, request_id: str | None = None):
40
+ super().__init__(message)
41
+ self.category = category
42
+ self.request_id = request_id
43
+
44
+
45
+ class OCRAPIError(OCRError):
46
+ """API errors (4xx/5xx responses)."""
47
+
48
+ def __init__(self, message: str, status_code: int, request_id: str | None = None):
49
+ category = "OCR_SERVER_ERROR" if status_code >= 500 else "OCR_API_ERROR"
50
+ super().__init__(message, category, request_id)
51
+ self.status_code = status_code
52
+
53
+
54
+ class OCRRateLimitError(OCRError):
55
+ """Rate limit exceeded (429)."""
56
+
57
+ def __init__(self, message: str = "Rate limit exceeded", retry_after: int = 60):
58
+ super().__init__(message, "OCR_RATE_LIMIT")
59
+ self.retry_after = retry_after
60
+
61
+
62
+ class OCRTimeoutError(OCRError):
63
+ """Processing timeout."""
64
+
65
+ def __init__(self, message: str, request_id: str | None = None):
66
+ super().__init__(message, "OCR_TIMEOUT", request_id)
67
+
68
+
69
+ class OCRFileError(OCRError):
70
+ """File access errors."""
71
+
72
+ def __init__(self, message: str, file_path: str):
73
+ super().__init__(message, "OCR_FILE_ERROR")
74
+ self.file_path = file_path
75
+
76
+
77
+ class OCRAuthenticationError(OCRError):
78
+ """Authentication/subscription errors (401/403)."""
79
+
80
+ def __init__(self, message: str, status_code: int):
81
+ # Provide actionable error message
82
+ if "subscription" in message.lower() or "expired" in message.lower() or status_code == 403:
83
+ detailed_msg = (
84
+ f"Datalab API subscription inactive (HTTP {status_code}). {message} "
85
+ "Action: Renew subscription at https://www.datalab.to/settings"
86
+ )
87
+ elif status_code == 401:
88
+ detailed_msg = (
89
+ f"Datalab API authentication failed. {message} "
90
+ "Action: Verify DATALAB_API_KEY is correct"
91
+ )
92
+ else:
93
+ detailed_msg = f"Datalab API access denied (HTTP {status_code}). {message}"
94
+ super().__init__(detailed_msg, "OCR_AUTHENTICATION_ERROR")
95
+ self.status_code = status_code
96
+
97
+
98
+ # =============================================================================
99
+ # DATA STRUCTURES (match src/models/document.ts exactly)
100
+ # =============================================================================
101
+
102
+
103
+ @dataclass
104
+ class PageOffset:
105
+ """
106
+ Character offset for a single page.
107
+ MUST match src/models/document.ts PageOffset interface.
108
+ Note: TypeScript uses camelCase (charStart), Python uses snake_case (char_start).
109
+ """
110
+
111
+ page: int # 1-indexed page number
112
+ char_start: int # Start offset in full text
113
+ char_end: int # End offset in full text
114
+
115
+
116
+ @dataclass
117
+ class OCRResult:
118
+ """
119
+ Result from OCR processing.
120
+ MUST match src/models/document.ts OCRResult interface exactly.
121
+ """
122
+
123
+ # Required fields (match TypeScript interface)
124
+ id: str # UUID - generate with uuid.uuid4()
125
+ provenance_id: str # UUID - caller provides
126
+ document_id: str # UUID - caller provides
127
+ extracted_text: str # Markdown text from Datalab
128
+ text_length: int # len(extracted_text)
129
+ datalab_request_id: str # Unique ID for this request
130
+ datalab_mode: Literal["fast", "balanced", "accurate"]
131
+ parse_quality_score: float | None
132
+ page_count: int
133
+ cost_cents: float | None
134
+ content_hash: str # sha256:... of extracted_text
135
+ processing_started_at: str # ISO 8601
136
+ processing_completed_at: str # ISO 8601
137
+ processing_duration_ms: int
138
+
139
+ # Additional fields for provenance (not in TS interface but needed)
140
+ page_offsets: list[PageOffset] # Character offsets per page
141
+ error: str | None = None
142
+
143
+ # Images extracted by Datalab (filename -> base64 data)
144
+ images: dict[str, str] | None = None
145
+
146
+ # JSON block hierarchy from Datalab (when output_format includes 'json')
147
+ json_blocks: dict | None = None
148
+
149
+ # Datalab metadata (page_stats, block_counts, etc.)
150
+ metadata: dict | None = None
151
+
152
+ # Structured extraction result (when page_schema provided)
153
+ extraction_json: dict | list | None = None
154
+
155
+ # Full cost breakdown dict from Datalab
156
+ cost_breakdown_full: dict | None = None
157
+
158
+ # Document metadata from Datalab
159
+ doc_title: str | None = None
160
+ doc_author: str | None = None
161
+ doc_subject: str | None = None
162
+
163
+
164
+ # =============================================================================
165
+ # SUPPORTED FILE TYPES (match src/models/document.ts)
166
+ # =============================================================================
167
+
168
+ SUPPORTED_EXTENSIONS = frozenset(
169
+ {
170
+ ".pdf",
171
+ ".png",
172
+ ".jpg",
173
+ ".jpeg",
174
+ ".tiff",
175
+ ".tif",
176
+ ".bmp",
177
+ ".gif",
178
+ ".webp",
179
+ ".docx",
180
+ ".doc",
181
+ ".pptx",
182
+ ".ppt",
183
+ ".xlsx",
184
+ ".xls",
185
+ ".txt",
186
+ ".csv",
187
+ ".md",
188
+ }
189
+ )
190
+
191
+
192
+ # =============================================================================
193
+ # MAIN IMPLEMENTATION
194
+ # =============================================================================
195
+
196
+
197
+ def get_api_key() -> str:
198
+ """
199
+ Get Datalab API key from environment.
200
+ FAIL-FAST: Raises immediately if not set.
201
+ """
202
+ api_key = os.environ.get("DATALAB_API_KEY")
203
+ if not api_key:
204
+ raise ValueError(
205
+ "DATALAB_API_KEY environment variable is required. "
206
+ "Get your key from https://www.datalab.to/settings"
207
+ )
208
+ if api_key == "your_api_key_here":
209
+ raise ValueError(
210
+ "DATALAB_API_KEY is set to placeholder value. Update .env with your actual API key."
211
+ )
212
+ return api_key
213
+
214
+
215
+ def validate_file(file_path: str) -> Path:
216
+ """
217
+ Validate file exists and is supported type.
218
+ FAIL-FAST: Raises immediately on any issue.
219
+ """
220
+ path = Path(file_path).resolve()
221
+
222
+ if not path.exists():
223
+ raise OCRFileError(f"File not found: {file_path}", str(path))
224
+
225
+ if not path.is_file():
226
+ raise OCRFileError(f"Not a file: {file_path}", str(path))
227
+
228
+ if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
229
+ raise OCRFileError(
230
+ f"Unsupported file type: {path.suffix}. "
231
+ f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
232
+ str(path),
233
+ )
234
+
235
+ return path
236
+
237
+
238
+ def compute_content_hash(content: str) -> str:
239
+ """
240
+ Compute SHA-256 hash matching src/utils/hash.ts format.
241
+
242
+ Returns: 'sha256:' + 64 lowercase hex characters
243
+ """
244
+ hash_hex = hashlib.sha256(content.encode("utf-8")).hexdigest()
245
+ return f"sha256:{hash_hex}"
246
+
247
+
248
+ def parse_page_offsets(markdown: str) -> list[PageOffset]:
249
+ """
250
+ Parse page delimiters from Datalab paginated output.
251
+
252
+ Datalab with paginate=True adds markers like:
253
+ ---
254
+ <!-- Page 2 -->
255
+
256
+ Returns list of PageOffset with character positions.
257
+ """
258
+ # Pattern matches page markers: newline + "---" + newline + "<!-- Page N -->" + newline
259
+ page_pattern = r"\n---\n<!-- Page (\d+) -->\n"
260
+
261
+ parts = re.split(page_pattern, markdown)
262
+
263
+ if len(parts) == 1:
264
+ # No page markers = single page document
265
+ return [PageOffset(page=1, char_start=0, char_end=len(markdown))]
266
+
267
+ offsets = []
268
+ current_offset = 0
269
+
270
+ # First part is page 1 content
271
+ page1_content = parts[0]
272
+ offsets.append(PageOffset(page=1, char_start=0, char_end=len(page1_content)))
273
+ current_offset = len(page1_content)
274
+
275
+ # Subsequent parts: alternating page_number, content
276
+ for i in range(1, len(parts), 2):
277
+ if i + 1 < len(parts):
278
+ page_num = int(parts[i])
279
+ content = parts[i + 1]
280
+ marker_len = len(f"\n---\n<!-- Page {page_num} -->\n")
281
+ offsets.append(
282
+ PageOffset(
283
+ page=page_num,
284
+ char_start=current_offset + marker_len,
285
+ char_end=current_offset + marker_len + len(content),
286
+ )
287
+ )
288
+ current_offset += marker_len + len(content)
289
+
290
+ return offsets
291
+
292
+
293
+ def process_document(
294
+ file_path: str,
295
+ document_id: str,
296
+ provenance_id: str,
297
+ mode: Literal["fast", "balanced", "accurate"] = "balanced",
298
+ timeout: int = 300,
299
+ # New Datalab API parameters
300
+ max_pages: int | None = None,
301
+ page_range: str | None = None,
302
+ skip_cache: bool = False,
303
+ disable_image_extraction: bool = False,
304
+ extras: list[str] | None = None,
305
+ page_schema: str | None = None,
306
+ additional_config: dict | None = None,
307
+ file_url: str | None = None,
308
+ ) -> OCRResult:
309
+ """
310
+ Process a document through Datalab OCR.
311
+
312
+ This is the MAIN function. Everything else supports this.
313
+
314
+ Args:
315
+ file_path: Path to document (PDF, image, or Office file)
316
+ document_id: UUID of the document record in database
317
+ provenance_id: UUID for the OCR_RESULT provenance record
318
+ mode: OCR quality mode (accurate costs more but better quality)
319
+ timeout: Maximum wait time in seconds (minimum 30s for API polling)
320
+ max_pages: Maximum pages to process (Datalab limit: 7000)
321
+ page_range: Specific pages to process, 0-indexed (e.g. "0-5,10")
322
+ skip_cache: Force reprocessing, skip Datalab cache
323
+ disable_image_extraction: Skip image extraction for text-only processing
324
+ extras: Extra Datalab features (e.g. ["track_changes", "chart_understanding"])
325
+ page_schema: JSON schema string for structured data extraction per page
326
+ additional_config: Additional Datalab config dict
327
+ file_url: URL of file to process (instead of local file, passed to Datalab as file_url)
328
+
329
+ Returns:
330
+ OCRResult with extracted text and metadata
331
+
332
+ Raises:
333
+ OCRAPIError: On 4xx/5xx API responses
334
+ OCRRateLimitError: On 429 (wait and retry)
335
+ OCRTimeoutError: On timeout
336
+ OCRFileError: On file access issues
337
+ ValueError: On missing API key
338
+ """
339
+ from datalab_sdk import ConvertOptions, DatalabClient
340
+ from datalab_sdk.exceptions import (
341
+ DatalabAPIError,
342
+ DatalabFileError,
343
+ DatalabTimeoutError,
344
+ DatalabValidationError,
345
+ )
346
+
347
+ # Validate inputs
348
+ if file_url:
349
+ validated_path = None # No local file when using URL
350
+ logger.info(f"Processing document from URL: {file_url} (mode={mode})")
351
+ else:
352
+ validated_path = validate_file(file_path)
353
+ logger.info(f"Processing document: {validated_path} (mode={mode})")
354
+ api_key = get_api_key()
355
+
356
+ # Record timing
357
+ start_time = time.time()
358
+ start_timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
359
+
360
+ # Generate unique request ID for tracking
361
+ request_id = str(uuid.uuid4())
362
+
363
+ try:
364
+ # Initialize client
365
+ client = DatalabClient(api_key=api_key)
366
+
367
+ # Configure options - paginate=True for page offset tracking
368
+ options = ConvertOptions(output_format="markdown,json", mode=mode, paginate=True)
369
+ # Only set optional Datalab API params if provided
370
+ if max_pages is not None:
371
+ options.max_pages = max_pages
372
+ if page_range is not None:
373
+ options.page_range = page_range
374
+ if skip_cache:
375
+ options.skip_cache = True
376
+ if disable_image_extraction:
377
+ options.disable_image_extraction = True
378
+ if extras:
379
+ # SDK expects comma-separated string, not list
380
+ options.extras = ",".join(extras) if isinstance(extras, list) else extras
381
+ if page_schema:
382
+ options.page_schema = page_schema
383
+ if additional_config:
384
+ options.additional_config = additional_config
385
+
386
+ # Calculate max_polls based on timeout (3 second poll interval) (FIX-P2-1)
387
+ max_polls = max(timeout // 3, 30)
388
+
389
+ # Call Datalab API
390
+ if file_url:
391
+ result = client.convert(
392
+ file_url=file_url, options=options, max_polls=max_polls, poll_interval=3
393
+ )
394
+ else:
395
+ result = client.convert(
396
+ file_path=str(validated_path), options=options, max_polls=max_polls, poll_interval=3
397
+ )
398
+
399
+ # Record completion
400
+ end_time = time.time()
401
+ end_timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
402
+ duration_ms = int((end_time - start_time) * 1000)
403
+
404
+ # Check for errors in result
405
+ if not result.success:
406
+ error_msg = result.error or "Unknown error during OCR processing"
407
+ logger.error(f"OCR failed: {error_msg}")
408
+ raise OCRAPIError(error_msg, status_code=500, request_id=request_id)
409
+
410
+ # Extract data from result
411
+ markdown = result.markdown or ""
412
+ page_count = result.page_count or 1
413
+ quality_score = result.parse_quality_score
414
+
415
+ # Get cost from response
416
+ # SDK v0.2.1 returns: {"list_cost_cents": N, "final_cost_cents": N}
417
+ # final_cost_cents is the actual charge after any discounts
418
+ cost_breakdown = result.cost_breakdown or {}
419
+ cost_cents = cost_breakdown.get("final_cost_cents")
420
+ if cost_cents is None:
421
+ cost_cents = cost_breakdown.get("total_cost_cents")
422
+ if cost_breakdown and cost_cents is None:
423
+ logger.warning("cost_breakdown present but no cost key found. Keys: %s", list(cost_breakdown.keys()))
424
+
425
+ # Capture images from Datalab response (filename -> base64 data)
426
+ # Images are returned as a dict with filename keys and base64-encoded image data
427
+ images = getattr(result, "images", None) or {}
428
+ if images:
429
+ logger.info(f"Captured {len(images)} images from Datalab response")
430
+
431
+ # Capture JSON block hierarchy (from output_format="markdown,json")
432
+ json_blocks = None
433
+ raw_json = getattr(result, "json", None)
434
+ if raw_json is not None:
435
+ if isinstance(raw_json, dict):
436
+ json_blocks = raw_json
437
+ elif hasattr(raw_json, "__dict__"):
438
+ json_blocks = raw_json.__dict__
439
+ else:
440
+ logger.warning(f"JSON output requested but got unexpected type: {type(raw_json)}")
441
+ if json_blocks is not None:
442
+ children = json_blocks.get("children", json_blocks.get("blocks", []))
443
+ logger.info(
444
+ f"Captured JSON block hierarchy with {len(children) if isinstance(children, list) else 0} top-level blocks"
445
+ )
446
+
447
+ # Capture metadata (page_stats, block_counts, etc.)
448
+ metadata_dict = None
449
+ raw_metadata = getattr(result, "metadata", None)
450
+ if raw_metadata is not None:
451
+ if isinstance(raw_metadata, dict):
452
+ metadata_dict = raw_metadata
453
+ elif hasattr(raw_metadata, "__dict__"):
454
+ metadata_dict = raw_metadata.__dict__
455
+
456
+ # Capture structured extraction result (when page_schema provided)
457
+ extraction_json = None
458
+ raw_extraction = getattr(result, "extraction_schema_json", None)
459
+ if raw_extraction is not None:
460
+ if isinstance(raw_extraction, str):
461
+ extraction_json = json.loads(raw_extraction)
462
+ elif isinstance(raw_extraction, (dict, list)):
463
+ extraction_json = raw_extraction
464
+ if extraction_json is not None:
465
+ logger.info("Captured structured extraction data")
466
+
467
+ # Capture extras feature data (when extras params are enabled)
468
+ # These are returned as top-level attributes on the result object
469
+ extras_features: dict = {}
470
+ for extras_key in (
471
+ "links",
472
+ "charts",
473
+ "tracked_changes",
474
+ "table_row_bboxes",
475
+ "infographics",
476
+ ):
477
+ val = getattr(result, extras_key, None)
478
+ if val is not None:
479
+ extras_features[extras_key] = val
480
+ if extras_features:
481
+ # Merge extras features into metadata dict for downstream storage
482
+ if metadata_dict is None:
483
+ metadata_dict = {}
484
+ metadata_dict["extras_features"] = extras_features
485
+ logger.info(f"Captured extras features: {list(extras_features.keys())}")
486
+
487
+ # Extract document metadata fields from Datalab metadata
488
+ doc_title = None
489
+ doc_author = None
490
+ doc_subject = None
491
+ if metadata_dict:
492
+ doc_title = metadata_dict.get("title")
493
+ doc_author = metadata_dict.get("author")
494
+ doc_subject = metadata_dict.get("subject")
495
+
496
+ # Parse page offsets for provenance tracking
497
+ page_offsets = parse_page_offsets(markdown)
498
+
499
+ # Compute content hash (matching src/utils/hash.ts format)
500
+ content_hash = compute_content_hash(markdown)
501
+
502
+ ocr_result = OCRResult(
503
+ id=str(uuid.uuid4()),
504
+ provenance_id=provenance_id,
505
+ document_id=document_id,
506
+ extracted_text=markdown,
507
+ text_length=len(markdown),
508
+ datalab_request_id=request_id,
509
+ datalab_mode=mode,
510
+ parse_quality_score=quality_score,
511
+ page_count=page_count,
512
+ cost_cents=cost_cents,
513
+ content_hash=content_hash,
514
+ processing_started_at=start_timestamp,
515
+ processing_completed_at=end_timestamp,
516
+ processing_duration_ms=duration_ms,
517
+ page_offsets=page_offsets,
518
+ images=images if images else None,
519
+ json_blocks=json_blocks,
520
+ metadata=metadata_dict,
521
+ extraction_json=extraction_json,
522
+ cost_breakdown_full=cost_breakdown if cost_breakdown else None,
523
+ doc_title=doc_title,
524
+ doc_author=doc_author,
525
+ doc_subject=doc_subject,
526
+ )
527
+
528
+ logger.info(
529
+ f"OCR complete: {page_count} pages, {len(markdown)} chars, "
530
+ f"{duration_ms}ms, cost=${(cost_cents or 0) / 100:.4f}"
531
+ )
532
+
533
+ return ocr_result
534
+
535
+ except DatalabAPIError as e:
536
+ status = getattr(e, "status_code", 500)
537
+ error_msg = str(e)
538
+ if status == 429 or "rate limit" in error_msg.lower():
539
+ logger.error(f"Rate limit exceeded: {e}")
540
+ raise OCRRateLimitError(error_msg) from e
541
+ elif status in (401, 403):
542
+ logger.error(f"Authentication error ({status}): {e}")
543
+ raise OCRAuthenticationError(error_msg, status) from e
544
+ else:
545
+ logger.error(f"API error ({status}): {e}")
546
+ raise OCRAPIError(error_msg, status, request_id) from e
547
+
548
+ except DatalabTimeoutError as e:
549
+ logger.error(f"Timeout after {timeout}s: {e}")
550
+ raise OCRTimeoutError(str(e), request_id) from e
551
+
552
+ except DatalabFileError as e:
553
+ logger.error(f"File error: {e}")
554
+ raise OCRFileError(str(e), file_url or str(validated_path)) from e
555
+
556
+ except DatalabValidationError as e:
557
+ logger.error(f"Validation error: {e}")
558
+ raise OCRAPIError(f"Invalid input: {e}", 400, request_id) from e
559
+
560
+ except Exception as e:
561
+ # Catch-all for unexpected errors - still fail fast
562
+ logger.error(f"Unexpected error during OCR: {e}")
563
+ raise OCRAPIError(str(e), 500, request_id) from e
564
+
565
+
566
+ # =============================================================================
567
+ # CLI INTERFACE (for manual testing)
568
+ # =============================================================================
569
+
570
+
571
+ def main() -> None:
572
+ """CLI entry point for manual testing."""
573
+ # Load .env file if present
574
+ try:
575
+ from dotenv import load_dotenv
576
+
577
+ env_path = Path(__file__).parent.parent / ".env"
578
+ if env_path.exists():
579
+ load_dotenv(env_path)
580
+ logger.debug(f"Loaded environment from {env_path}")
581
+ except ImportError:
582
+ pass # python-dotenv not installed, skip
583
+
584
+ parser = argparse.ArgumentParser(
585
+ description="Datalab OCR Worker - Extract text from documents",
586
+ formatter_class=argparse.RawDescriptionHelpFormatter,
587
+ epilog="""
588
+ Examples:
589
+ # Process single PDF
590
+ python ocr_worker.py --file ./data/bench/doc_0005.pdf --mode accurate
591
+
592
+ # Process with JSON output
593
+ python ocr_worker.py --file ./data/bench/doc_0005.pdf --json
594
+ """,
595
+ )
596
+ parser.add_argument("--file", "-f", type=str, help="Single file to process")
597
+ parser.add_argument(
598
+ "--file-url", type=str, help="URL of file to process (instead of local file)"
599
+ )
600
+ parser.add_argument(
601
+ "--mode",
602
+ "-m",
603
+ choices=["fast", "balanced", "accurate"],
604
+ default="balanced",
605
+ help="OCR mode (default: balanced)",
606
+ )
607
+ parser.add_argument(
608
+ "--doc-id", type=str, help="Document ID (UUID) - auto-generated if not provided"
609
+ )
610
+ parser.add_argument(
611
+ "--prov-id", type=str, help="Provenance ID (UUID) - auto-generated if not provided"
612
+ )
613
+ parser.add_argument("--json", action="store_true", help="Output as JSON")
614
+ # Datalab API parameters
615
+ parser.add_argument("--max-pages", type=int, help="Max pages to process (Datalab limit: 7000)")
616
+ parser.add_argument("--page-range", type=str, help='Page range, 0-indexed (e.g. "0-5,10")')
617
+ parser.add_argument(
618
+ "--skip-cache", action="store_true", help="Force reprocessing, skip Datalab cache"
619
+ )
620
+ parser.add_argument(
621
+ "--disable-image-extraction", action="store_true", help="Skip image extraction"
622
+ )
623
+ parser.add_argument(
624
+ "--extras",
625
+ type=str,
626
+ help='Comma-separated extras (e.g. "track_changes,chart_understanding")',
627
+ )
628
+ parser.add_argument(
629
+ "--page-schema", type=str, help="JSON schema string for structured extraction per page"
630
+ )
631
+ parser.add_argument(
632
+ "--additional-config", type=str, help="JSON string of additional Datalab config"
633
+ )
634
+
635
+ args = parser.parse_args()
636
+
637
+ if args.json:
638
+ # Suppress logging in JSON mode for clean output
639
+ logging.getLogger().setLevel(logging.CRITICAL)
640
+
641
+ if not args.file and not args.file_url:
642
+ parser.error("Either --file or --file-url is required")
643
+
644
+ try:
645
+ # Use provided IDs or generate new ones
646
+ doc_id = args.doc_id or str(uuid.uuid4())
647
+ prov_id = args.prov_id or str(uuid.uuid4())
648
+ # Parse extras list from comma-separated string
649
+ extras_list = args.extras.split(",") if args.extras else None
650
+ # Parse additional config JSON
651
+ additional_config = json.loads(args.additional_config) if args.additional_config else None
652
+
653
+ result = process_document(
654
+ args.file or "",
655
+ document_id=doc_id,
656
+ provenance_id=prov_id,
657
+ mode=args.mode,
658
+ max_pages=args.max_pages,
659
+ page_range=args.page_range,
660
+ skip_cache=args.skip_cache,
661
+ disable_image_extraction=args.disable_image_extraction,
662
+ extras=extras_list,
663
+ page_schema=args.page_schema,
664
+ additional_config=additional_config,
665
+ file_url=args.file_url,
666
+ )
667
+
668
+ if args.json:
669
+ # asdict() recursively converts nested dataclasses
670
+ # Use compact format (no indent) for python-shell compatibility
671
+ print(json.dumps(asdict(result)))
672
+ else:
673
+ print("=== OCR Result ===")
674
+ print(f"Pages: {result.page_count}")
675
+ print(f"Characters: {result.text_length}")
676
+ print(f"Duration: {result.processing_duration_ms}ms")
677
+ print(f"Cost: ${(result.cost_cents or 0) / 100:.4f}")
678
+ print(f"Quality: {result.parse_quality_score}")
679
+ print(f"Hash: {result.content_hash[:40]}...")
680
+ print("\n=== Extracted Text (first 500 chars) ===")
681
+ print(result.extracted_text[:500])
682
+
683
+ except Exception as e:
684
+ # In --json mode, logging is set to CRITICAL to keep stdout clean.
685
+ # But fatal errors MUST be logged to stderr for diagnostics, so
686
+ # temporarily elevate logger level and use logger.critical().
687
+ if args.json:
688
+ logger.critical(f"Fatal error: {e}", exc_info=True)
689
+ else:
690
+ logger.exception(f"Fatal error: {e}")
691
+ if args.json:
692
+ details = {}
693
+ if hasattr(e, "status_code"):
694
+ details["status_code"] = e.status_code
695
+ if hasattr(e, "request_id"):
696
+ details["request_id"] = e.request_id
697
+ if hasattr(e, "file_path"):
698
+ details["file_path"] = e.file_path
699
+ print(
700
+ json.dumps(
701
+ {
702
+ "error": str(e),
703
+ "category": getattr(e, "category", "OCR_API_ERROR"),
704
+ "details": details,
705
+ }
706
+ )
707
+ )
708
+ sys.exit(1)
709
+
710
+
711
+ if __name__ == "__main__":
712
+ main()