ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Heading Level Normalizer for Section-Aware Chunking
3
+ *
4
+ * Fixes inconsistent heading levels from Datalab OCR by detecting
5
+ * repeating heading patterns (e.g., "ARTICLE N") and normalizing
6
+ * their heading levels to the mode (most common) level within each group.
7
+ *
8
+ * @module services/chunking/heading-normalizer
9
+ */
10
+ /**
11
+ * Patterns that identify structural heading groups in legal/organizational documents.
12
+ * Each regex matches the heading text (not the markdown # prefix).
13
+ * Bold-wrapped text (e.g., **ARTICLE 1**) is stripped before matching.
14
+ */
15
+ const HEADING_PATTERNS = [
16
+ { name: 'ARTICLE', regex: /^ARTICLE\s+\d+/i },
17
+ { name: 'SECTION', regex: /^SECTION\s+\d+(\.\d+)*/i },
18
+ { name: 'CHAPTER', regex: /^CHAPTER\s+\d+/i },
19
+ { name: 'PART', regex: /^PART\s+\d+/i },
20
+ { name: 'TITLE', regex: /^TITLE\s+\d+/i },
21
+ { name: 'APPENDIX', regex: /^APPENDIX\s+[A-Z0-9]/i },
22
+ { name: 'SCHEDULE', regex: /^SCHEDULE\s+[A-Z0-9]/i },
23
+ { name: 'EXHIBIT', regex: /^EXHIBIT\s+[A-Z0-9]/i },
24
+ ];
25
+ /**
26
+ * Strip bold markers (**text**) from heading text for pattern matching.
27
+ */
28
+ function stripBold(text) {
29
+ return text.replace(/^\*\*(.+)\*\*$/, '$1').trim();
30
+ }
31
+ /**
32
+ * Compute the mode (most frequent value) of a number array.
33
+ * Ties are broken by preferring the smaller value.
34
+ */
35
+ function computeMode(values) {
36
+ const counts = new Map();
37
+ for (const v of values) {
38
+ counts.set(v, (counts.get(v) ?? 0) + 1);
39
+ }
40
+ let modeValue = values[0];
41
+ let modeCount = 0;
42
+ for (const [val, count] of counts) {
43
+ if (count > modeCount || (count === modeCount && val < modeValue)) {
44
+ modeValue = val;
45
+ modeCount = count;
46
+ }
47
+ }
48
+ return modeValue;
49
+ }
50
+ /**
51
+ * Normalize heading levels in-place for consistent section hierarchy.
52
+ *
53
+ * Groups headings by structural patterns (ARTICLE N, Section N.N, etc.),
54
+ * then normalizes each group to use the mode heading level. This fixes
55
+ * Datalab OCR inconsistencies where identical structural headings get
56
+ * assigned different levels (e.g., ARTICLE 1 as H1 but ARTICLE 5 as H3).
57
+ *
58
+ * Only mutates `block.headingLevel` - never modifies `block.text`.
59
+ *
60
+ * @param blocks - Parsed markdown blocks (mutated in-place)
61
+ * @param config - Normalization configuration
62
+ * @returns The same blocks array (for chaining convenience)
63
+ */
64
+ export function normalizeHeadingLevels(blocks, config) {
65
+ if (!config.enabled) {
66
+ return blocks;
67
+ }
68
+ const minCount = config.minPatternCount ?? 3;
69
+ // Build pattern groups
70
+ const groups = HEADING_PATTERNS.map(p => ({
71
+ name: p.name,
72
+ blockIndices: [],
73
+ levels: [],
74
+ }));
75
+ for (let i = 0; i < blocks.length; i++) {
76
+ const block = blocks[i];
77
+ if (block.type !== 'heading' || block.headingLevel === null || block.headingText === null) {
78
+ continue;
79
+ }
80
+ const cleanText = stripBold(block.headingText);
81
+ for (let g = 0; g < HEADING_PATTERNS.length; g++) {
82
+ if (HEADING_PATTERNS[g].regex.test(cleanText)) {
83
+ groups[g].blockIndices.push(i);
84
+ groups[g].levels.push(block.headingLevel);
85
+ break; // A heading belongs to at most one pattern group
86
+ }
87
+ }
88
+ }
89
+ // Normalize groups that meet the minimum count threshold
90
+ for (const group of groups) {
91
+ if (group.blockIndices.length < minCount) {
92
+ continue;
93
+ }
94
+ const targetLevel = computeMode(group.levels);
95
+ for (const blockIdx of group.blockIndices) {
96
+ blocks[blockIdx].headingLevel = targetLevel;
97
+ }
98
+ }
99
+ return blocks;
100
+ }
101
+ //# sourceMappingURL=heading-normalizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"heading-normalizer.js","sourceRoot":"","sources":["../../../src/services/chunking/heading-normalizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAsBH;;;;GAIG;AACH,MAAM,gBAAgB,GAA2C;IAC/D,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,iBAAiB,EAAE;IAC7C,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,yBAAyB,EAAE;IACrD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,iBAAiB,EAAE;IAC7C,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,cAAc,EAAE;IACvC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,eAAe,EAAE;IACzC,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,uBAAuB,EAAE;IACpD,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,uBAAuB,EAAE;IACpD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,sBAAsB,EAAE;CACnD,CAAC;AAEF;;GAEG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;AACrD,CAAC;AAED;;;GAGG;AACH,SAAS,WAAW,CAAC,MAAgB;IACnC,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACvB,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,IAAI,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IAC1B,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;QAClC,IAAI,KAAK,GAAG,SAAS,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,GAAG,GAAG,SAAS,CAAC,EAAE,CAAC;YAClE,SAAS,GAAG,GAAG,CAAC;YAChB,SAAS,GAAG,KAAK,CAAC;QACpB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,sBAAsB,CACpC,MAAuB,EACvB,MAAkC;IAElC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,eAAe,IAAI,CAAC,CAAC;IAE7C,uBAAuB;IACvB,MAAM,MAAM,GAAmB,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACxD,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,YAAY,EAAE,EAAE;QAChB,MAAM,EAAE,EAAE;KACX,CAAC,CAAC,CAAC;IAEJ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QACxB,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,IAAI,KAAK,CAAC,YAAY,KAAK,IAAI,IAAI,KAAK,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;YAC1F,SAAS;QACX,CAAC;QAED,MAAM,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;QAE/C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,IAAI,gBAAgB,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC9C,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;gBAC/B,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;gBAC1C,MAAM,CAAC,iDAAiD;YAC1D,CAAC;QACH,CAAC;IACH,CAAC;IAED,yDAAyD;IACzD,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YACzC,SAAS;QACX,CAAC;QAED,MAAM,WAAW,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAE9C,KAAK,MAAM,QAAQ,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;YAC1C,MAAM,CAAC,QAAQ,CAAC,CAAC,YAAY,GAAG,WAAW,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,163 @@
1
+ /**
2
+ * JSON Block Analyzer for Section-Aware Chunking
3
+ *
4
+ * Analyzes Datalab JSON block hierarchy to identify atomic (unsplittable)
5
+ * regions such as tables, figures, and code blocks. These regions inform
6
+ * the hybrid chunker where it must NOT split text.
7
+ *
8
+ * @module services/chunking/json-block-analyzer
9
+ */
10
+ import { PageOffset } from '../../models/document.js';
11
+ /** A region in the markdown text that should not be split */
12
+ export interface AtomicRegion {
13
+ startOffset: number;
14
+ endOffset: number;
15
+ blockType: string;
16
+ pageNumber: number | null;
17
+ }
18
+ /**
19
+ * Find atomic (unsplittable) regions in the markdown text by analyzing JSON blocks.
20
+ *
21
+ * Walks the Datalab JSON block tree, locates Table, TableGroup, Figure, FigureGroup,
22
+ * and Code blocks, then finds their approximate positions in the markdown text using
23
+ * fuzzy text matching. Returns sorted, non-overlapping regions.
24
+ *
25
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
26
+ * @param markdownText - The full markdown text to search within
27
+ * @param pageOffsets - Page offset information for page number assignment
28
+ * @returns Sorted array of AtomicRegion representing unsplittable text spans
29
+ */
30
+ export declare function findAtomicRegions(jsonBlocks: Record<string, unknown> | null, markdownText: string, pageOffsets: PageOffset[]): AtomicRegion[];
31
+ /**
32
+ * Check if a character offset falls within an atomic region.
33
+ *
34
+ * Uses binary search on the sorted regions array for efficient lookup.
35
+ *
36
+ * @param offset - The character offset to check
37
+ * @param regions - Sorted array of AtomicRegion (from findAtomicRegions)
38
+ * @returns The containing AtomicRegion, or null if offset is not in any region
39
+ */
40
+ export declare function isOffsetInAtomicRegion(offset: number, regions: AtomicRegion[]): AtomicRegion | null;
41
+ /** Statistics about block types found in the JSON block hierarchy */
42
+ export interface BlockTypeStats {
43
+ total_blocks: number;
44
+ text_blocks: number;
45
+ table_blocks: number;
46
+ figure_blocks: number;
47
+ code_blocks: number;
48
+ list_blocks: number;
49
+ header_blocks: number;
50
+ footer_blocks: number;
51
+ heading_blocks: number;
52
+ page_count: number;
53
+ tables_per_page: number;
54
+ figures_per_page: number;
55
+ text_density: number;
56
+ }
57
+ /**
58
+ * Walk the JSON block tree and count block types to produce statistics.
59
+ *
60
+ * Recognizes: Text, Table, TableGroup, Figure, FigureGroup, Code,
61
+ * ListItem, List, PageHeader, PageFooter, SectionHeader, Title, Page.
62
+ *
63
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
64
+ * @returns BlockTypeStats with counts and derived ratios
65
+ */
66
+ export declare function computeBlockTypeStats(jsonBlocks: Record<string, unknown> | null): BlockTypeStats | null;
67
+ /**
68
+ * Confidence scores for block types, used to compute chunk quality from
69
+ * the block types present in a chunk. Higher values indicate more structured
70
+ * and typically more reliable content.
71
+ */
72
+ export declare const BLOCK_TYPE_CONFIDENCE: Record<string, number>;
73
+ /**
74
+ * Compute a confidence score for a chunk based on the block types it contains.
75
+ *
76
+ * Returns the average confidence across all content types in the chunk.
77
+ * Unknown block types default to 0.7. An empty content types array also
78
+ * defaults to 0.7.
79
+ *
80
+ * @param contentTypes - Array of block type strings from the chunk
81
+ * @returns Confidence score between 0 and 1
82
+ */
83
+ export declare function computeBlockConfidence(contentTypes: string[]): number;
84
+ /** Information about headers and footers detected in the JSON block tree */
85
+ export interface HeaderFooterInfo {
86
+ headerTexts: string[];
87
+ footerTexts: string[];
88
+ repeatedHeaders: string[];
89
+ repeatedFooters: string[];
90
+ }
91
+ /**
92
+ * Detect repeated headers and footers from the JSON block tree.
93
+ *
94
+ * Walks the block tree for each page, collecting PageHeader and PageFooter
95
+ * block texts. A text is considered "repeated" if it appears on >50% of pages
96
+ * with at least 2 occurrences.
97
+ *
98
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
99
+ * @returns HeaderFooterInfo with all and repeated header/footer texts
100
+ */
101
+ export declare function detectRepeatedHeadersFooters(jsonBlocks: Record<string, unknown> | null): HeaderFooterInfo;
102
+ /**
103
+ * Check if a chunk's text closely matches any of the repeated header/footer texts.
104
+ * Uses normalized comparison (lowercased, whitespace-collapsed).
105
+ *
106
+ * @param chunkText - The chunk text to check
107
+ * @param repeatedTexts - Array of repeated header/footer texts
108
+ * @returns true if the chunk text matches a repeated header/footer
109
+ */
110
+ export declare function isRepeatedHeaderFooter(chunkText: string, repeatedTexts: string[]): boolean;
111
+ /** Extracted structure information for a table block */
112
+ export interface TableStructure {
113
+ startOffset: number;
114
+ endOffset: number;
115
+ columnHeaders: string[];
116
+ rowCount: number;
117
+ columnCount: number;
118
+ pageNumber: number | null;
119
+ /** Human-readable summary of table content */
120
+ summary: string;
121
+ /** Values from the first data row (for summary generation) */
122
+ firstRowValues: string[];
123
+ /** Caption text from preceding block (e.g., "Table 1: Budget Summary") */
124
+ caption?: string;
125
+ /** Index of a prior table this continues (cross-page table detection) */
126
+ continuationOf?: number;
127
+ }
128
+ /**
129
+ * Extract table structures from the JSON block tree.
130
+ *
131
+ * Walks json_blocks for Table/TableGroup blocks, extracts column headers
132
+ * from the first row, and maps to markdown text offsets.
133
+ *
134
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
135
+ * @param markdownText - The full markdown text to search within
136
+ * @param pageOffsets - Page offset information for page number assignment
137
+ * @returns Array of TableStructure with column headers and position info
138
+ */
139
+ export declare function extractTableStructures(jsonBlocks: Record<string, unknown> | null, markdownText: string, pageOffsets: PageOffset[]): TableStructure[];
140
+ /**
141
+ * Extract column headers from the first pipe-delimited row of markdown table text.
142
+ * Fallback when JSON block children don't contain TableRow elements.
143
+ */
144
+ export declare function extractHeadersFromMarkdown(tableMarkdown: string): string[];
145
+ /**
146
+ * Count table dimensions from markdown pipe-delimited text.
147
+ * Counts data rows (excludes header and separator rows).
148
+ */
149
+ export declare function countTableDimensionsFromMarkdown(tableMarkdown: string): {
150
+ rowCount: number;
151
+ columnCount: number;
152
+ };
153
+ /**
154
+ * Extract values from the first data row (after header and separator) of markdown table.
155
+ */
156
+ export declare function extractFirstDataRow(tableMarkdown: string): string[];
157
+ /**
158
+ * Generate a human-readable summary of table content.
159
+ * Format: "Table with N rows and columns: col1, col2. Sample: val1, val2"
160
+ * Max 200 chars.
161
+ */
162
+ export declare function generateTableSummary(columnHeaders: string[], rowCount: number, firstRowValues: string[], caption?: string): string;
163
+ //# sourceMappingURL=json-block-analyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"json-block-analyzer.d.ts","sourceRoot":"","sources":["../../../src/services/chunking/json-block-analyzer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AAEtD,6DAA6D;AAC7D,MAAM,WAAW,YAAY;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAWD;;;;;;;;;;;GAWG;AACH,wBAAgB,iBAAiB,CAC/B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,EAC1C,YAAY,EAAE,MAAM,EACpB,WAAW,EAAE,UAAU,EAAE,GACxB,YAAY,EAAE,CA6BhB;AAED;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,GAAG,YAAY,GAAG,IAAI,CAuBnG;AA8cD,qEAAqE;AACrE,MAAM,WAAW,cAAc;IAC7B,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,gBAAgB,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,wBAAgB,qBAAqB,CACnC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,GACzC,cAAc,GAAG,IAAI,CAyFvB;AAMD;;;;GAIG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAYxD,CAAC;AAEF;;;;;;;;;GASG;AACH,wBAAgB,sBAAsB,CAAC,YAAY,EAAE,MAAM,EAAE,GAAG,MAAM,CAIrE;AAMD,4EAA4E;AAC5E,MAAM,WAAW,gBAAgB;IAC/B,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,EAAE,MAAM,EAAE,CAAC;CAC3B;AAmCD;;;;;;;;;GASG;AACH,wBAAgB,4BAA4B,CAC1C,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,GACzC,gBAAgB,CA4DlB;AAED;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CACpC,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,EAAE,GACtB,OAAO,CAmBT;AAMD,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,8CAA8C;IAC9C,OAAO,EAAE,MAAM,CAAC;IAChB,8DAA8D;IAC9D,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,0EAA0E;IAC1E,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yEAAyE;IACzE,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,sBAAsB,CACpC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,EAC1C,YAAY,EAAE,MAAM,EACpB,WAAW,EAAE,UAAU,EAAE,GACxB,cAAc,EAAE,CAgFlB;AAED;;;GAGG;AACH,wBAAgB,0BAA0B,CAAC,aAAa,EAAE,MAAM,GAAG,MAAM,EAAE,CAc1E;AAED;;;GAGG;AACH,wBAAgB,gCAAgC,CAAC,aAAa,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,WAAW,EAAE,MAAM,CAAA;CAAE,CAyBjH;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,aAAa,EAAE,MAAM,GAAG,MAAM,EAAE,CAiBnE;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,CAClC,aAAa,EAAE,MAAM,EAAE,EACvB,QAAQ,EAAE,MAAM,EAChB,cAAc,EAAE,MAAM,EAAE,EACxB,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAuBR"}