ocr-provenance-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocr-provenance-mcp might be problematic. Click here for more details.

Files changed (578) hide show
  1. package/.env.example +55 -0
  2. package/LICENSE +78 -0
  3. package/README.md +1154 -0
  4. package/dist/bin-http.d.ts +24 -0
  5. package/dist/bin-http.d.ts.map +1 -0
  6. package/dist/bin-http.js +275 -0
  7. package/dist/bin-http.js.map +1 -0
  8. package/dist/bin-setup.d.ts +11 -0
  9. package/dist/bin-setup.d.ts.map +1 -0
  10. package/dist/bin-setup.js +610 -0
  11. package/dist/bin-setup.js.map +1 -0
  12. package/dist/bin.d.ts +16 -0
  13. package/dist/bin.d.ts.map +1 -0
  14. package/dist/bin.js +16 -0
  15. package/dist/bin.js.map +1 -0
  16. package/dist/index.d.ts +13 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +90 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/models/chunk.d.ts +136 -0
  21. package/dist/models/chunk.d.ts.map +1 -0
  22. package/dist/models/chunk.js +27 -0
  23. package/dist/models/chunk.js.map +1 -0
  24. package/dist/models/cluster.d.ts +79 -0
  25. package/dist/models/cluster.d.ts.map +1 -0
  26. package/dist/models/cluster.js +10 -0
  27. package/dist/models/cluster.js.map +1 -0
  28. package/dist/models/comparison.d.ts +62 -0
  29. package/dist/models/comparison.d.ts.map +1 -0
  30. package/dist/models/comparison.js +8 -0
  31. package/dist/models/comparison.js.map +1 -0
  32. package/dist/models/document.d.ts +104 -0
  33. package/dist/models/document.d.ts.map +1 -0
  34. package/dist/models/document.js +15 -0
  35. package/dist/models/document.js.map +1 -0
  36. package/dist/models/embedding.d.ts +87 -0
  37. package/dist/models/embedding.d.ts.map +1 -0
  38. package/dist/models/embedding.js +23 -0
  39. package/dist/models/embedding.js.map +1 -0
  40. package/dist/models/extraction.d.ts +15 -0
  41. package/dist/models/extraction.d.ts.map +1 -0
  42. package/dist/models/extraction.js +2 -0
  43. package/dist/models/extraction.js.map +1 -0
  44. package/dist/models/form-fill.d.ts +23 -0
  45. package/dist/models/form-fill.d.ts.map +1 -0
  46. package/dist/models/form-fill.js +2 -0
  47. package/dist/models/form-fill.js.map +1 -0
  48. package/dist/models/image.d.ts +177 -0
  49. package/dist/models/image.d.ts.map +1 -0
  50. package/dist/models/image.js +8 -0
  51. package/dist/models/image.js.map +1 -0
  52. package/dist/models/index.d.ts +14 -0
  53. package/dist/models/index.d.ts.map +1 -0
  54. package/dist/models/index.js +22 -0
  55. package/dist/models/index.js.map +1 -0
  56. package/dist/models/provenance.d.ts +174 -0
  57. package/dist/models/provenance.d.ts.map +1 -0
  58. package/dist/models/provenance.js +53 -0
  59. package/dist/models/provenance.js.map +1 -0
  60. package/dist/models/uploaded-file.d.ts +20 -0
  61. package/dist/models/uploaded-file.d.ts.map +1 -0
  62. package/dist/models/uploaded-file.js +2 -0
  63. package/dist/models/uploaded-file.js.map +1 -0
  64. package/dist/server/errors.d.ts +93 -0
  65. package/dist/server/errors.d.ts.map +1 -0
  66. package/dist/server/errors.js +256 -0
  67. package/dist/server/errors.js.map +1 -0
  68. package/dist/server/events.d.ts +36 -0
  69. package/dist/server/events.d.ts.map +1 -0
  70. package/dist/server/events.js +48 -0
  71. package/dist/server/events.js.map +1 -0
  72. package/dist/server/permissions.d.ts +26 -0
  73. package/dist/server/permissions.d.ts.map +1 -0
  74. package/dist/server/permissions.js +194 -0
  75. package/dist/server/permissions.js.map +1 -0
  76. package/dist/server/register-tools.d.ts +25 -0
  77. package/dist/server/register-tools.d.ts.map +1 -0
  78. package/dist/server/register-tools.js +102 -0
  79. package/dist/server/register-tools.js.map +1 -0
  80. package/dist/server/startup.d.ts +16 -0
  81. package/dist/server/startup.d.ts.map +1 -0
  82. package/dist/server/startup.js +37 -0
  83. package/dist/server/startup.js.map +1 -0
  84. package/dist/server/state.d.ts +166 -0
  85. package/dist/server/state.d.ts.map +1 -0
  86. package/dist/server/state.js +424 -0
  87. package/dist/server/state.js.map +1 -0
  88. package/dist/server/transports/http-transport.d.ts +37 -0
  89. package/dist/server/transports/http-transport.d.ts.map +1 -0
  90. package/dist/server/transports/http-transport.js +204 -0
  91. package/dist/server/transports/http-transport.js.map +1 -0
  92. package/dist/server/transports/index.d.ts +9 -0
  93. package/dist/server/transports/index.d.ts.map +1 -0
  94. package/dist/server/transports/index.js +9 -0
  95. package/dist/server/transports/index.js.map +1 -0
  96. package/dist/server/transports/session-manager.d.ts +40 -0
  97. package/dist/server/transports/session-manager.d.ts.map +1 -0
  98. package/dist/server/transports/session-manager.js +74 -0
  99. package/dist/server/transports/session-manager.js.map +1 -0
  100. package/dist/server/types.d.ts +82 -0
  101. package/dist/server/types.d.ts.map +1 -0
  102. package/dist/server/types.js +14 -0
  103. package/dist/server/types.js.map +1 -0
  104. package/dist/services/audit.d.ts +26 -0
  105. package/dist/services/audit.d.ts.map +1 -0
  106. package/dist/services/audit.js +43 -0
  107. package/dist/services/audit.js.map +1 -0
  108. package/dist/services/chunking/chunk-deduplicator.d.ts +33 -0
  109. package/dist/services/chunking/chunk-deduplicator.d.ts.map +1 -0
  110. package/dist/services/chunking/chunk-deduplicator.js +46 -0
  111. package/dist/services/chunking/chunk-deduplicator.js.map +1 -0
  112. package/dist/services/chunking/chunk-merger.d.ts +26 -0
  113. package/dist/services/chunking/chunk-merger.d.ts.map +1 -0
  114. package/dist/services/chunking/chunk-merger.js +94 -0
  115. package/dist/services/chunking/chunk-merger.js.map +1 -0
  116. package/dist/services/chunking/chunker.d.ts +62 -0
  117. package/dist/services/chunking/chunker.d.ts.map +1 -0
  118. package/dist/services/chunking/chunker.js +566 -0
  119. package/dist/services/chunking/chunker.js.map +1 -0
  120. package/dist/services/chunking/heading-normalizer.d.ts +33 -0
  121. package/dist/services/chunking/heading-normalizer.d.ts.map +1 -0
  122. package/dist/services/chunking/heading-normalizer.js +101 -0
  123. package/dist/services/chunking/heading-normalizer.js.map +1 -0
  124. package/dist/services/chunking/json-block-analyzer.d.ts +163 -0
  125. package/dist/services/chunking/json-block-analyzer.d.ts.map +1 -0
  126. package/dist/services/chunking/json-block-analyzer.js +1033 -0
  127. package/dist/services/chunking/json-block-analyzer.js.map +1 -0
  128. package/dist/services/chunking/markdown-parser.d.ts +75 -0
  129. package/dist/services/chunking/markdown-parser.d.ts.map +1 -0
  130. package/dist/services/chunking/markdown-parser.js +428 -0
  131. package/dist/services/chunking/markdown-parser.js.map +1 -0
  132. package/dist/services/chunking/text-normalizer.d.ts +20 -0
  133. package/dist/services/chunking/text-normalizer.d.ts.map +1 -0
  134. package/dist/services/chunking/text-normalizer.js +36 -0
  135. package/dist/services/chunking/text-normalizer.js.map +1 -0
  136. package/dist/services/clm/contract-schemas.d.ts +36 -0
  137. package/dist/services/clm/contract-schemas.d.ts.map +1 -0
  138. package/dist/services/clm/contract-schemas.js +92 -0
  139. package/dist/services/clm/contract-schemas.js.map +1 -0
  140. package/dist/services/clm/summarization.d.ts +46 -0
  141. package/dist/services/clm/summarization.d.ts.map +1 -0
  142. package/dist/services/clm/summarization.js +61 -0
  143. package/dist/services/clm/summarization.js.map +1 -0
  144. package/dist/services/clustering/clustering-service.d.ts +58 -0
  145. package/dist/services/clustering/clustering-service.d.ts.map +1 -0
  146. package/dist/services/clustering/clustering-service.js +467 -0
  147. package/dist/services/clustering/clustering-service.js.map +1 -0
  148. package/dist/services/comparison/diff-service.d.ts +41 -0
  149. package/dist/services/comparison/diff-service.d.ts.map +1 -0
  150. package/dist/services/comparison/diff-service.js +120 -0
  151. package/dist/services/comparison/diff-service.js.map +1 -0
  152. package/dist/services/embedding/embedder.d.ts +55 -0
  153. package/dist/services/embedding/embedder.d.ts.map +1 -0
  154. package/dist/services/embedding/embedder.js +202 -0
  155. package/dist/services/embedding/embedder.js.map +1 -0
  156. package/dist/services/embedding/nomic.d.ts +67 -0
  157. package/dist/services/embedding/nomic.d.ts.map +1 -0
  158. package/dist/services/embedding/nomic.js +280 -0
  159. package/dist/services/embedding/nomic.js.map +1 -0
  160. package/dist/services/gemini/circuit-breaker.d.ts +106 -0
  161. package/dist/services/gemini/circuit-breaker.d.ts.map +1 -0
  162. package/dist/services/gemini/circuit-breaker.js +237 -0
  163. package/dist/services/gemini/circuit-breaker.js.map +1 -0
  164. package/dist/services/gemini/client.d.ts +173 -0
  165. package/dist/services/gemini/client.d.ts.map +1 -0
  166. package/dist/services/gemini/client.js +483 -0
  167. package/dist/services/gemini/client.js.map +1 -0
  168. package/dist/services/gemini/config.d.ts +116 -0
  169. package/dist/services/gemini/config.d.ts.map +1 -0
  170. package/dist/services/gemini/config.js +118 -0
  171. package/dist/services/gemini/config.js.map +1 -0
  172. package/dist/services/gemini/index.d.ts +9 -0
  173. package/dist/services/gemini/index.d.ts.map +1 -0
  174. package/dist/services/gemini/index.js +13 -0
  175. package/dist/services/gemini/index.js.map +1 -0
  176. package/dist/services/gemini/rate-limiter.d.ts +62 -0
  177. package/dist/services/gemini/rate-limiter.d.ts.map +1 -0
  178. package/dist/services/gemini/rate-limiter.js +120 -0
  179. package/dist/services/gemini/rate-limiter.js.map +1 -0
  180. package/dist/services/images/extractor.d.ts +88 -0
  181. package/dist/services/images/extractor.d.ts.map +1 -0
  182. package/dist/services/images/extractor.js +340 -0
  183. package/dist/services/images/extractor.js.map +1 -0
  184. package/dist/services/images/optimizer.d.ts +130 -0
  185. package/dist/services/images/optimizer.d.ts.map +1 -0
  186. package/dist/services/images/optimizer.js +228 -0
  187. package/dist/services/images/optimizer.js.map +1 -0
  188. package/dist/services/ocr/datalab.d.ts +64 -0
  189. package/dist/services/ocr/datalab.d.ts.map +1 -0
  190. package/dist/services/ocr/datalab.js +425 -0
  191. package/dist/services/ocr/datalab.js.map +1 -0
  192. package/dist/services/ocr/errors.d.ts +38 -0
  193. package/dist/services/ocr/errors.d.ts.map +1 -0
  194. package/dist/services/ocr/errors.js +83 -0
  195. package/dist/services/ocr/errors.js.map +1 -0
  196. package/dist/services/ocr/file-manager.d.ts +76 -0
  197. package/dist/services/ocr/file-manager.d.ts.map +1 -0
  198. package/dist/services/ocr/file-manager.js +238 -0
  199. package/dist/services/ocr/file-manager.js.map +1 -0
  200. package/dist/services/ocr/form-fill.d.ts +48 -0
  201. package/dist/services/ocr/form-fill.d.ts.map +1 -0
  202. package/dist/services/ocr/form-fill.js +213 -0
  203. package/dist/services/ocr/form-fill.js.map +1 -0
  204. package/dist/services/ocr/processor.d.ts +95 -0
  205. package/dist/services/ocr/processor.d.ts.map +1 -0
  206. package/dist/services/ocr/processor.js +259 -0
  207. package/dist/services/ocr/processor.js.map +1 -0
  208. package/dist/services/provenance/agent-metadata.d.ts +82 -0
  209. package/dist/services/provenance/agent-metadata.d.ts.map +1 -0
  210. package/dist/services/provenance/agent-metadata.js +106 -0
  211. package/dist/services/provenance/agent-metadata.js.map +1 -0
  212. package/dist/services/provenance/chain-hash.d.ts +57 -0
  213. package/dist/services/provenance/chain-hash.d.ts.map +1 -0
  214. package/dist/services/provenance/chain-hash.js +131 -0
  215. package/dist/services/provenance/chain-hash.js.map +1 -0
  216. package/dist/services/provenance/exporter.d.ts +202 -0
  217. package/dist/services/provenance/exporter.d.ts.map +1 -0
  218. package/dist/services/provenance/exporter.js +457 -0
  219. package/dist/services/provenance/exporter.js.map +1 -0
  220. package/dist/services/provenance/index.d.ts +15 -0
  221. package/dist/services/provenance/index.d.ts.map +1 -0
  222. package/dist/services/provenance/index.js +17 -0
  223. package/dist/services/provenance/index.js.map +1 -0
  224. package/dist/services/provenance/tracker.d.ts +138 -0
  225. package/dist/services/provenance/tracker.d.ts.map +1 -0
  226. package/dist/services/provenance/tracker.js +293 -0
  227. package/dist/services/provenance/tracker.js.map +1 -0
  228. package/dist/services/provenance/verifier.d.ts +153 -0
  229. package/dist/services/provenance/verifier.d.ts.map +1 -0
  230. package/dist/services/provenance/verifier.js +536 -0
  231. package/dist/services/provenance/verifier.js.map +1 -0
  232. package/dist/services/python-pool.d.ts +70 -0
  233. package/dist/services/python-pool.d.ts.map +1 -0
  234. package/dist/services/python-pool.js +265 -0
  235. package/dist/services/python-pool.js.map +1 -0
  236. package/dist/services/search/bm25.d.ts +180 -0
  237. package/dist/services/search/bm25.d.ts.map +1 -0
  238. package/dist/services/search/bm25.js +656 -0
  239. package/dist/services/search/bm25.js.map +1 -0
  240. package/dist/services/search/fusion.d.ts +103 -0
  241. package/dist/services/search/fusion.d.ts.map +1 -0
  242. package/dist/services/search/fusion.js +122 -0
  243. package/dist/services/search/fusion.js.map +1 -0
  244. package/dist/services/search/local-reranker.d.ts +30 -0
  245. package/dist/services/search/local-reranker.d.ts.map +1 -0
  246. package/dist/services/search/local-reranker.js +123 -0
  247. package/dist/services/search/local-reranker.js.map +1 -0
  248. package/dist/services/search/quality.d.ts +11 -0
  249. package/dist/services/search/quality.d.ts.map +1 -0
  250. package/dist/services/search/quality.js +17 -0
  251. package/dist/services/search/quality.js.map +1 -0
  252. package/dist/services/search/query-classifier.d.ts +34 -0
  253. package/dist/services/search/query-classifier.d.ts.map +1 -0
  254. package/dist/services/search/query-classifier.js +114 -0
  255. package/dist/services/search/query-classifier.js.map +1 -0
  256. package/dist/services/search/query-expander.d.ts +73 -0
  257. package/dist/services/search/query-expander.d.ts.map +1 -0
  258. package/dist/services/search/query-expander.js +281 -0
  259. package/dist/services/search/query-expander.js.map +1 -0
  260. package/dist/services/search/reranker.d.ts +44 -0
  261. package/dist/services/search/reranker.d.ts.map +1 -0
  262. package/dist/services/search/reranker.js +101 -0
  263. package/dist/services/search/reranker.js.map +1 -0
  264. package/dist/services/storage/database/annotation-operations.d.ts +113 -0
  265. package/dist/services/storage/database/annotation-operations.d.ts.map +1 -0
  266. package/dist/services/storage/database/annotation-operations.js +177 -0
  267. package/dist/services/storage/database/annotation-operations.js.map +1 -0
  268. package/dist/services/storage/database/approval-operations.d.ts +132 -0
  269. package/dist/services/storage/database/approval-operations.d.ts.map +1 -0
  270. package/dist/services/storage/database/approval-operations.js +206 -0
  271. package/dist/services/storage/database/approval-operations.js.map +1 -0
  272. package/dist/services/storage/database/chunk-operations.d.ts +132 -0
  273. package/dist/services/storage/database/chunk-operations.d.ts.map +1 -0
  274. package/dist/services/storage/database/chunk-operations.js +306 -0
  275. package/dist/services/storage/database/chunk-operations.js.map +1 -0
  276. package/dist/services/storage/database/cluster-operations.d.ts +97 -0
  277. package/dist/services/storage/database/cluster-operations.d.ts.map +1 -0
  278. package/dist/services/storage/database/cluster-operations.js +258 -0
  279. package/dist/services/storage/database/cluster-operations.js.map +1 -0
  280. package/dist/services/storage/database/comparison-operations.d.ts +41 -0
  281. package/dist/services/storage/database/comparison-operations.d.ts.map +1 -0
  282. package/dist/services/storage/database/comparison-operations.js +65 -0
  283. package/dist/services/storage/database/comparison-operations.js.map +1 -0
  284. package/dist/services/storage/database/converters.d.ts +36 -0
  285. package/dist/services/storage/database/converters.d.ts.map +1 -0
  286. package/dist/services/storage/database/converters.js +244 -0
  287. package/dist/services/storage/database/converters.js.map +1 -0
  288. package/dist/services/storage/database/document-operations.d.ts +145 -0
  289. package/dist/services/storage/database/document-operations.d.ts.map +1 -0
  290. package/dist/services/storage/database/document-operations.js +498 -0
  291. package/dist/services/storage/database/document-operations.js.map +1 -0
  292. package/dist/services/storage/database/embedding-operations.d.ts +130 -0
  293. package/dist/services/storage/database/embedding-operations.d.ts.map +1 -0
  294. package/dist/services/storage/database/embedding-operations.js +315 -0
  295. package/dist/services/storage/database/embedding-operations.js.map +1 -0
  296. package/dist/services/storage/database/extraction-operations.d.ts +47 -0
  297. package/dist/services/storage/database/extraction-operations.d.ts.map +1 -0
  298. package/dist/services/storage/database/extraction-operations.js +85 -0
  299. package/dist/services/storage/database/extraction-operations.js.map +1 -0
  300. package/dist/services/storage/database/form-fill-operations.d.ts +58 -0
  301. package/dist/services/storage/database/form-fill-operations.d.ts.map +1 -0
  302. package/dist/services/storage/database/form-fill-operations.js +116 -0
  303. package/dist/services/storage/database/form-fill-operations.js.map +1 -0
  304. package/dist/services/storage/database/helpers.d.ts +29 -0
  305. package/dist/services/storage/database/helpers.d.ts.map +1 -0
  306. package/dist/services/storage/database/helpers.js +55 -0
  307. package/dist/services/storage/database/helpers.js.map +1 -0
  308. package/dist/services/storage/database/image-operations.d.ts +202 -0
  309. package/dist/services/storage/database/image-operations.d.ts.map +1 -0
  310. package/dist/services/storage/database/image-operations.js +484 -0
  311. package/dist/services/storage/database/image-operations.js.map +1 -0
  312. package/dist/services/storage/database/index.d.ts +13 -0
  313. package/dist/services/storage/database/index.d.ts.map +1 -0
  314. package/dist/services/storage/database/index.js +16 -0
  315. package/dist/services/storage/database/index.js.map +1 -0
  316. package/dist/services/storage/database/lock-operations.d.ts +59 -0
  317. package/dist/services/storage/database/lock-operations.d.ts.map +1 -0
  318. package/dist/services/storage/database/lock-operations.js +89 -0
  319. package/dist/services/storage/database/lock-operations.js.map +1 -0
  320. package/dist/services/storage/database/obligation-operations.d.ts +88 -0
  321. package/dist/services/storage/database/obligation-operations.d.ts.map +1 -0
  322. package/dist/services/storage/database/obligation-operations.js +206 -0
  323. package/dist/services/storage/database/obligation-operations.js.map +1 -0
  324. package/dist/services/storage/database/ocr-operations.d.ts +33 -0
  325. package/dist/services/storage/database/ocr-operations.d.ts.map +1 -0
  326. package/dist/services/storage/database/ocr-operations.js +70 -0
  327. package/dist/services/storage/database/ocr-operations.js.map +1 -0
  328. package/dist/services/storage/database/playbook-operations.d.ts +72 -0
  329. package/dist/services/storage/database/playbook-operations.d.ts.map +1 -0
  330. package/dist/services/storage/database/playbook-operations.js +247 -0
  331. package/dist/services/storage/database/playbook-operations.js.map +1 -0
  332. package/dist/services/storage/database/provenance-operations.d.ts +112 -0
  333. package/dist/services/storage/database/provenance-operations.d.ts.map +1 -0
  334. package/dist/services/storage/database/provenance-operations.js +251 -0
  335. package/dist/services/storage/database/provenance-operations.js.map +1 -0
  336. package/dist/services/storage/database/service.d.ts +142 -0
  337. package/dist/services/storage/database/service.d.ts.map +1 -0
  338. package/dist/services/storage/database/service.js +310 -0
  339. package/dist/services/storage/database/service.js.map +1 -0
  340. package/dist/services/storage/database/static-operations.d.ts +30 -0
  341. package/dist/services/storage/database/static-operations.d.ts.map +1 -0
  342. package/dist/services/storage/database/static-operations.js +218 -0
  343. package/dist/services/storage/database/static-operations.js.map +1 -0
  344. package/dist/services/storage/database/stats-operations.d.ts +101 -0
  345. package/dist/services/storage/database/stats-operations.d.ts.map +1 -0
  346. package/dist/services/storage/database/stats-operations.js +394 -0
  347. package/dist/services/storage/database/stats-operations.js.map +1 -0
  348. package/dist/services/storage/database/tag-operations.d.ts +76 -0
  349. package/dist/services/storage/database/tag-operations.d.ts.map +1 -0
  350. package/dist/services/storage/database/tag-operations.js +178 -0
  351. package/dist/services/storage/database/tag-operations.js.map +1 -0
  352. package/dist/services/storage/database/types.d.ts +286 -0
  353. package/dist/services/storage/database/types.d.ts.map +1 -0
  354. package/dist/services/storage/database/types.js +39 -0
  355. package/dist/services/storage/database/types.js.map +1 -0
  356. package/dist/services/storage/database/upload-operations.d.ts +71 -0
  357. package/dist/services/storage/database/upload-operations.d.ts.map +1 -0
  358. package/dist/services/storage/database/upload-operations.js +124 -0
  359. package/dist/services/storage/database/upload-operations.js.map +1 -0
  360. package/dist/services/storage/database/user-operations.d.ts +102 -0
  361. package/dist/services/storage/database/user-operations.d.ts.map +1 -0
  362. package/dist/services/storage/database/user-operations.js +151 -0
  363. package/dist/services/storage/database/user-operations.js.map +1 -0
  364. package/dist/services/storage/database/workflow-operations.d.ts +98 -0
  365. package/dist/services/storage/database/workflow-operations.d.ts.map +1 -0
  366. package/dist/services/storage/database/workflow-operations.js +157 -0
  367. package/dist/services/storage/database/workflow-operations.js.map +1 -0
  368. package/dist/services/storage/database.d.ts +16 -0
  369. package/dist/services/storage/database.d.ts.map +1 -0
  370. package/dist/services/storage/database.js +15 -0
  371. package/dist/services/storage/database.js.map +1 -0
  372. package/dist/services/storage/index.d.ts +10 -0
  373. package/dist/services/storage/index.d.ts.map +1 -0
  374. package/dist/services/storage/index.js +10 -0
  375. package/dist/services/storage/index.js.map +1 -0
  376. package/dist/services/storage/migrations/index.d.ts +16 -0
  377. package/dist/services/storage/migrations/index.d.ts.map +1 -0
  378. package/dist/services/storage/migrations/index.js +20 -0
  379. package/dist/services/storage/migrations/index.js.map +1 -0
  380. package/dist/services/storage/migrations/operations.d.ts +40 -0
  381. package/dist/services/storage/migrations/operations.d.ts.map +1 -0
  382. package/dist/services/storage/migrations/operations.js +2910 -0
  383. package/dist/services/storage/migrations/operations.js.map +1 -0
  384. package/dist/services/storage/migrations/schema-definitions.d.ts +306 -0
  385. package/dist/services/storage/migrations/schema-definitions.d.ts.map +1 -0
  386. package/dist/services/storage/migrations/schema-definitions.js +1006 -0
  387. package/dist/services/storage/migrations/schema-definitions.js.map +1 -0
  388. package/dist/services/storage/migrations/schema-helpers.d.ts +50 -0
  389. package/dist/services/storage/migrations/schema-helpers.d.ts.map +1 -0
  390. package/dist/services/storage/migrations/schema-helpers.js +176 -0
  391. package/dist/services/storage/migrations/schema-helpers.js.map +1 -0
  392. package/dist/services/storage/migrations/types.d.ts +15 -0
  393. package/dist/services/storage/migrations/types.d.ts.map +1 -0
  394. package/dist/services/storage/migrations/types.js +21 -0
  395. package/dist/services/storage/migrations/types.js.map +1 -0
  396. package/dist/services/storage/migrations/verification.d.ts +20 -0
  397. package/dist/services/storage/migrations/verification.d.ts.map +1 -0
  398. package/dist/services/storage/migrations/verification.js +78 -0
  399. package/dist/services/storage/migrations/verification.js.map +1 -0
  400. package/dist/services/storage/migrations.d.ts +16 -0
  401. package/dist/services/storage/migrations.d.ts.map +1 -0
  402. package/dist/services/storage/migrations.js +17 -0
  403. package/dist/services/storage/migrations.js.map +1 -0
  404. package/dist/services/storage/types.d.ts +12 -0
  405. package/dist/services/storage/types.d.ts.map +1 -0
  406. package/dist/services/storage/types.js +5 -0
  407. package/dist/services/storage/types.js.map +1 -0
  408. package/dist/services/storage/vector.d.ts +208 -0
  409. package/dist/services/storage/vector.d.ts.map +1 -0
  410. package/dist/services/storage/vector.js +526 -0
  411. package/dist/services/storage/vector.js.map +1 -0
  412. package/dist/services/vlm/pipeline.d.ts +194 -0
  413. package/dist/services/vlm/pipeline.d.ts.map +1 -0
  414. package/dist/services/vlm/pipeline.js +800 -0
  415. package/dist/services/vlm/pipeline.js.map +1 -0
  416. package/dist/services/vlm/prompts.d.ts +171 -0
  417. package/dist/services/vlm/prompts.d.ts.map +1 -0
  418. package/dist/services/vlm/prompts.js +229 -0
  419. package/dist/services/vlm/prompts.js.map +1 -0
  420. package/dist/services/vlm/service.d.ts +174 -0
  421. package/dist/services/vlm/service.d.ts.map +1 -0
  422. package/dist/services/vlm/service.js +256 -0
  423. package/dist/services/vlm/service.js.map +1 -0
  424. package/dist/services/webhook-delivery.d.ts +4 -0
  425. package/dist/services/webhook-delivery.d.ts.map +1 -0
  426. package/dist/services/webhook-delivery.js +140 -0
  427. package/dist/services/webhook-delivery.js.map +1 -0
  428. package/dist/tools/chunks.d.ts +19 -0
  429. package/dist/tools/chunks.d.ts.map +1 -0
  430. package/dist/tools/chunks.js +392 -0
  431. package/dist/tools/chunks.js.map +1 -0
  432. package/dist/tools/clm.d.ts +16 -0
  433. package/dist/tools/clm.d.ts.map +1 -0
  434. package/dist/tools/clm.js +668 -0
  435. package/dist/tools/clm.js.map +1 -0
  436. package/dist/tools/clustering.d.ts +13 -0
  437. package/dist/tools/clustering.d.ts.map +1 -0
  438. package/dist/tools/clustering.js +498 -0
  439. package/dist/tools/clustering.js.map +1 -0
  440. package/dist/tools/collaboration.d.ts +15 -0
  441. package/dist/tools/collaboration.d.ts.map +1 -0
  442. package/dist/tools/collaboration.js +516 -0
  443. package/dist/tools/collaboration.js.map +1 -0
  444. package/dist/tools/comparison.d.ts +13 -0
  445. package/dist/tools/comparison.d.ts.map +1 -0
  446. package/dist/tools/comparison.js +735 -0
  447. package/dist/tools/comparison.js.map +1 -0
  448. package/dist/tools/compliance.d.ts +15 -0
  449. package/dist/tools/compliance.d.ts.map +1 -0
  450. package/dist/tools/compliance.js +640 -0
  451. package/dist/tools/compliance.js.map +1 -0
  452. package/dist/tools/config.d.ts +19 -0
  453. package/dist/tools/config.d.ts.map +1 -0
  454. package/dist/tools/config.js +213 -0
  455. package/dist/tools/config.js.map +1 -0
  456. package/dist/tools/database.d.ts +62 -0
  457. package/dist/tools/database.d.ts.map +1 -0
  458. package/dist/tools/database.js +288 -0
  459. package/dist/tools/database.js.map +1 -0
  460. package/dist/tools/documents.d.ts +61 -0
  461. package/dist/tools/documents.d.ts.map +1 -0
  462. package/dist/tools/documents.js +1624 -0
  463. package/dist/tools/documents.js.map +1 -0
  464. package/dist/tools/embeddings.d.ts +14 -0
  465. package/dist/tools/embeddings.d.ts.map +1 -0
  466. package/dist/tools/embeddings.js +626 -0
  467. package/dist/tools/embeddings.js.map +1 -0
  468. package/dist/tools/evaluation.d.ts +25 -0
  469. package/dist/tools/evaluation.d.ts.map +1 -0
  470. package/dist/tools/evaluation.js +523 -0
  471. package/dist/tools/evaluation.js.map +1 -0
  472. package/dist/tools/events.d.ts +16 -0
  473. package/dist/tools/events.d.ts.map +1 -0
  474. package/dist/tools/events.js +493 -0
  475. package/dist/tools/events.js.map +1 -0
  476. package/dist/tools/extraction-structured.d.ts +13 -0
  477. package/dist/tools/extraction-structured.d.ts.map +1 -0
  478. package/dist/tools/extraction-structured.js +390 -0
  479. package/dist/tools/extraction-structured.js.map +1 -0
  480. package/dist/tools/extraction.d.ts +24 -0
  481. package/dist/tools/extraction.d.ts.map +1 -0
  482. package/dist/tools/extraction.js +424 -0
  483. package/dist/tools/extraction.js.map +1 -0
  484. package/dist/tools/file-management.d.ts +14 -0
  485. package/dist/tools/file-management.d.ts.map +1 -0
  486. package/dist/tools/file-management.js +523 -0
  487. package/dist/tools/file-management.js.map +1 -0
  488. package/dist/tools/form-fill.d.ts +13 -0
  489. package/dist/tools/form-fill.d.ts.map +1 -0
  490. package/dist/tools/form-fill.js +250 -0
  491. package/dist/tools/form-fill.js.map +1 -0
  492. package/dist/tools/health.d.ts +19 -0
  493. package/dist/tools/health.d.ts.map +1 -0
  494. package/dist/tools/health.js +229 -0
  495. package/dist/tools/health.js.map +1 -0
  496. package/dist/tools/images.d.ts +54 -0
  497. package/dist/tools/images.d.ts.map +1 -0
  498. package/dist/tools/images.js +787 -0
  499. package/dist/tools/images.js.map +1 -0
  500. package/dist/tools/ingestion.d.ts +94 -0
  501. package/dist/tools/ingestion.d.ts.map +1 -0
  502. package/dist/tools/ingestion.js +1659 -0
  503. package/dist/tools/ingestion.js.map +1 -0
  504. package/dist/tools/intelligence.d.ts +18 -0
  505. package/dist/tools/intelligence.d.ts.map +1 -0
  506. package/dist/tools/intelligence.js +1039 -0
  507. package/dist/tools/intelligence.js.map +1 -0
  508. package/dist/tools/provenance.d.ts +51 -0
  509. package/dist/tools/provenance.d.ts.map +1 -0
  510. package/dist/tools/provenance.js +691 -0
  511. package/dist/tools/provenance.js.map +1 -0
  512. package/dist/tools/reports.d.ts +41 -0
  513. package/dist/tools/reports.d.ts.map +1 -0
  514. package/dist/tools/reports.js +1394 -0
  515. package/dist/tools/reports.js.map +1 -0
  516. package/dist/tools/search.d.ts +35 -0
  517. package/dist/tools/search.d.ts.map +1 -0
  518. package/dist/tools/search.js +2528 -0
  519. package/dist/tools/search.js.map +1 -0
  520. package/dist/tools/shared.d.ts +52 -0
  521. package/dist/tools/shared.d.ts.map +1 -0
  522. package/dist/tools/shared.js +54 -0
  523. package/dist/tools/shared.js.map +1 -0
  524. package/dist/tools/tags.d.ts +15 -0
  525. package/dist/tools/tags.d.ts.map +1 -0
  526. package/dist/tools/tags.js +287 -0
  527. package/dist/tools/tags.js.map +1 -0
  528. package/dist/tools/timeline.d.ts +15 -0
  529. package/dist/tools/timeline.d.ts.map +1 -0
  530. package/dist/tools/timeline.js +14 -0
  531. package/dist/tools/timeline.js.map +1 -0
  532. package/dist/tools/users.d.ts +14 -0
  533. package/dist/tools/users.d.ts.map +1 -0
  534. package/dist/tools/users.js +257 -0
  535. package/dist/tools/users.js.map +1 -0
  536. package/dist/tools/vlm.d.ts +40 -0
  537. package/dist/tools/vlm.d.ts.map +1 -0
  538. package/dist/tools/vlm.js +475 -0
  539. package/dist/tools/vlm.js.map +1 -0
  540. package/dist/tools/workflow.d.ts +16 -0
  541. package/dist/tools/workflow.d.ts.map +1 -0
  542. package/dist/tools/workflow.js +495 -0
  543. package/dist/tools/workflow.js.map +1 -0
  544. package/dist/utils/backoff.d.ts +53 -0
  545. package/dist/utils/backoff.d.ts.map +1 -0
  546. package/dist/utils/backoff.js +78 -0
  547. package/dist/utils/backoff.js.map +1 -0
  548. package/dist/utils/config-persistence.d.ts +33 -0
  549. package/dist/utils/config-persistence.d.ts.map +1 -0
  550. package/dist/utils/config-persistence.js +61 -0
  551. package/dist/utils/config-persistence.js.map +1 -0
  552. package/dist/utils/hash.d.ts +65 -0
  553. package/dist/utils/hash.d.ts.map +1 -0
  554. package/dist/utils/hash.js +146 -0
  555. package/dist/utils/hash.js.map +1 -0
  556. package/dist/utils/math.d.ts +21 -0
  557. package/dist/utils/math.d.ts.map +1 -0
  558. package/dist/utils/math.js +39 -0
  559. package/dist/utils/math.js.map +1 -0
  560. package/dist/utils/validation.d.ts +697 -0
  561. package/dist/utils/validation.d.ts.map +1 -0
  562. package/dist/utils/validation.js +529 -0
  563. package/dist/utils/validation.js.map +1 -0
  564. package/package.json +96 -0
  565. package/python/.gitkeep +0 -0
  566. package/python/__init__.py +104 -0
  567. package/python/clustering_worker.py +440 -0
  568. package/python/docx_image_extractor.py +524 -0
  569. package/python/embedding_worker.py +552 -0
  570. package/python/file_manager_worker.py +564 -0
  571. package/python/form_fill_worker.py +399 -0
  572. package/python/gpu_utils.py +582 -0
  573. package/python/image_extractor.py +317 -0
  574. package/python/image_optimizer.py +444 -0
  575. package/python/ocr_worker.py +712 -0
  576. package/python/pyproject.toml +76 -0
  577. package/python/requirements.txt +51 -0
  578. package/python/reranker_worker.py +87 -0
@@ -0,0 +1,1033 @@
1
+ /**
2
+ * JSON Block Analyzer for Section-Aware Chunking
3
+ *
4
+ * Analyzes Datalab JSON block hierarchy to identify atomic (unsplittable)
5
+ * regions such as tables, figures, and code blocks. These regions inform
6
+ * the hybrid chunker where it must NOT split text.
7
+ *
8
+ * @module services/chunking/json-block-analyzer
9
+ */
10
+ /** Block types that should be treated as atomic (unsplittable) */
11
+ const ATOMIC_BLOCK_TYPES = new Set([
12
+ 'Table',
13
+ 'TableGroup',
14
+ 'Figure',
15
+ 'FigureGroup',
16
+ 'Code',
17
+ ]);
18
+ /**
19
+ * Find atomic (unsplittable) regions in the markdown text by analyzing JSON blocks.
20
+ *
21
+ * Walks the Datalab JSON block tree, locates Table, TableGroup, Figure, FigureGroup,
22
+ * and Code blocks, then finds their approximate positions in the markdown text using
23
+ * fuzzy text matching. Returns sorted, non-overlapping regions.
24
+ *
25
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
26
+ * @param markdownText - The full markdown text to search within
27
+ * @param pageOffsets - Page offset information for page number assignment
28
+ * @returns Sorted array of AtomicRegion representing unsplittable text spans
29
+ */
30
+ export function findAtomicRegions(jsonBlocks, markdownText, pageOffsets) {
31
+ if (!jsonBlocks) {
32
+ return [];
33
+ }
34
+ if (markdownText.length === 0) {
35
+ return [];
36
+ }
37
+ const rawRegions = [];
38
+ // Walk the JSON block tree
39
+ walkBlocks(jsonBlocks, (block, pageNum) => {
40
+ const blockType = block.block_type;
41
+ if (!blockType || !ATOMIC_BLOCK_TYPES.has(blockType)) {
42
+ return;
43
+ }
44
+ const region = locateBlockInMarkdown(block, blockType, pageNum, markdownText, pageOffsets);
45
+ if (region) {
46
+ rawRegions.push(region);
47
+ }
48
+ }, 0);
49
+ // Sort by startOffset
50
+ rawRegions.sort((a, b) => a.startOffset - b.startOffset);
51
+ // Merge overlapping regions
52
+ return mergeOverlappingRegions(rawRegions);
53
+ }
54
+ /**
55
+ * Check if a character offset falls within an atomic region.
56
+ *
57
+ * Uses binary search on the sorted regions array for efficient lookup.
58
+ *
59
+ * @param offset - The character offset to check
60
+ * @param regions - Sorted array of AtomicRegion (from findAtomicRegions)
61
+ * @returns The containing AtomicRegion, or null if offset is not in any region
62
+ */
63
+ export function isOffsetInAtomicRegion(offset, regions) {
64
+ if (regions.length === 0) {
65
+ return null;
66
+ }
67
+ let low = 0;
68
+ let high = regions.length - 1;
69
+ while (low <= high) {
70
+ const mid = Math.floor((low + high) / 2);
71
+ const region = regions[mid];
72
+ if (offset < region.startOffset) {
73
+ high = mid - 1;
74
+ }
75
+ else if (offset >= region.endOffset) {
76
+ low = mid + 1;
77
+ }
78
+ else {
79
+ // offset >= region.startOffset && offset < region.endOffset
80
+ return region;
81
+ }
82
+ }
83
+ return null;
84
+ }
85
+ // ---------------------------------------------------------------------------
86
+ // Internal helpers
87
+ // ---------------------------------------------------------------------------
88
+ /**
89
+ * Strip HTML tags and decode basic entities from an HTML string
90
+ */
91
+ function stripHtmlTags(html) {
92
+ // Remove all HTML tags
93
+ let text = html.replace(/<[^>]*>/g, '');
94
+ // Decode basic HTML entities
95
+ text = text.replace(/&amp;/g, '&');
96
+ text = text.replace(/&lt;/g, '<');
97
+ text = text.replace(/&gt;/g, '>');
98
+ text = text.replace(/&quot;/g, '"');
99
+ text = text.replace(/&#39;/g, "'");
100
+ text = text.replace(/&nbsp;/g, ' ');
101
+ return text;
102
+ }
103
+ /**
104
+ * Recursively walk the JSON block tree, calling the callback for each block.
105
+ * Tracks the current page number from Page blocks.
106
+ */
107
+ function walkBlocks(block, callback, pageNum) {
108
+ callback(block, pageNum);
109
+ const children = (block.children ?? block.blocks);
110
+ if (Array.isArray(children)) {
111
+ let childPageNum = pageNum;
112
+ for (const child of children) {
113
+ const childBlock = child;
114
+ const childType = childBlock.block_type;
115
+ walkBlocks(childBlock, callback, childPageNum);
116
+ // After walking a Page child, increment for the next page
117
+ if (childType === 'Page') {
118
+ childPageNum++;
119
+ }
120
+ }
121
+ }
122
+ }
123
+ /**
124
+ * Attempt to locate a JSON block's content in the markdown text.
125
+ * Uses different strategies depending on block type.
126
+ */
127
+ function locateBlockInMarkdown(block, blockType, _pageNum, markdownText, pageOffsets) {
128
+ // For Table blocks, search for the table's header row (first pipe-delimited line)
129
+ if (blockType === 'Table' || blockType === 'TableGroup') {
130
+ return locateTableInMarkdown(block, blockType, markdownText, pageOffsets);
131
+ }
132
+ // For Figure, FigureGroup, Code blocks: use HTML content
133
+ return locateByHtmlContent(block, blockType, markdownText, pageOffsets);
134
+ }
135
+ /**
136
+ * Locate a table block by searching for its header row pattern in markdown
137
+ */
138
+ function locateTableInMarkdown(block, blockType, markdownText, pageOffsets) {
139
+ // Try to get table content from the block's HTML or text
140
+ const html = block.html ?? '';
141
+ const strippedText = stripHtmlTags(html).trim();
142
+ // Extract the first meaningful line as a search key
143
+ let searchKey = '';
144
+ if (strippedText.length > 0) {
145
+ // Get first non-empty line from stripped HTML
146
+ const lines = strippedText.split('\n').filter((l) => l.trim().length > 0);
147
+ if (lines.length > 0) {
148
+ searchKey = lines[0].trim().slice(0, 60);
149
+ }
150
+ }
151
+ // Also try to find a markdown table pattern near the expected location
152
+ // Search for pipe-delimited lines
153
+ if (searchKey.length < 5) {
154
+ // Fallback: try to find any table near the expected page
155
+ return locateTableByPattern(blockType, markdownText, pageOffsets);
156
+ }
157
+ // Search for the key in the markdown
158
+ const keyIdx = findFuzzyMatch(searchKey, markdownText);
159
+ if (keyIdx === -1) {
160
+ console.error(`[json-block-analyzer] Could not locate ${blockType} block with search key: "${searchKey.slice(0, 40)}..."`);
161
+ return null;
162
+ }
163
+ // Find the extent of the table around this match point
164
+ const tableExtent = findTableExtent(markdownText, keyIdx);
165
+ if (!tableExtent) {
166
+ return null;
167
+ }
168
+ validateRegionOffsets(tableExtent.start, tableExtent.end);
169
+ return {
170
+ startOffset: tableExtent.start,
171
+ endOffset: tableExtent.end,
172
+ blockType,
173
+ pageNumber: getPageNumberForOffset(tableExtent.start, pageOffsets),
174
+ };
175
+ }
176
+ /**
177
+ * Locate a block by its HTML content using fuzzy text matching
178
+ */
179
+ function locateByHtmlContent(block, blockType, markdownText, pageOffsets) {
180
+ const html = block.html ?? '';
181
+ if (html.length === 0) {
182
+ // No HTML content to match against
183
+ return null;
184
+ }
185
+ const strippedText = stripHtmlTags(html).trim();
186
+ if (strippedText.length === 0) {
187
+ return null;
188
+ }
189
+ // Use the first 50 characters as a search key
190
+ const searchKey = strippedText.slice(0, 50).trim();
191
+ if (searchKey.length < 3) {
192
+ return null;
193
+ }
194
+ const matchIdx = findFuzzyMatch(searchKey, markdownText);
195
+ if (matchIdx === -1) {
196
+ console.error(`[json-block-analyzer] Could not locate ${blockType} block with content: "${searchKey.slice(0, 40)}..."`);
197
+ return null;
198
+ }
199
+ // Estimate the end of this block:
200
+ // For code blocks, look for closing fence
201
+ // For figures, use a reasonable extent based on the full stripped text length
202
+ let endIdx;
203
+ if (blockType === 'Code') {
204
+ endIdx = findCodeBlockEnd(markdownText, matchIdx);
205
+ }
206
+ else {
207
+ // Figure/FigureGroup: estimate based on content length
208
+ // Use the stripped text length as a rough guide, with a minimum extent
209
+ const estimatedLength = Math.max(strippedText.length, 20);
210
+ endIdx = Math.min(matchIdx + estimatedLength, markdownText.length);
211
+ }
212
+ validateRegionOffsets(matchIdx, endIdx);
213
+ return {
214
+ startOffset: matchIdx,
215
+ endOffset: endIdx,
216
+ blockType,
217
+ pageNumber: getPageNumberForOffset(matchIdx, pageOffsets),
218
+ };
219
+ }
220
+ /**
221
+ * Find a fuzzy match for a search key in the markdown text.
222
+ * First tries exact substring match, then falls back to normalized matching.
223
+ *
224
+ * @returns The start index of the match, or -1 if not found
225
+ */
226
+ function findFuzzyMatch(searchKey, markdownText) {
227
+ // Try exact match first
228
+ const exactIdx = markdownText.indexOf(searchKey);
229
+ if (exactIdx !== -1) {
230
+ return exactIdx;
231
+ }
232
+ // Normalize both strings: collapse whitespace, lowercase
233
+ const normalizedKey = normalizeForSearch(searchKey);
234
+ if (normalizedKey.length < 3) {
235
+ return -1;
236
+ }
237
+ const normalizedText = normalizeForSearch(markdownText);
238
+ const normalizedIdx = normalizedText.indexOf(normalizedKey);
239
+ if (normalizedIdx === -1) {
240
+ return -1;
241
+ }
242
+ // Map the normalized index back to the original text position.
243
+ // Walk the original text, counting non-whitespace characters to find the
244
+ // position that corresponds to the normalized index.
245
+ return mapNormalizedIndexToOriginal(markdownText, normalizedIdx);
246
+ }
247
+ /**
248
+ * Normalize text for fuzzy matching: collapse whitespace, lowercase
249
+ */
250
+ function normalizeForSearch(text) {
251
+ return text.toLowerCase().replace(/\s+/g, ' ').trim();
252
+ }
253
+ /**
254
+ * Map a character index in normalized text back to the original text position.
255
+ */
256
+ function mapNormalizedIndexToOriginal(originalText, normalizedIdx) {
257
+ let normalizedPos = 0;
258
+ let inWhitespace = false;
259
+ let started = false;
260
+ for (let i = 0; i < originalText.length; i++) {
261
+ const ch = originalText[i];
262
+ const isWs = /\s/.test(ch);
263
+ if (!started && isWs) {
264
+ // Skip leading whitespace
265
+ continue;
266
+ }
267
+ started = true;
268
+ if (isWs) {
269
+ if (!inWhitespace) {
270
+ // First whitespace char after non-whitespace counts as one space
271
+ if (normalizedPos === normalizedIdx) {
272
+ return i;
273
+ }
274
+ normalizedPos++;
275
+ inWhitespace = true;
276
+ }
277
+ // Additional whitespace chars are collapsed, don't increment
278
+ }
279
+ else {
280
+ if (normalizedPos === normalizedIdx) {
281
+ return i;
282
+ }
283
+ normalizedPos++;
284
+ inWhitespace = false;
285
+ }
286
+ }
287
+ // If we reach here, return the end of the text
288
+ return originalText.length;
289
+ }
290
+ /**
291
+ * Find the full extent of a markdown table around a given position
292
+ */
293
+ function findTableExtent(markdownText, nearIdx) {
294
+ // Find the start of the line containing nearIdx
295
+ let lineStart = nearIdx;
296
+ while (lineStart > 0 && markdownText[lineStart - 1] !== '\n') {
297
+ lineStart--;
298
+ }
299
+ // Scan backward to find the first line of the table (starts with |)
300
+ let tableStart = lineStart;
301
+ while (tableStart > 0) {
302
+ // Find start of previous line
303
+ let prevLineStart = tableStart - 1;
304
+ if (prevLineStart >= 0 && markdownText[prevLineStart] === '\n') {
305
+ prevLineStart--;
306
+ }
307
+ while (prevLineStart > 0 && markdownText[prevLineStart - 1] !== '\n') {
308
+ prevLineStart--;
309
+ }
310
+ const prevLine = markdownText.slice(prevLineStart, tableStart).trim();
311
+ if (prevLine.startsWith('|') || prevLine.length === 0) {
312
+ // The previous line is part of the table or empty (could be above table)
313
+ if (prevLine.startsWith('|')) {
314
+ tableStart = prevLineStart;
315
+ }
316
+ else {
317
+ break;
318
+ }
319
+ }
320
+ else {
321
+ break;
322
+ }
323
+ }
324
+ // Scan forward to find the last line of the table
325
+ let tableEnd = nearIdx;
326
+ while (tableEnd < markdownText.length) {
327
+ // Find end of current line
328
+ let lineEnd = tableEnd;
329
+ while (lineEnd < markdownText.length && markdownText[lineEnd] !== '\n') {
330
+ lineEnd++;
331
+ }
332
+ const currentLine = markdownText.slice(tableEnd, lineEnd).trim();
333
+ if (currentLine.startsWith('|') || currentLine.length === 0) {
334
+ tableEnd = lineEnd + 1;
335
+ if (currentLine.length === 0 && tableEnd > nearIdx + 2) {
336
+ // Empty line after some table content - table is done
337
+ break;
338
+ }
339
+ }
340
+ else {
341
+ // Non-table line, table ends at start of this line
342
+ break;
343
+ }
344
+ }
345
+ // Ensure we don't go past the text
346
+ tableEnd = Math.min(tableEnd, markdownText.length);
347
+ if (tableEnd <= tableStart) {
348
+ return null;
349
+ }
350
+ return { start: tableStart, end: tableEnd };
351
+ }
352
+ /**
353
+ * Find the end of a code block starting near a given position
354
+ */
355
+ function findCodeBlockEnd(markdownText, startIdx) {
356
+ // Look for the opening ``` line
357
+ const searchFrom = startIdx;
358
+ // First, find the opening fence if we're not exactly at it
359
+ let fenceStart = markdownText.lastIndexOf('```', searchFrom);
360
+ if (fenceStart === -1) {
361
+ fenceStart = startIdx;
362
+ }
363
+ // Find the end of the opening fence line
364
+ let pos = fenceStart + 3;
365
+ while (pos < markdownText.length && markdownText[pos] !== '\n') {
366
+ pos++;
367
+ }
368
+ pos++; // Skip the newline
369
+ // Now look for the closing ```
370
+ while (pos < markdownText.length) {
371
+ if (markdownText.slice(pos).trimStart().startsWith('```')) {
372
+ // Find the end of the closing fence line
373
+ let endPos = pos;
374
+ while (endPos < markdownText.length && markdownText[endPos] !== '\n') {
375
+ endPos++;
376
+ }
377
+ return Math.min(endPos + 1, markdownText.length);
378
+ }
379
+ // Move to next line
380
+ while (pos < markdownText.length && markdownText[pos] !== '\n') {
381
+ pos++;
382
+ }
383
+ pos++; // Skip newline
384
+ }
385
+ // No closing fence found, return end of text
386
+ return markdownText.length;
387
+ }
388
+ /**
389
+ * Fallback: try to locate a table by scanning for pipe-delimited patterns
390
+ * near the expected page region
391
+ */
392
+ function locateTableByPattern(blockType, _markdownText, _pageOffsets) {
393
+ // This is a fallback when we have no content to match.
394
+ // We cannot reliably locate a specific table without content.
395
+ console.error(`[json-block-analyzer] Could not locate ${blockType} block: no searchable content in HTML`);
396
+ return null;
397
+ }
398
+ /**
399
+ * Get page number for a character offset (delegates to page offsets lookup)
400
+ */
401
+ function getPageNumberForOffset(charOffset, pageOffsets) {
402
+ if (pageOffsets.length === 0) {
403
+ return null;
404
+ }
405
+ for (const page of pageOffsets) {
406
+ if (charOffset >= page.charStart && charOffset < page.charEnd) {
407
+ return page.page;
408
+ }
409
+ }
410
+ // If past all pages, return last page
411
+ if (charOffset >= pageOffsets[pageOffsets.length - 1].charEnd) {
412
+ return pageOffsets[pageOffsets.length - 1].page;
413
+ }
414
+ return pageOffsets[0].page;
415
+ }
416
+ /**
417
+ * Merge overlapping or adjacent regions in a sorted array
418
+ */
419
+ function mergeOverlappingRegions(regions) {
420
+ if (regions.length <= 1) {
421
+ return regions;
422
+ }
423
+ const merged = [regions[0]];
424
+ for (let i = 1; i < regions.length; i++) {
425
+ const current = regions[i];
426
+ const last = merged[merged.length - 1];
427
+ if (current.startOffset <= last.endOffset) {
428
+ // Overlapping or adjacent - merge
429
+ last.endOffset = Math.max(last.endOffset, current.endOffset);
430
+ // Keep the block type of the larger region
431
+ if (current.endOffset - current.startOffset > last.endOffset - last.startOffset) {
432
+ last.blockType = current.blockType;
433
+ }
434
+ }
435
+ else {
436
+ merged.push(current);
437
+ }
438
+ }
439
+ return merged;
440
+ }
441
+ /**
442
+ * Validate that region offsets are non-negative and properly ordered
443
+ */
444
+ function validateRegionOffsets(start, end) {
445
+ if (start < 0) {
446
+ throw new Error(`Invalid negative startOffset in atomic region: ${start}`);
447
+ }
448
+ if (end < start) {
449
+ throw new Error(`endOffset (${end}) is less than startOffset (${start}) in atomic region`);
450
+ }
451
+ }
452
+ /**
453
+ * Walk the JSON block tree and count block types to produce statistics.
454
+ *
455
+ * Recognizes: Text, Table, TableGroup, Figure, FigureGroup, Code,
456
+ * ListItem, List, PageHeader, PageFooter, SectionHeader, Title, Page.
457
+ *
458
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
459
+ * @returns BlockTypeStats with counts and derived ratios
460
+ */
461
+ export function computeBlockTypeStats(jsonBlocks) {
462
+ if (!jsonBlocks) {
463
+ return null;
464
+ }
465
+ const counts = {
466
+ total: 0,
467
+ text: 0,
468
+ table: 0,
469
+ figure: 0,
470
+ code: 0,
471
+ list: 0,
472
+ header: 0,
473
+ footer: 0,
474
+ heading: 0,
475
+ page: 0,
476
+ };
477
+ const countBlocks = (block) => {
478
+ const blockType = block.block_type;
479
+ if (blockType) {
480
+ counts.total++;
481
+ switch (blockType) {
482
+ case 'Text':
483
+ counts.text++;
484
+ break;
485
+ case 'Table':
486
+ case 'TableGroup':
487
+ counts.table++;
488
+ break;
489
+ case 'Figure':
490
+ case 'FigureGroup':
491
+ counts.figure++;
492
+ break;
493
+ case 'Code':
494
+ counts.code++;
495
+ break;
496
+ case 'ListItem':
497
+ case 'List':
498
+ counts.list++;
499
+ break;
500
+ case 'PageHeader':
501
+ counts.header++;
502
+ break;
503
+ case 'PageFooter':
504
+ counts.footer++;
505
+ break;
506
+ case 'SectionHeader':
507
+ case 'Title':
508
+ counts.heading++;
509
+ break;
510
+ case 'Page':
511
+ counts.page++;
512
+ break;
513
+ // Other block types still count toward total_blocks
514
+ }
515
+ }
516
+ const children = (block.children ?? block.blocks);
517
+ if (Array.isArray(children)) {
518
+ for (const child of children) {
519
+ countBlocks(child);
520
+ }
521
+ }
522
+ };
523
+ countBlocks(jsonBlocks);
524
+ const pageCount = Math.max(counts.page, 1);
525
+ // Content blocks = non-structural blocks (exclude Page, PageHeader, PageFooter)
526
+ const contentBlocks = counts.total - counts.page - counts.header - counts.footer;
527
+ return {
528
+ total_blocks: counts.total,
529
+ text_blocks: counts.text,
530
+ table_blocks: counts.table,
531
+ figure_blocks: counts.figure,
532
+ code_blocks: counts.code,
533
+ list_blocks: counts.list,
534
+ header_blocks: counts.header,
535
+ footer_blocks: counts.footer,
536
+ heading_blocks: counts.heading,
537
+ page_count: counts.page,
538
+ tables_per_page: Math.round((counts.table / pageCount) * 100) / 100,
539
+ figures_per_page: Math.round((counts.figure / pageCount) * 100) / 100,
540
+ text_density: contentBlocks > 0
541
+ ? Math.round((counts.text / contentBlocks) * 100) / 100
542
+ : 0,
543
+ };
544
+ }
545
+ // ---------------------------------------------------------------------------
546
+ // Block-Type Confidence Scoring (ME-8 / Task 4.3)
547
+ // ---------------------------------------------------------------------------
548
+ /**
549
+ * Confidence scores for block types, used to compute chunk quality from
550
+ * the block types present in a chunk. Higher values indicate more structured
551
+ * and typically more reliable content.
552
+ */
553
+ export const BLOCK_TYPE_CONFIDENCE = {
554
+ Table: 0.9,
555
+ TableGroup: 0.9,
556
+ Code: 0.9,
557
+ SectionHeader: 0.85,
558
+ Title: 0.85,
559
+ ListItem: 0.8,
560
+ List: 0.8,
561
+ Text: 0.7,
562
+ Figure: 0.6,
563
+ PageHeader: 0.5,
564
+ PageFooter: 0.5,
565
+ };
566
+ /**
567
+ * Compute a confidence score for a chunk based on the block types it contains.
568
+ *
569
+ * Returns the average confidence across all content types in the chunk.
570
+ * Unknown block types default to 0.7. An empty content types array also
571
+ * defaults to 0.7.
572
+ *
573
+ * @param contentTypes - Array of block type strings from the chunk
574
+ * @returns Confidence score between 0 and 1
575
+ */
576
+ export function computeBlockConfidence(contentTypes) {
577
+ if (contentTypes.length === 0)
578
+ return 0.7;
579
+ const scores = contentTypes.map((t) => BLOCK_TYPE_CONFIDENCE[t] ?? 0.7);
580
+ return scores.reduce((a, b) => a + b, 0) / scores.length;
581
+ }
582
+ /**
583
+ * Extract text content from a block by walking its HTML or children.
584
+ * Returns the concatenated text content, stripped of HTML tags.
585
+ */
586
+ function extractBlockText(block) {
587
+ // Try HTML content first
588
+ const html = block.html ?? '';
589
+ if (html.length > 0) {
590
+ return stripHtmlTags(html).trim();
591
+ }
592
+ // Try direct text content
593
+ const text = block.text ?? '';
594
+ if (text.length > 0) {
595
+ return text.trim();
596
+ }
597
+ // Walk children to collect text
598
+ const children = (block.children ?? block.blocks);
599
+ if (Array.isArray(children)) {
600
+ const parts = [];
601
+ for (const child of children) {
602
+ const childText = extractBlockText(child);
603
+ if (childText.length > 0) {
604
+ parts.push(childText);
605
+ }
606
+ }
607
+ return parts.join(' ').trim();
608
+ }
609
+ return '';
610
+ }
611
+ /**
612
+ * Detect repeated headers and footers from the JSON block tree.
613
+ *
614
+ * Walks the block tree for each page, collecting PageHeader and PageFooter
615
+ * block texts. A text is considered "repeated" if it appears on >50% of pages
616
+ * with at least 2 occurrences.
617
+ *
618
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
619
+ * @returns HeaderFooterInfo with all and repeated header/footer texts
620
+ */
621
+ export function detectRepeatedHeadersFooters(jsonBlocks) {
622
+ const result = {
623
+ headerTexts: [],
624
+ footerTexts: [],
625
+ repeatedHeaders: [],
626
+ repeatedFooters: [],
627
+ };
628
+ if (!jsonBlocks) {
629
+ return result;
630
+ }
631
+ // Collect header/footer texts per page
632
+ const headerCounts = new Map();
633
+ const footerCounts = new Map();
634
+ let pageCount = 0;
635
+ // Walk the tree collecting PageHeader and PageFooter blocks
636
+ walkBlocks(jsonBlocks, (block, _pageNum) => {
637
+ const blockType = block.block_type;
638
+ if (!blockType)
639
+ return;
640
+ if (blockType === 'Page') {
641
+ pageCount++;
642
+ return;
643
+ }
644
+ if (blockType === 'PageHeader') {
645
+ const text = extractBlockText(block);
646
+ if (text.length > 0) {
647
+ result.headerTexts.push(text);
648
+ headerCounts.set(text, (headerCounts.get(text) ?? 0) + 1);
649
+ }
650
+ }
651
+ else if (blockType === 'PageFooter') {
652
+ const text = extractBlockText(block);
653
+ if (text.length > 0) {
654
+ result.footerTexts.push(text);
655
+ footerCounts.set(text, (footerCounts.get(text) ?? 0) + 1);
656
+ }
657
+ }
658
+ }, 0);
659
+ // Ensure at least 1 page for percentage calculation
660
+ const effectivePageCount = Math.max(pageCount, 1);
661
+ const threshold = effectivePageCount / 2;
662
+ // Repeated = appears on >50% of pages, with at least 2 occurrences
663
+ for (const [text, count] of headerCounts) {
664
+ if (count >= 2 && count > threshold) {
665
+ result.repeatedHeaders.push(text);
666
+ }
667
+ }
668
+ for (const [text, count] of footerCounts) {
669
+ if (count >= 2 && count > threshold) {
670
+ result.repeatedFooters.push(text);
671
+ }
672
+ }
673
+ return result;
674
+ }
675
+ /**
676
+ * Check if a chunk's text closely matches any of the repeated header/footer texts.
677
+ * Uses normalized comparison (lowercased, whitespace-collapsed).
678
+ *
679
+ * @param chunkText - The chunk text to check
680
+ * @param repeatedTexts - Array of repeated header/footer texts
681
+ * @returns true if the chunk text matches a repeated header/footer
682
+ */
683
+ export function isRepeatedHeaderFooter(chunkText, repeatedTexts) {
684
+ if (repeatedTexts.length === 0)
685
+ return false;
686
+ const normalizedChunk = chunkText.toLowerCase().replace(/\s+/g, ' ').trim();
687
+ if (normalizedChunk.length === 0)
688
+ return false;
689
+ for (const repeated of repeatedTexts) {
690
+ const normalizedRepeated = repeated.toLowerCase().replace(/\s+/g, ' ').trim();
691
+ // Exact match or chunk contains the repeated text
692
+ if (normalizedChunk === normalizedRepeated)
693
+ return true;
694
+ // Check if the chunk is very short and is a substring of the repeated text
695
+ if (normalizedChunk.length <= normalizedRepeated.length * 1.2 &&
696
+ normalizedRepeated.includes(normalizedChunk))
697
+ return true;
698
+ // Check if the repeated text is contained in a short chunk
699
+ if (normalizedChunk.length <= normalizedRepeated.length * 1.5 &&
700
+ normalizedChunk.includes(normalizedRepeated))
701
+ return true;
702
+ }
703
+ return false;
704
+ }
705
+ /**
706
+ * Extract table structures from the JSON block tree.
707
+ *
708
+ * Walks json_blocks for Table/TableGroup blocks, extracts column headers
709
+ * from the first row, and maps to markdown text offsets.
710
+ *
711
+ * @param jsonBlocks - The JSON block hierarchy from Datalab OCR (may be null)
712
+ * @param markdownText - The full markdown text to search within
713
+ * @param pageOffsets - Page offset information for page number assignment
714
+ * @returns Array of TableStructure with column headers and position info
715
+ */
716
+ export function extractTableStructures(jsonBlocks, markdownText, pageOffsets) {
717
+ if (!jsonBlocks || markdownText.length === 0) {
718
+ return [];
719
+ }
720
+ const structures = [];
721
+ /** Track previous block for caption detection */
722
+ let previousBlockText = '';
723
+ walkBlocks(jsonBlocks, (block, _pageNum) => {
724
+ const blockType = block.block_type;
725
+ // Track non-table block text for caption detection
726
+ if (blockType && blockType !== 'Table' && blockType !== 'TableGroup') {
727
+ const text = extractBlockText(block);
728
+ if (text.length > 0) {
729
+ previousBlockText = text;
730
+ }
731
+ return;
732
+ }
733
+ if (blockType !== 'Table' && blockType !== 'TableGroup') {
734
+ return;
735
+ }
736
+ // Locate the table in markdown text first (needed for markdown fallbacks)
737
+ const region = locateBlockInMarkdown(block, blockType, _pageNum, markdownText, pageOffsets);
738
+ if (!region) {
739
+ previousBlockText = '';
740
+ return;
741
+ }
742
+ // Get the markdown text range for this table
743
+ const tableMarkdown = markdownText.slice(region.startOffset, region.endOffset);
744
+ // Extract column headers from the block's children, with markdown fallback
745
+ let columnHeaders = extractTableColumnHeaders(block);
746
+ if (columnHeaders.length === 0) {
747
+ columnHeaders = extractHeadersFromMarkdown(tableMarkdown);
748
+ }
749
+ // Count rows from block children, with markdown fallback
750
+ let { rowCount, columnCount } = countTableDimensions(block, columnHeaders.length);
751
+ if (rowCount === 0) {
752
+ const mdDims = countTableDimensionsFromMarkdown(tableMarkdown);
753
+ rowCount = mdDims.rowCount;
754
+ if (columnCount === 0)
755
+ columnCount = mdDims.columnCount;
756
+ }
757
+ // Extract first data row values from markdown for summary
758
+ const firstRowValues = extractFirstDataRow(tableMarkdown);
759
+ // Detect caption from preceding block
760
+ let caption;
761
+ if (previousBlockText.length > 0 && /^(Table|Figure)\s+\d+[.:]/i.test(previousBlockText)) {
762
+ caption = previousBlockText.slice(0, 200);
763
+ }
764
+ // Generate summary
765
+ const summary = generateTableSummary(columnHeaders, rowCount, firstRowValues, caption);
766
+ structures.push({
767
+ startOffset: region.startOffset,
768
+ endOffset: region.endOffset,
769
+ columnHeaders,
770
+ rowCount,
771
+ columnCount,
772
+ pageNumber: region.pageNumber,
773
+ summary,
774
+ firstRowValues,
775
+ caption,
776
+ });
777
+ previousBlockText = '';
778
+ }, 0);
779
+ // Cross-page table continuity detection
780
+ detectTableContinuations(structures);
781
+ return structures;
782
+ }
783
+ /**
784
+ * Extract column headers from the first pipe-delimited row of markdown table text.
785
+ * Fallback when JSON block children don't contain TableRow elements.
786
+ */
787
+ export function extractHeadersFromMarkdown(tableMarkdown) {
788
+ const lines = tableMarkdown.split('\n').filter(l => l.trim().length > 0);
789
+ for (const line of lines) {
790
+ const trimmed = line.trim();
791
+ if (!trimmed.includes('|'))
792
+ continue;
793
+ // Skip separator rows like |---|---|
794
+ if (/^\|?[\s-:|]+\|?$/.test(trimmed))
795
+ continue;
796
+ // Parse pipe-delimited cells
797
+ const cells = trimmed.split('|')
798
+ .map(c => c.trim())
799
+ .filter(c => c.length > 0);
800
+ if (cells.length > 0)
801
+ return cells;
802
+ }
803
+ return [];
804
+ }
805
+ /**
806
+ * Count table dimensions from markdown pipe-delimited text.
807
+ * Counts data rows (excludes header and separator rows).
808
+ */
809
+ export function countTableDimensionsFromMarkdown(tableMarkdown) {
810
+ const lines = tableMarkdown.split('\n').filter(l => l.trim().length > 0 && l.includes('|'));
811
+ if (lines.length === 0)
812
+ return { rowCount: 0, columnCount: 0 };
813
+ let maxCols = 0;
814
+ let headerFound = false;
815
+ let dataRows = 0;
816
+ for (const line of lines) {
817
+ const trimmed = line.trim();
818
+ // Check if separator row
819
+ if (/^\|?[\s-:|]+\|?$/.test(trimmed)) {
820
+ continue;
821
+ }
822
+ const cells = trimmed.split('|').map(c => c.trim()).filter(c => c.length > 0);
823
+ if (cells.length > maxCols)
824
+ maxCols = cells.length;
825
+ if (!headerFound) {
826
+ headerFound = true; // first non-separator row is the header
827
+ }
828
+ else {
829
+ dataRows++;
830
+ }
831
+ }
832
+ return { rowCount: dataRows, columnCount: maxCols };
833
+ }
834
+ /**
835
+ * Extract values from the first data row (after header and separator) of markdown table.
836
+ */
837
+ export function extractFirstDataRow(tableMarkdown) {
838
+ const lines = tableMarkdown.split('\n').filter(l => l.trim().length > 0 && l.includes('|'));
839
+ let headerSeen = false;
840
+ for (const line of lines) {
841
+ const trimmed = line.trim();
842
+ if (/^\|?[\s-:|]+\|?$/.test(trimmed)) {
843
+ continue;
844
+ }
845
+ if (!headerSeen) {
846
+ headerSeen = true;
847
+ continue;
848
+ }
849
+ // First non-header, non-separator row is the first data row
850
+ return trimmed.split('|').map(c => c.trim()).filter(c => c.length > 0);
851
+ }
852
+ return [];
853
+ }
854
+ /**
855
+ * Generate a human-readable summary of table content.
856
+ * Format: "Table with N rows and columns: col1, col2. Sample: val1, val2"
857
+ * Max 200 chars.
858
+ */
859
+ export function generateTableSummary(columnHeaders, rowCount, firstRowValues, caption) {
860
+ const parts = [];
861
+ if (caption) {
862
+ parts.push(caption);
863
+ }
864
+ const rowDesc = rowCount > 0 ? `${rowCount} rows` : 'rows';
865
+ if (columnHeaders.length > 0) {
866
+ parts.push(`Table with ${rowDesc} and columns: ${columnHeaders.join(', ')}`);
867
+ }
868
+ else {
869
+ parts.push(`Table with ${rowDesc}`);
870
+ }
871
+ if (firstRowValues.length > 0) {
872
+ parts.push(`Sample: ${firstRowValues.join(', ')}`);
873
+ }
874
+ let summary = parts.join('. ');
875
+ if (summary.length > 200) {
876
+ summary = summary.slice(0, 197) + '...';
877
+ }
878
+ return summary;
879
+ }
880
+ /**
881
+ * Detect cross-page table continuations by comparing column headers.
882
+ * Consecutive tables with matching headers on adjacent pages are linked.
883
+ */
884
+ function detectTableContinuations(structures) {
885
+ if (structures.length < 2)
886
+ return;
887
+ for (let i = 1; i < structures.length; i++) {
888
+ const prev = structures[i - 1];
889
+ const curr = structures[i];
890
+ // Both must have column headers to compare
891
+ if (prev.columnHeaders.length === 0 || curr.columnHeaders.length === 0)
892
+ continue;
893
+ // Must be on adjacent pages (or page info unavailable)
894
+ if (prev.pageNumber !== null && curr.pageNumber !== null) {
895
+ if (curr.pageNumber - prev.pageNumber > 1)
896
+ continue;
897
+ }
898
+ // Compare column headers: exact match or >80% overlap
899
+ const overlap = columnHeaderOverlap(prev.columnHeaders, curr.columnHeaders);
900
+ if (overlap >= 0.8) {
901
+ curr.continuationOf = i - 1;
902
+ }
903
+ }
904
+ }
905
+ /**
906
+ * Compute Sorensen-Dice similarity between two column header arrays.
907
+ */
908
+ function columnHeaderOverlap(a, b) {
909
+ if (a.length === 0 && b.length === 0)
910
+ return 1;
911
+ if (a.length === 0 || b.length === 0)
912
+ return 0;
913
+ const setA = new Set(a.map(h => h.toLowerCase().trim()));
914
+ const setB = new Set(b.map(h => h.toLowerCase().trim()));
915
+ let intersection = 0;
916
+ for (const h of setA) {
917
+ if (setB.has(h))
918
+ intersection++;
919
+ }
920
+ return (2 * intersection) / (setA.size + setB.size);
921
+ }
922
+ /**
923
+ * Extract column headers from the first row of a table block.
924
+ * Looks for the first TableRow/Row child and extracts cell texts.
925
+ */
926
+ function extractTableColumnHeaders(block) {
927
+ const children = (block.children ?? block.blocks);
928
+ if (!Array.isArray(children) || children.length === 0) {
929
+ // Try extracting from HTML content as fallback
930
+ return extractHeadersFromHtml(block);
931
+ }
932
+ // Look for the first row-like child
933
+ for (const child of children) {
934
+ const childBlock = child;
935
+ const childType = childBlock.block_type;
936
+ if (childType === 'TableRow' || childType === 'Row' || childType === 'TableHeader') {
937
+ const cells = (childBlock.children ?? childBlock.blocks);
938
+ if (Array.isArray(cells) && cells.length > 0) {
939
+ const headers = [];
940
+ for (const cell of cells) {
941
+ const cellBlock = cell;
942
+ const cellText = extractBlockText(cellBlock);
943
+ if (cellText.length > 0) {
944
+ headers.push(cellText);
945
+ }
946
+ }
947
+ if (headers.length > 0)
948
+ return headers;
949
+ }
950
+ }
951
+ // For TableGroup, check nested Table children
952
+ if (childType === 'Table') {
953
+ const tableHeaders = extractTableColumnHeaders(childBlock);
954
+ if (tableHeaders.length > 0)
955
+ return tableHeaders;
956
+ }
957
+ }
958
+ // Fallback: try HTML parsing
959
+ return extractHeadersFromHtml(block);
960
+ }
961
+ /**
962
+ * Extract table headers from block HTML content (fallback).
963
+ * Looks for the first row in an HTML table.
964
+ */
965
+ function extractHeadersFromHtml(block) {
966
+ const html = block.html ?? '';
967
+ if (html.length === 0)
968
+ return [];
969
+ // Try to find <th> elements first
970
+ const thMatches = html.match(/<th[^>]*>(.*?)<\/th>/gi);
971
+ if (thMatches && thMatches.length > 0) {
972
+ return thMatches.map(th => stripHtmlTags(th).trim()).filter(t => t.length > 0);
973
+ }
974
+ // Try first <tr> and extract <td> elements
975
+ const firstRowMatch = html.match(/<tr[^>]*>(.*?)<\/tr>/i);
976
+ if (firstRowMatch) {
977
+ const tdMatches = firstRowMatch[1].match(/<td[^>]*>(.*?)<\/td>/gi);
978
+ if (tdMatches && tdMatches.length > 0) {
979
+ return tdMatches.map(td => stripHtmlTags(td).trim()).filter(t => t.length > 0);
980
+ }
981
+ }
982
+ return [];
983
+ }
984
+ /**
985
+ * Count table dimensions from block children, with HTML fallback.
986
+ */
987
+ function countTableDimensions(block, headerColumnCount) {
988
+ const children = (block.children ?? block.blocks);
989
+ let rowCount = 0;
990
+ let maxColumns = headerColumnCount;
991
+ if (Array.isArray(children) && children.length > 0) {
992
+ for (const child of children) {
993
+ const childBlock = child;
994
+ const childType = childBlock.block_type;
995
+ if (childType === 'TableRow' || childType === 'Row' || childType === 'TableHeader') {
996
+ rowCount++;
997
+ const cells = (childBlock.children ?? childBlock.blocks);
998
+ if (Array.isArray(cells) && cells.length > maxColumns) {
999
+ maxColumns = cells.length;
1000
+ }
1001
+ }
1002
+ else if (childType === 'Table') {
1003
+ // Nested table in TableGroup
1004
+ const nested = countTableDimensions(childBlock, headerColumnCount);
1005
+ rowCount += nested.rowCount;
1006
+ if (nested.columnCount > maxColumns)
1007
+ maxColumns = nested.columnCount;
1008
+ }
1009
+ }
1010
+ }
1011
+ // HTML fallback: count <tr> elements when block children yield 0 rows
1012
+ if (rowCount === 0) {
1013
+ const html = block.html ?? '';
1014
+ if (html.length > 0) {
1015
+ const trMatches = html.match(/<tr[^>]*>/gi);
1016
+ if (trMatches) {
1017
+ // Subtract 1 for header row (data rows only)
1018
+ rowCount = Math.max(0, trMatches.length - 1);
1019
+ }
1020
+ // Count max columns from HTML if needed
1021
+ if (maxColumns === 0) {
1022
+ const firstRowMatch = html.match(/<tr[^>]*>([\s\S]*?)<\/tr>/i);
1023
+ if (firstRowMatch) {
1024
+ const cellCount = (firstRowMatch[1].match(/<t[dh][^>]*>/gi) ?? []).length;
1025
+ if (cellCount > maxColumns)
1026
+ maxColumns = cellCount;
1027
+ }
1028
+ }
1029
+ }
1030
+ }
1031
+ return { rowCount, columnCount: maxColumns };
1032
+ }
1033
+ //# sourceMappingURL=json-block-analyzer.js.map