@llamaindex/liteparse 1.5.2 → 2.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (578) hide show
  1. package/README.md +50 -373
  2. package/dist/cli.d.ts +3 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +87 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/lib.d.ts +58 -0
  7. package/dist/lib.d.ts.map +1 -0
  8. package/dist/lib.js +88 -0
  9. package/dist/lib.js.map +1 -0
  10. package/dist/native.d.ts +54 -0
  11. package/dist/native.d.ts.map +1 -0
  12. package/dist/native.js +70 -0
  13. package/dist/native.js.map +1 -0
  14. package/libpdfium.so +0 -0
  15. package/liteparse.linux-x64-gnu.node +0 -0
  16. package/package.json +36 -50
  17. package/LICENSE +0 -201
  18. package/dist/cli/parse.d.ts +0 -4
  19. package/dist/cli/parse.d.ts.map +0 -1
  20. package/dist/cli/parse.js +0 -450
  21. package/dist/cli/parse.js.map +0 -1
  22. package/dist/package.json +0 -90
  23. package/dist/src/conversion/convertToPdf.d.ts +0 -65
  24. package/dist/src/conversion/convertToPdf.d.ts.map +0 -1
  25. package/dist/src/conversion/convertToPdf.js +0 -405
  26. package/dist/src/conversion/convertToPdf.js.map +0 -1
  27. package/dist/src/conversion/convertToPdf.test.d.ts +0 -2
  28. package/dist/src/conversion/convertToPdf.test.d.ts.map +0 -1
  29. package/dist/src/conversion/convertToPdf.test.js +0 -327
  30. package/dist/src/conversion/convertToPdf.test.js.map +0 -1
  31. package/dist/src/core/config.d.ts +0 -4
  32. package/dist/src/core/config.d.ts.map +0 -1
  33. package/dist/src/core/config.js +0 -26
  34. package/dist/src/core/config.js.map +0 -1
  35. package/dist/src/core/config.test.d.ts +0 -2
  36. package/dist/src/core/config.test.d.ts.map +0 -1
  37. package/dist/src/core/config.test.js +0 -21
  38. package/dist/src/core/config.test.js.map +0 -1
  39. package/dist/src/core/parser.d.ts +0 -92
  40. package/dist/src/core/parser.d.ts.map +0 -1
  41. package/dist/src/core/parser.js +0 -401
  42. package/dist/src/core/parser.js.map +0 -1
  43. package/dist/src/core/parser.test.d.ts +0 -2
  44. package/dist/src/core/parser.test.d.ts.map +0 -1
  45. package/dist/src/core/parser.test.js +0 -541
  46. package/dist/src/core/parser.test.js.map +0 -1
  47. package/dist/src/core/types.d.ts +0 -370
  48. package/dist/src/core/types.d.ts.map +0 -1
  49. package/dist/src/core/types.js +0 -2
  50. package/dist/src/core/types.js.map +0 -1
  51. package/dist/src/engines/ocr/http-simple.d.ts +0 -19
  52. package/dist/src/engines/ocr/http-simple.d.ts.map +0 -1
  53. package/dist/src/engines/ocr/http-simple.js +0 -69
  54. package/dist/src/engines/ocr/http-simple.js.map +0 -1
  55. package/dist/src/engines/ocr/http-simple.test.d.ts +0 -2
  56. package/dist/src/engines/ocr/http-simple.test.d.ts.map +0 -1
  57. package/dist/src/engines/ocr/http-simple.test.js +0 -108
  58. package/dist/src/engines/ocr/http-simple.test.js.map +0 -1
  59. package/dist/src/engines/ocr/interface.d.ts +0 -15
  60. package/dist/src/engines/ocr/interface.d.ts.map +0 -1
  61. package/dist/src/engines/ocr/interface.js +0 -2
  62. package/dist/src/engines/ocr/interface.js.map +0 -1
  63. package/dist/src/engines/ocr/tesseract.d.ts +0 -20
  64. package/dist/src/engines/ocr/tesseract.d.ts.map +0 -1
  65. package/dist/src/engines/ocr/tesseract.js +0 -161
  66. package/dist/src/engines/ocr/tesseract.js.map +0 -1
  67. package/dist/src/engines/ocr/tesseract.test.d.ts +0 -2
  68. package/dist/src/engines/ocr/tesseract.test.d.ts.map +0 -1
  69. package/dist/src/engines/ocr/tesseract.test.js +0 -94
  70. package/dist/src/engines/ocr/tesseract.test.js.map +0 -1
  71. package/dist/src/engines/pdf/interface.d.ts +0 -84
  72. package/dist/src/engines/pdf/interface.d.ts.map +0 -1
  73. package/dist/src/engines/pdf/interface.js +0 -2
  74. package/dist/src/engines/pdf/interface.js.map +0 -1
  75. package/dist/src/engines/pdf/pdfium-renderer.d.ts +0 -31
  76. package/dist/src/engines/pdf/pdfium-renderer.d.ts.map +0 -1
  77. package/dist/src/engines/pdf/pdfium-renderer.js +0 -145
  78. package/dist/src/engines/pdf/pdfium-renderer.js.map +0 -1
  79. package/dist/src/engines/pdf/pdfium-renderer.test.d.ts +0 -2
  80. package/dist/src/engines/pdf/pdfium-renderer.test.d.ts.map +0 -1
  81. package/dist/src/engines/pdf/pdfium-renderer.test.js +0 -109
  82. package/dist/src/engines/pdf/pdfium-renderer.test.js.map +0 -1
  83. package/dist/src/engines/pdf/pdfjs.d.ts +0 -14
  84. package/dist/src/engines/pdf/pdfjs.d.ts.map +0 -1
  85. package/dist/src/engines/pdf/pdfjs.js +0 -799
  86. package/dist/src/engines/pdf/pdfjs.js.map +0 -1
  87. package/dist/src/engines/pdf/pdfjs.test.d.ts +0 -2
  88. package/dist/src/engines/pdf/pdfjs.test.d.ts.map +0 -1
  89. package/dist/src/engines/pdf/pdfjs.test.js +0 -225
  90. package/dist/src/engines/pdf/pdfjs.test.js.map +0 -1
  91. package/dist/src/engines/pdf/pdfjsImporter.d.ts +0 -5
  92. package/dist/src/engines/pdf/pdfjsImporter.d.ts.map +0 -1
  93. package/dist/src/engines/pdf/pdfjsImporter.js +0 -45
  94. package/dist/src/engines/pdf/pdfjsImporter.js.map +0 -1
  95. package/dist/src/index.d.ts +0 -3
  96. package/dist/src/index.d.ts.map +0 -1
  97. package/dist/src/index.js +0 -5
  98. package/dist/src/index.js.map +0 -1
  99. package/dist/src/lib.d.ts +0 -19
  100. package/dist/src/lib.d.ts.map +0 -1
  101. package/dist/src/lib.js +0 -17
  102. package/dist/src/lib.js.map +0 -1
  103. package/dist/src/output/json.d.ts +0 -10
  104. package/dist/src/output/json.d.ts.map +0 -1
  105. package/dist/src/output/json.js +0 -32
  106. package/dist/src/output/json.js.map +0 -1
  107. package/dist/src/output/json.test.d.ts +0 -2
  108. package/dist/src/output/json.test.d.ts.map +0 -1
  109. package/dist/src/output/json.test.js +0 -199
  110. package/dist/src/output/json.test.js.map +0 -1
  111. package/dist/src/output/text.d.ts +0 -10
  112. package/dist/src/output/text.d.ts.map +0 -1
  113. package/dist/src/output/text.js +0 -17
  114. package/dist/src/output/text.js.map +0 -1
  115. package/dist/src/output/text.test.d.ts +0 -2
  116. package/dist/src/output/text.test.d.ts.map +0 -1
  117. package/dist/src/output/text.test.js +0 -65
  118. package/dist/src/output/text.test.js.map +0 -1
  119. package/dist/src/processing/bbox.d.ts +0 -20
  120. package/dist/src/processing/bbox.d.ts.map +0 -1
  121. package/dist/src/processing/bbox.js +0 -258
  122. package/dist/src/processing/bbox.js.map +0 -1
  123. package/dist/src/processing/bbox.test.d.ts +0 -2
  124. package/dist/src/processing/bbox.test.d.ts.map +0 -1
  125. package/dist/src/processing/bbox.test.js +0 -334
  126. package/dist/src/processing/bbox.test.js.map +0 -1
  127. package/dist/src/processing/cleanText.d.ts +0 -6
  128. package/dist/src/processing/cleanText.d.ts.map +0 -1
  129. package/dist/src/processing/cleanText.js +0 -73
  130. package/dist/src/processing/cleanText.js.map +0 -1
  131. package/dist/src/processing/cleanText.test.d.ts +0 -2
  132. package/dist/src/processing/cleanText.test.d.ts.map +0 -1
  133. package/dist/src/processing/cleanText.test.js +0 -46
  134. package/dist/src/processing/cleanText.test.js.map +0 -1
  135. package/dist/src/processing/grid.d.ts +0 -7
  136. package/dist/src/processing/grid.d.ts.map +0 -1
  137. package/dist/src/processing/grid.js +0 -13
  138. package/dist/src/processing/grid.js.map +0 -1
  139. package/dist/src/processing/gridDebugLogger.d.ts +0 -206
  140. package/dist/src/processing/gridDebugLogger.d.ts.map +0 -1
  141. package/dist/src/processing/gridDebugLogger.js +0 -446
  142. package/dist/src/processing/gridDebugLogger.js.map +0 -1
  143. package/dist/src/processing/gridProjection.d.ts +0 -19
  144. package/dist/src/processing/gridProjection.d.ts.map +0 -1
  145. package/dist/src/processing/gridProjection.js +0 -1813
  146. package/dist/src/processing/gridProjection.js.map +0 -1
  147. package/dist/src/processing/gridProjection.test.d.ts +0 -2
  148. package/dist/src/processing/gridProjection.test.d.ts.map +0 -1
  149. package/dist/src/processing/gridProjection.test.js +0 -495
  150. package/dist/src/processing/gridProjection.test.js.map +0 -1
  151. package/dist/src/processing/gridVisualizer.d.ts +0 -14
  152. package/dist/src/processing/gridVisualizer.d.ts.map +0 -1
  153. package/dist/src/processing/gridVisualizer.js +0 -166
  154. package/dist/src/processing/gridVisualizer.js.map +0 -1
  155. package/dist/src/processing/markupUtils.d.ts +0 -7
  156. package/dist/src/processing/markupUtils.d.ts.map +0 -1
  157. package/dist/src/processing/markupUtils.js +0 -25
  158. package/dist/src/processing/markupUtils.js.map +0 -1
  159. package/dist/src/processing/markupUtils.test.d.ts +0 -2
  160. package/dist/src/processing/markupUtils.test.d.ts.map +0 -1
  161. package/dist/src/processing/markupUtils.test.js +0 -26
  162. package/dist/src/processing/markupUtils.test.js.map +0 -1
  163. package/dist/src/processing/ocrUtils.d.ts +0 -24
  164. package/dist/src/processing/ocrUtils.d.ts.map +0 -1
  165. package/dist/src/processing/ocrUtils.js +0 -79
  166. package/dist/src/processing/ocrUtils.js.map +0 -1
  167. package/dist/src/processing/octUtils.test.d.ts +0 -2
  168. package/dist/src/processing/octUtils.test.d.ts.map +0 -1
  169. package/dist/src/processing/octUtils.test.js +0 -72
  170. package/dist/src/processing/octUtils.test.js.map +0 -1
  171. package/dist/src/processing/searchItems.d.ts +0 -26
  172. package/dist/src/processing/searchItems.d.ts.map +0 -1
  173. package/dist/src/processing/searchItems.js +0 -93
  174. package/dist/src/processing/searchItems.js.map +0 -1
  175. package/dist/src/processing/searchItems.test.d.ts +0 -2
  176. package/dist/src/processing/searchItems.test.d.ts.map +0 -1
  177. package/dist/src/processing/searchItems.test.js +0 -84
  178. package/dist/src/processing/searchItems.test.js.map +0 -1
  179. package/dist/src/processing/textUtils.d.ts +0 -20
  180. package/dist/src/processing/textUtils.d.ts.map +0 -1
  181. package/dist/src/processing/textUtils.js +0 -142
  182. package/dist/src/processing/textUtils.js.map +0 -1
  183. package/dist/src/processing/textUtils.test.d.ts +0 -2
  184. package/dist/src/processing/textUtils.test.d.ts.map +0 -1
  185. package/dist/src/processing/textUtils.test.js +0 -45
  186. package/dist/src/processing/textUtils.test.js.map +0 -1
  187. package/dist/src/vendor/pdfjs/LICENSE +0 -177
  188. package/dist/src/vendor/pdfjs/README.md +0 -0
  189. package/dist/src/vendor/pdfjs/cmaps/78-EUC-H.bcmap +0 -0
  190. package/dist/src/vendor/pdfjs/cmaps/78-EUC-V.bcmap +0 -0
  191. package/dist/src/vendor/pdfjs/cmaps/78-H.bcmap +0 -0
  192. package/dist/src/vendor/pdfjs/cmaps/78-RKSJ-H.bcmap +0 -0
  193. package/dist/src/vendor/pdfjs/cmaps/78-RKSJ-V.bcmap +0 -0
  194. package/dist/src/vendor/pdfjs/cmaps/78-V.bcmap +0 -0
  195. package/dist/src/vendor/pdfjs/cmaps/78ms-RKSJ-H.bcmap +0 -0
  196. package/dist/src/vendor/pdfjs/cmaps/78ms-RKSJ-V.bcmap +0 -0
  197. package/dist/src/vendor/pdfjs/cmaps/83pv-RKSJ-H.bcmap +0 -0
  198. package/dist/src/vendor/pdfjs/cmaps/90ms-RKSJ-H.bcmap +0 -0
  199. package/dist/src/vendor/pdfjs/cmaps/90ms-RKSJ-V.bcmap +0 -0
  200. package/dist/src/vendor/pdfjs/cmaps/90msp-RKSJ-H.bcmap +0 -0
  201. package/dist/src/vendor/pdfjs/cmaps/90msp-RKSJ-V.bcmap +0 -0
  202. package/dist/src/vendor/pdfjs/cmaps/90pv-RKSJ-H.bcmap +0 -0
  203. package/dist/src/vendor/pdfjs/cmaps/90pv-RKSJ-V.bcmap +0 -0
  204. package/dist/src/vendor/pdfjs/cmaps/Add-H.bcmap +0 -0
  205. package/dist/src/vendor/pdfjs/cmaps/Add-RKSJ-H.bcmap +0 -0
  206. package/dist/src/vendor/pdfjs/cmaps/Add-RKSJ-V.bcmap +0 -0
  207. package/dist/src/vendor/pdfjs/cmaps/Add-V.bcmap +0 -0
  208. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-0.bcmap +0 -0
  209. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-1.bcmap +0 -0
  210. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-2.bcmap +0 -0
  211. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-3.bcmap +0 -0
  212. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-4.bcmap +0 -0
  213. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-5.bcmap +0 -0
  214. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-6.bcmap +0 -0
  215. package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
  216. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-0.bcmap +0 -0
  217. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-1.bcmap +0 -0
  218. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-2.bcmap +0 -0
  219. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-3.bcmap +0 -0
  220. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-4.bcmap +0 -0
  221. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-5.bcmap +0 -0
  222. package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
  223. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-0.bcmap +0 -0
  224. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-1.bcmap +0 -0
  225. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-2.bcmap +0 -0
  226. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-3.bcmap +0 -0
  227. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-4.bcmap +0 -0
  228. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-5.bcmap +0 -0
  229. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-6.bcmap +0 -0
  230. package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
  231. package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-0.bcmap +0 -0
  232. package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-1.bcmap +0 -0
  233. package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-2.bcmap +0 -0
  234. package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
  235. package/dist/src/vendor/pdfjs/cmaps/B5-H.bcmap +0 -0
  236. package/dist/src/vendor/pdfjs/cmaps/B5-V.bcmap +0 -0
  237. package/dist/src/vendor/pdfjs/cmaps/B5pc-H.bcmap +0 -0
  238. package/dist/src/vendor/pdfjs/cmaps/B5pc-V.bcmap +0 -0
  239. package/dist/src/vendor/pdfjs/cmaps/CNS-EUC-H.bcmap +0 -0
  240. package/dist/src/vendor/pdfjs/cmaps/CNS-EUC-V.bcmap +0 -0
  241. package/dist/src/vendor/pdfjs/cmaps/CNS1-H.bcmap +0 -0
  242. package/dist/src/vendor/pdfjs/cmaps/CNS1-V.bcmap +0 -0
  243. package/dist/src/vendor/pdfjs/cmaps/CNS2-H.bcmap +0 -0
  244. package/dist/src/vendor/pdfjs/cmaps/CNS2-V.bcmap +0 -3
  245. package/dist/src/vendor/pdfjs/cmaps/ETHK-B5-H.bcmap +0 -0
  246. package/dist/src/vendor/pdfjs/cmaps/ETHK-B5-V.bcmap +0 -0
  247. package/dist/src/vendor/pdfjs/cmaps/ETen-B5-H.bcmap +0 -0
  248. package/dist/src/vendor/pdfjs/cmaps/ETen-B5-V.bcmap +0 -0
  249. package/dist/src/vendor/pdfjs/cmaps/ETenms-B5-H.bcmap +0 -3
  250. package/dist/src/vendor/pdfjs/cmaps/ETenms-B5-V.bcmap +0 -0
  251. package/dist/src/vendor/pdfjs/cmaps/EUC-H.bcmap +0 -0
  252. package/dist/src/vendor/pdfjs/cmaps/EUC-V.bcmap +0 -0
  253. package/dist/src/vendor/pdfjs/cmaps/Ext-H.bcmap +0 -0
  254. package/dist/src/vendor/pdfjs/cmaps/Ext-RKSJ-H.bcmap +0 -0
  255. package/dist/src/vendor/pdfjs/cmaps/Ext-RKSJ-V.bcmap +0 -0
  256. package/dist/src/vendor/pdfjs/cmaps/Ext-V.bcmap +0 -0
  257. package/dist/src/vendor/pdfjs/cmaps/GB-EUC-H.bcmap +0 -0
  258. package/dist/src/vendor/pdfjs/cmaps/GB-EUC-V.bcmap +0 -0
  259. package/dist/src/vendor/pdfjs/cmaps/GB-H.bcmap +0 -4
  260. package/dist/src/vendor/pdfjs/cmaps/GB-V.bcmap +0 -0
  261. package/dist/src/vendor/pdfjs/cmaps/GBK-EUC-H.bcmap +0 -0
  262. package/dist/src/vendor/pdfjs/cmaps/GBK-EUC-V.bcmap +0 -0
  263. package/dist/src/vendor/pdfjs/cmaps/GBK2K-H.bcmap +0 -0
  264. package/dist/src/vendor/pdfjs/cmaps/GBK2K-V.bcmap +0 -0
  265. package/dist/src/vendor/pdfjs/cmaps/GBKp-EUC-H.bcmap +0 -0
  266. package/dist/src/vendor/pdfjs/cmaps/GBKp-EUC-V.bcmap +0 -0
  267. package/dist/src/vendor/pdfjs/cmaps/GBT-EUC-H.bcmap +0 -0
  268. package/dist/src/vendor/pdfjs/cmaps/GBT-EUC-V.bcmap +0 -0
  269. package/dist/src/vendor/pdfjs/cmaps/GBT-H.bcmap +0 -0
  270. package/dist/src/vendor/pdfjs/cmaps/GBT-V.bcmap +0 -0
  271. package/dist/src/vendor/pdfjs/cmaps/GBTpc-EUC-H.bcmap +0 -0
  272. package/dist/src/vendor/pdfjs/cmaps/GBTpc-EUC-V.bcmap +0 -0
  273. package/dist/src/vendor/pdfjs/cmaps/GBpc-EUC-H.bcmap +0 -0
  274. package/dist/src/vendor/pdfjs/cmaps/GBpc-EUC-V.bcmap +0 -0
  275. package/dist/src/vendor/pdfjs/cmaps/H.bcmap +0 -0
  276. package/dist/src/vendor/pdfjs/cmaps/HKdla-B5-H.bcmap +0 -0
  277. package/dist/src/vendor/pdfjs/cmaps/HKdla-B5-V.bcmap +0 -0
  278. package/dist/src/vendor/pdfjs/cmaps/HKdlb-B5-H.bcmap +0 -0
  279. package/dist/src/vendor/pdfjs/cmaps/HKdlb-B5-V.bcmap +0 -0
  280. package/dist/src/vendor/pdfjs/cmaps/HKgccs-B5-H.bcmap +0 -0
  281. package/dist/src/vendor/pdfjs/cmaps/HKgccs-B5-V.bcmap +0 -0
  282. package/dist/src/vendor/pdfjs/cmaps/HKm314-B5-H.bcmap +0 -0
  283. package/dist/src/vendor/pdfjs/cmaps/HKm314-B5-V.bcmap +0 -0
  284. package/dist/src/vendor/pdfjs/cmaps/HKm471-B5-H.bcmap +0 -0
  285. package/dist/src/vendor/pdfjs/cmaps/HKm471-B5-V.bcmap +0 -0
  286. package/dist/src/vendor/pdfjs/cmaps/HKscs-B5-H.bcmap +0 -0
  287. package/dist/src/vendor/pdfjs/cmaps/HKscs-B5-V.bcmap +0 -0
  288. package/dist/src/vendor/pdfjs/cmaps/Hankaku.bcmap +0 -0
  289. package/dist/src/vendor/pdfjs/cmaps/Hiragana.bcmap +0 -0
  290. package/dist/src/vendor/pdfjs/cmaps/KSC-EUC-H.bcmap +0 -0
  291. package/dist/src/vendor/pdfjs/cmaps/KSC-EUC-V.bcmap +0 -0
  292. package/dist/src/vendor/pdfjs/cmaps/KSC-H.bcmap +0 -0
  293. package/dist/src/vendor/pdfjs/cmaps/KSC-Johab-H.bcmap +0 -0
  294. package/dist/src/vendor/pdfjs/cmaps/KSC-Johab-V.bcmap +0 -0
  295. package/dist/src/vendor/pdfjs/cmaps/KSC-V.bcmap +0 -0
  296. package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-H.bcmap +0 -0
  297. package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
  298. package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
  299. package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-V.bcmap +0 -0
  300. package/dist/src/vendor/pdfjs/cmaps/KSCpc-EUC-H.bcmap +0 -0
  301. package/dist/src/vendor/pdfjs/cmaps/KSCpc-EUC-V.bcmap +0 -0
  302. package/dist/src/vendor/pdfjs/cmaps/Katakana.bcmap +0 -0
  303. package/dist/src/vendor/pdfjs/cmaps/LICENSE +0 -36
  304. package/dist/src/vendor/pdfjs/cmaps/NWP-H.bcmap +0 -0
  305. package/dist/src/vendor/pdfjs/cmaps/NWP-V.bcmap +0 -0
  306. package/dist/src/vendor/pdfjs/cmaps/RKSJ-H.bcmap +0 -0
  307. package/dist/src/vendor/pdfjs/cmaps/RKSJ-V.bcmap +0 -0
  308. package/dist/src/vendor/pdfjs/cmaps/Roman.bcmap +0 -0
  309. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UCS2-H.bcmap +0 -0
  310. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UCS2-V.bcmap +0 -0
  311. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF16-H.bcmap +0 -0
  312. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF16-V.bcmap +0 -0
  313. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF32-H.bcmap +0 -0
  314. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF32-V.bcmap +0 -0
  315. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF8-H.bcmap +0 -0
  316. package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF8-V.bcmap +0 -0
  317. package/dist/src/vendor/pdfjs/cmaps/UniGB-UCS2-H.bcmap +0 -0
  318. package/dist/src/vendor/pdfjs/cmaps/UniGB-UCS2-V.bcmap +0 -0
  319. package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF16-H.bcmap +0 -0
  320. package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF16-V.bcmap +0 -0
  321. package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF32-H.bcmap +0 -0
  322. package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF32-V.bcmap +0 -0
  323. package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF8-H.bcmap +0 -0
  324. package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF8-V.bcmap +0 -0
  325. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-H.bcmap +0 -0
  326. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
  327. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
  328. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-V.bcmap +0 -0
  329. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF16-H.bcmap +0 -0
  330. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF16-V.bcmap +0 -0
  331. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF32-H.bcmap +0 -0
  332. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF32-V.bcmap +0 -0
  333. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF8-H.bcmap +0 -0
  334. package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF8-V.bcmap +0 -0
  335. package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
  336. package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
  337. package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
  338. package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
  339. package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
  340. package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
  341. package/dist/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
  342. package/dist/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
  343. package/dist/src/vendor/pdfjs/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
  344. package/dist/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
  345. package/dist/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
  346. package/dist/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
  347. package/dist/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
  348. package/dist/src/vendor/pdfjs/cmaps/UniKS-UCS2-H.bcmap +0 -0
  349. package/dist/src/vendor/pdfjs/cmaps/UniKS-UCS2-V.bcmap +0 -0
  350. package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF16-H.bcmap +0 -0
  351. package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF16-V.bcmap +0 -0
  352. package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF32-H.bcmap +0 -0
  353. package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF32-V.bcmap +0 -0
  354. package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF8-H.bcmap +0 -0
  355. package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF8-V.bcmap +0 -0
  356. package/dist/src/vendor/pdfjs/cmaps/V.bcmap +0 -0
  357. package/dist/src/vendor/pdfjs/cmaps/WP-Symbol.bcmap +0 -0
  358. package/dist/src/vendor/pdfjs/jbig2.wasm +0 -0
  359. package/dist/src/vendor/pdfjs/openjpeg.wasm +0 -0
  360. package/dist/src/vendor/pdfjs/pdf.mjs +0 -33603
  361. package/dist/src/vendor/pdfjs/pdf.mjs.map +0 -1
  362. package/dist/src/vendor/pdfjs/pdf.sandbox.mjs +0 -4936
  363. package/dist/src/vendor/pdfjs/pdf.sandbox.mjs.map +0 -1
  364. package/dist/src/vendor/pdfjs/pdf.worker.mjs +0 -70100
  365. package/dist/src/vendor/pdfjs/pdf.worker.mjs.map +0 -1
  366. package/dist/src/vendor/pdfjs/qcms_bg.wasm +0 -0
  367. package/dist/src/vendor/pdfjs/standard_fonts/FoxitDingbats.pfb +0 -0
  368. package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixed.pfb +0 -0
  369. package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixedBold.pfb +0 -0
  370. package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
  371. package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixedItalic.pfb +0 -0
  372. package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerif.pfb +0 -0
  373. package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerifBold.pfb +0 -0
  374. package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
  375. package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerifItalic.pfb +0 -0
  376. package/dist/src/vendor/pdfjs/standard_fonts/FoxitSymbol.pfb +0 -0
  377. package/dist/src/vendor/pdfjs/standard_fonts/LICENSE_FOXIT +0 -27
  378. package/dist/src/vendor/pdfjs/standard_fonts/LICENSE_LIBERATION +0 -102
  379. package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-Bold.ttf +0 -0
  380. package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
  381. package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-Italic.ttf +0 -0
  382. package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-Regular.ttf +0 -0
  383. package/src/vendor/pdfjs/LICENSE +0 -177
  384. package/src/vendor/pdfjs/README.md +0 -0
  385. package/src/vendor/pdfjs/cmaps/78-EUC-H.bcmap +0 -0
  386. package/src/vendor/pdfjs/cmaps/78-EUC-V.bcmap +0 -0
  387. package/src/vendor/pdfjs/cmaps/78-H.bcmap +0 -0
  388. package/src/vendor/pdfjs/cmaps/78-RKSJ-H.bcmap +0 -0
  389. package/src/vendor/pdfjs/cmaps/78-RKSJ-V.bcmap +0 -0
  390. package/src/vendor/pdfjs/cmaps/78-V.bcmap +0 -0
  391. package/src/vendor/pdfjs/cmaps/78ms-RKSJ-H.bcmap +0 -0
  392. package/src/vendor/pdfjs/cmaps/78ms-RKSJ-V.bcmap +0 -0
  393. package/src/vendor/pdfjs/cmaps/83pv-RKSJ-H.bcmap +0 -0
  394. package/src/vendor/pdfjs/cmaps/90ms-RKSJ-H.bcmap +0 -0
  395. package/src/vendor/pdfjs/cmaps/90ms-RKSJ-V.bcmap +0 -0
  396. package/src/vendor/pdfjs/cmaps/90msp-RKSJ-H.bcmap +0 -0
  397. package/src/vendor/pdfjs/cmaps/90msp-RKSJ-V.bcmap +0 -0
  398. package/src/vendor/pdfjs/cmaps/90pv-RKSJ-H.bcmap +0 -0
  399. package/src/vendor/pdfjs/cmaps/90pv-RKSJ-V.bcmap +0 -0
  400. package/src/vendor/pdfjs/cmaps/Add-H.bcmap +0 -0
  401. package/src/vendor/pdfjs/cmaps/Add-RKSJ-H.bcmap +0 -0
  402. package/src/vendor/pdfjs/cmaps/Add-RKSJ-V.bcmap +0 -0
  403. package/src/vendor/pdfjs/cmaps/Add-V.bcmap +0 -0
  404. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-0.bcmap +0 -0
  405. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-1.bcmap +0 -0
  406. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-2.bcmap +0 -0
  407. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-3.bcmap +0 -0
  408. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-4.bcmap +0 -0
  409. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-5.bcmap +0 -0
  410. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-6.bcmap +0 -0
  411. package/src/vendor/pdfjs/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
  412. package/src/vendor/pdfjs/cmaps/Adobe-GB1-0.bcmap +0 -0
  413. package/src/vendor/pdfjs/cmaps/Adobe-GB1-1.bcmap +0 -0
  414. package/src/vendor/pdfjs/cmaps/Adobe-GB1-2.bcmap +0 -0
  415. package/src/vendor/pdfjs/cmaps/Adobe-GB1-3.bcmap +0 -0
  416. package/src/vendor/pdfjs/cmaps/Adobe-GB1-4.bcmap +0 -0
  417. package/src/vendor/pdfjs/cmaps/Adobe-GB1-5.bcmap +0 -0
  418. package/src/vendor/pdfjs/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
  419. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-0.bcmap +0 -0
  420. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-1.bcmap +0 -0
  421. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-2.bcmap +0 -0
  422. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-3.bcmap +0 -0
  423. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-4.bcmap +0 -0
  424. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-5.bcmap +0 -0
  425. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-6.bcmap +0 -0
  426. package/src/vendor/pdfjs/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
  427. package/src/vendor/pdfjs/cmaps/Adobe-Korea1-0.bcmap +0 -0
  428. package/src/vendor/pdfjs/cmaps/Adobe-Korea1-1.bcmap +0 -0
  429. package/src/vendor/pdfjs/cmaps/Adobe-Korea1-2.bcmap +0 -0
  430. package/src/vendor/pdfjs/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
  431. package/src/vendor/pdfjs/cmaps/B5-H.bcmap +0 -0
  432. package/src/vendor/pdfjs/cmaps/B5-V.bcmap +0 -0
  433. package/src/vendor/pdfjs/cmaps/B5pc-H.bcmap +0 -0
  434. package/src/vendor/pdfjs/cmaps/B5pc-V.bcmap +0 -0
  435. package/src/vendor/pdfjs/cmaps/CNS-EUC-H.bcmap +0 -0
  436. package/src/vendor/pdfjs/cmaps/CNS-EUC-V.bcmap +0 -0
  437. package/src/vendor/pdfjs/cmaps/CNS1-H.bcmap +0 -0
  438. package/src/vendor/pdfjs/cmaps/CNS1-V.bcmap +0 -0
  439. package/src/vendor/pdfjs/cmaps/CNS2-H.bcmap +0 -0
  440. package/src/vendor/pdfjs/cmaps/CNS2-V.bcmap +0 -3
  441. package/src/vendor/pdfjs/cmaps/ETHK-B5-H.bcmap +0 -0
  442. package/src/vendor/pdfjs/cmaps/ETHK-B5-V.bcmap +0 -0
  443. package/src/vendor/pdfjs/cmaps/ETen-B5-H.bcmap +0 -0
  444. package/src/vendor/pdfjs/cmaps/ETen-B5-V.bcmap +0 -0
  445. package/src/vendor/pdfjs/cmaps/ETenms-B5-H.bcmap +0 -3
  446. package/src/vendor/pdfjs/cmaps/ETenms-B5-V.bcmap +0 -0
  447. package/src/vendor/pdfjs/cmaps/EUC-H.bcmap +0 -0
  448. package/src/vendor/pdfjs/cmaps/EUC-V.bcmap +0 -0
  449. package/src/vendor/pdfjs/cmaps/Ext-H.bcmap +0 -0
  450. package/src/vendor/pdfjs/cmaps/Ext-RKSJ-H.bcmap +0 -0
  451. package/src/vendor/pdfjs/cmaps/Ext-RKSJ-V.bcmap +0 -0
  452. package/src/vendor/pdfjs/cmaps/Ext-V.bcmap +0 -0
  453. package/src/vendor/pdfjs/cmaps/GB-EUC-H.bcmap +0 -0
  454. package/src/vendor/pdfjs/cmaps/GB-EUC-V.bcmap +0 -0
  455. package/src/vendor/pdfjs/cmaps/GB-H.bcmap +0 -4
  456. package/src/vendor/pdfjs/cmaps/GB-V.bcmap +0 -0
  457. package/src/vendor/pdfjs/cmaps/GBK-EUC-H.bcmap +0 -0
  458. package/src/vendor/pdfjs/cmaps/GBK-EUC-V.bcmap +0 -0
  459. package/src/vendor/pdfjs/cmaps/GBK2K-H.bcmap +0 -0
  460. package/src/vendor/pdfjs/cmaps/GBK2K-V.bcmap +0 -0
  461. package/src/vendor/pdfjs/cmaps/GBKp-EUC-H.bcmap +0 -0
  462. package/src/vendor/pdfjs/cmaps/GBKp-EUC-V.bcmap +0 -0
  463. package/src/vendor/pdfjs/cmaps/GBT-EUC-H.bcmap +0 -0
  464. package/src/vendor/pdfjs/cmaps/GBT-EUC-V.bcmap +0 -0
  465. package/src/vendor/pdfjs/cmaps/GBT-H.bcmap +0 -0
  466. package/src/vendor/pdfjs/cmaps/GBT-V.bcmap +0 -0
  467. package/src/vendor/pdfjs/cmaps/GBTpc-EUC-H.bcmap +0 -0
  468. package/src/vendor/pdfjs/cmaps/GBTpc-EUC-V.bcmap +0 -0
  469. package/src/vendor/pdfjs/cmaps/GBpc-EUC-H.bcmap +0 -0
  470. package/src/vendor/pdfjs/cmaps/GBpc-EUC-V.bcmap +0 -0
  471. package/src/vendor/pdfjs/cmaps/H.bcmap +0 -0
  472. package/src/vendor/pdfjs/cmaps/HKdla-B5-H.bcmap +0 -0
  473. package/src/vendor/pdfjs/cmaps/HKdla-B5-V.bcmap +0 -0
  474. package/src/vendor/pdfjs/cmaps/HKdlb-B5-H.bcmap +0 -0
  475. package/src/vendor/pdfjs/cmaps/HKdlb-B5-V.bcmap +0 -0
  476. package/src/vendor/pdfjs/cmaps/HKgccs-B5-H.bcmap +0 -0
  477. package/src/vendor/pdfjs/cmaps/HKgccs-B5-V.bcmap +0 -0
  478. package/src/vendor/pdfjs/cmaps/HKm314-B5-H.bcmap +0 -0
  479. package/src/vendor/pdfjs/cmaps/HKm314-B5-V.bcmap +0 -0
  480. package/src/vendor/pdfjs/cmaps/HKm471-B5-H.bcmap +0 -0
  481. package/src/vendor/pdfjs/cmaps/HKm471-B5-V.bcmap +0 -0
  482. package/src/vendor/pdfjs/cmaps/HKscs-B5-H.bcmap +0 -0
  483. package/src/vendor/pdfjs/cmaps/HKscs-B5-V.bcmap +0 -0
  484. package/src/vendor/pdfjs/cmaps/Hankaku.bcmap +0 -0
  485. package/src/vendor/pdfjs/cmaps/Hiragana.bcmap +0 -0
  486. package/src/vendor/pdfjs/cmaps/KSC-EUC-H.bcmap +0 -0
  487. package/src/vendor/pdfjs/cmaps/KSC-EUC-V.bcmap +0 -0
  488. package/src/vendor/pdfjs/cmaps/KSC-H.bcmap +0 -0
  489. package/src/vendor/pdfjs/cmaps/KSC-Johab-H.bcmap +0 -0
  490. package/src/vendor/pdfjs/cmaps/KSC-Johab-V.bcmap +0 -0
  491. package/src/vendor/pdfjs/cmaps/KSC-V.bcmap +0 -0
  492. package/src/vendor/pdfjs/cmaps/KSCms-UHC-H.bcmap +0 -0
  493. package/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
  494. package/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
  495. package/src/vendor/pdfjs/cmaps/KSCms-UHC-V.bcmap +0 -0
  496. package/src/vendor/pdfjs/cmaps/KSCpc-EUC-H.bcmap +0 -0
  497. package/src/vendor/pdfjs/cmaps/KSCpc-EUC-V.bcmap +0 -0
  498. package/src/vendor/pdfjs/cmaps/Katakana.bcmap +0 -0
  499. package/src/vendor/pdfjs/cmaps/LICENSE +0 -36
  500. package/src/vendor/pdfjs/cmaps/NWP-H.bcmap +0 -0
  501. package/src/vendor/pdfjs/cmaps/NWP-V.bcmap +0 -0
  502. package/src/vendor/pdfjs/cmaps/RKSJ-H.bcmap +0 -0
  503. package/src/vendor/pdfjs/cmaps/RKSJ-V.bcmap +0 -0
  504. package/src/vendor/pdfjs/cmaps/Roman.bcmap +0 -0
  505. package/src/vendor/pdfjs/cmaps/UniCNS-UCS2-H.bcmap +0 -0
  506. package/src/vendor/pdfjs/cmaps/UniCNS-UCS2-V.bcmap +0 -0
  507. package/src/vendor/pdfjs/cmaps/UniCNS-UTF16-H.bcmap +0 -0
  508. package/src/vendor/pdfjs/cmaps/UniCNS-UTF16-V.bcmap +0 -0
  509. package/src/vendor/pdfjs/cmaps/UniCNS-UTF32-H.bcmap +0 -0
  510. package/src/vendor/pdfjs/cmaps/UniCNS-UTF32-V.bcmap +0 -0
  511. package/src/vendor/pdfjs/cmaps/UniCNS-UTF8-H.bcmap +0 -0
  512. package/src/vendor/pdfjs/cmaps/UniCNS-UTF8-V.bcmap +0 -0
  513. package/src/vendor/pdfjs/cmaps/UniGB-UCS2-H.bcmap +0 -0
  514. package/src/vendor/pdfjs/cmaps/UniGB-UCS2-V.bcmap +0 -0
  515. package/src/vendor/pdfjs/cmaps/UniGB-UTF16-H.bcmap +0 -0
  516. package/src/vendor/pdfjs/cmaps/UniGB-UTF16-V.bcmap +0 -0
  517. package/src/vendor/pdfjs/cmaps/UniGB-UTF32-H.bcmap +0 -0
  518. package/src/vendor/pdfjs/cmaps/UniGB-UTF32-V.bcmap +0 -0
  519. package/src/vendor/pdfjs/cmaps/UniGB-UTF8-H.bcmap +0 -0
  520. package/src/vendor/pdfjs/cmaps/UniGB-UTF8-V.bcmap +0 -0
  521. package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-H.bcmap +0 -0
  522. package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
  523. package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
  524. package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-V.bcmap +0 -0
  525. package/src/vendor/pdfjs/cmaps/UniJIS-UTF16-H.bcmap +0 -0
  526. package/src/vendor/pdfjs/cmaps/UniJIS-UTF16-V.bcmap +0 -0
  527. package/src/vendor/pdfjs/cmaps/UniJIS-UTF32-H.bcmap +0 -0
  528. package/src/vendor/pdfjs/cmaps/UniJIS-UTF32-V.bcmap +0 -0
  529. package/src/vendor/pdfjs/cmaps/UniJIS-UTF8-H.bcmap +0 -0
  530. package/src/vendor/pdfjs/cmaps/UniJIS-UTF8-V.bcmap +0 -0
  531. package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
  532. package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
  533. package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
  534. package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
  535. package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
  536. package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
  537. package/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
  538. package/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
  539. package/src/vendor/pdfjs/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
  540. package/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
  541. package/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
  542. package/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
  543. package/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
  544. package/src/vendor/pdfjs/cmaps/UniKS-UCS2-H.bcmap +0 -0
  545. package/src/vendor/pdfjs/cmaps/UniKS-UCS2-V.bcmap +0 -0
  546. package/src/vendor/pdfjs/cmaps/UniKS-UTF16-H.bcmap +0 -0
  547. package/src/vendor/pdfjs/cmaps/UniKS-UTF16-V.bcmap +0 -0
  548. package/src/vendor/pdfjs/cmaps/UniKS-UTF32-H.bcmap +0 -0
  549. package/src/vendor/pdfjs/cmaps/UniKS-UTF32-V.bcmap +0 -0
  550. package/src/vendor/pdfjs/cmaps/UniKS-UTF8-H.bcmap +0 -0
  551. package/src/vendor/pdfjs/cmaps/UniKS-UTF8-V.bcmap +0 -0
  552. package/src/vendor/pdfjs/cmaps/V.bcmap +0 -0
  553. package/src/vendor/pdfjs/cmaps/WP-Symbol.bcmap +0 -0
  554. package/src/vendor/pdfjs/jbig2.wasm +0 -0
  555. package/src/vendor/pdfjs/openjpeg.wasm +0 -0
  556. package/src/vendor/pdfjs/pdf.mjs +0 -33603
  557. package/src/vendor/pdfjs/pdf.mjs.map +0 -1
  558. package/src/vendor/pdfjs/pdf.sandbox.mjs +0 -4936
  559. package/src/vendor/pdfjs/pdf.sandbox.mjs.map +0 -1
  560. package/src/vendor/pdfjs/pdf.worker.mjs +0 -70100
  561. package/src/vendor/pdfjs/pdf.worker.mjs.map +0 -1
  562. package/src/vendor/pdfjs/qcms_bg.wasm +0 -0
  563. package/src/vendor/pdfjs/standard_fonts/FoxitDingbats.pfb +0 -0
  564. package/src/vendor/pdfjs/standard_fonts/FoxitFixed.pfb +0 -0
  565. package/src/vendor/pdfjs/standard_fonts/FoxitFixedBold.pfb +0 -0
  566. package/src/vendor/pdfjs/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
  567. package/src/vendor/pdfjs/standard_fonts/FoxitFixedItalic.pfb +0 -0
  568. package/src/vendor/pdfjs/standard_fonts/FoxitSerif.pfb +0 -0
  569. package/src/vendor/pdfjs/standard_fonts/FoxitSerifBold.pfb +0 -0
  570. package/src/vendor/pdfjs/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
  571. package/src/vendor/pdfjs/standard_fonts/FoxitSerifItalic.pfb +0 -0
  572. package/src/vendor/pdfjs/standard_fonts/FoxitSymbol.pfb +0 -0
  573. package/src/vendor/pdfjs/standard_fonts/LICENSE_FOXIT +0 -27
  574. package/src/vendor/pdfjs/standard_fonts/LICENSE_LIBERATION +0 -102
  575. package/src/vendor/pdfjs/standard_fonts/LiberationSans-Bold.ttf +0 -0
  576. package/src/vendor/pdfjs/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
  577. package/src/vendor/pdfjs/standard_fonts/LiberationSans-Italic.ttf +0 -0
  578. package/src/vendor/pdfjs/standard_fonts/LiberationSans-Regular.ttf +0 -0
@@ -1,1813 +0,0 @@
1
- import { strToSubscriptString, strToPostScript } from "./textUtils.js";
2
- import { buildBbox } from "./bbox.js";
3
- import { cleanRawText } from "./cleanText.js";
4
- import { applyMarkupTags } from "./markupUtils.js";
5
- import { createGridDebugLogger } from "./gridDebugLogger.js";
6
- import { renderAllVisualizations } from "./gridVisualizer.js";
7
- // Minimum spaces between unsnapped bboxes (likely justified text
8
- const FLOATING_SPACES = 2;
9
- // Minimum spaces between snapped columns
10
- const COLUMN_SPACES = 4;
11
- // --- Flowing text detection thresholds ---
12
- // Max total anchors (left+right+center) before block is considered structured
13
- const FLOWING_MAX_TOTAL_ANCHORS = 4;
14
- // Max left anchors before block is considered structured
15
- const FLOWING_MAX_LEFT_ANCHORS = 3;
16
- // Minimum non-empty lines required to classify a block
17
- const FLOWING_MIN_LINES = 3;
18
- // Fraction of page width a line must span to count as "wide"
19
- const FLOWING_WIDE_LINE_RATIO = 0.5;
20
- // Fraction of lines that must be wide for a block to be flowing
21
- const FLOWING_WIDE_LINE_THRESHOLD = 0.6;
22
- // Multiplier on median char width for column gap detection
23
- const FLOWING_COLUMN_GAP_MULTIPLIER = 4;
24
- // Minimum items on a line to be classified as flowing in per-line detection
25
- const FLOWING_MIN_LINE_ITEMS = 3;
26
- // Height multiplier for word-break space threshold in flowing text
27
- const FLOWING_SPACE_HEIGHT_RATIO = 0.15;
28
- // Minimum absolute space threshold in flowing text
29
- const FLOWING_SPACE_MIN_THRESHOLD = 0.3;
30
- // Maximum indent (in character widths) for flowing text
31
- const FLOWING_MAX_INDENT = 8;
32
- function roundAnchor(anchor) {
33
- // group anchor x-coord by nearest 1/4 unit
34
- return Math.round(anchor * 4) / 4;
35
- }
36
- function getRepresentativeLineMetrics(line, globalMedianHeight) {
37
- const minRepresentativeHeight = globalMedianHeight * 0.5;
38
- const representativeItems = line.filter((bbox) => bbox.h >= minRepresentativeHeight);
39
- const items = representativeItems.length > 0 ? representativeItems : line;
40
- const top = Math.min(...items.map((bbox) => bbox.y));
41
- const bottom = Math.max(...items.map((bbox) => bbox.y + bbox.h));
42
- return {
43
- top,
44
- bottom,
45
- height: bottom - top,
46
- };
47
- }
48
- // 2pt @ PDF 72 DPI -> 8px @ 300DPI
49
- const SMALL_FONT_SIZE_THRESHOLD = 2;
50
- function isSmallTextLine(line) {
51
- // check for line where >50% of the text is very small
52
- const smallText = line.filter((item) => item.h < SMALL_FONT_SIZE_THRESHOLD);
53
- if (smallText.length / line.length > 0.5) {
54
- return true;
55
- }
56
- return false;
57
- }
58
- function filterUnprojectableText(config, line) {
59
- // Filter text items that would break projection (e.g., very small text)
60
- if (line.length === 0) {
61
- return line;
62
- }
63
- let filteredLine = line;
64
- if (!config.preserveVerySmallText && isSmallTextLine(line)) {
65
- // remove very small text lines
66
- filteredLine = filteredLine.filter((item) => item.h >= SMALL_FONT_SIZE_THRESHOLD);
67
- }
68
- return filteredLine;
69
- }
70
- function canSnapLine(config, line) {
71
- // force lines that will likely break projection to be unsnapped floating text
72
- // currently this includes:
73
- // - lines of entirely small text
74
- //
75
- // NOTE: this assumes undesirable text has already been filtered before projection
76
- // (i.e. parse mode based removal of text should be done before this in filterUnprojectableText())
77
- if (line.length === 0) {
78
- return true;
79
- }
80
- if (!config.preserveVerySmallText && isSmallTextLine(line)) {
81
- return false;
82
- }
83
- return true;
84
- }
85
- function fixSparseBlocks(blocks, rawLines) {
86
- // compress whitespace in blocks containing very sparse lines (>80% whitespace)
87
- const regexp = new RegExp(`\\s{${COLUMN_SPACES},}`, "g");
88
- for (const block of blocks) {
89
- let total = 0;
90
- let whitespace = 0;
91
- for (let i = block.start; i < block.end; ++i) {
92
- if (!rawLines[i]) {
93
- continue;
94
- }
95
- rawLines[i] = rawLines[i].trimEnd();
96
- const line = rawLines[i];
97
- if (line.length === 0) {
98
- continue;
99
- }
100
- total += line.length;
101
- whitespace += line.match(/\s/g)?.length || 0;
102
- }
103
- if (total >= 500 && whitespace / total > 0.8) {
104
- for (let i = block.start; i < block.end; ++i) {
105
- const line = rawLines[i];
106
- if (!line || line.length === 0) {
107
- continue;
108
- }
109
- rawLines[i] = line.replace(regexp, " ".repeat(FLOATING_SPACES));
110
- }
111
- }
112
- }
113
- }
114
- function extractAnchorsPointsFromLines(lines, page) {
115
- const pageHeight = page.height;
116
- const anchorLeft = {};
117
- const anchorRight = {};
118
- const anchorCenter = {};
119
- for (const line of lines) {
120
- for (const bbox of line) {
121
- let anchor = roundAnchor(bbox.x);
122
- if (!anchorLeft[anchor]) {
123
- anchorLeft[anchor] = [];
124
- }
125
- anchorLeft[anchor].push(bbox);
126
- anchor = roundAnchor(bbox.x + bbox.w);
127
- if (!anchorRight[anchor]) {
128
- anchorRight[anchor] = [];
129
- }
130
- anchorRight[anchor].push(bbox);
131
- const center = Math.round(bbox.x + bbox.w / 2);
132
- if (!anchorCenter[center]) {
133
- anchorCenter[center] = [];
134
- }
135
- anchorCenter[center].push(bbox);
136
- }
137
- }
138
- function deltaMin(collection, delta) {
139
- for (const anchor in collection) {
140
- const maxDelta = pageHeight * delta;
141
- for (let i = 0; i < collection[anchor].length; i++) {
142
- let shouldKeep = false;
143
- if (i > 0) {
144
- if (collection[anchor][i].y - collection[anchor][i - 1].y < maxDelta) {
145
- shouldKeep = true;
146
- }
147
- }
148
- if (i < collection[anchor].length - 1) {
149
- if (collection[anchor][i + 1].y - collection[anchor][i].y < maxDelta) {
150
- shouldKeep = true;
151
- }
152
- }
153
- if (!shouldKeep) {
154
- collection[anchor].splice(i--, 1);
155
- }
156
- }
157
- }
158
- }
159
- // Group nearby anchors FIRST to merge items at similar positions
160
- // This ensures deltaMin operates on combined groups, not isolated items
161
- group(anchorLeft);
162
- group(anchorRight);
163
- group(anchorCenter);
164
- deltaMin(anchorRight, 0.17);
165
- deltaMin(anchorLeft, 0.2);
166
- deltaMin(anchorCenter, 0.05);
167
- function intercept(collection) {
168
- for (const anchor in collection) {
169
- let shouldKeep = false;
170
- for (let i = 0; i < collection[anchor].length; i++) {
171
- if (i > 0) {
172
- let intercept = false;
173
- // check intercept
174
- const a1 = collection[anchor][i - 1];
175
- const a2 = collection[anchor][i];
176
- for (const line of lines) {
177
- if (line.length > 0 && line[0].y > a1.y && line[0].y < a2.y) {
178
- for (const item of line) {
179
- if (item.x < parseInt(anchor) && item.x + item.w > parseInt(anchor)) {
180
- intercept = true;
181
- break;
182
- }
183
- }
184
- if (intercept) {
185
- break;
186
- }
187
- }
188
- }
189
- if (!intercept) {
190
- shouldKeep = true;
191
- break;
192
- }
193
- }
194
- }
195
- if (!shouldKeep) {
196
- delete collection[anchor];
197
- }
198
- }
199
- }
200
- intercept(anchorRight);
201
- intercept(anchorLeft);
202
- intercept(anchorCenter);
203
- function group(collection) {
204
- // Sort anchors to process them in order
205
- const sortedAnchors = Object.keys(collection)
206
- .map((a) => parseFloat(a))
207
- .sort((a, b) => a - b);
208
- // Merge nearby anchors within a tolerance
209
- // Use 2 units as tolerance - this catches columns that are close but not exactly aligned
210
- const MERGE_TOLERANCE = 2;
211
- for (let i = 0; i < sortedAnchors.length; i++) {
212
- const anchor = sortedAnchors[i];
213
- if (!(anchor in collection))
214
- continue; // Already merged
215
- // Look for nearby anchors to merge into this one or merge this into
216
- for (let j = i + 1; j < sortedAnchors.length; j++) {
217
- const nextAnchor = sortedAnchors[j];
218
- if (!(nextAnchor in collection))
219
- continue;
220
- // Stop if we're beyond the tolerance
221
- if (nextAnchor - anchor > MERGE_TOLERANCE)
222
- break;
223
- // Merge the smaller anchor into the larger one
224
- if (collection[nextAnchor].length > collection[anchor].length) {
225
- collection[nextAnchor].push(...collection[anchor]);
226
- delete collection[anchor];
227
- break; // This anchor is gone, move to next
228
- }
229
- else {
230
- collection[anchor].push(...collection[nextAnchor]);
231
- delete collection[nextAnchor];
232
- }
233
- }
234
- }
235
- }
236
- function anyAnchorSurvived(bbox) {
237
- return (roundAnchor(bbox.x) in anchorLeft ||
238
- roundAnchor(bbox.x + bbox.w) in anchorRight ||
239
- Math.round(bbox.x + bbox.w / 2) in anchorCenter);
240
- }
241
- // Try seeing if a floating bbox would align well with a surviving anchor on a line immediately above or below it
242
- function tryAlignFloating(collection, ANCHOR_MARGIN, refXFromBbox, anchorValFromBbox) {
243
- for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {
244
- const line = lines[lineIndex];
245
- for (const bbox of line) {
246
- // Only consider floating bboxes
247
- if (anyAnchorSurvived(bbox)) {
248
- continue;
249
- }
250
- // Check the lines before and after
251
- const candidateLines = [];
252
- if (lineIndex > 0) {
253
- candidateLines.push(lines[lineIndex - 1]);
254
- }
255
- if (lineIndex < lines.length - 1) {
256
- candidateLines.push(lines[lineIndex + 1]);
257
- }
258
- // Check candidate lines for:
259
- // Possible alignment
260
- // Being within the margin
261
- // Being the closest of the candidates
262
- let candidateAnchor = "";
263
- let prevDiff = ANCHOR_MARGIN + 1;
264
- for (const candLine of candidateLines) {
265
- for (const candBBox of candLine) {
266
- const candAnchorVal = anchorValFromBbox(candBBox);
267
- if (!(candAnchorVal in collection)) {
268
- continue;
269
- }
270
- const xDiff = Math.abs(candAnchorVal - refXFromBbox(bbox));
271
- if (xDiff <= ANCHOR_MARGIN && xDiff < prevDiff) {
272
- candidateAnchor = candAnchorVal.toString();
273
- prevDiff = xDiff;
274
- }
275
- }
276
- }
277
- // No candidate found
278
- if (candidateAnchor.length == 0) {
279
- continue;
280
- }
281
- // Candidate found - update the anchor's bbox list
282
- // But first check if the bbox is already in this anchor (could happen after merging)
283
- const targetAnchor = collection[parseFloat(candidateAnchor)];
284
- if (!targetAnchor.includes(bbox)) {
285
- targetAnchor.push(bbox);
286
- }
287
- }
288
- }
289
- }
290
- // Try to left-align floating bboxes
291
- tryAlignFloating(anchorLeft, 2, (bbox) => bbox.x, (bbox) => roundAnchor(bbox.x));
292
- // Sort the anchors' lists of bboxes by y-value
293
- function sortAnchor(collection) {
294
- for (const anchor in collection) {
295
- collection[anchor].sort((a, b) => a.y - b.y);
296
- }
297
- }
298
- sortAnchor(anchorLeft);
299
- sortAnchor(anchorRight);
300
- sortAnchor(anchorCenter);
301
- // deduplicate
302
- const duplicates = [];
303
- for (const anchor in anchorLeft) {
304
- for (const item of anchorLeft[anchor]) {
305
- item.snap = "left";
306
- item.leftAnchor = anchor;
307
- }
308
- }
309
- for (const anchor in anchorRight) {
310
- for (const item of anchorRight[anchor]) {
311
- if (item.snap) {
312
- item.isDup = true;
313
- duplicates.push(item);
314
- }
315
- item.snap = "right";
316
- item.rightAnchor = anchor;
317
- }
318
- }
319
- for (const anchor in anchorCenter) {
320
- for (const item of anchorCenter[anchor]) {
321
- if (item.snap && !item.isDup) {
322
- item.isDup = true;
323
- duplicates.push(item);
324
- }
325
- item.snap = "center";
326
- item.centerAnchor = anchor;
327
- }
328
- }
329
- function anchorCounts(item) {
330
- let leftCount = 0;
331
- if (item.leftAnchor) {
332
- const key = parseFloat(item.leftAnchor);
333
- leftCount = anchorLeft[key] ? anchorLeft[key].length : 0;
334
- }
335
- let rightCount = 0;
336
- if (item.rightAnchor) {
337
- const key = parseFloat(item.rightAnchor);
338
- rightCount = anchorRight[key] ? anchorRight[key].length : 0;
339
- }
340
- let centerCount = 0;
341
- if (item.centerAnchor) {
342
- const key = parseFloat(item.centerAnchor);
343
- centerCount = anchorCenter[key] ? anchorCenter[key].length : 0;
344
- }
345
- return [leftCount, rightCount, centerCount];
346
- }
347
- // find all left aligned blocks, all right aligned blocks, all centered blocks, in that order
348
- // we cannot check all 3 at once since we may end up double counting potential anchor matches
349
- // (i.e. we need to exclude block that we know are left/right aligned before counting possible
350
- // matching centered blocks)
351
- // find all lefts
352
- let hasChanged = true;
353
- while (hasChanged && duplicates.length > 0) {
354
- hasChanged = false;
355
- for (let i = duplicates.length - 1; i >= 0; --i) {
356
- const item = duplicates[i];
357
- const [leftCount, rightCount, centerCount] = anchorCounts(item);
358
- if (leftCount >= rightCount && leftCount >= centerCount) {
359
- item.snap = "left";
360
- if (item.rightAnchor) {
361
- const key = parseFloat(item.rightAnchor);
362
- if (anchorRight[key]) {
363
- const idx = anchorRight[key].indexOf(item);
364
- if (idx >= 0) {
365
- anchorRight[key].splice(idx, 1);
366
- hasChanged = true;
367
- }
368
- }
369
- }
370
- if (item.centerAnchor) {
371
- const key = parseFloat(item.centerAnchor);
372
- if (anchorCenter[key]) {
373
- const idx = anchorCenter[key].indexOf(item);
374
- if (idx >= 0) {
375
- anchorCenter[key].splice(idx, 1);
376
- hasChanged = true;
377
- }
378
- }
379
- }
380
- duplicates.splice(i, 1);
381
- }
382
- }
383
- }
384
- // find all rights
385
- hasChanged = true;
386
- while (hasChanged && duplicates.length > 0) {
387
- hasChanged = false;
388
- for (let i = duplicates.length - 1; i >= 0; --i) {
389
- const item = duplicates[i];
390
- const [leftCount, rightCount, centerCount] = anchorCounts(item);
391
- if (rightCount >= leftCount && rightCount >= centerCount) {
392
- item.snap = "right";
393
- if (item.leftAnchor) {
394
- const key = parseFloat(item.leftAnchor);
395
- if (anchorLeft[key]) {
396
- const idx = anchorLeft[key].indexOf(item);
397
- if (idx >= 0) {
398
- anchorLeft[key].splice(idx, 1);
399
- hasChanged = true;
400
- }
401
- }
402
- }
403
- if (item.centerAnchor) {
404
- const key = parseFloat(item.centerAnchor);
405
- if (anchorCenter[key]) {
406
- const idx = anchorCenter[key].indexOf(item);
407
- if (idx >= 0) {
408
- anchorCenter[key].splice(idx, 1);
409
- hasChanged = true;
410
- }
411
- }
412
- }
413
- duplicates.splice(i, 1);
414
- }
415
- }
416
- }
417
- // remaining duplicates are centered
418
- for (const item of duplicates) {
419
- item.snap = "center";
420
- if (item.leftAnchor) {
421
- const key = parseFloat(item.leftAnchor);
422
- if (anchorLeft[key]) {
423
- const idx = anchorLeft[key].indexOf(item);
424
- if (idx >= 0) {
425
- anchorLeft[key].splice(idx, 1);
426
- }
427
- }
428
- }
429
- if (item.rightAnchor) {
430
- const key = parseFloat(item.rightAnchor);
431
- if (anchorRight[key]) {
432
- const idx = anchorRight[key].indexOf(item);
433
- if (idx >= 0) {
434
- anchorRight[key].splice(idx, 1);
435
- }
436
- }
437
- }
438
- }
439
- // filter anchors
440
- // delete singleton
441
- for (const anchor in anchorLeft) {
442
- if (anchorLeft[anchor].length < 2) {
443
- if (anchorLeft[anchor].length) {
444
- delete anchorLeft[anchor][0].snap;
445
- }
446
- delete anchorLeft[anchor];
447
- }
448
- }
449
- for (const anchor in anchorRight) {
450
- if (anchorRight[anchor].length < 2) {
451
- if (anchorRight[anchor].length) {
452
- delete anchorRight[anchor][0].snap;
453
- }
454
- delete anchorRight[anchor];
455
- }
456
- }
457
- for (const anchor in anchorCenter) {
458
- if (anchorCenter[anchor].length < 2) {
459
- if (anchorCenter[anchor].length) {
460
- delete anchorCenter[anchor][0].snap;
461
- }
462
- delete anchorCenter[anchor];
463
- }
464
- }
465
- return {
466
- anchorLeft,
467
- anchorRight,
468
- anchorCenter,
469
- };
470
- }
471
- function handleRotationReadingOrder(textBbox, pageHeight) {
472
- // if no bbox is rotated (.r is set), return
473
- if (!textBbox.find((b) => b.r != 0)) {
474
- return;
475
- }
476
- // Group ALL items by rotation value (not by consecutive items)
477
- // This ensures rotated text blocks stay together even when their X coordinates
478
- // overlap with non-rotated content (e.g., rotated table + footer at same X positions)
479
- const groupsByRotation = {};
480
- for (const bbox of textBbox) {
481
- const r = bbox.r || 0;
482
- if (!groupsByRotation[r]) {
483
- groupsByRotation[r] = [];
484
- }
485
- groupsByRotation[r].push(bbox);
486
- }
487
- // Build bboxGroup array from rotation groups, sorted by X position of group
488
- const bboxGroup = [];
489
- for (const rotation in groupsByRotation) {
490
- const group = groupsByRotation[rotation];
491
- // Sort each group by Y for proper reading order
492
- group.sort((a, b) => a.y - b.y);
493
- bboxGroup.push(group);
494
- }
495
- // Sort groups by their minimum X position to maintain left-to-right order
496
- bboxGroup.sort((a, b) => {
497
- const minXA = Math.min(...a.map((item) => item.x));
498
- const minXB = Math.min(...b.map((item) => item.x));
499
- return minXA - minXB;
500
- });
501
- // NOTE/ WARNING: height and width of bbox are NOT rotated beforehand!
502
- for (const [index, group] of bboxGroup.entries()) {
503
- if (group[0].r == 90 || group[0].r == 270) {
504
- // Check if there are non-rotated items that actually overlap visually (both X and Y)
505
- // with the rotated group. X-only overlap is not sufficient because items could
506
- // be in completely different parts of the page (e.g., rotated table + footer).
507
- let globalOverlap = false;
508
- for (const bbox of textBbox) {
509
- if (bbox.r != group[0].r) {
510
- const overlap = group.find((b) =>
511
- // Check X overlap
512
- b.x >= bbox.x &&
513
- b.x <= bbox.x + bbox.w &&
514
- // Also check Y overlap - items must actually be near each other vertically
515
- b.y < bbox.y + bbox.h &&
516
- b.y + b.h > bbox.y &&
517
- bbox.r != b.r);
518
- if (overlap) {
519
- globalOverlap = true;
520
- }
521
- }
522
- }
523
- if (globalOverlap) {
524
- // rotate bbox to be horizontal
525
- for (const bbox of group) {
526
- if (bbox.d) {
527
- bbox.y += bbox.d;
528
- bbox.d = 0;
529
- }
530
- bbox.r = 0;
531
- bbox.rotated = true;
532
- }
533
- }
534
- else {
535
- // insert the bbox group in the Y axis after previous group and before next group.
536
- // move Next group by current group height (width as not rotated yet).
537
- const groupMaxX = Math.max(...group.map((v) => v.x + v.w));
538
- let deltaY = 0;
539
- if (index != 0) {
540
- const previousGroup = bboxGroup[index - 1];
541
- const previousGroupMaxY = Math.max(...previousGroup.map((v) => v.y + v.h));
542
- // Use pageHeight offset to guarantee no alignment issues with other groups
543
- deltaY = previousGroupMaxY + pageHeight;
544
- }
545
- // clockwise rotation (90 degrees)
546
- // - Text reads top-to-bottom in page space
547
- // - Y position in page space -> X position after de-rotation
548
- // - X position in page space -> Y position after de-rotation (row)
549
- if (group[0].r == 90) {
550
- for (const bbox of group) {
551
- const newX = Math.round(bbox.y);
552
- const newY = bbox.x + deltaY;
553
- // Swap width and height since text orientation changes
554
- const newW = bbox.h;
555
- const newH = bbox.w;
556
- bbox.x = newX;
557
- bbox.y = newY;
558
- bbox.w = newW;
559
- bbox.h = newH;
560
- bbox.r = 0;
561
- bbox.rotated = true;
562
- }
563
- }
564
- // counter clockwize rotation (text reads bottom-to-top)
565
- // For 270-degree rotation, text at higher Y positions should be
566
- // at lower X positions after de-rotation (left-to-right reading order)
567
- if (group[0].r == 270) {
568
- // For 270-degree counter-clockwise rotation:
569
- // - Text reads bottom-to-top in page space
570
- // - Y position in page space -> X position after de-rotation (inverted)
571
- // - X position in page space -> Y position after de-rotation (row)
572
- // - w and h need to be swapped since they represent visual dimensions
573
- // For 270-degree rotation: h is the extent along reading direction (string width)
574
- const maxY = Math.max(...group.map((b) => b.y + b.h));
575
- for (const bbox of group) {
576
- // Transform coordinates:
577
- // - new_x = distance from right edge of rotated block (inverted Y)
578
- // Use h (string width in original coords) for the extent
579
- // - new_y = row position (from original X)
580
- const newX = Math.round(maxY - bbox.y - bbox.h);
581
- // Use exact X for Y (will be grouped by bboxToLine's Y_SORT_TOLERANCE)
582
- const newY = bbox.x + deltaY;
583
- // Swap width and height since text orientation changes
584
- const newW = bbox.h;
585
- const newH = bbox.w;
586
- bbox.x = newX;
587
- bbox.y = newY;
588
- bbox.w = newW;
589
- bbox.h = newH;
590
- bbox.r = 0;
591
- bbox.rotated = true;
592
- }
593
- }
594
- // Use pageHeight offset to guarantee no alignment issues
595
- const globalDelta = deltaY + groupMaxX + pageHeight;
596
- for (const [otherGroupIndex, other] of bboxGroup.entries()) {
597
- if (otherGroupIndex <= index) {
598
- continue;
599
- }
600
- for (const bbox of other) {
601
- if (bbox.r == 90 || bbox.r == 270) {
602
- bbox.d = (bbox.d ? bbox.d : 0) + globalDelta;
603
- continue;
604
- }
605
- bbox.y += globalDelta;
606
- }
607
- }
608
- }
609
- }
610
- }
611
- textBbox.sort((a, b) => {
612
- return a.y - b.y;
613
- });
614
- // Handle 180-degree rotated text (upside down)
615
- // Since we already grouped by rotation, we can iterate the existing groups
616
- for (const group of bboxGroup) {
617
- if (group[0].r == 180) {
618
- // Sort by X for proper reading order
619
- group.sort((a, b) => a.x - b.x);
620
- // Switch upside down
621
- for (const bbox of group) {
622
- bbox.x = Math.round(bbox.ry ?? bbox.y);
623
- bbox.y = bbox.rx ?? bbox.x;
624
- bbox.r = 0;
625
- bbox.rotated = true;
626
- }
627
- }
628
- }
629
- }
630
- export function bboxToLine(textBbox, medianWidth, medianHeight, pageWidth) {
631
- // Y-tolerance for sorting: items within this threshold are considered same line
632
- // This handles:
633
- // 1. Floating point precision issues between columns (e.g., 334.7400 vs 334.7399)
634
- // 2. Subscripts/superscripts which are typically offset by 3-5 units from their base characters
635
- // Using a fraction of medianHeight to scale with document font size.
636
- const Y_SORT_TOLERANCE = Math.max(medianHeight * 0.5, 5.0);
637
- // Note: We keep whitespace items as they may be needed for proper word separation.
638
- // The spacing calculation handles gaps between items.
639
- // For two-column documents, detect and mark margin line numbers
640
- // These are short numeric items positioned between columns (near the page midpoint)
641
- // They should not be merged with column content
642
- if (pageWidth) {
643
- const midpoint = pageWidth * 0.5;
644
- const marginZoneLeft = midpoint - 5;
645
- const marginZoneRight = midpoint + 20;
646
- for (const bbox of textBbox) {
647
- const bboxCenter = bbox.x + bbox.w / 2;
648
- // Check if item is in the margin zone and looks like a line number
649
- if (bboxCenter > marginZoneLeft &&
650
- bboxCenter < marginZoneRight &&
651
- bbox.str.trim().match(/^\d{1,2}[O]?$/) && // 1-2 digits, possibly with O (OCR error for 0)
652
- bbox.w < 15 // Line numbers are narrow
653
- ) {
654
- // Mark as margin item - will be placed on its own line
655
- bbox.isMarginLineNumber = true;
656
- }
657
- }
658
- }
659
- // sort lines on first y axis then x axis (top - left)
660
- // Use Y tolerance so items on same visual line sort by x regardless of tiny y differences
661
- textBbox.sort((a, b) => {
662
- if (Math.abs(a.y - b.y) < Y_SORT_TOLERANCE) {
663
- return a.x - b.x;
664
- }
665
- return a.y - b.y;
666
- });
667
- function canMergeMarkup(previousBbox, bbox) {
668
- if (!previousBbox.markup && !bbox.markup) {
669
- return true;
670
- }
671
- if (previousBbox.markup &&
672
- bbox.markup &&
673
- previousBbox.markup.highlight === bbox.markup.highlight &&
674
- previousBbox.markup.underline === bbox.markup.underline &&
675
- previousBbox.markup.squiggly === bbox.markup.squiggly &&
676
- previousBbox.markup.strikeout === bbox.markup.strikeout) {
677
- return true;
678
- }
679
- return false;
680
- }
681
- function canMerge(previousBbox, bbox) {
682
- if (bbox.y == previousBbox.y && bbox.h == previousBbox.h) {
683
- // Use raw pageBbox width for sub-pixel accurate gap calculation.
684
- // The rounded `.w` field can cause a true −0.02px overlap to appear as +0.12px,
685
- // and a −0.86px overlap to appear as −0.72px — both outside the old tolerance.
686
- // PDFs sometimes encode a single value (e.g. "119:12") as a sequence of adjacent
687
- // text runs whose bounding boxes touch or slightly overlap (up to ~1px) due to
688
- // character spacing / kerning. We must merge these rather than treat them as
689
- // separate tokens.
690
- const prevRawWidth = previousBbox.pageBbox?.w ?? previousBbox.w;
691
- const xDelta = bbox.x - previousBbox.x - prevRawWidth;
692
- if (((xDelta < 0 && xDelta > -1.0) || (xDelta >= 0 && xDelta < 0.1)) &&
693
- canMergeMarkup(previousBbox, bbox)) {
694
- return true;
695
- }
696
- }
697
- return false;
698
- }
699
- function mergePageBbox(a, b) {
700
- const aBbox = a.pageBbox || { x: a.x, y: a.y, w: a.w, h: a.h };
701
- const bBbox = b.pageBbox || { x: b.x, y: b.y, w: b.w, h: b.h };
702
- const left = Math.min(aBbox.x, bBbox.x);
703
- const top = Math.min(aBbox.y, bBbox.y);
704
- const right = Math.max(aBbox.x + aBbox.w, bBbox.x + bBbox.w);
705
- const bottom = Math.max(aBbox.y + aBbox.h, bBbox.y + bBbox.h);
706
- return { x: left, y: top, w: right - left, h: bottom - top };
707
- }
708
- // merge Continuous bbox
709
- for (let i = 1; i < textBbox.length; i++) {
710
- const bbox = textBbox[i];
711
- const previousBbox = textBbox[i - 1];
712
- if (canMerge(previousBbox, bbox)) {
713
- previousBbox.w = bbox.x + bbox.w - previousBbox.x;
714
- previousBbox.str += bbox.str;
715
- previousBbox.strLength += bbox.strLength;
716
- previousBbox.pageBbox = mergePageBbox(previousBbox, bbox);
717
- textBbox.splice(i, 1);
718
- i--;
719
- }
720
- }
721
- // try to find the bounding box that align as line and group them by line
722
- const lines = [];
723
- let currentLine = [];
724
- let previousBbox = null;
725
- for (const bbox of textBbox) {
726
- if (!previousBbox) {
727
- currentLine.push(bbox);
728
- }
729
- // This is where we define how line are build. to be improved
730
- else {
731
- const lineMinY = Math.min(...currentLine.map((v) => v.y));
732
- const lineMaxY = Math.max(...currentLine.map((v) => v.y + v.h));
733
- let lineCollide = false;
734
- for (const currentLineItemBbox of currentLine) {
735
- const overlapLenght = Math.min(currentLineItemBbox.x + currentLineItemBbox.w, bbox.x + bbox.w) -
736
- Math.max(currentLineItemBbox.x, bbox.x);
737
- // Use a minimum threshold to tolerate small overlaps common in PDFs due to:
738
- // - character spacing/kerning
739
- // - floating-point precision issues
740
- // - adjacent items with slightly overlapping bounding boxes
741
- // We want to detect true collisions (same text rendered twice) not adjacent text.
742
- // Some PDFs (e.g. scanned documents with text overlays) split lines into narrow
743
- // vertical strips whose bounding boxes overlap by a few pixels. These are NOT
744
- // duplicates — they contain different text fragments. True duplicates overlap by
745
- // close to 100% of the item width. Use a proportional check: only treat as
746
- // collision if overlap exceeds 50% of the smaller item's width.
747
- const minItemWidth = Math.min(currentLineItemBbox.w, bbox.w);
748
- if (overlapLenght > Math.max(medianWidth / 3, 5) && overlapLenght > minItemWidth * 0.5) {
749
- lineCollide = true;
750
- break;
751
- }
752
- }
753
- // Don't merge margin line numbers with regular content
754
- const currentLineHasMargin = currentLine.some((b) => b.isMarginLineNumber === true);
755
- const bboxIsMargin = bbox.isMarginLineNumber === true;
756
- const marginMismatch = currentLineHasMargin !== bboxIsMargin;
757
- // For rotated text, use Y-tolerance based merging since heights may be inconsistent
758
- const yTolerance = bbox.rotated ? Math.max(medianHeight * 2, 20) : 0;
759
- const yWithinTolerance = bbox.rotated && Math.abs(bbox.y - lineMinY) < yTolerance;
760
- if (!lineCollide &&
761
- !marginMismatch &&
762
- (yWithinTolerance ||
763
- (bbox.y + bbox.h * 0.5 >= lineMinY && bbox.y + bbox.h * 0.5 <= lineMaxY) ||
764
- (bbox.y >= lineMinY && bbox.y <= lineMaxY))) {
765
- currentLine.push(bbox);
766
- }
767
- else {
768
- if (currentLine.length) {
769
- lines.push(currentLine);
770
- }
771
- currentLine = [bbox];
772
- }
773
- }
774
- previousBbox = bbox;
775
- }
776
- if (currentLine.length) {
777
- lines.push(currentLine);
778
- }
779
- // sort each line by x
780
- for (const line of lines) {
781
- line.sort((a, b) => a.x - b.x);
782
- }
783
- // sort lines by y
784
- lines.sort((a, b) => a[0].y - b[0].y);
785
- // merge 'words'
786
- const mergeThreshold = 1;
787
- // Pattern to detect standalone numeric values (financial table numbers)
788
- // Matches: numbers with optional commas, decimal points, dollar signs, percentages, negatives
789
- const numericPattern = /^[$]?-?[\d,]+\.?\d*%?$/;
790
- function looksLikeTableNumber(str) {
791
- const trimmed = str.trim();
792
- // Must be at least 2 chars to be a table number (avoid merging single digits)
793
- return trimmed.length >= 2 && numericPattern.test(trimmed);
794
- }
795
- for (const line of lines) {
796
- for (let i = 1; i < line.length; ++i) {
797
- // merge box in word if:
798
- // - same height
799
- // - less than 2 in space
800
- // if (line[i].h == line[i-1].h) {
801
- const currentLine = line[i];
802
- const previousLine = line[i - 1];
803
- if (canMergeMarkup(previousLine, currentLine)) {
804
- // Don't merge adjacent numbers in tables - they're separate columns
805
- const bothAreNumbers = looksLikeTableNumber(previousLine.str) && looksLikeTableNumber(currentLine.str);
806
- // Check gap with BOTH rounded width (used elsewhere) and raw width from pageBbox.
807
- // Only merge without space if both agree the gap is small enough. This prevents
808
- // rounding artifacts from causing word fusions (e.g., "of" + "our" → "ofour"
809
- // when Math.round(w) reduces the gap from 1.34 to 1.0)
810
- const roundedGap = currentLine.x - previousLine.x - previousLine.w;
811
- const rawGap = currentLine.x - previousLine.x - (previousLine.pageBbox?.w ?? previousLine.w);
812
- if (!bothAreNumbers && roundedGap <= mergeThreshold && rawGap <= mergeThreshold) {
813
- // if same word but less than .7 of prev line
814
- if (currentLine.h != 0 && currentLine.h < previousLine.h * 0.7) {
815
- // and not starting with space
816
- if (currentLine.str[0] == " ") {
817
- break;
818
- }
819
- if (currentLine.y > previousLine.y + previousLine.h * 0.2) {
820
- currentLine.str = strToSubscriptString(currentLine.str);
821
- }
822
- else {
823
- currentLine.str = strToPostScript(currentLine.str);
824
- }
825
- }
826
- // When items overlap (negative gap), the overlap zone often contains
827
- // duplicated characters (common in strip-fragmented PDFs from scans).
828
- // Detect and strip the overlapping characters to avoid doubled text.
829
- // Try multiple overlap estimates (floor/ceil with both char widths) since
830
- // average character width is approximate and rounding can be off by one.
831
- let textToAppend = currentLine.str;
832
- let lengthToAdd = currentLine.strLength;
833
- if (roundedGap < 0) {
834
- const prevCharWidth = previousLine.strLength > 0 ? previousLine.w / previousLine.strLength : 1;
835
- const currCharWidth = currentLine.strLength > 0 ? currentLine.w / currentLine.strLength : 1;
836
- const rawOverlap = -roundedGap;
837
- // Generate candidate overlap counts from both char widths, floored and ceiled
838
- const candidates = new Set([
839
- Math.floor(rawOverlap / prevCharWidth),
840
- Math.ceil(rawOverlap / prevCharWidth),
841
- Math.floor(rawOverlap / currCharWidth),
842
- Math.ceil(rawOverlap / currCharWidth),
843
- ]);
844
- for (const overlapChars of candidates) {
845
- if (overlapChars > 0 &&
846
- overlapChars < currentLine.strLength &&
847
- overlapChars <= previousLine.strLength) {
848
- const prevEnd = previousLine.str.slice(-overlapChars).toLowerCase();
849
- const currStart = currentLine.str.slice(0, overlapChars).toLowerCase();
850
- if (prevEnd === currStart) {
851
- textToAppend = currentLine.str.slice(overlapChars);
852
- lengthToAdd = currentLine.strLength - overlapChars;
853
- break;
854
- }
855
- }
856
- }
857
- }
858
- previousLine.w = currentLine.x + currentLine.w - previousLine.x;
859
- previousLine.str += textToAppend;
860
- previousLine.strLength += lengthToAdd;
861
- previousLine.pageBbox = mergePageBbox(previousLine, currentLine);
862
- line.splice(i, 1);
863
- i--;
864
- }
865
- else if (!bothAreNumbers &&
866
- currentLine.x - previousLine.x - previousLine.w < previousLine.w / previousLine.strLength) {
867
- // merge if space between this word and previous is less than average
868
- // character width (using previous word font size)
869
- // But don't merge adjacent numbers - they're likely table columns
870
- // Now extend the width
871
- previousLine.w = currentLine.x + currentLine.w - previousLine.x;
872
- // Add space between merged items unless the previous already ends with space
873
- if (!previousLine.str.endsWith(" ")) {
874
- previousLine.str += " ";
875
- previousLine.strLength += 1;
876
- }
877
- previousLine.str += currentLine.str;
878
- previousLine.strLength += currentLine.strLength;
879
- previousLine.pageBbox = mergePageBbox(previousLine, currentLine);
880
- line.splice(i, 1);
881
- i--;
882
- }
883
- }
884
- // }
885
- }
886
- }
887
- // check if we can merge the lines together
888
- for (let i = 1; i < lines.length - 1; i++) {
889
- const currentLine = lines[i];
890
- const previousLine = lines[i - 1];
891
- const previousLineMinY = Math.min(...previousLine.map((v) => v.y));
892
- const previousLineMaxY = Math.max(...previousLine.map((v) => v.y + v.h));
893
- const currentLineMinY = Math.min(...currentLine.map((v) => v.y));
894
- const currentLineMaxY = Math.max(...currentLine.map((v) => v.y + v.h));
895
- // does the 2 line overlap?
896
- if (previousLineMaxY > currentLineMinY && previousLineMinY < currentLineMaxY) {
897
- // check the bboxes of current line and prevline do not overlap
898
- let bboxOverlap = false;
899
- for (const bbox of currentLine) {
900
- for (const prevBbox of previousLine) {
901
- if (bbox.x >= prevBbox.x && bbox.x <= prevBbox.x + prevBbox.w) {
902
- bboxOverlap = true;
903
- break;
904
- }
905
- if (prevBbox.x >= bbox.x && prevBbox.x <= bbox.x + bbox.w) {
906
- bboxOverlap = true;
907
- break;
908
- }
909
- }
910
- }
911
- // merge if no overlap
912
- if (!bboxOverlap) {
913
- previousLine.push(...currentLine);
914
- previousLine.sort((a, b) => a.x - b.x);
915
- lines.splice(i--, 1);
916
- }
917
- }
918
- }
919
- for (let i = 1; i < lines.length; i++) {
920
- const previousLineMetrics = getRepresentativeLineMetrics(lines[i - 1], medianHeight);
921
- const currentLineMetrics = getRepresentativeLineMetrics(lines[i], medianHeight);
922
- const yDelta = currentLineMetrics.top - previousLineMetrics.bottom;
923
- const referenceHeight = Math.max(medianHeight, Math.min(previousLineMetrics.height, currentLineMetrics.height));
924
- // Calculate the number of blank lines to insert based on vertical spacing
925
- // Use medianHeight as a reference for one line spacing
926
- if (yDelta > referenceHeight) {
927
- // Calculate how many blank lines should be inserted
928
- // Round to nearest integer to get approximate number of lines
929
- const numBlankLines = Math.round(yDelta / referenceHeight) - 1;
930
- // Cap at a reasonable maximum (e.g., 10 blank lines) to avoid extreme cases
931
- const linesToInsert = Math.min(Math.max(numBlankLines, 1), 10);
932
- // Insert the calculated number of blank lines
933
- const blankLines = Array(linesToInsert).fill([]);
934
- lines.splice(i, 0, ...blankLines);
935
- i += linesToInsert;
936
- }
937
- }
938
- return lines;
939
- }
940
- function canRenderBbox(line, bbox) {
941
- for (const item of line) {
942
- if (item == bbox) {
943
- return true;
944
- }
945
- if (!item.rendered) {
946
- return false;
947
- }
948
- }
949
- return false;
950
- }
951
- function updateForwardAnchorRightBound(snapMap, forwardAnchor, rightBound, anchorTarget, logger, triggerText, triggerLineIndex, snapType) {
952
- // Anything snapped to the right of rightBound should be aligned to anchorTarget line length at minimum
953
- // Also update nearby positions (within tolerance) to handle slight position variations between rows
954
- const POSITION_TOLERANCE = 2;
955
- for (let i = snapMap.length - 1; i >= 0; --i) {
956
- const anchor = snapMap[i];
957
- if (rightBound <= anchor) {
958
- if (!forwardAnchor[anchor] || anchorTarget > forwardAnchor[anchor]) {
959
- const oldValue = forwardAnchor[anchor];
960
- forwardAnchor[anchor] = anchorTarget;
961
- logger.logForwardAnchorMutation(triggerText, triggerLineIndex, snapType, anchor, oldValue, anchorTarget, rightBound);
962
- }
963
- // Also update nearby positions within tolerance
964
- for (let j = i - 1; j >= 0; --j) {
965
- const nearbyAnchor = snapMap[j];
966
- if (anchor - nearbyAnchor > POSITION_TOLERANCE)
967
- break;
968
- if (!forwardAnchor[nearbyAnchor] || anchorTarget > forwardAnchor[nearbyAnchor]) {
969
- const oldValue = forwardAnchor[nearbyAnchor];
970
- forwardAnchor[nearbyAnchor] = anchorTarget;
971
- logger.logForwardAnchorMutation(triggerText, triggerLineIndex, snapType, nearbyAnchor, oldValue, anchorTarget, rightBound);
972
- }
973
- }
974
- }
975
- else {
976
- return;
977
- }
978
- }
979
- }
980
- function updateForwardAnchors(bbox, nextBbox, snapMaps, forwardAnchors, lineLength, logger, lineIndex) {
981
- const rightBound = bbox.x + bbox.w;
982
- let targetLength = lineLength;
983
- if (nextBbox && (nextBbox.shouldSpace ?? 0) > 0) {
984
- targetLength += nextBbox.shouldSpace ?? 0;
985
- }
986
- const triggerText = bbox.str;
987
- updateForwardAnchorRightBound(snapMaps.left, forwardAnchors.left, rightBound, targetLength, logger, triggerText, lineIndex, "left");
988
- updateForwardAnchorRightBound(snapMaps.right, forwardAnchors.right, rightBound, targetLength, logger, triggerText, lineIndex, "right");
989
- // we do not update center anchors since centered text may span between snapped columns
990
- updateForwardAnchorRightBound(snapMaps.floating, forwardAnchors.floating, rightBound, targetLength, logger, triggerText, lineIndex, "floating");
991
- }
992
- /**
993
- * Compute the maximum gap between adjacent items on a line.
994
- */
995
- function lineMaxGap(line) {
996
- let maxGap = 0;
997
- for (let gi = 1; gi < line.length; gi++) {
998
- const gap = line[gi].x - (line[gi - 1].x + line[gi - 1].w);
999
- if (gap > maxGap)
1000
- maxGap = gap;
1001
- }
1002
- return maxGap;
1003
- }
1004
- /**
1005
- * Render a single line as flowing text: join items with single spaces based on gap size.
1006
- * Sets bbox.rendered = true for each item.
1007
- */
1008
- function renderLineAsFlowingText(line, minX, medianWidth) {
1009
- const indent = Math.min(Math.max(Math.round((line[0].x - minX) / medianWidth), 0), FLOWING_MAX_INDENT);
1010
- let result = " ".repeat(indent);
1011
- for (let i = 0; i < line.length; i++) {
1012
- const bbox = line[i];
1013
- if (i > 0) {
1014
- const prevBbox = line[i - 1];
1015
- const gap = bbox.x - (prevBbox.x + prevBbox.w);
1016
- const spaceThreshold = Math.max(bbox.h * FLOWING_SPACE_HEIGHT_RATIO, FLOWING_SPACE_MIN_THRESHOLD);
1017
- if (gap > spaceThreshold && !result.endsWith(" ")) {
1018
- result += " ";
1019
- }
1020
- }
1021
- result += bbox.str;
1022
- bbox.rendered = true;
1023
- }
1024
- return result;
1025
- }
1026
- /**
1027
- * Classify whether a block of lines is flowing paragraph text or structured/tabular content.
1028
- * Flowing text gets a simpler rendering path that avoids grid projection artifacts.
1029
- */
1030
- function isFlowingTextBlock(blockLines, anchorLeft, anchorRight, anchorCenter, pageWidth) {
1031
- const leftAnchorCount = Object.keys(anchorLeft).length;
1032
- const rightAnchorCount = Object.keys(anchorRight).length;
1033
- const centerAnchorCount = Object.keys(anchorCenter).length;
1034
- // Multiple column anchors indicate structured/tabular content
1035
- if (leftAnchorCount + rightAnchorCount + centerAnchorCount > FLOWING_MAX_TOTAL_ANCHORS)
1036
- return false;
1037
- if (leftAnchorCount > FLOWING_MAX_LEFT_ANCHORS)
1038
- return false;
1039
- // Count non-empty lines and how many span most of the page width
1040
- let nonEmptyLines = 0;
1041
- let wideLines = 0;
1042
- for (const line of blockLines) {
1043
- if (line.length === 0)
1044
- continue;
1045
- nonEmptyLines++;
1046
- const lineStart = line[0].x;
1047
- const lineEnd = line[line.length - 1].x + line[line.length - 1].w;
1048
- if (lineEnd - lineStart > pageWidth * FLOWING_WIDE_LINE_RATIO)
1049
- wideLines++;
1050
- }
1051
- // Need enough lines to confidently classify
1052
- if (nonEmptyLines < FLOWING_MIN_LINES)
1053
- return false;
1054
- // Majority of lines should span most of page width for flowing text
1055
- return wideLines / nonEmptyLines > FLOWING_WIDE_LINE_THRESHOLD;
1056
- }
1057
- /**
1058
- * Render a flowing text block by joining items with single spaces.
1059
- * Avoids grid projection artifacts (excessive whitespace, fused words)
1060
- * that occur when applying column-alignment logic to paragraph text.
1061
- */
1062
- function renderFlowingBlock(lines, block, rawLines, medianWidth) {
1063
- // Find the block's left margin
1064
- let minX = Infinity;
1065
- for (let i = block.start; i < block.end; i++) {
1066
- if (lines[i].length > 0) {
1067
- minX = Math.min(minX, lines[i][0].x);
1068
- }
1069
- }
1070
- if (minX === Infinity)
1071
- minX = 0;
1072
- for (let lineIndex = block.start; lineIndex < block.end; lineIndex++) {
1073
- const line = lines[lineIndex];
1074
- if (!rawLines[lineIndex]) {
1075
- rawLines[lineIndex] = "";
1076
- }
1077
- if (line.length === 0)
1078
- continue;
1079
- rawLines[lineIndex] = renderLineAsFlowingText(line, minX, medianWidth);
1080
- }
1081
- }
1082
- function getMedianTextBoxSize(lines) {
1083
- // calculate median textBox width
1084
- const widthList = [];
1085
- for (const bbox of lines) {
1086
- if (bbox.w > 0) {
1087
- widthList.push(bbox.w / bbox.strLength);
1088
- }
1089
- }
1090
- const medianWidth = widthList.sort((a, b) => a - b)[Math.floor(widthList.length / 2)];
1091
- // calculate median textBox height
1092
- const heightList = [];
1093
- for (const bbox of lines) {
1094
- if (bbox.h > 0) {
1095
- heightList.push(bbox.h);
1096
- }
1097
- }
1098
- const medianHeight = heightList.sort((a, b) => a - b)[Math.floor(heightList.length / 2)];
1099
- return { width: medianWidth, height: medianHeight };
1100
- }
1101
- export function projectToGrid(config, page, projectionBoxes, prevAnchors, totalPages, logger) {
1102
- // detect '.' garbage in the lines
1103
- let dotCount = 0;
1104
- for (const bbox of projectionBoxes) {
1105
- // check if bbox.str contains only dots
1106
- if (bbox.str.match(/^\.+$/)) {
1107
- dotCount++;
1108
- }
1109
- }
1110
- if (dotCount > 100 && dotCount > projectionBoxes.length * 0.05) {
1111
- // remove all dots and splice them from lines
1112
- const newLines = [];
1113
- for (const bbox of projectionBoxes) {
1114
- if (bbox.str.match(/^\.+$/)) {
1115
- continue;
1116
- }
1117
- if (bbox.str.match(/^·+$/)) {
1118
- continue;
1119
- }
1120
- if (bbox.str.match(/^"+$/)) {
1121
- continue;
1122
- }
1123
- newLines.push(bbox);
1124
- }
1125
- projectionBoxes = newLines;
1126
- }
1127
- // calculate median textBox width/height
1128
- const pageMedianSizes = getMedianTextBoxSize(projectionBoxes);
1129
- let medianWidth = pageMedianSizes.width;
1130
- const medianHeight = pageMedianSizes.height;
1131
- // Save original bboxes (including OCR) for text attribution
1132
- const attributionBboxes = [];
1133
- for (const bbox of projectionBoxes) {
1134
- if (!bbox || !bbox.str || bbox.vgap || bbox.isPlaceholder) {
1135
- continue;
1136
- }
1137
- attributionBboxes.push({
1138
- str: bbox.str,
1139
- x: bbox.x,
1140
- y: bbox.y,
1141
- w: bbox.w,
1142
- h: bbox.h,
1143
- r: bbox.r,
1144
- strLength: bbox.str.length,
1145
- });
1146
- }
1147
- handleRotationReadingOrder(projectionBoxes, page.height);
1148
- const lines = bboxToLine(projectionBoxes, medianWidth, medianHeight, page.width);
1149
- // Log line composition
1150
- for (let i = 0; i < lines.length; i++) {
1151
- logger.logLineComposition(i, lines[i]);
1152
- }
1153
- // remove unprojectable text and apply markup to final lines
1154
- for (let i = 0; i < lines.length; ++i) {
1155
- const line = filterUnprojectableText(config, lines[i]);
1156
- for (const bbox of line) {
1157
- // With the way our grid projection currently works, we have to output
1158
- // tags before raw line projection to avoid breaking the projection alignment.
1159
- // The tags get replaced with MD as needed in output formatting, this does
1160
- // result in output text containing the ~~ strikeout markup, but this is
1161
- // mitigated since we skip markup entirely when we are not outputting markdown
1162
- if (bbox.str.trim().length != 0 && bbox.markup) {
1163
- bbox.str = applyMarkupTags(bbox.markup, bbox.str);
1164
- }
1165
- }
1166
- lines[i] = line;
1167
- }
1168
- const forwardAnchors = {
1169
- left: {},
1170
- right: {},
1171
- center: {},
1172
- floating: {},
1173
- };
1174
- const rawLines = [];
1175
- const rawLinesDelta = [];
1176
- const blocks = [];
1177
- if (config.preserveLayoutAlignmentAcrossPages && totalPages > 1) {
1178
- blocks.push({ start: 0, end: lines.length });
1179
- }
1180
- else {
1181
- let emptyCount = 0;
1182
- let start = -1;
1183
- for (const [lineIndex, line] of lines.entries()) {
1184
- if (line.length === 0) {
1185
- emptyCount++;
1186
- if (emptyCount > 1) {
1187
- if (start >= 0) {
1188
- // ignore completely empty blocks, include the double blank
1189
- // line at the end of valid blocks
1190
- blocks.push({ start: start, end: lineIndex + 1 });
1191
- }
1192
- start = -1;
1193
- }
1194
- }
1195
- else {
1196
- emptyCount = 0;
1197
- if (start < 0) {
1198
- start = lineIndex;
1199
- }
1200
- }
1201
- }
1202
- if (start > -1) {
1203
- blocks.push({ start: start, end: lines.length });
1204
- }
1205
- }
1206
- // Log block assignments
1207
- for (let bi = 0; bi < blocks.length; bi++) {
1208
- logger.logBlock(bi, blocks[bi].start, blocks[bi].end);
1209
- }
1210
- for (const block of blocks) {
1211
- const blockLines = lines.slice(block.start, block.end);
1212
- const { anchorLeft, anchorRight, anchorCenter } = extractAnchorsPointsFromLines(blockLines, page);
1213
- logger.logAnchors(anchorLeft, anchorRight, anchorCenter);
1214
- // Block-level classification: if entire block is clearly flowing text,
1215
- // render it simply and skip grid projection
1216
- if (isFlowingTextBlock(blockLines, anchorLeft, anchorRight, anchorCenter, page.width)) {
1217
- logger.logFlowingBlock(block.start, block.end);
1218
- if (!config.preserveLayoutAlignmentAcrossPages) {
1219
- const sizes = getMedianTextBoxSize(blockLines.flat());
1220
- medianWidth = sizes.width;
1221
- }
1222
- renderFlowingBlock(lines, block, rawLines, medianWidth);
1223
- for (let li = block.start; li < block.end; li++) {
1224
- if (rawLines[li])
1225
- logger.captureRender(li, 0, rawLines[li], "flowing");
1226
- }
1227
- continue;
1228
- }
1229
- logger.logStructuredBlock(block.start, block.end);
1230
- const snapMaps = {
1231
- left: [],
1232
- right: [],
1233
- center: [],
1234
- floating: [],
1235
- };
1236
- const uniqueSnaps = new Set();
1237
- for (const snap in anchorLeft) {
1238
- uniqueSnaps.add(parseFloat(snap));
1239
- }
1240
- snapMaps.left.push(...uniqueSnaps);
1241
- uniqueSnaps.clear();
1242
- for (const snap in anchorRight) {
1243
- uniqueSnaps.add(parseFloat(snap));
1244
- }
1245
- snapMaps.right.push(...uniqueSnaps);
1246
- uniqueSnaps.clear();
1247
- for (const snap in anchorCenter) {
1248
- uniqueSnaps.add(parseFloat(snap));
1249
- }
1250
- snapMaps.center.push(...uniqueSnaps);
1251
- uniqueSnaps.clear();
1252
- let hasChanged = true;
1253
- const leftSnap = [];
1254
- const rightSnap = [];
1255
- const centerSnap = [];
1256
- if (!config.preserveLayoutAlignmentAcrossPages) {
1257
- const sizes = getMedianTextBoxSize(lines.slice(block.start, block.end).flat());
1258
- medianWidth = sizes.width;
1259
- // medianHeight updated but not currently used per-block - reserved for future use
1260
- void sizes.height;
1261
- }
1262
- logger.logBlockContext(block.start, block.end, medianWidth, new Set());
1263
- // compute snaps
1264
- for (let lineIndex = block.start; lineIndex < block.end; ++lineIndex) {
1265
- const line = lines[lineIndex];
1266
- const forceUnsnapped = !canSnapLine(config, line);
1267
- let prevBbox = null;
1268
- for (let boxIndex = 0; boxIndex < line.length; ++boxIndex) {
1269
- const bbox = line[boxIndex];
1270
- bbox.forceUnsnapped = forceUnsnapped;
1271
- const spaceThreshold = 2;
1272
- // should we add a space between the two bbox?
1273
- // TODO RTL
1274
- if (prevBbox && bbox.x - (prevBbox.x + prevBbox.w) > spaceThreshold) {
1275
- const xDelta = bbox.x - (prevBbox.x + prevBbox.w);
1276
- const prevCharWidth = prevBbox.w / prevBbox.strLength;
1277
- // add a space
1278
- bbox.shouldSpace = 1;
1279
- if (xDelta > prevCharWidth * 2) {
1280
- // Check if both items are in the same column based on gap size
1281
- // If gap is less than 10% of page width, treat as same column
1282
- // This works for any number of columns
1283
- const columnGapThreshold = page.width * 0.1;
1284
- const bothInSameColumn = xDelta < columnGapThreshold;
1285
- // insert column spacing if any of:
1286
- // - gap is more than an approximate tab (8x average char width)
1287
- // - previous bbox is right snap
1288
- // - this bbox is left snap
1289
- // - both previous and this bbox are snaps
1290
- // otherwise insert floating spacing
1291
- if ((!bbox.forceUnsnapped && xDelta > prevCharWidth * 8) ||
1292
- (bbox.snap && bbox.snap === "left") ||
1293
- (prevBbox.snap && prevBbox.snap === "right") ||
1294
- (bbox.snap && prevBbox.snap)) {
1295
- // If both items are in the same column, limit spacing to avoid
1296
- // preserving justified text gaps from PDFs
1297
- bbox.shouldSpace = bothInSameColumn ? FLOATING_SPACES : COLUMN_SPACES;
1298
- }
1299
- else {
1300
- // For items in the same column, use minimal spacing
1301
- bbox.shouldSpace = bothInSameColumn ? 1 : FLOATING_SPACES;
1302
- }
1303
- }
1304
- }
1305
- else {
1306
- bbox.shouldSpace = 0;
1307
- }
1308
- prevBbox = bbox;
1309
- logger.logSnapAssignment(bbox, lineIndex, boxIndex);
1310
- if (!bbox.snap) {
1311
- uniqueSnaps.add(Math.round(bbox.x));
1312
- }
1313
- else if (bbox.snap == "left") {
1314
- leftSnap.push({ bbox, lineIndex, boxIndex });
1315
- }
1316
- else if (bbox.snap == "right") {
1317
- rightSnap.push({ bbox, lineIndex, boxIndex });
1318
- }
1319
- else if (bbox.snap == "center") {
1320
- centerSnap.push({ bbox, lineIndex, boxIndex });
1321
- }
1322
- }
1323
- }
1324
- snapMaps.floating.push(...uniqueSnaps);
1325
- uniqueSnaps.clear();
1326
- snapMaps.floating.sort((a, b) => a - b);
1327
- snapMaps.center.sort((a, b) => a - b);
1328
- snapMaps.right.sort((a, b) => a - b);
1329
- snapMaps.left.sort((a, b) => a - b);
1330
- // Per-line flowing text detection: pre-render lines that are clearly paragraph text
1331
- // (spans page width, no large column gaps) with simple single-space joining.
1332
- // This avoids grid projection artifacts on flowing text within mixed blocks.
1333
- const flowingLines = new Set();
1334
- {
1335
- // Find block's left margin for indent calculation
1336
- let blockMinX = Infinity;
1337
- for (let li = block.start; li < block.end; li++) {
1338
- if (lines[li].length > 0) {
1339
- blockMinX = Math.min(blockMinX, lines[li][0].x);
1340
- }
1341
- }
1342
- if (blockMinX === Infinity)
1343
- blockMinX = 0;
1344
- const columnGapThreshold = medianWidth * FLOWING_COLUMN_GAP_MULTIPLIER;
1345
- // Helper to mark and render a line as flowing
1346
- function markFlowing(lineIndex, reason) {
1347
- const line = lines[lineIndex];
1348
- if (!rawLines[lineIndex]) {
1349
- rawLines[lineIndex] = "";
1350
- rawLinesDelta[lineIndex] = 0;
1351
- }
1352
- rawLines[lineIndex] = renderLineAsFlowingText(line, blockMinX, medianWidth);
1353
- flowingLines.add(lineIndex);
1354
- logger.logFlowingLine(lineIndex, reason);
1355
- logger.captureRender(lineIndex, 0, rawLines[lineIndex], "flowing");
1356
- }
1357
- // First pass: detect clearly flowing lines (wide, no column gaps, enough items)
1358
- for (let lineIndex = block.start; lineIndex < block.end; lineIndex++) {
1359
- const line = lines[lineIndex];
1360
- if (line.length < FLOWING_MIN_LINE_ITEMS)
1361
- continue;
1362
- const lineStart = line[0].x;
1363
- const lineEnd = line[line.length - 1].x + line[line.length - 1].w;
1364
- const lineSpan = lineEnd - lineStart;
1365
- if (lineSpan > page.width * FLOWING_WIDE_LINE_RATIO &&
1366
- lineMaxGap(line) < columnGapThreshold) {
1367
- markFlowing(lineIndex, "wide span, no column gaps");
1368
- }
1369
- }
1370
- // Second pass: extend flowing to adjacent continuation lines using
1371
- // forward + backward sweeps (O(n) instead of iterating until convergence)
1372
- // Forward sweep: propagate flowing status downward
1373
- for (let lineIndex = block.start; lineIndex < block.end; lineIndex++) {
1374
- if (flowingLines.has(lineIndex))
1375
- continue;
1376
- const line = lines[lineIndex];
1377
- if (line.length === 0)
1378
- continue;
1379
- if (flowingLines.has(lineIndex - 1) && lineMaxGap(line) < columnGapThreshold) {
1380
- markFlowing(lineIndex, `forward propagation from line ${lineIndex - 1}`);
1381
- }
1382
- }
1383
- // Backward sweep: propagate flowing status upward
1384
- for (let lineIndex = block.end - 1; lineIndex >= block.start; lineIndex--) {
1385
- if (flowingLines.has(lineIndex))
1386
- continue;
1387
- const line = lines[lineIndex];
1388
- if (line.length === 0)
1389
- continue;
1390
- if (flowingLines.has(lineIndex + 1) && lineMaxGap(line) < columnGapThreshold) {
1391
- markFlowing(lineIndex, `backward propagation from line ${lineIndex + 1}`);
1392
- }
1393
- }
1394
- }
1395
- while (hasChanged || snapMaps.right.length || snapMaps.left.length || snapMaps.center.length) {
1396
- hasChanged = false;
1397
- for (let lineIndex = block.start; lineIndex < block.end; ++lineIndex) {
1398
- const line = lines[lineIndex];
1399
- if (!rawLines[lineIndex]) {
1400
- rawLines[lineIndex] = "";
1401
- rawLinesDelta[lineIndex] = 0;
1402
- }
1403
- for (let boxIndex = 0; boxIndex < line.length; ++boxIndex) {
1404
- const bbox = line[boxIndex];
1405
- if (bbox.rendered) {
1406
- continue;
1407
- }
1408
- if (!bbox.forceUnsnapped) {
1409
- if (bbox.snap) {
1410
- continue;
1411
- }
1412
- if ((snapMaps.left.length && snapMaps.left[0] < bbox.x) ||
1413
- (snapMaps.right.length && snapMaps.right[0] < bbox.x) ||
1414
- (snapMaps.center.length && snapMaps.center[0] < Math.round(bbox.x + bbox.w / 2))) {
1415
- continue;
1416
- }
1417
- }
1418
- if (!canRenderBbox(line, bbox)) {
1419
- break;
1420
- }
1421
- const initialTargetX = Math.round(bbox.x / medianWidth);
1422
- let targetX = Math.min(initialTargetX, COLUMN_SPACES);
1423
- let lastSnapLeft = 0;
1424
- let lastSnapLeftKey;
1425
- for (const key in forwardAnchors.left) {
1426
- // Use parseFloat to preserve decimal precision from anchor keys
1427
- if (parseFloat(key) <= bbox.x) {
1428
- if (forwardAnchors.left[key] > lastSnapLeft) {
1429
- lastSnapLeft = forwardAnchors.left[key];
1430
- lastSnapLeftKey = parseFloat(key);
1431
- }
1432
- }
1433
- }
1434
- const rawLineTrimLength = rawLines[lineIndex].trimEnd().length;
1435
- const lineMax = Math.max(lastSnapLeft, rawLineTrimLength + (bbox.shouldSpace ?? 0));
1436
- let bindingConstraint = "COLUMN_SPACES";
1437
- if (targetX < lineMax) {
1438
- targetX = lineMax;
1439
- bindingConstraint =
1440
- lastSnapLeft >= rawLineTrimLength + (bbox.shouldSpace ?? 0)
1441
- ? "lastSnapLeft"
1442
- : "lineMax";
1443
- }
1444
- let floatingAnchorBump;
1445
- if (!bbox.forceUnsnapped) {
1446
- const floatingAnchor = forwardAnchors.floating[Math.round(bbox.x)];
1447
- if (floatingAnchor && targetX < floatingAnchor) {
1448
- // Limit floating anchor adjustment to avoid excessive gaps in justified text
1449
- // Use a small max gap to prevent large spacing within columns
1450
- const maxFloatingGap = 4;
1451
- const adjustedAnchor = Math.min(floatingAnchor, targetX + maxFloatingGap);
1452
- if (adjustedAnchor > targetX) {
1453
- floatingAnchorBump = adjustedAnchor - targetX;
1454
- targetX = adjustedAnchor;
1455
- bindingConstraint = "floatingAnchor";
1456
- }
1457
- }
1458
- }
1459
- logger.logRenderTrace(bbox, lineIndex, {
1460
- snapType: "floating",
1461
- initialTargetX,
1462
- medianWidth,
1463
- lineMax,
1464
- lastSnapLeft: lastSnapLeft > 0 ? lastSnapLeft : undefined,
1465
- lastSnapLeftKey,
1466
- rawLineTrimLength,
1467
- shouldSpace: bbox.shouldSpace ?? 0,
1468
- floatingAnchorBump,
1469
- finalTargetX: targetX,
1470
- bindingConstraint,
1471
- });
1472
- rawLines[lineIndex] = rawLines[lineIndex].trimEnd();
1473
- if (targetX > rawLines[lineIndex].length) {
1474
- rawLines[lineIndex] += " ".repeat(targetX - rawLines[lineIndex].length);
1475
- }
1476
- rawLines[lineIndex] += bbox.str;
1477
- bbox.rendered = true;
1478
- hasChanged = true;
1479
- logger.logRender(bbox, lineIndex, targetX, "floating");
1480
- logger.captureRender(lineIndex, targetX, bbox.str, "floating");
1481
- let nextBbox = null;
1482
- if (line.length > boxIndex + 1) {
1483
- nextBbox = line[boxIndex + 1];
1484
- }
1485
- if (!bbox.forceUnsnapped) {
1486
- updateForwardAnchors(bbox, nextBbox, snapMaps, forwardAnchors, rawLines[lineIndex].length, logger, lineIndex);
1487
- }
1488
- }
1489
- }
1490
- if (snapMaps.left.length &&
1491
- (!snapMaps.right.length || snapMaps.left[0] <= snapMaps.right[0]) &&
1492
- (!snapMaps.center.length || snapMaps.left[0] <= snapMaps.center[0])) {
1493
- const thisTurnSnap = [];
1494
- for (const item of leftSnap) {
1495
- if (item.bbox.leftAnchor &&
1496
- parseFloat(item.bbox.leftAnchor) == snapMaps.left[0] &&
1497
- !flowingLines.has(item.lineIndex)) {
1498
- thisTurnSnap.push(item);
1499
- }
1500
- }
1501
- hasChanged = true;
1502
- if (!thisTurnSnap.length) {
1503
- snapMaps.left.shift();
1504
- continue;
1505
- }
1506
- const leftInitialTargetX = Math.round(snapMaps.left[0] / medianWidth);
1507
- let targetX = Math.min(leftInitialTargetX, COLUMN_SPACES);
1508
- const lineMax = Math.max(...thisTurnSnap.map((v) => {
1509
- let spaceEnd = 0;
1510
- if (!rawLines[v.lineIndex].endsWith(" ")) {
1511
- spaceEnd = v.bbox.shouldSpace ?? 0;
1512
- }
1513
- if ((v.bbox.shouldSpace ?? 0) > 1) {
1514
- const trailingSpaces = rawLines[v.lineIndex].length - rawLines[v.lineIndex].trimEnd().length;
1515
- if (trailingSpaces < (v.bbox.shouldSpace ?? 0)) {
1516
- spaceEnd = (v.bbox.shouldSpace ?? 0) - trailingSpaces;
1517
- }
1518
- }
1519
- return rawLines[v.lineIndex].length + spaceEnd + 1;
1520
- }));
1521
- let leftBindingConstraint = "COLUMN_SPACES";
1522
- if (targetX < lineMax) {
1523
- targetX = lineMax;
1524
- leftBindingConstraint = "lineMax";
1525
- }
1526
- const leftForwardAnchorValue = forwardAnchors.left[snapMaps.left[0]];
1527
- if (leftForwardAnchorValue && targetX < leftForwardAnchorValue) {
1528
- targetX = leftForwardAnchorValue;
1529
- leftBindingConstraint = "forwardAnchor";
1530
- }
1531
- const leftPrevAnchorValue = prevAnchors.forwardAnchorLeft[snapMaps.left[0]];
1532
- if (leftPrevAnchorValue && targetX < leftPrevAnchorValue) {
1533
- targetX = leftPrevAnchorValue;
1534
- leftBindingConstraint = "prevAnchor";
1535
- }
1536
- forwardAnchors.left[snapMaps.left[0]] = targetX;
1537
- logger.logForwardAnchor("left", snapMaps.left[0], targetX);
1538
- for (const currentLeftSnapBox of thisTurnSnap) {
1539
- const lineIndex = currentLeftSnapBox.lineIndex;
1540
- if (flowingLines.has(lineIndex))
1541
- continue;
1542
- logger.logRenderTrace(currentLeftSnapBox.bbox, lineIndex, {
1543
- snapType: "left",
1544
- initialTargetX: leftInitialTargetX,
1545
- medianWidth,
1546
- lineMax,
1547
- rawLineTrimLength: rawLines[lineIndex].trimEnd().length,
1548
- shouldSpace: currentLeftSnapBox.bbox.shouldSpace ?? 0,
1549
- forwardAnchorValue: leftForwardAnchorValue || undefined,
1550
- prevAnchorValue: leftPrevAnchorValue || undefined,
1551
- finalTargetX: targetX,
1552
- bindingConstraint: leftBindingConstraint,
1553
- });
1554
- if (targetX > rawLines[lineIndex].length) {
1555
- rawLines[lineIndex] += " ".repeat(targetX - rawLines[lineIndex].length);
1556
- }
1557
- rawLines[lineIndex] += currentLeftSnapBox.bbox.str;
1558
- currentLeftSnapBox.bbox.rendered = true;
1559
- logger.logRender(currentLeftSnapBox.bbox, lineIndex, targetX, "left-snap");
1560
- logger.captureRender(lineIndex, targetX, currentLeftSnapBox.bbox.str, "left");
1561
- let nextBbox = null;
1562
- if (lines[lineIndex].length > currentLeftSnapBox.boxIndex + 1) {
1563
- nextBbox = lines[lineIndex][currentLeftSnapBox.boxIndex + 1];
1564
- }
1565
- updateForwardAnchors(currentLeftSnapBox.bbox, nextBbox, snapMaps, forwardAnchors, rawLines[lineIndex].length, logger, lineIndex);
1566
- }
1567
- for (let index = block.start; index < block.end; ++index) {
1568
- if (flowingLines.has(index))
1569
- continue;
1570
- const line = rawLines[index];
1571
- if (line.length < targetX) {
1572
- rawLines[index] += " ".repeat(targetX - line.length);
1573
- }
1574
- }
1575
- snapMaps.left.shift();
1576
- }
1577
- else if (snapMaps.right.length &&
1578
- (!snapMaps.left.length || snapMaps.right[0] <= snapMaps.left[0]) &&
1579
- (!snapMaps.center.length || snapMaps.right[0] <= snapMaps.center[0])) {
1580
- const thisTurnSnap = [];
1581
- hasChanged = true;
1582
- for (const item of rightSnap) {
1583
- if (item.bbox.rightAnchor &&
1584
- parseFloat(item.bbox.rightAnchor) == snapMaps.right[0] &&
1585
- !flowingLines.has(item.lineIndex)) {
1586
- thisTurnSnap.push(item);
1587
- }
1588
- }
1589
- if (!thisTurnSnap.length) {
1590
- snapMaps.right.shift();
1591
- continue;
1592
- }
1593
- const rightInitialTargetX = Math.round(snapMaps.right[0] / medianWidth);
1594
- let targetX = Math.min(rightInitialTargetX, COLUMN_SPACES);
1595
- const allRightCandidates = thisTurnSnap.map((v) => {
1596
- let lastSnapLeft = 0;
1597
- for (const key in forwardAnchors.left) {
1598
- if (parseInt(key) <= v.bbox.x) {
1599
- lastSnapLeft = Math.max(lastSnapLeft, forwardAnchors.left[key]);
1600
- }
1601
- }
1602
- const value = Math.max(lastSnapLeft, rawLines[v.lineIndex].trimEnd().length + (v.bbox.shouldSpace ?? 0)) + v.bbox.strLength;
1603
- return {
1604
- text: v.bbox.str.substring(0, 30),
1605
- lineIndex: v.lineIndex,
1606
- lineItemCount: lines[v.lineIndex].length,
1607
- filtered: false,
1608
- value,
1609
- };
1610
- });
1611
- const lineMax = Math.max(...allRightCandidates.map((c) => c.value));
1612
- let rightBindingConstraint = "COLUMN_SPACES";
1613
- if (targetX < lineMax) {
1614
- targetX = lineMax;
1615
- rightBindingConstraint = "lineMax";
1616
- }
1617
- const rightForwardAnchorValue = forwardAnchors.right[snapMaps.right[0]];
1618
- if (rightForwardAnchorValue && targetX < rightForwardAnchorValue) {
1619
- targetX = rightForwardAnchorValue;
1620
- rightBindingConstraint = "forwardAnchor";
1621
- }
1622
- const rightPrevAnchorValue = prevAnchors.forwardAnchorRight[snapMaps.right[0]];
1623
- if (rightPrevAnchorValue && targetX < rightPrevAnchorValue) {
1624
- targetX = rightPrevAnchorValue;
1625
- rightBindingConstraint = "prevAnchor";
1626
- }
1627
- forwardAnchors.right[snapMaps.right[0]] = targetX;
1628
- logger.logForwardAnchor("right", snapMaps.right[0], targetX);
1629
- for (const currentRightSnapBox of thisTurnSnap) {
1630
- const lineIndex = currentRightSnapBox.lineIndex;
1631
- if (flowingLines.has(lineIndex))
1632
- continue;
1633
- logger.logRenderTrace(currentRightSnapBox.bbox, lineIndex, {
1634
- snapType: "right",
1635
- initialTargetX: rightInitialTargetX,
1636
- medianWidth,
1637
- lineMax,
1638
- rawLineTrimLength: rawLines[lineIndex].trimEnd().length,
1639
- shouldSpace: currentRightSnapBox.bbox.shouldSpace ?? 0,
1640
- lineMaxCandidates: allRightCandidates,
1641
- forwardAnchorValue: rightForwardAnchorValue || undefined,
1642
- prevAnchorValue: rightPrevAnchorValue || undefined,
1643
- finalTargetX: targetX,
1644
- bindingConstraint: rightBindingConstraint,
1645
- });
1646
- rawLines[lineIndex] = rawLines[lineIndex].trimEnd();
1647
- if (targetX > rawLines[lineIndex].trimEnd().length + currentRightSnapBox.bbox.strLength) {
1648
- rawLines[lineIndex] += " ".repeat(targetX - rawLines[lineIndex].length - currentRightSnapBox.bbox.strLength);
1649
- }
1650
- rawLines[lineIndex] += currentRightSnapBox.bbox.str;
1651
- currentRightSnapBox.bbox.rendered = true;
1652
- logger.logRender(currentRightSnapBox.bbox, lineIndex, targetX, "right-snap");
1653
- logger.captureRender(lineIndex, targetX - currentRightSnapBox.bbox.strLength, currentRightSnapBox.bbox.str, "right");
1654
- let nextBbox = null;
1655
- if (lines[lineIndex].length > currentRightSnapBox.boxIndex + 1) {
1656
- nextBbox = lines[lineIndex][currentRightSnapBox.boxIndex + 1];
1657
- }
1658
- updateForwardAnchors(currentRightSnapBox.bbox, nextBbox, snapMaps, forwardAnchors, rawLines[lineIndex].length, logger, lineIndex);
1659
- }
1660
- for (let index = block.start; index < block.end; ++index) {
1661
- if (flowingLines.has(index))
1662
- continue;
1663
- const line = rawLines[index];
1664
- if (line.length < targetX) {
1665
- rawLines[index] += " ".repeat(targetX - line.length);
1666
- }
1667
- }
1668
- snapMaps.right.shift();
1669
- }
1670
- else if (snapMaps.center.length &&
1671
- (!snapMaps.left.length || snapMaps.center[0] <= snapMaps.left[0]) &&
1672
- (!snapMaps.right.length || snapMaps.center[0] <= snapMaps.right[0])) {
1673
- const thisTurnSnap = [];
1674
- hasChanged = true;
1675
- for (const item of centerSnap) {
1676
- if (item.bbox.centerAnchor &&
1677
- parseFloat(item.bbox.centerAnchor) == snapMaps.center[0] &&
1678
- !flowingLines.has(item.lineIndex)) {
1679
- thisTurnSnap.push(item);
1680
- }
1681
- }
1682
- if (!thisTurnSnap.length) {
1683
- snapMaps.center.shift();
1684
- continue;
1685
- }
1686
- const centerInitialTargetX = Math.round(snapMaps.center[0] / medianWidth);
1687
- let targetX = Math.min(centerInitialTargetX, COLUMN_SPACES);
1688
- const lineMax = Math.max(...thisTurnSnap.map((v) => {
1689
- let spaceEnd = 0;
1690
- if (!rawLines[v.lineIndex].endsWith(" ")) {
1691
- spaceEnd = v.bbox.shouldSpace ?? 0;
1692
- }
1693
- if ((v.bbox.shouldSpace ?? 0) > 1) {
1694
- const trailingSpaces = rawLines[v.lineIndex].length - rawLines[v.lineIndex].trimEnd().length;
1695
- if (trailingSpaces < (v.bbox.shouldSpace ?? 0)) {
1696
- spaceEnd = (v.bbox.shouldSpace ?? 0) - trailingSpaces;
1697
- }
1698
- }
1699
- return rawLines[v.lineIndex].length + Math.round(v.bbox.strLength / 2) + spaceEnd;
1700
- }));
1701
- let centerBindingConstraint = "COLUMN_SPACES";
1702
- if (targetX < lineMax) {
1703
- targetX = lineMax;
1704
- centerBindingConstraint = "lineMax";
1705
- }
1706
- const centerForwardAnchorValue = forwardAnchors.center[snapMaps.center[0]];
1707
- if (centerForwardAnchorValue && targetX < centerForwardAnchorValue) {
1708
- targetX = centerForwardAnchorValue;
1709
- centerBindingConstraint = "forwardAnchor";
1710
- }
1711
- const centerPrevAnchorValue = prevAnchors.forwardAnchorCenter[snapMaps.center[0]];
1712
- if (centerPrevAnchorValue && targetX < centerPrevAnchorValue) {
1713
- targetX = centerPrevAnchorValue;
1714
- centerBindingConstraint = "prevAnchor";
1715
- }
1716
- forwardAnchors.center[snapMaps.center[0]] = targetX;
1717
- logger.logForwardAnchor("center", snapMaps.center[0], targetX);
1718
- for (const currentCenterSnapBox of thisTurnSnap) {
1719
- if (flowingLines.has(currentCenterSnapBox.lineIndex))
1720
- continue;
1721
- logger.logRenderTrace(currentCenterSnapBox.bbox, currentCenterSnapBox.lineIndex, {
1722
- snapType: "center",
1723
- initialTargetX: centerInitialTargetX,
1724
- medianWidth,
1725
- lineMax,
1726
- rawLineTrimLength: rawLines[currentCenterSnapBox.lineIndex].trimEnd().length,
1727
- shouldSpace: currentCenterSnapBox.bbox.shouldSpace ?? 0,
1728
- forwardAnchorValue: centerForwardAnchorValue || undefined,
1729
- prevAnchorValue: centerPrevAnchorValue || undefined,
1730
- finalTargetX: targetX,
1731
- bindingConstraint: centerBindingConstraint,
1732
- });
1733
- if (targetX >
1734
- rawLines[currentCenterSnapBox.lineIndex].length +
1735
- Math.round(currentCenterSnapBox.bbox.strLength / 2)) {
1736
- rawLines[currentCenterSnapBox.lineIndex] += " ".repeat(targetX -
1737
- rawLines[currentCenterSnapBox.lineIndex].length -
1738
- Math.round(currentCenterSnapBox.bbox.strLength / 2));
1739
- }
1740
- rawLines[currentCenterSnapBox.lineIndex] += currentCenterSnapBox.bbox.str;
1741
- currentCenterSnapBox.bbox.rendered = true;
1742
- logger.logRender(currentCenterSnapBox.bbox, currentCenterSnapBox.lineIndex, targetX, "center-snap");
1743
- logger.captureRender(currentCenterSnapBox.lineIndex, targetX - Math.round(currentCenterSnapBox.bbox.strLength / 2), currentCenterSnapBox.bbox.str, "center");
1744
- }
1745
- snapMaps.center.shift();
1746
- }
1747
- }
1748
- }
1749
- fixSparseBlocks(blocks, rawLines);
1750
- logger.captureRawLines(rawLines);
1751
- const text = rawLines.join("\n");
1752
- // OSS: Return text instead of mutating page object
1753
- return {
1754
- text,
1755
- prevAnchors: {
1756
- forwardAnchorLeft: forwardAnchors.left,
1757
- forwardAnchorRight: forwardAnchors.right,
1758
- forwardAnchorCenter: forwardAnchors.center,
1759
- },
1760
- };
1761
- }
1762
- export async function projectPagesToGrid(pages, config) {
1763
- const logger = createGridDebugLogger(config.debug);
1764
- const prevAnchors = {
1765
- forwardAnchorLeft: {},
1766
- forwardAnchorRight: {},
1767
- forwardAnchorCenter: {},
1768
- };
1769
- const results = [];
1770
- for (const page of pages) {
1771
- logger.setPage(page.pageNum, page.width, page.height);
1772
- // Build projection boxes from text items
1773
- const projectionBoxes = buildBbox(page, config);
1774
- // Project to grid
1775
- const { text, prevAnchors: newAnchors } = projectToGrid(config, page, projectionBoxes, prevAnchors, pages.length, logger);
1776
- // Update forward anchors if preserving across pages
1777
- if (config.preserveLayoutAlignmentAcrossPages) {
1778
- for (const anchor in newAnchors.forwardAnchorLeft) {
1779
- prevAnchors.forwardAnchorLeft[anchor] = newAnchors.forwardAnchorLeft[anchor];
1780
- }
1781
- for (const anchor in newAnchors.forwardAnchorRight) {
1782
- prevAnchors.forwardAnchorRight[anchor] = newAnchors.forwardAnchorRight[anchor];
1783
- }
1784
- for (const anchor in newAnchors.forwardAnchorCenter) {
1785
- prevAnchors.forwardAnchorCenter[anchor] = newAnchors.forwardAnchorCenter[anchor];
1786
- }
1787
- }
1788
- // Build result page
1789
- results.push({
1790
- pageNum: page.pageNum,
1791
- width: page.width,
1792
- height: page.height,
1793
- text,
1794
- textItems: page.textItems,
1795
- boundingBoxes: [],
1796
- });
1797
- }
1798
- // Clean raw text (margin detection, etc)
1799
- cleanRawText(results, config);
1800
- // Flush debug log and render visualizations
1801
- if (logger.enabled) {
1802
- await logger.flush();
1803
- if (logger.shouldVisualize) {
1804
- const vizPath = logger.debugConfig.visualizePath ?? "./debug-output";
1805
- const paths = await renderAllVisualizations(logger.visualizerPages, vizPath);
1806
- for (const p of paths) {
1807
- process.stderr.write(`[grid-debug] Visualization: ${p}\n`);
1808
- }
1809
- }
1810
- }
1811
- return results;
1812
- }
1813
- //# sourceMappingURL=gridProjection.js.map