@llamaindex/liteparse 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +339 -0
- package/dist/cli/parse.d.ts +4 -0
- package/dist/cli/parse.d.ts.map +1 -0
- package/dist/cli/parse.js +401 -0
- package/dist/cli/parse.js.map +1 -0
- package/dist/src/conversion/convertToPdf.d.ts +47 -0
- package/dist/src/conversion/convertToPdf.d.ts.map +1 -0
- package/dist/src/conversion/convertToPdf.js +337 -0
- package/dist/src/conversion/convertToPdf.js.map +1 -0
- package/dist/src/conversion/convertToPdf.test.d.ts +2 -0
- package/dist/src/conversion/convertToPdf.test.d.ts.map +1 -0
- package/dist/src/conversion/convertToPdf.test.js +208 -0
- package/dist/src/conversion/convertToPdf.test.js.map +1 -0
- package/dist/src/core/config.d.ts +4 -0
- package/dist/src/core/config.d.ts.map +1 -0
- package/dist/src/core/config.js +25 -0
- package/dist/src/core/config.js.map +1 -0
- package/dist/src/core/config.test.d.ts +2 -0
- package/dist/src/core/config.test.d.ts.map +1 -0
- package/dist/src/core/config.test.js +21 -0
- package/dist/src/core/config.test.js.map +1 -0
- package/dist/src/core/parser.d.ts +83 -0
- package/dist/src/core/parser.d.ts.map +1 -0
- package/dist/src/core/parser.js +333 -0
- package/dist/src/core/parser.js.map +1 -0
- package/dist/src/core/parser.test.d.ts +2 -0
- package/dist/src/core/parser.test.d.ts.map +1 -0
- package/dist/src/core/parser.test.js +537 -0
- package/dist/src/core/parser.test.js.map +1 -0
- package/dist/src/core/types.d.ts +287 -0
- package/dist/src/core/types.d.ts.map +1 -0
- package/dist/src/core/types.js +2 -0
- package/dist/src/core/types.js.map +1 -0
- package/dist/src/engines/ocr/http-simple.d.ts +19 -0
- package/dist/src/engines/ocr/http-simple.d.ts.map +1 -0
- package/dist/src/engines/ocr/http-simple.js +63 -0
- package/dist/src/engines/ocr/http-simple.js.map +1 -0
- package/dist/src/engines/ocr/http-simple.test.d.ts +2 -0
- package/dist/src/engines/ocr/http-simple.test.d.ts.map +1 -0
- package/dist/src/engines/ocr/http-simple.test.js +108 -0
- package/dist/src/engines/ocr/http-simple.test.js.map +1 -0
- package/dist/src/engines/ocr/interface.d.ts +15 -0
- package/dist/src/engines/ocr/interface.d.ts.map +1 -0
- package/dist/src/engines/ocr/interface.js +2 -0
- package/dist/src/engines/ocr/interface.js.map +1 -0
- package/dist/src/engines/ocr/tesseract.d.ts +19 -0
- package/dist/src/engines/ocr/tesseract.d.ts.map +1 -0
- package/dist/src/engines/ocr/tesseract.js +112 -0
- package/dist/src/engines/ocr/tesseract.js.map +1 -0
- package/dist/src/engines/ocr/tesseract.test.d.ts +2 -0
- package/dist/src/engines/ocr/tesseract.test.d.ts.map +1 -0
- package/dist/src/engines/ocr/tesseract.test.js +84 -0
- package/dist/src/engines/ocr/tesseract.test.js.map +1 -0
- package/dist/src/engines/pdf/interface.d.ts +79 -0
- package/dist/src/engines/pdf/interface.d.ts.map +1 -0
- package/dist/src/engines/pdf/interface.js +2 -0
- package/dist/src/engines/pdf/interface.js.map +1 -0
- package/dist/src/engines/pdf/pdfium-renderer.d.ts +11 -0
- package/dist/src/engines/pdf/pdfium-renderer.d.ts.map +1 -0
- package/dist/src/engines/pdf/pdfium-renderer.js +64 -0
- package/dist/src/engines/pdf/pdfium-renderer.js.map +1 -0
- package/dist/src/engines/pdf/pdfium-renderer.test.d.ts +2 -0
- package/dist/src/engines/pdf/pdfium-renderer.test.d.ts.map +1 -0
- package/dist/src/engines/pdf/pdfium-renderer.test.js +76 -0
- package/dist/src/engines/pdf/pdfium-renderer.test.js.map +1 -0
- package/dist/src/engines/pdf/pdfjs.d.ts +13 -0
- package/dist/src/engines/pdf/pdfjs.d.ts.map +1 -0
- package/dist/src/engines/pdf/pdfjs.js +538 -0
- package/dist/src/engines/pdf/pdfjs.js.map +1 -0
- package/dist/src/engines/pdf/pdfjs.test.d.ts +2 -0
- package/dist/src/engines/pdf/pdfjs.test.d.ts.map +1 -0
- package/dist/src/engines/pdf/pdfjs.test.js +220 -0
- package/dist/src/engines/pdf/pdfjs.test.js.map +1 -0
- package/dist/src/engines/pdf/pdfjsImporter.d.ts +5 -0
- package/dist/src/engines/pdf/pdfjsImporter.d.ts.map +1 -0
- package/dist/src/engines/pdf/pdfjsImporter.js +9 -0
- package/dist/src/engines/pdf/pdfjsImporter.js.map +1 -0
- package/dist/src/index.d.ts +3 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +5 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/lib.d.ts +17 -0
- package/dist/src/lib.d.ts.map +1 -0
- package/dist/src/lib.js +16 -0
- package/dist/src/lib.js.map +1 -0
- package/dist/src/output/json.d.ts +10 -0
- package/dist/src/output/json.d.ts.map +1 -0
- package/dist/src/output/json.js +31 -0
- package/dist/src/output/json.js.map +1 -0
- package/dist/src/output/json.test.d.ts +2 -0
- package/dist/src/output/json.test.d.ts.map +1 -0
- package/dist/src/output/json.test.js +136 -0
- package/dist/src/output/json.test.js.map +1 -0
- package/dist/src/output/text.d.ts +10 -0
- package/dist/src/output/text.d.ts.map +1 -0
- package/dist/src/output/text.js +17 -0
- package/dist/src/output/text.js.map +1 -0
- package/dist/src/output/text.test.d.ts +2 -0
- package/dist/src/output/text.test.d.ts.map +1 -0
- package/dist/src/output/text.test.js +65 -0
- package/dist/src/output/text.test.js.map +1 -0
- package/dist/src/processing/bbox.d.ts +20 -0
- package/dist/src/processing/bbox.d.ts.map +1 -0
- package/dist/src/processing/bbox.js +258 -0
- package/dist/src/processing/bbox.js.map +1 -0
- package/dist/src/processing/bbox.test.d.ts +2 -0
- package/dist/src/processing/bbox.test.d.ts.map +1 -0
- package/dist/src/processing/bbox.test.js +334 -0
- package/dist/src/processing/bbox.test.js.map +1 -0
- package/dist/src/processing/cleanText.d.ts +6 -0
- package/dist/src/processing/cleanText.d.ts.map +1 -0
- package/dist/src/processing/cleanText.js +73 -0
- package/dist/src/processing/cleanText.js.map +1 -0
- package/dist/src/processing/cleanText.test.d.ts +2 -0
- package/dist/src/processing/cleanText.test.d.ts.map +1 -0
- package/dist/src/processing/cleanText.test.js +46 -0
- package/dist/src/processing/cleanText.test.js.map +1 -0
- package/dist/src/processing/grid.d.ts +7 -0
- package/dist/src/processing/grid.d.ts.map +1 -0
- package/dist/src/processing/grid.js +13 -0
- package/dist/src/processing/grid.js.map +1 -0
- package/dist/src/processing/gridProjection.d.ts +18 -0
- package/dist/src/processing/gridProjection.d.ts.map +1 -0
- package/dist/src/processing/gridProjection.js +1392 -0
- package/dist/src/processing/gridProjection.js.map +1 -0
- package/dist/src/processing/gridProjection.test.d.ts +2 -0
- package/dist/src/processing/gridProjection.test.d.ts.map +1 -0
- package/dist/src/processing/gridProjection.test.js +464 -0
- package/dist/src/processing/gridProjection.test.js.map +1 -0
- package/dist/src/processing/markupUtils.d.ts +7 -0
- package/dist/src/processing/markupUtils.d.ts.map +1 -0
- package/dist/src/processing/markupUtils.js +25 -0
- package/dist/src/processing/markupUtils.js.map +1 -0
- package/dist/src/processing/markupUtils.test.d.ts +2 -0
- package/dist/src/processing/markupUtils.test.d.ts.map +1 -0
- package/dist/src/processing/markupUtils.test.js +26 -0
- package/dist/src/processing/markupUtils.test.js.map +1 -0
- package/dist/src/processing/ocrUtils.d.ts +24 -0
- package/dist/src/processing/ocrUtils.d.ts.map +1 -0
- package/dist/src/processing/ocrUtils.js +79 -0
- package/dist/src/processing/ocrUtils.js.map +1 -0
- package/dist/src/processing/octUtils.test.d.ts +2 -0
- package/dist/src/processing/octUtils.test.d.ts.map +1 -0
- package/dist/src/processing/octUtils.test.js +72 -0
- package/dist/src/processing/octUtils.test.js.map +1 -0
- package/dist/src/processing/textUtils.d.ts +20 -0
- package/dist/src/processing/textUtils.d.ts.map +1 -0
- package/dist/src/processing/textUtils.js +142 -0
- package/dist/src/processing/textUtils.js.map +1 -0
- package/dist/src/processing/textUtils.test.d.ts +2 -0
- package/dist/src/processing/textUtils.test.d.ts.map +1 -0
- package/dist/src/processing/textUtils.test.js +45 -0
- package/dist/src/processing/textUtils.test.js.map +1 -0
- package/dist/src/vendor/pdfjs/LICENSE +177 -0
- package/dist/src/vendor/pdfjs/README.md +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78ms-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/78ms-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/83pv-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/90ms-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/90ms-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/90msp-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/90msp-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/90pv-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/90pv-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Add-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Add-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Add-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Add-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-0.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-1.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-3.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-4.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-5.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-6.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-0.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-1.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-3.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-4.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-5.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-0.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-1.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-3.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-4.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-5.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-6.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-0.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-1.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/B5pc-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/B5pc-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/CNS-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/CNS-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/CNS1-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/CNS1-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/CNS2-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/CNS2-V.bcmap +3 -0
- package/dist/src/vendor/pdfjs/cmaps/ETHK-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/ETHK-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/ETen-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/ETen-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/ETenms-B5-H.bcmap +3 -0
- package/dist/src/vendor/pdfjs/cmaps/ETenms-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Ext-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Ext-RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Ext-RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Ext-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GB-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GB-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GB-H.bcmap +4 -0
- package/dist/src/vendor/pdfjs/cmaps/GB-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBK-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBK-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBK2K-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBK2K-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBKp-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBKp-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBT-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBT-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBT-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBT-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBTpc-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBTpc-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBpc-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/GBpc-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKdla-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKdla-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKdlb-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKdlb-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKgccs-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKgccs-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKm314-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKm314-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKm471-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKm471-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKscs-B5-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/HKscs-B5-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Hankaku.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Hiragana.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSC-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSC-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSC-Johab-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSC-Johab-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSCms-UHC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSCpc-EUC-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/KSCpc-EUC-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Katakana.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/LICENSE +36 -0
- package/dist/src/vendor/pdfjs/cmaps/NWP-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/NWP-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/RKSJ-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/RKSJ-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/Roman.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UCS2-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UCS2-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF16-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF16-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF8-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniCNS-UTF8-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UCS2-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UCS2-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF16-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF16-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF8-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniGB-UTF8-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UCS2-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF16-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF16-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF8-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS-UTF8-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UCS2-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UCS2-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF16-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF16-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF32-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF32-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF8-H.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/UniKS-UTF8-V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/V.bcmap +0 -0
- package/dist/src/vendor/pdfjs/cmaps/WP-Symbol.bcmap +0 -0
- package/dist/src/vendor/pdfjs/pdf.mjs +19481 -0
- package/dist/src/vendor/pdfjs/pdf.mjs.map +1 -0
- package/dist/src/vendor/pdfjs/pdf.sandbox.mjs +210 -0
- package/dist/src/vendor/pdfjs/pdf.sandbox.mjs.map +1 -0
- package/dist/src/vendor/pdfjs/pdf.worker.mjs +56001 -0
- package/dist/src/vendor/pdfjs/pdf.worker.mjs.map +1 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitDingbats.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixed.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixedBold.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitFixedItalic.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerif.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerifBold.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitSerifItalic.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/FoxitSymbol.pfb +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/LICENSE_FOXIT +27 -0
- package/dist/src/vendor/pdfjs/standard_fonts/LICENSE_LIBERATION +102 -0
- package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-Bold.ttf +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-Italic.ttf +0 -0
- package/dist/src/vendor/pdfjs/standard_fonts/LiberationSans-Regular.ttf +0 -0
- package/package.json +89 -0
- package/src/vendor/pdfjs/LICENSE +177 -0
- package/src/vendor/pdfjs/README.md +0 -0
- package/src/vendor/pdfjs/cmaps/78-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78ms-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/78ms-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/83pv-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/90ms-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/90ms-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/90msp-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/90msp-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/90pv-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/90pv-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Add-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Add-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Add-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Add-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-0.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-1.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-3.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-4.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-5.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-6.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-0.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-1.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-3.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-4.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-5.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-0.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-1.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-3.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-4.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-5.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-6.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Korea1-0.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Korea1-1.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Korea1-2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/B5pc-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/B5pc-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/CNS-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/CNS-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/CNS1-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/CNS1-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/CNS2-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/CNS2-V.bcmap +3 -0
- package/src/vendor/pdfjs/cmaps/ETHK-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/ETHK-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/ETen-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/ETen-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/ETenms-B5-H.bcmap +3 -0
- package/src/vendor/pdfjs/cmaps/ETenms-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Ext-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Ext-RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Ext-RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Ext-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GB-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GB-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GB-H.bcmap +4 -0
- package/src/vendor/pdfjs/cmaps/GB-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBK-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBK-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBK2K-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBK2K-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBKp-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBKp-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBT-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBT-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBT-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBT-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBTpc-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBTpc-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBpc-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/GBpc-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKdla-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKdla-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKdlb-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKdlb-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKgccs-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKgccs-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKm314-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKm314-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKm471-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKm471-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKscs-B5-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/HKscs-B5-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Hankaku.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Hiragana.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSC-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSC-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSC-Johab-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSC-Johab-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSCms-UHC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSCms-UHC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSCpc-EUC-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/KSCpc-EUC-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Katakana.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/LICENSE +36 -0
- package/src/vendor/pdfjs/cmaps/NWP-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/NWP-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/RKSJ-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/RKSJ-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/Roman.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UCS2-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UCS2-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UTF16-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UTF16-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UTF8-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniCNS-UTF8-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UCS2-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UCS2-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UTF16-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UTF16-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UTF8-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniGB-UTF8-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UCS2-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UTF16-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UTF16-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UTF8-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS-UTF8-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UCS2-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UCS2-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UTF16-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UTF16-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UTF32-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UTF32-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UTF8-H.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/UniKS-UTF8-V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/V.bcmap +0 -0
- package/src/vendor/pdfjs/cmaps/WP-Symbol.bcmap +0 -0
- package/src/vendor/pdfjs/pdf.mjs +19481 -0
- package/src/vendor/pdfjs/pdf.mjs.map +1 -0
- package/src/vendor/pdfjs/pdf.sandbox.mjs +210 -0
- package/src/vendor/pdfjs/pdf.sandbox.mjs.map +1 -0
- package/src/vendor/pdfjs/pdf.worker.mjs +56001 -0
- package/src/vendor/pdfjs/pdf.worker.mjs.map +1 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitDingbats.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitFixed.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitFixedBold.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitFixedItalic.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitSerif.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitSerifBold.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitSerifItalic.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/FoxitSymbol.pfb +0 -0
- package/src/vendor/pdfjs/standard_fonts/LICENSE_FOXIT +27 -0
- package/src/vendor/pdfjs/standard_fonts/LICENSE_LIBERATION +102 -0
- package/src/vendor/pdfjs/standard_fonts/LiberationSans-Bold.ttf +0 -0
- package/src/vendor/pdfjs/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
- package/src/vendor/pdfjs/standard_fonts/LiberationSans-Italic.ttf +0 -0
- package/src/vendor/pdfjs/standard_fonts/LiberationSans-Regular.ttf +0 -0
|
@@ -0,0 +1,1392 @@
|
|
|
1
|
+
import { strToSubscriptString, strToPostScript } from "./textUtils.js";
|
|
2
|
+
import { buildBbox } from "./bbox.js";
|
|
3
|
+
import { cleanRawText } from "./cleanText.js";
|
|
4
|
+
import { applyMarkupTags } from "./markupUtils.js";
|
|
5
|
+
// Minimum spaces between unsnapped bboxes (likely justified text
|
|
6
|
+
const FLOATING_SPACES = 2;
|
|
7
|
+
// Minimum spaces between snapped columns
|
|
8
|
+
const COLUMN_SPACES = 4;
|
|
9
|
+
function roundAnchor(anchor) {
|
|
10
|
+
// group anchor x-coord by nearest 1/4 unit
|
|
11
|
+
return Math.round(anchor * 4) / 4;
|
|
12
|
+
}
|
|
13
|
+
// 2pt @ PDF 72 DPI -> 8px @ 300DPI
|
|
14
|
+
const SMALL_FONT_SIZE_THRESHOLD = 2;
|
|
15
|
+
function isSmallTextLine(line) {
|
|
16
|
+
// check for line where >50% of the text is very small
|
|
17
|
+
const smallText = line.filter((item) => item.h < SMALL_FONT_SIZE_THRESHOLD);
|
|
18
|
+
if (smallText.length / line.length > 0.5) {
|
|
19
|
+
return true;
|
|
20
|
+
}
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
function filterUnprojectableText(config, line) {
|
|
24
|
+
// Filter text items that would break projection (e.g., very small text)
|
|
25
|
+
if (line.length === 0) {
|
|
26
|
+
return line;
|
|
27
|
+
}
|
|
28
|
+
let filteredLine = line;
|
|
29
|
+
if (!config.preserveVerySmallText && isSmallTextLine(line)) {
|
|
30
|
+
// remove very small text lines
|
|
31
|
+
filteredLine = filteredLine.filter((item) => item.h >= SMALL_FONT_SIZE_THRESHOLD);
|
|
32
|
+
}
|
|
33
|
+
return filteredLine;
|
|
34
|
+
}
|
|
35
|
+
function canSnapLine(config, line) {
|
|
36
|
+
// force lines that will likely break projection to be unsnapped floating text
|
|
37
|
+
// currently this includes:
|
|
38
|
+
// - lines of entirely small text
|
|
39
|
+
//
|
|
40
|
+
// NOTE: this assumes undesirable text has already been filtered before projection
|
|
41
|
+
// (i.e. parse mode based removal of text should be done before this in filterUnprojectableText())
|
|
42
|
+
if (line.length === 0) {
|
|
43
|
+
return true;
|
|
44
|
+
}
|
|
45
|
+
if (!config.preserveVerySmallText && isSmallTextLine(line)) {
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
return true;
|
|
49
|
+
}
|
|
50
|
+
function fixSparseBlocks(blocks, rawLines) {
|
|
51
|
+
// compress whitespace in blocks containing very sparse lines (>80% whitespace)
|
|
52
|
+
const regexp = new RegExp(`\\s{${COLUMN_SPACES},}`, "g");
|
|
53
|
+
for (const block of blocks) {
|
|
54
|
+
let total = 0;
|
|
55
|
+
let whitespace = 0;
|
|
56
|
+
for (let i = block.start; i < block.end; ++i) {
|
|
57
|
+
if (!rawLines[i]) {
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
rawLines[i] = rawLines[i].trimEnd();
|
|
61
|
+
const line = rawLines[i];
|
|
62
|
+
if (line.length === 0) {
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
total += line.length;
|
|
66
|
+
whitespace += line.match(/\s/g)?.length || 0;
|
|
67
|
+
}
|
|
68
|
+
if (total >= 500 && whitespace / total > 0.8) {
|
|
69
|
+
for (let i = block.start; i < block.end; ++i) {
|
|
70
|
+
const line = rawLines[i];
|
|
71
|
+
if (!line || line.length === 0) {
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
rawLines[i] = line.replace(regexp, " ".repeat(FLOATING_SPACES));
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
function extractAnchorsPointsFromLines(lines, page) {
|
|
80
|
+
const pageHeight = page.height;
|
|
81
|
+
const anchorLeft = {};
|
|
82
|
+
const anchorRight = {};
|
|
83
|
+
const anchorCenter = {};
|
|
84
|
+
for (const line of lines) {
|
|
85
|
+
for (const bbox of line) {
|
|
86
|
+
let anchor = roundAnchor(bbox.x);
|
|
87
|
+
if (!anchorLeft[anchor]) {
|
|
88
|
+
anchorLeft[anchor] = [];
|
|
89
|
+
}
|
|
90
|
+
anchorLeft[anchor].push(bbox);
|
|
91
|
+
anchor = roundAnchor(bbox.x + bbox.w);
|
|
92
|
+
if (!anchorRight[anchor]) {
|
|
93
|
+
anchorRight[anchor] = [];
|
|
94
|
+
}
|
|
95
|
+
anchorRight[anchor].push(bbox);
|
|
96
|
+
const center = Math.round(bbox.x + bbox.w / 2);
|
|
97
|
+
if (!anchorCenter[center]) {
|
|
98
|
+
anchorCenter[center] = [];
|
|
99
|
+
}
|
|
100
|
+
anchorCenter[center].push(bbox);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
function deltaMin(collection, delta) {
|
|
104
|
+
for (const anchor in collection) {
|
|
105
|
+
const maxDelta = pageHeight * delta;
|
|
106
|
+
for (let i = 0; i < collection[anchor].length; i++) {
|
|
107
|
+
let shouldKeep = false;
|
|
108
|
+
if (i > 0) {
|
|
109
|
+
if (collection[anchor][i].y - collection[anchor][i - 1].y < maxDelta) {
|
|
110
|
+
shouldKeep = true;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
if (i < collection[anchor].length - 1) {
|
|
114
|
+
if (collection[anchor][i + 1].y - collection[anchor][i].y < maxDelta) {
|
|
115
|
+
shouldKeep = true;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
if (!shouldKeep) {
|
|
119
|
+
collection[anchor].splice(i--, 1);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
// Group nearby anchors FIRST to merge items at similar positions
|
|
125
|
+
// This ensures deltaMin operates on combined groups, not isolated items
|
|
126
|
+
group(anchorLeft);
|
|
127
|
+
group(anchorRight);
|
|
128
|
+
group(anchorCenter);
|
|
129
|
+
deltaMin(anchorRight, 0.17);
|
|
130
|
+
deltaMin(anchorLeft, 0.2);
|
|
131
|
+
deltaMin(anchorCenter, 0.05);
|
|
132
|
+
function intercept(collection) {
|
|
133
|
+
for (const anchor in collection) {
|
|
134
|
+
let shouldKeep = false;
|
|
135
|
+
for (let i = 0; i < collection[anchor].length; i++) {
|
|
136
|
+
if (i > 0) {
|
|
137
|
+
let intercept = false;
|
|
138
|
+
// check intercept
|
|
139
|
+
const a1 = collection[anchor][i - 1];
|
|
140
|
+
const a2 = collection[anchor][i];
|
|
141
|
+
for (const line of lines) {
|
|
142
|
+
if (line.length > 0 && line[0].y > a1.y && line[0].y < a2.y) {
|
|
143
|
+
for (const item of line) {
|
|
144
|
+
if (item.x < parseInt(anchor) && item.x + item.w > parseInt(anchor)) {
|
|
145
|
+
intercept = true;
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
if (intercept) {
|
|
150
|
+
break;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (!intercept) {
|
|
155
|
+
shouldKeep = true;
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (!shouldKeep) {
|
|
161
|
+
delete collection[anchor];
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
intercept(anchorRight);
|
|
166
|
+
intercept(anchorLeft);
|
|
167
|
+
intercept(anchorCenter);
|
|
168
|
+
function group(collection) {
|
|
169
|
+
// Sort anchors to process them in order
|
|
170
|
+
const sortedAnchors = Object.keys(collection)
|
|
171
|
+
.map((a) => parseFloat(a))
|
|
172
|
+
.sort((a, b) => a - b);
|
|
173
|
+
// Merge nearby anchors within a tolerance
|
|
174
|
+
// Use 2 units as tolerance - this catches columns that are close but not exactly aligned
|
|
175
|
+
const MERGE_TOLERANCE = 2;
|
|
176
|
+
for (let i = 0; i < sortedAnchors.length; i++) {
|
|
177
|
+
const anchor = sortedAnchors[i];
|
|
178
|
+
if (!(anchor in collection))
|
|
179
|
+
continue; // Already merged
|
|
180
|
+
// Look for nearby anchors to merge into this one or merge this into
|
|
181
|
+
for (let j = i + 1; j < sortedAnchors.length; j++) {
|
|
182
|
+
const nextAnchor = sortedAnchors[j];
|
|
183
|
+
if (!(nextAnchor in collection))
|
|
184
|
+
continue;
|
|
185
|
+
// Stop if we're beyond the tolerance
|
|
186
|
+
if (nextAnchor - anchor > MERGE_TOLERANCE)
|
|
187
|
+
break;
|
|
188
|
+
// Merge the smaller anchor into the larger one
|
|
189
|
+
if (collection[nextAnchor].length > collection[anchor].length) {
|
|
190
|
+
collection[nextAnchor].push(...collection[anchor]);
|
|
191
|
+
delete collection[anchor];
|
|
192
|
+
break; // This anchor is gone, move to next
|
|
193
|
+
}
|
|
194
|
+
else {
|
|
195
|
+
collection[anchor].push(...collection[nextAnchor]);
|
|
196
|
+
delete collection[nextAnchor];
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
function anyAnchorSurvived(bbox) {
|
|
202
|
+
return (roundAnchor(bbox.x) in anchorLeft ||
|
|
203
|
+
roundAnchor(bbox.x + bbox.w) in anchorRight ||
|
|
204
|
+
Math.round(bbox.x + bbox.w / 2) in anchorCenter);
|
|
205
|
+
}
|
|
206
|
+
// Try seeing if a floating bbox would align well with a surviving anchor on a line immediately above or below it
|
|
207
|
+
function tryAlignFloating(collection, ANCHOR_MARGIN, refXFromBbox, anchorValFromBbox) {
|
|
208
|
+
for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {
|
|
209
|
+
const line = lines[lineIndex];
|
|
210
|
+
for (const bbox of line) {
|
|
211
|
+
// Only consider floating bboxes
|
|
212
|
+
if (anyAnchorSurvived(bbox)) {
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
// Check the lines before and after
|
|
216
|
+
const candidateLines = [];
|
|
217
|
+
if (lineIndex > 0) {
|
|
218
|
+
candidateLines.push(lines[lineIndex - 1]);
|
|
219
|
+
}
|
|
220
|
+
if (lineIndex < lines.length - 1) {
|
|
221
|
+
candidateLines.push(lines[lineIndex + 1]);
|
|
222
|
+
}
|
|
223
|
+
// Check candidate lines for:
|
|
224
|
+
// Possible alignment
|
|
225
|
+
// Being within the margin
|
|
226
|
+
// Being the closest of the candidates
|
|
227
|
+
let candidateAnchor = "";
|
|
228
|
+
let prevDiff = ANCHOR_MARGIN + 1;
|
|
229
|
+
for (const candLine of candidateLines) {
|
|
230
|
+
for (const candBBox of candLine) {
|
|
231
|
+
const candAnchorVal = anchorValFromBbox(candBBox);
|
|
232
|
+
if (!(candAnchorVal in collection)) {
|
|
233
|
+
continue;
|
|
234
|
+
}
|
|
235
|
+
const xDiff = Math.abs(candAnchorVal - refXFromBbox(bbox));
|
|
236
|
+
if (xDiff <= ANCHOR_MARGIN && xDiff < prevDiff) {
|
|
237
|
+
candidateAnchor = candAnchorVal.toString();
|
|
238
|
+
prevDiff = xDiff;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
// No candidate found
|
|
243
|
+
if (candidateAnchor.length == 0) {
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
// Candidate found - update the anchor's bbox list
|
|
247
|
+
// But first check if the bbox is already in this anchor (could happen after merging)
|
|
248
|
+
const targetAnchor = collection[parseFloat(candidateAnchor)];
|
|
249
|
+
if (!targetAnchor.includes(bbox)) {
|
|
250
|
+
targetAnchor.push(bbox);
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Try to left-align floating bboxes
|
|
256
|
+
tryAlignFloating(anchorLeft, 2, (bbox) => bbox.x, (bbox) => roundAnchor(bbox.x));
|
|
257
|
+
// Sort the anchors' lists of bboxes by y-value
|
|
258
|
+
function sortAnchor(collection) {
|
|
259
|
+
for (const anchor in collection) {
|
|
260
|
+
collection[anchor].sort((a, b) => a.y - b.y);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
sortAnchor(anchorLeft);
|
|
264
|
+
sortAnchor(anchorRight);
|
|
265
|
+
sortAnchor(anchorCenter);
|
|
266
|
+
// deduplicate
|
|
267
|
+
const duplicates = [];
|
|
268
|
+
for (const anchor in anchorLeft) {
|
|
269
|
+
for (const item of anchorLeft[anchor]) {
|
|
270
|
+
item.snap = "left";
|
|
271
|
+
item.leftAnchor = anchor;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
for (const anchor in anchorRight) {
|
|
275
|
+
for (const item of anchorRight[anchor]) {
|
|
276
|
+
if (item.snap) {
|
|
277
|
+
item.isDup = true;
|
|
278
|
+
duplicates.push(item);
|
|
279
|
+
}
|
|
280
|
+
item.snap = "right";
|
|
281
|
+
item.rightAnchor = anchor;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
for (const anchor in anchorCenter) {
|
|
285
|
+
for (const item of anchorCenter[anchor]) {
|
|
286
|
+
if (item.snap && !item.isDup) {
|
|
287
|
+
item.isDup = true;
|
|
288
|
+
duplicates.push(item);
|
|
289
|
+
}
|
|
290
|
+
item.snap = "center";
|
|
291
|
+
item.centerAnchor = anchor;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
function anchorCounts(item) {
|
|
295
|
+
let leftCount = 0;
|
|
296
|
+
if (item.leftAnchor) {
|
|
297
|
+
const key = parseFloat(item.leftAnchor);
|
|
298
|
+
leftCount = anchorLeft[key] ? anchorLeft[key].length : 0;
|
|
299
|
+
}
|
|
300
|
+
let rightCount = 0;
|
|
301
|
+
if (item.rightAnchor) {
|
|
302
|
+
const key = parseFloat(item.rightAnchor);
|
|
303
|
+
rightCount = anchorRight[key] ? anchorRight[key].length : 0;
|
|
304
|
+
}
|
|
305
|
+
let centerCount = 0;
|
|
306
|
+
if (item.centerAnchor) {
|
|
307
|
+
const key = parseFloat(item.centerAnchor);
|
|
308
|
+
centerCount = anchorCenter[key] ? anchorCenter[key].length : 0;
|
|
309
|
+
}
|
|
310
|
+
return [leftCount, rightCount, centerCount];
|
|
311
|
+
}
|
|
312
|
+
// find all left aligned blocks, all right aligned blocks, all centered blocks, in that order
|
|
313
|
+
// we cannot check all 3 at once since we may end up double counting potential anchor matches
|
|
314
|
+
// (i.e. we need to exclude block that we know are left/right aligned before counting possible
|
|
315
|
+
// matching centered blocks)
|
|
316
|
+
// find all lefts
|
|
317
|
+
let hasChanged = true;
|
|
318
|
+
while (hasChanged && duplicates.length > 0) {
|
|
319
|
+
hasChanged = false;
|
|
320
|
+
for (let i = duplicates.length - 1; i >= 0; --i) {
|
|
321
|
+
const item = duplicates[i];
|
|
322
|
+
const [leftCount, rightCount, centerCount] = anchorCounts(item);
|
|
323
|
+
if (leftCount >= rightCount && leftCount >= centerCount) {
|
|
324
|
+
item.snap = "left";
|
|
325
|
+
if (item.rightAnchor) {
|
|
326
|
+
const key = parseFloat(item.rightAnchor);
|
|
327
|
+
if (anchorRight[key]) {
|
|
328
|
+
anchorRight[key].splice(anchorRight[key].indexOf(item), 1);
|
|
329
|
+
hasChanged = true;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
if (item.centerAnchor) {
|
|
333
|
+
const key = parseFloat(item.centerAnchor);
|
|
334
|
+
if (anchorCenter[key]) {
|
|
335
|
+
anchorCenter[key].splice(anchorCenter[key].indexOf(item), 1);
|
|
336
|
+
hasChanged = true;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
duplicates.splice(i, 1);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
// find all rights
|
|
344
|
+
hasChanged = true;
|
|
345
|
+
while (hasChanged && duplicates.length > 0) {
|
|
346
|
+
hasChanged = false;
|
|
347
|
+
for (let i = duplicates.length - 1; i >= 0; --i) {
|
|
348
|
+
const item = duplicates[i];
|
|
349
|
+
const [leftCount, rightCount, centerCount] = anchorCounts(item);
|
|
350
|
+
if (rightCount >= leftCount && rightCount >= centerCount) {
|
|
351
|
+
item.snap = "right";
|
|
352
|
+
if (item.leftAnchor) {
|
|
353
|
+
const key = parseFloat(item.leftAnchor);
|
|
354
|
+
if (anchorLeft[key]) {
|
|
355
|
+
const idx = anchorLeft[key].indexOf(item);
|
|
356
|
+
if (idx >= 0) {
|
|
357
|
+
anchorLeft[key].splice(idx, 1);
|
|
358
|
+
hasChanged = true;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
if (item.centerAnchor) {
|
|
363
|
+
const key = parseFloat(item.centerAnchor);
|
|
364
|
+
if (anchorCenter[key]) {
|
|
365
|
+
anchorCenter[key].splice(anchorCenter[key].indexOf(item), 1);
|
|
366
|
+
hasChanged = true;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
duplicates.splice(i, 1);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
// remaining duplicates are centered
|
|
374
|
+
for (const item of duplicates) {
|
|
375
|
+
item.snap = "center";
|
|
376
|
+
if (item.leftAnchor) {
|
|
377
|
+
const key = parseFloat(item.leftAnchor);
|
|
378
|
+
if (anchorLeft[key]) {
|
|
379
|
+
anchorLeft[key].splice(anchorLeft[key].indexOf(item), 1);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
if (item.rightAnchor) {
|
|
383
|
+
const key = parseFloat(item.rightAnchor);
|
|
384
|
+
if (anchorRight[key]) {
|
|
385
|
+
anchorRight[key].splice(anchorRight[key].indexOf(item), 1);
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
// filter anchors
|
|
390
|
+
// delete singleton
|
|
391
|
+
for (const anchor in anchorLeft) {
|
|
392
|
+
if (anchorLeft[anchor].length < 2) {
|
|
393
|
+
if (anchorLeft[anchor].length) {
|
|
394
|
+
delete anchorLeft[anchor][0].snap;
|
|
395
|
+
}
|
|
396
|
+
delete anchorLeft[anchor];
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
for (const anchor in anchorRight) {
|
|
400
|
+
if (anchorRight[anchor].length < 2) {
|
|
401
|
+
if (anchorRight[anchor].length) {
|
|
402
|
+
delete anchorRight[anchor][0].snap;
|
|
403
|
+
}
|
|
404
|
+
delete anchorRight[anchor];
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
for (const anchor in anchorCenter) {
|
|
408
|
+
if (anchorCenter[anchor].length < 2) {
|
|
409
|
+
if (anchorCenter[anchor].length) {
|
|
410
|
+
delete anchorCenter[anchor][0].snap;
|
|
411
|
+
}
|
|
412
|
+
delete anchorCenter[anchor];
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
return {
|
|
416
|
+
anchorLeft,
|
|
417
|
+
anchorRight,
|
|
418
|
+
anchorCenter,
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
function handleRotationReadingOrder(textBbox, pageHeight) {
|
|
422
|
+
// if no bbox is rotated (.r is set), return
|
|
423
|
+
if (!textBbox.find((b) => b.r != 0)) {
|
|
424
|
+
return;
|
|
425
|
+
}
|
|
426
|
+
// Group ALL items by rotation value (not by consecutive items)
|
|
427
|
+
// This ensures rotated text blocks stay together even when their X coordinates
|
|
428
|
+
// overlap with non-rotated content (e.g., rotated table + footer at same X positions)
|
|
429
|
+
const groupsByRotation = {};
|
|
430
|
+
for (const bbox of textBbox) {
|
|
431
|
+
const r = bbox.r || 0;
|
|
432
|
+
if (!groupsByRotation[r]) {
|
|
433
|
+
groupsByRotation[r] = [];
|
|
434
|
+
}
|
|
435
|
+
groupsByRotation[r].push(bbox);
|
|
436
|
+
}
|
|
437
|
+
// Build bboxGroup array from rotation groups, sorted by X position of group
|
|
438
|
+
const bboxGroup = [];
|
|
439
|
+
for (const rotation in groupsByRotation) {
|
|
440
|
+
const group = groupsByRotation[rotation];
|
|
441
|
+
// Sort each group by Y for proper reading order
|
|
442
|
+
group.sort((a, b) => a.y - b.y);
|
|
443
|
+
bboxGroup.push(group);
|
|
444
|
+
}
|
|
445
|
+
// Sort groups by their minimum X position to maintain left-to-right order
|
|
446
|
+
bboxGroup.sort((a, b) => {
|
|
447
|
+
const minXA = Math.min(...a.map((item) => item.x));
|
|
448
|
+
const minXB = Math.min(...b.map((item) => item.x));
|
|
449
|
+
return minXA - minXB;
|
|
450
|
+
});
|
|
451
|
+
// NOTE/ WARNING: height and width of bbox are NOT rotated beforehand!
|
|
452
|
+
for (const [index, group] of bboxGroup.entries()) {
|
|
453
|
+
if (group[0].r == 90 || group[0].r == 270) {
|
|
454
|
+
// Check if there are non-rotated items that actually overlap visually (both X and Y)
|
|
455
|
+
// with the rotated group. X-only overlap is not sufficient because items could
|
|
456
|
+
// be in completely different parts of the page (e.g., rotated table + footer).
|
|
457
|
+
let globalOverlap = false;
|
|
458
|
+
for (const bbox of textBbox) {
|
|
459
|
+
if (bbox.r != group[0].r) {
|
|
460
|
+
const overlap = group.find((b) =>
|
|
461
|
+
// Check X overlap
|
|
462
|
+
b.x >= bbox.x &&
|
|
463
|
+
b.x <= bbox.x + bbox.w &&
|
|
464
|
+
// Also check Y overlap - items must actually be near each other vertically
|
|
465
|
+
b.y < bbox.y + bbox.h &&
|
|
466
|
+
b.y + b.h > bbox.y &&
|
|
467
|
+
bbox.r != b.r);
|
|
468
|
+
if (overlap) {
|
|
469
|
+
globalOverlap = true;
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
if (globalOverlap) {
|
|
474
|
+
// rotate bbox to be horizontal
|
|
475
|
+
for (const bbox of group) {
|
|
476
|
+
if (bbox.d) {
|
|
477
|
+
bbox.y += bbox.d;
|
|
478
|
+
bbox.d = 0;
|
|
479
|
+
}
|
|
480
|
+
bbox.r = 0;
|
|
481
|
+
bbox.rotated = true;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
else {
|
|
485
|
+
// insert the bbox group in the Y axis after previous group and before next group.
|
|
486
|
+
// move Next group by current group height (width as not rotated yet).
|
|
487
|
+
const groupMaxX = Math.max(...group.map((v) => v.x + v.w));
|
|
488
|
+
let deltaY = 0;
|
|
489
|
+
if (index != 0) {
|
|
490
|
+
const previousGroup = bboxGroup[index - 1];
|
|
491
|
+
const previousGroupMaxY = Math.max(...previousGroup.map((v) => v.y + v.h));
|
|
492
|
+
// Use pageHeight offset to guarantee no alignment issues with other groups
|
|
493
|
+
deltaY = previousGroupMaxY + pageHeight;
|
|
494
|
+
}
|
|
495
|
+
// clockwise rotation (90 degrees)
|
|
496
|
+
// - Text reads top-to-bottom in page space
|
|
497
|
+
// - Y position in page space -> X position after de-rotation
|
|
498
|
+
// - X position in page space -> Y position after de-rotation (row)
|
|
499
|
+
if (group[0].r == 90) {
|
|
500
|
+
for (const bbox of group) {
|
|
501
|
+
const newX = Math.round(bbox.y);
|
|
502
|
+
const newY = bbox.x + deltaY;
|
|
503
|
+
// Swap width and height since text orientation changes
|
|
504
|
+
const newW = bbox.h;
|
|
505
|
+
const newH = bbox.w;
|
|
506
|
+
bbox.x = newX;
|
|
507
|
+
bbox.y = newY;
|
|
508
|
+
bbox.w = newW;
|
|
509
|
+
bbox.h = newH;
|
|
510
|
+
bbox.r = 0;
|
|
511
|
+
bbox.rotated = true;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
// counter clockwize rotation (text reads bottom-to-top)
|
|
515
|
+
// For 270-degree rotation, text at higher Y positions should be
|
|
516
|
+
// at lower X positions after de-rotation (left-to-right reading order)
|
|
517
|
+
if (group[0].r == 270) {
|
|
518
|
+
// For 270-degree counter-clockwise rotation:
|
|
519
|
+
// - Text reads bottom-to-top in page space
|
|
520
|
+
// - Y position in page space -> X position after de-rotation (inverted)
|
|
521
|
+
// - X position in page space -> Y position after de-rotation (row)
|
|
522
|
+
// - w and h need to be swapped since they represent visual dimensions
|
|
523
|
+
// For 270-degree rotation: h is the extent along reading direction (string width)
|
|
524
|
+
const maxY = Math.max(...group.map((b) => b.y + b.h));
|
|
525
|
+
for (const bbox of group) {
|
|
526
|
+
// Transform coordinates:
|
|
527
|
+
// - new_x = distance from right edge of rotated block (inverted Y)
|
|
528
|
+
// Use h (string width in original coords) for the extent
|
|
529
|
+
// - new_y = row position (from original X)
|
|
530
|
+
const newX = Math.round(maxY - bbox.y - bbox.h);
|
|
531
|
+
// Use exact X for Y (will be grouped by bboxToLine's Y_SORT_TOLERANCE)
|
|
532
|
+
const newY = bbox.x + deltaY;
|
|
533
|
+
// Swap width and height since text orientation changes
|
|
534
|
+
const newW = bbox.h;
|
|
535
|
+
const newH = bbox.w;
|
|
536
|
+
bbox.x = newX;
|
|
537
|
+
bbox.y = newY;
|
|
538
|
+
bbox.w = newW;
|
|
539
|
+
bbox.h = newH;
|
|
540
|
+
bbox.r = 0;
|
|
541
|
+
bbox.rotated = true;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
// Use pageHeight offset to guarantee no alignment issues
|
|
545
|
+
const globalDelta = deltaY + groupMaxX + pageHeight;
|
|
546
|
+
for (const [otherGroupIndex, other] of bboxGroup.entries()) {
|
|
547
|
+
if (otherGroupIndex <= index) {
|
|
548
|
+
continue;
|
|
549
|
+
}
|
|
550
|
+
for (const bbox of other) {
|
|
551
|
+
if (bbox.r == 90 || bbox.r == 270) {
|
|
552
|
+
bbox.d = (bbox.d ? bbox.d : 0) + globalDelta;
|
|
553
|
+
continue;
|
|
554
|
+
}
|
|
555
|
+
bbox.y += globalDelta;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
textBbox.sort((a, b) => {
|
|
562
|
+
return a.y - b.y;
|
|
563
|
+
});
|
|
564
|
+
// Handle 180-degree rotated text (upside down)
|
|
565
|
+
// Since we already grouped by rotation, we can iterate the existing groups
|
|
566
|
+
for (const group of bboxGroup) {
|
|
567
|
+
if (group[0].r == 180) {
|
|
568
|
+
// Sort by X for proper reading order
|
|
569
|
+
group.sort((a, b) => a.x - b.x);
|
|
570
|
+
// Switch upside down
|
|
571
|
+
for (const bbox of group) {
|
|
572
|
+
bbox.x = Math.round(bbox.ry ?? bbox.y);
|
|
573
|
+
bbox.y = bbox.rx ?? bbox.x;
|
|
574
|
+
bbox.r = 0;
|
|
575
|
+
bbox.rotated = true;
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
export function bboxToLine(textBbox, medianWidth, medianHeight, pageWidth) {
|
|
581
|
+
// Y-tolerance for sorting: items within this threshold are considered same line
|
|
582
|
+
// This handles:
|
|
583
|
+
// 1. Floating point precision issues between columns (e.g., 334.7400 vs 334.7399)
|
|
584
|
+
// 2. Subscripts/superscripts which are typically offset by 3-5 units from their base characters
|
|
585
|
+
// Using a fraction of medianHeight to scale with document font size.
|
|
586
|
+
const Y_SORT_TOLERANCE = Math.max(medianHeight * 0.5, 5.0);
|
|
587
|
+
// Note: We keep whitespace items as they may be needed for proper word separation.
|
|
588
|
+
// The spacing calculation handles gaps between items.
|
|
589
|
+
// For two-column documents, detect and mark margin line numbers
|
|
590
|
+
// These are short numeric items positioned between columns (near the page midpoint)
|
|
591
|
+
// They should not be merged with column content
|
|
592
|
+
if (pageWidth) {
|
|
593
|
+
const midpoint = pageWidth * 0.5;
|
|
594
|
+
const marginZoneLeft = midpoint - 5;
|
|
595
|
+
const marginZoneRight = midpoint + 20;
|
|
596
|
+
for (const bbox of textBbox) {
|
|
597
|
+
const bboxCenter = bbox.x + bbox.w / 2;
|
|
598
|
+
// Check if item is in the margin zone and looks like a line number
|
|
599
|
+
if (bboxCenter > marginZoneLeft &&
|
|
600
|
+
bboxCenter < marginZoneRight &&
|
|
601
|
+
bbox.str.trim().match(/^\d{1,2}[O]?$/) && // 1-2 digits, possibly with O (OCR error for 0)
|
|
602
|
+
bbox.w < 15 // Line numbers are narrow
|
|
603
|
+
) {
|
|
604
|
+
// Mark as margin item - will be placed on its own line
|
|
605
|
+
bbox.isMarginLineNumber = true;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
// sort lines on first y axis then x axis (top - left)
|
|
610
|
+
// Use Y tolerance so items on same visual line sort by x regardless of tiny y differences
|
|
611
|
+
textBbox.sort((a, b) => {
|
|
612
|
+
if (Math.abs(a.y - b.y) < Y_SORT_TOLERANCE) {
|
|
613
|
+
return a.x - b.x;
|
|
614
|
+
}
|
|
615
|
+
return a.y - b.y;
|
|
616
|
+
});
|
|
617
|
+
function canMergeMarkup(previousBbox, bbox) {
|
|
618
|
+
if (!previousBbox.markup && !bbox.markup) {
|
|
619
|
+
return true;
|
|
620
|
+
}
|
|
621
|
+
if (previousBbox.markup &&
|
|
622
|
+
bbox.markup &&
|
|
623
|
+
previousBbox.markup.highlight === bbox.markup.highlight &&
|
|
624
|
+
previousBbox.markup.underline === bbox.markup.underline &&
|
|
625
|
+
previousBbox.markup.squiggly === bbox.markup.squiggly &&
|
|
626
|
+
previousBbox.markup.strikeout === bbox.markup.strikeout) {
|
|
627
|
+
return true;
|
|
628
|
+
}
|
|
629
|
+
return false;
|
|
630
|
+
}
|
|
631
|
+
function canMerge(previousBbox, bbox) {
|
|
632
|
+
if (bbox.y == previousBbox.y && bbox.h == previousBbox.h) {
|
|
633
|
+
const xDelta = bbox.x - previousBbox.x - previousBbox.w;
|
|
634
|
+
if (((xDelta < 0 && xDelta > -0.5) || (xDelta >= 0 && xDelta < 0.1)) &&
|
|
635
|
+
canMergeMarkup(previousBbox, bbox)) {
|
|
636
|
+
return true;
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
return false;
|
|
640
|
+
}
|
|
641
|
+
function mergePageBbox(a, b) {
|
|
642
|
+
const aBbox = a.pageBbox || { x: a.x, y: a.y, w: a.w, h: a.h };
|
|
643
|
+
const bBbox = b.pageBbox || { x: b.x, y: b.y, w: b.w, h: b.h };
|
|
644
|
+
const left = Math.min(aBbox.x, bBbox.x);
|
|
645
|
+
const top = Math.min(aBbox.y, bBbox.y);
|
|
646
|
+
const right = Math.max(aBbox.x + aBbox.w, bBbox.x + bBbox.w);
|
|
647
|
+
const bottom = Math.max(aBbox.y + aBbox.h, bBbox.y + bBbox.h);
|
|
648
|
+
return { x: left, y: top, w: right - left, h: bottom - top };
|
|
649
|
+
}
|
|
650
|
+
// merge Continuous bbox
|
|
651
|
+
for (let i = 1; i < textBbox.length; i++) {
|
|
652
|
+
const bbox = textBbox[i];
|
|
653
|
+
const previousBbox = textBbox[i - 1];
|
|
654
|
+
if (canMerge(previousBbox, bbox)) {
|
|
655
|
+
previousBbox.w = bbox.x + bbox.w - previousBbox.x;
|
|
656
|
+
previousBbox.str += bbox.str;
|
|
657
|
+
previousBbox.strLength += bbox.strLength;
|
|
658
|
+
previousBbox.pageBbox = mergePageBbox(previousBbox, bbox);
|
|
659
|
+
textBbox.splice(i, 1);
|
|
660
|
+
i--;
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
// try to find the bounding box that align as line and group them by line
|
|
664
|
+
const lines = [];
|
|
665
|
+
let currentLine = [];
|
|
666
|
+
let previousBbox = null;
|
|
667
|
+
for (const bbox of textBbox) {
|
|
668
|
+
if (!previousBbox) {
|
|
669
|
+
currentLine.push(bbox);
|
|
670
|
+
}
|
|
671
|
+
// This is where we define how line are build. to be improved
|
|
672
|
+
else {
|
|
673
|
+
const lineMinY = Math.min(...currentLine.map((v) => v.y));
|
|
674
|
+
const lineMaxY = Math.max(...currentLine.map((v) => v.y + v.h));
|
|
675
|
+
let lineCollide = false;
|
|
676
|
+
for (const currentLineItemBbox of currentLine) {
|
|
677
|
+
const overlapLenght = Math.min(currentLineItemBbox.x + currentLineItemBbox.w, bbox.x + bbox.w) -
|
|
678
|
+
Math.max(currentLineItemBbox.x, bbox.x);
|
|
679
|
+
// Use a minimum threshold to tolerate small overlaps common in PDFs due to:
|
|
680
|
+
// - character spacing/kerning
|
|
681
|
+
// - floating-point precision issues
|
|
682
|
+
// - adjacent items with slightly overlapping bounding boxes
|
|
683
|
+
// We want to detect true collisions (same text rendered twice) not adjacent text
|
|
684
|
+
if (overlapLenght > Math.max(medianWidth / 3, 5)) {
|
|
685
|
+
lineCollide = true;
|
|
686
|
+
break;
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
// Don't merge margin line numbers with regular content
|
|
690
|
+
const currentLineHasMargin = currentLine.some((b) => b.isMarginLineNumber === true);
|
|
691
|
+
const bboxIsMargin = bbox.isMarginLineNumber === true;
|
|
692
|
+
const marginMismatch = currentLineHasMargin !== bboxIsMargin;
|
|
693
|
+
// For rotated text, use Y-tolerance based merging since heights may be inconsistent
|
|
694
|
+
const yTolerance = bbox.rotated ? Math.max(medianHeight * 2, 20) : 0;
|
|
695
|
+
const yWithinTolerance = bbox.rotated && Math.abs(bbox.y - lineMinY) < yTolerance;
|
|
696
|
+
if (!lineCollide &&
|
|
697
|
+
!marginMismatch &&
|
|
698
|
+
(yWithinTolerance ||
|
|
699
|
+
(bbox.y + bbox.h * 0.5 >= lineMinY && bbox.y + bbox.h * 0.5 <= lineMaxY) ||
|
|
700
|
+
(bbox.y >= lineMinY && bbox.y <= lineMaxY))) {
|
|
701
|
+
currentLine.push(bbox);
|
|
702
|
+
}
|
|
703
|
+
else {
|
|
704
|
+
if (currentLine.length) {
|
|
705
|
+
lines.push(currentLine);
|
|
706
|
+
}
|
|
707
|
+
currentLine = [bbox];
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
previousBbox = bbox;
|
|
711
|
+
}
|
|
712
|
+
if (currentLine.length) {
|
|
713
|
+
lines.push(currentLine);
|
|
714
|
+
}
|
|
715
|
+
// sort each line by x
|
|
716
|
+
for (const line of lines) {
|
|
717
|
+
line.sort((a, b) => a.x - b.x);
|
|
718
|
+
}
|
|
719
|
+
// sort lines by y
|
|
720
|
+
lines.sort((a, b) => a[0].y - b[0].y);
|
|
721
|
+
// merge 'words'
|
|
722
|
+
const mergeThreshold = 1;
|
|
723
|
+
// Pattern to detect standalone numeric values (financial table numbers)
|
|
724
|
+
// Matches: numbers with optional commas, decimal points, dollar signs, percentages, negatives
|
|
725
|
+
const numericPattern = /^[$]?-?[\d,]+\.?\d*%?$/;
|
|
726
|
+
function looksLikeTableNumber(str) {
|
|
727
|
+
const trimmed = str.trim();
|
|
728
|
+
// Must be at least 2 chars to be a table number (avoid merging single digits)
|
|
729
|
+
return trimmed.length >= 2 && numericPattern.test(trimmed);
|
|
730
|
+
}
|
|
731
|
+
for (const line of lines) {
|
|
732
|
+
for (let i = 1; i < line.length; ++i) {
|
|
733
|
+
// merge box in word if:
|
|
734
|
+
// - same height
|
|
735
|
+
// - less than 2 in space
|
|
736
|
+
// if (line[i].h == line[i-1].h) {
|
|
737
|
+
const currentLine = line[i];
|
|
738
|
+
const previousLine = line[i - 1];
|
|
739
|
+
if (canMergeMarkup(previousLine, currentLine)) {
|
|
740
|
+
// Don't merge adjacent numbers in tables - they're separate columns
|
|
741
|
+
const bothAreNumbers = looksLikeTableNumber(previousLine.str) && looksLikeTableNumber(currentLine.str);
|
|
742
|
+
if (!bothAreNumbers && currentLine.x - previousLine.x - previousLine.w <= mergeThreshold) {
|
|
743
|
+
// if same word but less than .7 of prev line
|
|
744
|
+
if (currentLine.h != 0 && currentLine.h < previousLine.h * 0.7) {
|
|
745
|
+
// and not starting with space
|
|
746
|
+
if (currentLine.str[0] == " ") {
|
|
747
|
+
break;
|
|
748
|
+
}
|
|
749
|
+
if (currentLine.y > previousLine.y + previousLine.h * 0.2) {
|
|
750
|
+
currentLine.str = strToSubscriptString(currentLine.str);
|
|
751
|
+
}
|
|
752
|
+
else {
|
|
753
|
+
currentLine.str = strToPostScript(currentLine.str);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
previousLine.w = currentLine.x + currentLine.w - previousLine.x;
|
|
757
|
+
previousLine.str += currentLine.str;
|
|
758
|
+
previousLine.strLength += currentLine.strLength;
|
|
759
|
+
previousLine.pageBbox = mergePageBbox(previousLine, currentLine);
|
|
760
|
+
line.splice(i, 1);
|
|
761
|
+
i--;
|
|
762
|
+
}
|
|
763
|
+
else if (!bothAreNumbers &&
|
|
764
|
+
currentLine.x - previousLine.x - previousLine.w < previousLine.w / previousLine.strLength) {
|
|
765
|
+
// merge if space between this word and previous is less than average
|
|
766
|
+
// character width (using previous word font size)
|
|
767
|
+
// But don't merge adjacent numbers - they're likely table columns
|
|
768
|
+
// Now extend the width
|
|
769
|
+
previousLine.w = currentLine.x + currentLine.w - previousLine.x;
|
|
770
|
+
// Add space between merged items unless the previous already ends with space
|
|
771
|
+
if (!previousLine.str.endsWith(" ")) {
|
|
772
|
+
previousLine.str += " ";
|
|
773
|
+
previousLine.strLength += 1;
|
|
774
|
+
}
|
|
775
|
+
previousLine.str += currentLine.str;
|
|
776
|
+
previousLine.strLength += currentLine.strLength;
|
|
777
|
+
previousLine.pageBbox = mergePageBbox(previousLine, currentLine);
|
|
778
|
+
line.splice(i, 1);
|
|
779
|
+
i--;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
// }
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
// check if we can merge the lines together
|
|
786
|
+
for (let i = 1; i < lines.length - 1; i++) {
|
|
787
|
+
const currentLine = lines[i];
|
|
788
|
+
const previousLine = lines[i - 1];
|
|
789
|
+
const previousLineMinY = Math.min(...previousLine.map((v) => v.y));
|
|
790
|
+
const previousLineMaxY = Math.max(...previousLine.map((v) => v.y + v.h));
|
|
791
|
+
const currentLineMinY = Math.min(...currentLine.map((v) => v.y));
|
|
792
|
+
const currentLineMaxY = Math.max(...currentLine.map((v) => v.y + v.h));
|
|
793
|
+
// does the 2 line overlap?
|
|
794
|
+
if (previousLineMaxY > currentLineMinY && previousLineMinY < currentLineMaxY) {
|
|
795
|
+
// check the bboxes of current line and prevline do not overlap
|
|
796
|
+
let bboxOverlap = false;
|
|
797
|
+
for (const bbox of currentLine) {
|
|
798
|
+
for (const prevBbox of previousLine) {
|
|
799
|
+
if (bbox.x >= prevBbox.x && bbox.x <= prevBbox.x + prevBbox.w) {
|
|
800
|
+
bboxOverlap = true;
|
|
801
|
+
break;
|
|
802
|
+
}
|
|
803
|
+
if (prevBbox.x >= bbox.x && prevBbox.x <= bbox.x + bbox.w) {
|
|
804
|
+
bboxOverlap = true;
|
|
805
|
+
break;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
// merge if no overlap
|
|
810
|
+
if (!bboxOverlap) {
|
|
811
|
+
previousLine.push(...currentLine);
|
|
812
|
+
previousLine.sort((a, b) => a.x - b.x);
|
|
813
|
+
lines.splice(i--, 1);
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
for (let i = 1; i < lines.length; i++) {
|
|
818
|
+
const yDelta = lines[i][0].y - lines[i - 1][0].y - lines[i - 1][0].h;
|
|
819
|
+
// Calculate the number of blank lines to insert based on vertical spacing
|
|
820
|
+
// Use medianHeight as a reference for one line spacing
|
|
821
|
+
if (yDelta > medianHeight) {
|
|
822
|
+
// Calculate how many blank lines should be inserted
|
|
823
|
+
// Round to nearest integer to get approximate number of lines
|
|
824
|
+
const numBlankLines = Math.round(yDelta / medianHeight) - 1;
|
|
825
|
+
// Cap at a reasonable maximum (e.g., 10 blank lines) to avoid extreme cases
|
|
826
|
+
const linesToInsert = Math.min(Math.max(numBlankLines, 1), 10);
|
|
827
|
+
// Insert the calculated number of blank lines
|
|
828
|
+
const blankLines = Array(linesToInsert).fill([]);
|
|
829
|
+
lines.splice(i, 0, ...blankLines);
|
|
830
|
+
i += linesToInsert;
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
return lines;
|
|
834
|
+
}
|
|
835
|
+
function canRenderBbox(line, bbox) {
|
|
836
|
+
for (const item of line) {
|
|
837
|
+
if (item == bbox) {
|
|
838
|
+
return true;
|
|
839
|
+
}
|
|
840
|
+
if (!item.rendered) {
|
|
841
|
+
return false;
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
return false;
|
|
845
|
+
}
|
|
846
|
+
function updateForwardAnchorRightBound(snapMap, forwardAnchor, rightBound, anchorTarget) {
|
|
847
|
+
// Anything snapped to the right of rightBound should be aligned to anchorTarget line length at minimum
|
|
848
|
+
// Also update nearby positions (within tolerance) to handle slight position variations between rows
|
|
849
|
+
const POSITION_TOLERANCE = 2;
|
|
850
|
+
for (let i = snapMap.length - 1; i >= 0; --i) {
|
|
851
|
+
const anchor = snapMap[i];
|
|
852
|
+
if (rightBound <= anchor) {
|
|
853
|
+
if (!forwardAnchor[anchor] || anchorTarget > forwardAnchor[anchor]) {
|
|
854
|
+
forwardAnchor[anchor] = anchorTarget;
|
|
855
|
+
}
|
|
856
|
+
// Also update nearby positions within tolerance
|
|
857
|
+
for (let j = i - 1; j >= 0; --j) {
|
|
858
|
+
const nearbyAnchor = snapMap[j];
|
|
859
|
+
if (anchor - nearbyAnchor > POSITION_TOLERANCE)
|
|
860
|
+
break;
|
|
861
|
+
if (!forwardAnchor[nearbyAnchor] || anchorTarget > forwardAnchor[nearbyAnchor]) {
|
|
862
|
+
forwardAnchor[nearbyAnchor] = anchorTarget;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
else {
|
|
867
|
+
return;
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
function updateForwardAnchors(bbox, nextBbox, snapMaps, forwardAnchors, lineLength) {
|
|
872
|
+
const rightBound = bbox.x + bbox.w;
|
|
873
|
+
let targetLength = lineLength;
|
|
874
|
+
if (nextBbox && (nextBbox.shouldSpace ?? 0) > 0) {
|
|
875
|
+
targetLength += nextBbox.shouldSpace ?? 0;
|
|
876
|
+
}
|
|
877
|
+
updateForwardAnchorRightBound(snapMaps.left, forwardAnchors.left, rightBound, targetLength);
|
|
878
|
+
updateForwardAnchorRightBound(snapMaps.right, forwardAnchors.right, rightBound, targetLength);
|
|
879
|
+
// we do not update center anchors since centered text may span between snapped columns
|
|
880
|
+
updateForwardAnchorRightBound(snapMaps.floating, forwardAnchors.floating, rightBound, targetLength);
|
|
881
|
+
}
|
|
882
|
+
function getMedianTextBoxSize(lines) {
|
|
883
|
+
// calculate median textBox width
|
|
884
|
+
const widthList = [];
|
|
885
|
+
for (const bbox of lines) {
|
|
886
|
+
if (bbox.w > 0) {
|
|
887
|
+
widthList.push(bbox.w / bbox.strLength);
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
const medianWidth = widthList.sort((a, b) => a - b)[Math.floor(widthList.length / 2)];
|
|
891
|
+
// calculate median textBox height
|
|
892
|
+
const heightList = [];
|
|
893
|
+
for (const bbox of lines) {
|
|
894
|
+
if (bbox.h > 0) {
|
|
895
|
+
heightList.push(bbox.h);
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
const medianHeight = heightList.sort((a, b) => a - b)[Math.floor(heightList.length / 2)];
|
|
899
|
+
return { width: medianWidth, height: medianHeight };
|
|
900
|
+
}
|
|
901
|
+
export function projectToGrid(config, page, projectionBoxes, prevAnchors, totalPages) {
|
|
902
|
+
// detect '.' garbage in the lines
|
|
903
|
+
let dotCount = 0;
|
|
904
|
+
for (const bbox of projectionBoxes) {
|
|
905
|
+
// check if bbox.str contains only dots
|
|
906
|
+
if (bbox.str.match(/^\.+$/)) {
|
|
907
|
+
dotCount++;
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
if (dotCount > 100 && dotCount > projectionBoxes.length * 0.05) {
|
|
911
|
+
// remove all dots and splice them from lines
|
|
912
|
+
const newLines = [];
|
|
913
|
+
for (const bbox of projectionBoxes) {
|
|
914
|
+
if (bbox.str.match(/^\.+$/)) {
|
|
915
|
+
continue;
|
|
916
|
+
}
|
|
917
|
+
if (bbox.str.match(/^·+$/)) {
|
|
918
|
+
continue;
|
|
919
|
+
}
|
|
920
|
+
if (bbox.str.match(/^"+$/)) {
|
|
921
|
+
continue;
|
|
922
|
+
}
|
|
923
|
+
newLines.push(bbox);
|
|
924
|
+
}
|
|
925
|
+
projectionBoxes = newLines;
|
|
926
|
+
}
|
|
927
|
+
// calculate median textBox width/height
|
|
928
|
+
const pageMedianSizes = getMedianTextBoxSize(projectionBoxes);
|
|
929
|
+
let medianWidth = pageMedianSizes.width;
|
|
930
|
+
const medianHeight = pageMedianSizes.height;
|
|
931
|
+
// Save original bboxes (including OCR) for text attribution
|
|
932
|
+
const attributionBboxes = [];
|
|
933
|
+
for (const bbox of projectionBoxes) {
|
|
934
|
+
if (!bbox || !bbox.str || bbox.vgap || bbox.isPlaceholder) {
|
|
935
|
+
continue;
|
|
936
|
+
}
|
|
937
|
+
attributionBboxes.push({
|
|
938
|
+
str: bbox.str,
|
|
939
|
+
x: bbox.x,
|
|
940
|
+
y: bbox.y,
|
|
941
|
+
w: bbox.w,
|
|
942
|
+
h: bbox.h,
|
|
943
|
+
r: bbox.r,
|
|
944
|
+
strLength: bbox.str.length,
|
|
945
|
+
});
|
|
946
|
+
}
|
|
947
|
+
handleRotationReadingOrder(projectionBoxes, page.height);
|
|
948
|
+
const lines = bboxToLine(projectionBoxes, medianWidth, medianHeight, page.width);
|
|
949
|
+
// remove unprojectable text and apply markup to final lines
|
|
950
|
+
for (let i = 0; i < lines.length; ++i) {
|
|
951
|
+
const line = filterUnprojectableText(config, lines[i]);
|
|
952
|
+
for (const bbox of line) {
|
|
953
|
+
// With the way our grid projection currently works, we have to output
|
|
954
|
+
// tags before raw line projection to avoid breaking the projection alignment.
|
|
955
|
+
// The tags get replaced with MD as needed in output formatting, this does
|
|
956
|
+
// result in output text containing the ~~ strikeout markup, but this is
|
|
957
|
+
// mitigated since we skip markup entirely when we are not outputting markdown
|
|
958
|
+
if (bbox.str.trim().length != 0 && bbox.markup) {
|
|
959
|
+
bbox.str = applyMarkupTags(bbox.markup, bbox.str);
|
|
960
|
+
}
|
|
961
|
+
}
|
|
962
|
+
lines[i] = line;
|
|
963
|
+
}
|
|
964
|
+
const forwardAnchors = {
|
|
965
|
+
left: {},
|
|
966
|
+
right: {},
|
|
967
|
+
center: {},
|
|
968
|
+
floating: {},
|
|
969
|
+
};
|
|
970
|
+
const rawLines = [];
|
|
971
|
+
const rawLinesDelta = [];
|
|
972
|
+
const blocks = [];
|
|
973
|
+
if (config.preserveLayoutAlignmentAcrossPages && totalPages > 1) {
|
|
974
|
+
blocks.push({ start: 0, end: lines.length });
|
|
975
|
+
}
|
|
976
|
+
else {
|
|
977
|
+
let emptyCount = 0;
|
|
978
|
+
let start = -1;
|
|
979
|
+
for (const [lineIndex, line] of lines.entries()) {
|
|
980
|
+
if (line.length === 0) {
|
|
981
|
+
emptyCount++;
|
|
982
|
+
if (emptyCount > 1) {
|
|
983
|
+
if (start >= 0) {
|
|
984
|
+
// ignore completely empty blocks, include the double blank
|
|
985
|
+
// line at the end of valid blocks
|
|
986
|
+
blocks.push({ start: start, end: lineIndex + 1 });
|
|
987
|
+
}
|
|
988
|
+
start = -1;
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
else {
|
|
992
|
+
emptyCount = 0;
|
|
993
|
+
if (start < 0) {
|
|
994
|
+
start = lineIndex;
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
if (start > -1) {
|
|
999
|
+
blocks.push({ start: start, end: lines.length });
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
for (const block of blocks) {
|
|
1003
|
+
const { anchorLeft, anchorRight, anchorCenter } = extractAnchorsPointsFromLines(lines.slice(block.start, block.end), page);
|
|
1004
|
+
const snapMaps = {
|
|
1005
|
+
left: [],
|
|
1006
|
+
right: [],
|
|
1007
|
+
center: [],
|
|
1008
|
+
floating: [],
|
|
1009
|
+
};
|
|
1010
|
+
const uniqueSnaps = new Set();
|
|
1011
|
+
for (const snap in anchorLeft) {
|
|
1012
|
+
uniqueSnaps.add(parseFloat(snap));
|
|
1013
|
+
}
|
|
1014
|
+
snapMaps.left.push(...uniqueSnaps);
|
|
1015
|
+
uniqueSnaps.clear();
|
|
1016
|
+
for (const snap in anchorRight) {
|
|
1017
|
+
uniqueSnaps.add(parseFloat(snap));
|
|
1018
|
+
}
|
|
1019
|
+
snapMaps.right.push(...uniqueSnaps);
|
|
1020
|
+
uniqueSnaps.clear();
|
|
1021
|
+
for (const snap in anchorCenter) {
|
|
1022
|
+
uniqueSnaps.add(parseFloat(snap));
|
|
1023
|
+
}
|
|
1024
|
+
snapMaps.center.push(...uniqueSnaps);
|
|
1025
|
+
uniqueSnaps.clear();
|
|
1026
|
+
let hasChanged = true;
|
|
1027
|
+
const leftSnap = [];
|
|
1028
|
+
const rightSnap = [];
|
|
1029
|
+
const centerSnap = [];
|
|
1030
|
+
if (!config.preserveLayoutAlignmentAcrossPages) {
|
|
1031
|
+
const sizes = getMedianTextBoxSize(lines.slice(block.start, block.end).flat());
|
|
1032
|
+
medianWidth = sizes.width;
|
|
1033
|
+
// medianHeight updated but not currently used per-block - reserved for future use
|
|
1034
|
+
void sizes.height;
|
|
1035
|
+
}
|
|
1036
|
+
// compute snaps
|
|
1037
|
+
for (let lineIndex = block.start; lineIndex < block.end; ++lineIndex) {
|
|
1038
|
+
const line = lines[lineIndex];
|
|
1039
|
+
const forceUnsnapped = !canSnapLine(config, line);
|
|
1040
|
+
let prevBbox = null;
|
|
1041
|
+
for (let boxIndex = 0; boxIndex < line.length; ++boxIndex) {
|
|
1042
|
+
const bbox = line[boxIndex];
|
|
1043
|
+
bbox.forceUnsnapped = forceUnsnapped;
|
|
1044
|
+
const spaceThreshold = 2;
|
|
1045
|
+
// should we add a space between the two bbox?
|
|
1046
|
+
// TODO RTL
|
|
1047
|
+
if (prevBbox && bbox.x - (prevBbox.x + prevBbox.w) > spaceThreshold) {
|
|
1048
|
+
const xDelta = bbox.x - (prevBbox.x + prevBbox.w);
|
|
1049
|
+
const prevCharWidth = prevBbox.w / prevBbox.strLength;
|
|
1050
|
+
// add a space
|
|
1051
|
+
bbox.shouldSpace = 1;
|
|
1052
|
+
if (xDelta > prevCharWidth * 2) {
|
|
1053
|
+
// Check if both items are in the same column based on gap size
|
|
1054
|
+
// If gap is less than 10% of page width, treat as same column
|
|
1055
|
+
// This works for any number of columns
|
|
1056
|
+
const columnGapThreshold = page.width * 0.1;
|
|
1057
|
+
const bothInSameColumn = xDelta < columnGapThreshold;
|
|
1058
|
+
// insert column spacing if any of:
|
|
1059
|
+
// - gap is more than an approximate tab (8x average char width)
|
|
1060
|
+
// - previous bbox is right snap
|
|
1061
|
+
// - this bbox is left snap
|
|
1062
|
+
// - both previous and this bbox are snaps
|
|
1063
|
+
// otherwise insert floating spacing
|
|
1064
|
+
if ((!bbox.forceUnsnapped && xDelta > prevCharWidth * 8) ||
|
|
1065
|
+
(bbox.snap && bbox.snap === "left") ||
|
|
1066
|
+
(prevBbox.snap && prevBbox.snap === "right") ||
|
|
1067
|
+
(bbox.snap && prevBbox.snap)) {
|
|
1068
|
+
// If both items are in the same column, limit spacing to avoid
|
|
1069
|
+
// preserving justified text gaps from PDFs
|
|
1070
|
+
bbox.shouldSpace = bothInSameColumn ? FLOATING_SPACES : COLUMN_SPACES;
|
|
1071
|
+
}
|
|
1072
|
+
else {
|
|
1073
|
+
// For items in the same column, use minimal spacing
|
|
1074
|
+
bbox.shouldSpace = bothInSameColumn ? 1 : FLOATING_SPACES;
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
else {
|
|
1079
|
+
bbox.shouldSpace = 0;
|
|
1080
|
+
}
|
|
1081
|
+
prevBbox = bbox;
|
|
1082
|
+
if (!bbox.snap) {
|
|
1083
|
+
uniqueSnaps.add(Math.round(bbox.x));
|
|
1084
|
+
}
|
|
1085
|
+
else if (bbox.snap == "left") {
|
|
1086
|
+
leftSnap.push({ bbox, lineIndex, boxIndex });
|
|
1087
|
+
}
|
|
1088
|
+
else if (bbox.snap == "right") {
|
|
1089
|
+
rightSnap.push({ bbox, lineIndex, boxIndex });
|
|
1090
|
+
}
|
|
1091
|
+
else if (bbox.snap == "center") {
|
|
1092
|
+
centerSnap.push({ bbox, lineIndex, boxIndex });
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
snapMaps.floating.push(...uniqueSnaps);
|
|
1097
|
+
uniqueSnaps.clear();
|
|
1098
|
+
snapMaps.floating.sort((a, b) => a - b);
|
|
1099
|
+
snapMaps.center.sort((a, b) => a - b);
|
|
1100
|
+
snapMaps.right.sort((a, b) => a - b);
|
|
1101
|
+
snapMaps.left.sort((a, b) => a - b);
|
|
1102
|
+
while (hasChanged || snapMaps.right.length || snapMaps.left.length || snapMaps.center.length) {
|
|
1103
|
+
hasChanged = false;
|
|
1104
|
+
for (let lineIndex = block.start; lineIndex < block.end; ++lineIndex) {
|
|
1105
|
+
const line = lines[lineIndex];
|
|
1106
|
+
if (!rawLines[lineIndex]) {
|
|
1107
|
+
rawLines[lineIndex] = "";
|
|
1108
|
+
rawLinesDelta[lineIndex] = 0;
|
|
1109
|
+
}
|
|
1110
|
+
for (let boxIndex = 0; boxIndex < line.length; ++boxIndex) {
|
|
1111
|
+
const bbox = line[boxIndex];
|
|
1112
|
+
if (bbox.rendered) {
|
|
1113
|
+
continue;
|
|
1114
|
+
}
|
|
1115
|
+
if (!bbox.forceUnsnapped) {
|
|
1116
|
+
if (bbox.snap) {
|
|
1117
|
+
continue;
|
|
1118
|
+
}
|
|
1119
|
+
if ((snapMaps.left.length && snapMaps.left[0] < bbox.x) ||
|
|
1120
|
+
(snapMaps.right.length && snapMaps.right[0] < bbox.x) ||
|
|
1121
|
+
(snapMaps.center.length && snapMaps.center[0] < Math.round(bbox.x + bbox.w / 2))) {
|
|
1122
|
+
continue;
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
if (!canRenderBbox(line, bbox)) {
|
|
1126
|
+
break;
|
|
1127
|
+
}
|
|
1128
|
+
let targetX = Math.min(Math.round(bbox.x / medianWidth), COLUMN_SPACES);
|
|
1129
|
+
let lastSnapLeft = 0;
|
|
1130
|
+
for (const key in forwardAnchors.left) {
|
|
1131
|
+
// Use parseFloat to preserve decimal precision from anchor keys
|
|
1132
|
+
if (parseFloat(key) <= bbox.x) {
|
|
1133
|
+
lastSnapLeft = Math.max(lastSnapLeft, forwardAnchors.left[key]);
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
const lineMax = Math.max(lastSnapLeft, rawLines[lineIndex].trimEnd().length + (bbox.shouldSpace ?? 0));
|
|
1137
|
+
if (targetX < lineMax) {
|
|
1138
|
+
targetX = lineMax;
|
|
1139
|
+
}
|
|
1140
|
+
if (!bbox.forceUnsnapped) {
|
|
1141
|
+
const floatingAnchor = forwardAnchors.floating[Math.round(bbox.x)];
|
|
1142
|
+
if (floatingAnchor && targetX < floatingAnchor) {
|
|
1143
|
+
// Limit floating anchor adjustment to avoid excessive gaps in justified text
|
|
1144
|
+
// Use a small max gap to prevent large spacing within columns
|
|
1145
|
+
const maxFloatingGap = 4;
|
|
1146
|
+
const adjustedAnchor = Math.min(floatingAnchor, targetX + maxFloatingGap);
|
|
1147
|
+
if (adjustedAnchor > targetX) {
|
|
1148
|
+
targetX = adjustedAnchor;
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
}
|
|
1152
|
+
rawLines[lineIndex] = rawLines[lineIndex].trimEnd();
|
|
1153
|
+
if (targetX > rawLines[lineIndex].length) {
|
|
1154
|
+
rawLines[lineIndex] += " ".repeat(targetX - rawLines[lineIndex].length);
|
|
1155
|
+
}
|
|
1156
|
+
rawLines[lineIndex] += bbox.str;
|
|
1157
|
+
bbox.rendered = true;
|
|
1158
|
+
hasChanged = true;
|
|
1159
|
+
let nextBbox = null;
|
|
1160
|
+
if (line.length > boxIndex + 1) {
|
|
1161
|
+
nextBbox = line[boxIndex + 1];
|
|
1162
|
+
}
|
|
1163
|
+
if (!bbox.forceUnsnapped) {
|
|
1164
|
+
updateForwardAnchors(bbox, nextBbox, snapMaps, forwardAnchors, rawLines[lineIndex].length);
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
if (snapMaps.left.length &&
|
|
1169
|
+
(!snapMaps.right.length || snapMaps.left[0] <= snapMaps.right[0]) &&
|
|
1170
|
+
(!snapMaps.center.length || snapMaps.left[0] <= snapMaps.center[0])) {
|
|
1171
|
+
const thisTurnSnap = [];
|
|
1172
|
+
for (const item of leftSnap) {
|
|
1173
|
+
if (item.bbox.leftAnchor && parseFloat(item.bbox.leftAnchor) == snapMaps.left[0]) {
|
|
1174
|
+
thisTurnSnap.push(item);
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
hasChanged = true;
|
|
1178
|
+
if (!thisTurnSnap.length) {
|
|
1179
|
+
snapMaps.left.shift();
|
|
1180
|
+
continue;
|
|
1181
|
+
}
|
|
1182
|
+
let targetX = Math.min(Math.round(snapMaps.left[0] / medianWidth), COLUMN_SPACES);
|
|
1183
|
+
const lineMax = Math.max(...thisTurnSnap.map((v) => {
|
|
1184
|
+
let spaceEnd = 0;
|
|
1185
|
+
if (!rawLines[v.lineIndex].endsWith(" ")) {
|
|
1186
|
+
spaceEnd = v.bbox.shouldSpace ?? 0;
|
|
1187
|
+
}
|
|
1188
|
+
if ((v.bbox.shouldSpace ?? 0) > 1) {
|
|
1189
|
+
const trailingSpaces = rawLines[v.lineIndex].length - rawLines[v.lineIndex].trimEnd().length;
|
|
1190
|
+
if (trailingSpaces < (v.bbox.shouldSpace ?? 0)) {
|
|
1191
|
+
spaceEnd = (v.bbox.shouldSpace ?? 0) - trailingSpaces;
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
return rawLines[v.lineIndex].length + spaceEnd + 1;
|
|
1195
|
+
}));
|
|
1196
|
+
if (targetX < lineMax) {
|
|
1197
|
+
targetX = lineMax;
|
|
1198
|
+
}
|
|
1199
|
+
if (forwardAnchors.left[snapMaps.left[0]] &&
|
|
1200
|
+
targetX < forwardAnchors.left[snapMaps.left[0]]) {
|
|
1201
|
+
targetX = forwardAnchors.left[snapMaps.left[0]];
|
|
1202
|
+
}
|
|
1203
|
+
if (prevAnchors.forwardAnchorLeft[snapMaps.left[0]] &&
|
|
1204
|
+
targetX < prevAnchors.forwardAnchorLeft[snapMaps.left[0]]) {
|
|
1205
|
+
targetX = prevAnchors.forwardAnchorLeft[snapMaps.left[0]];
|
|
1206
|
+
}
|
|
1207
|
+
forwardAnchors.left[snapMaps.left[0]] = targetX;
|
|
1208
|
+
for (const currentLeftSnapBox of thisTurnSnap) {
|
|
1209
|
+
const lineIndex = currentLeftSnapBox.lineIndex;
|
|
1210
|
+
if (targetX > rawLines[lineIndex].length) {
|
|
1211
|
+
rawLines[lineIndex] += " ".repeat(targetX - rawLines[lineIndex].length);
|
|
1212
|
+
}
|
|
1213
|
+
rawLines[lineIndex] += currentLeftSnapBox.bbox.str;
|
|
1214
|
+
currentLeftSnapBox.bbox.rendered = true;
|
|
1215
|
+
let nextBbox = null;
|
|
1216
|
+
if (lines[lineIndex].length > currentLeftSnapBox.boxIndex + 1) {
|
|
1217
|
+
nextBbox = lines[lineIndex][currentLeftSnapBox.boxIndex + 1];
|
|
1218
|
+
}
|
|
1219
|
+
updateForwardAnchors(currentLeftSnapBox.bbox, nextBbox, snapMaps, forwardAnchors, rawLines[lineIndex].length);
|
|
1220
|
+
}
|
|
1221
|
+
for (let index = block.start; index < block.end; ++index) {
|
|
1222
|
+
const line = rawLines[index];
|
|
1223
|
+
if (line.length < targetX) {
|
|
1224
|
+
rawLines[index] += " ".repeat(targetX - line.length);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
snapMaps.left.shift();
|
|
1228
|
+
}
|
|
1229
|
+
else if (snapMaps.right.length &&
|
|
1230
|
+
(!snapMaps.left.length || snapMaps.right[0] <= snapMaps.left[0]) &&
|
|
1231
|
+
(!snapMaps.center.length || snapMaps.right[0] <= snapMaps.center[0])) {
|
|
1232
|
+
const thisTurnSnap = [];
|
|
1233
|
+
hasChanged = true;
|
|
1234
|
+
for (const item of rightSnap) {
|
|
1235
|
+
if (item.bbox.rightAnchor && parseFloat(item.bbox.rightAnchor) == snapMaps.right[0]) {
|
|
1236
|
+
thisTurnSnap.push(item);
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
if (!thisTurnSnap.length) {
|
|
1240
|
+
snapMaps.right.shift();
|
|
1241
|
+
continue;
|
|
1242
|
+
}
|
|
1243
|
+
let targetX = Math.min(Math.round(snapMaps.right[0] / medianWidth), COLUMN_SPACES);
|
|
1244
|
+
const lineMax = Math.max(...thisTurnSnap.map((v) => {
|
|
1245
|
+
let lastSnapLeft = 0;
|
|
1246
|
+
for (const key in forwardAnchors.left) {
|
|
1247
|
+
if (parseInt(key) <= v.bbox.x) {
|
|
1248
|
+
lastSnapLeft = Math.max(lastSnapLeft, forwardAnchors.left[key]);
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1251
|
+
return (Math.max(lastSnapLeft, rawLines[v.lineIndex].trimEnd().length + (v.bbox.shouldSpace ?? 0)) + v.bbox.strLength);
|
|
1252
|
+
}));
|
|
1253
|
+
if (targetX < lineMax) {
|
|
1254
|
+
targetX = lineMax;
|
|
1255
|
+
}
|
|
1256
|
+
if (forwardAnchors.right[snapMaps.right[0]] &&
|
|
1257
|
+
targetX < forwardAnchors.right[snapMaps.right[0]]) {
|
|
1258
|
+
targetX = forwardAnchors.right[snapMaps.right[0]];
|
|
1259
|
+
}
|
|
1260
|
+
if (prevAnchors.forwardAnchorRight[snapMaps.right[0]] &&
|
|
1261
|
+
targetX < prevAnchors.forwardAnchorRight[snapMaps.right[0]]) {
|
|
1262
|
+
targetX = prevAnchors.forwardAnchorRight[snapMaps.right[0]];
|
|
1263
|
+
}
|
|
1264
|
+
forwardAnchors.right[snapMaps.right[0]] = targetX;
|
|
1265
|
+
for (const currentRightSnapBox of thisTurnSnap) {
|
|
1266
|
+
const lineIndex = currentRightSnapBox.lineIndex;
|
|
1267
|
+
rawLines[lineIndex] = rawLines[lineIndex].trimEnd();
|
|
1268
|
+
if (targetX > rawLines[lineIndex].trimEnd().length + currentRightSnapBox.bbox.strLength) {
|
|
1269
|
+
rawLines[lineIndex] += " ".repeat(targetX - rawLines[lineIndex].length - currentRightSnapBox.bbox.strLength);
|
|
1270
|
+
}
|
|
1271
|
+
rawLines[lineIndex] += currentRightSnapBox.bbox.str;
|
|
1272
|
+
currentRightSnapBox.bbox.rendered = true;
|
|
1273
|
+
let nextBbox = null;
|
|
1274
|
+
if (lines[lineIndex].length > currentRightSnapBox.boxIndex + 1) {
|
|
1275
|
+
nextBbox = lines[lineIndex][currentRightSnapBox.boxIndex + 1];
|
|
1276
|
+
}
|
|
1277
|
+
updateForwardAnchors(currentRightSnapBox.bbox, nextBbox, snapMaps, forwardAnchors, rawLines[lineIndex].length);
|
|
1278
|
+
}
|
|
1279
|
+
for (let index = block.start; index < block.end; ++index) {
|
|
1280
|
+
const line = rawLines[index];
|
|
1281
|
+
if (line.length < targetX) {
|
|
1282
|
+
rawLines[index] += " ".repeat(targetX - line.length);
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
snapMaps.right.shift();
|
|
1286
|
+
}
|
|
1287
|
+
else if (snapMaps.center.length &&
|
|
1288
|
+
(!snapMaps.left.length || snapMaps.center[0] <= snapMaps.left[0]) &&
|
|
1289
|
+
(!snapMaps.right.length || snapMaps.center[0] <= snapMaps.right[0])) {
|
|
1290
|
+
const thisTurnSnap = [];
|
|
1291
|
+
hasChanged = true;
|
|
1292
|
+
for (const item of centerSnap) {
|
|
1293
|
+
if (item.bbox.centerAnchor && parseFloat(item.bbox.centerAnchor) == snapMaps.center[0]) {
|
|
1294
|
+
thisTurnSnap.push(item);
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
if (!thisTurnSnap.length) {
|
|
1298
|
+
snapMaps.center.shift();
|
|
1299
|
+
continue;
|
|
1300
|
+
}
|
|
1301
|
+
let targetX = Math.min(Math.round(snapMaps.center[0] / medianWidth), COLUMN_SPACES);
|
|
1302
|
+
const lineMax = Math.max(...thisTurnSnap.map((v) => {
|
|
1303
|
+
let spaceEnd = 0;
|
|
1304
|
+
if (!rawLines[v.lineIndex].endsWith(" ")) {
|
|
1305
|
+
spaceEnd = v.bbox.shouldSpace ?? 0;
|
|
1306
|
+
}
|
|
1307
|
+
if ((v.bbox.shouldSpace ?? 0) > 1) {
|
|
1308
|
+
const trailingSpaces = rawLines[v.lineIndex].length - rawLines[v.lineIndex].trimEnd().length;
|
|
1309
|
+
if (trailingSpaces < (v.bbox.shouldSpace ?? 0)) {
|
|
1310
|
+
spaceEnd = (v.bbox.shouldSpace ?? 0) - trailingSpaces;
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
return rawLines[v.lineIndex].length + Math.round(v.bbox.strLength / 2) + spaceEnd;
|
|
1314
|
+
}));
|
|
1315
|
+
if (targetX < lineMax) {
|
|
1316
|
+
targetX = lineMax;
|
|
1317
|
+
}
|
|
1318
|
+
if (forwardAnchors.center[snapMaps.center[0]] &&
|
|
1319
|
+
targetX < forwardAnchors.center[snapMaps.center[0]]) {
|
|
1320
|
+
targetX = forwardAnchors.center[snapMaps.center[0]];
|
|
1321
|
+
}
|
|
1322
|
+
if (prevAnchors.forwardAnchorCenter[snapMaps.center[0]] &&
|
|
1323
|
+
targetX < prevAnchors.forwardAnchorCenter[snapMaps.center[0]]) {
|
|
1324
|
+
targetX = prevAnchors.forwardAnchorCenter[snapMaps.center[0]];
|
|
1325
|
+
}
|
|
1326
|
+
forwardAnchors.center[snapMaps.center[0]] = targetX;
|
|
1327
|
+
for (const currentCenterSnapBox of thisTurnSnap) {
|
|
1328
|
+
if (targetX >
|
|
1329
|
+
rawLines[currentCenterSnapBox.lineIndex].length +
|
|
1330
|
+
Math.round(currentCenterSnapBox.bbox.strLength / 2)) {
|
|
1331
|
+
rawLines[currentCenterSnapBox.lineIndex] += " ".repeat(targetX -
|
|
1332
|
+
rawLines[currentCenterSnapBox.lineIndex].length -
|
|
1333
|
+
Math.round(currentCenterSnapBox.bbox.strLength / 2));
|
|
1334
|
+
}
|
|
1335
|
+
rawLines[currentCenterSnapBox.lineIndex] += currentCenterSnapBox.bbox.str;
|
|
1336
|
+
currentCenterSnapBox.bbox.rendered = true;
|
|
1337
|
+
}
|
|
1338
|
+
snapMaps.center.shift();
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
}
|
|
1342
|
+
fixSparseBlocks(blocks, rawLines);
|
|
1343
|
+
const text = rawLines.join("\n");
|
|
1344
|
+
// OSS: Return text instead of mutating page object
|
|
1345
|
+
return {
|
|
1346
|
+
text,
|
|
1347
|
+
prevAnchors: {
|
|
1348
|
+
forwardAnchorLeft: forwardAnchors.left,
|
|
1349
|
+
forwardAnchorRight: forwardAnchors.right,
|
|
1350
|
+
forwardAnchorCenter: forwardAnchors.center,
|
|
1351
|
+
},
|
|
1352
|
+
};
|
|
1353
|
+
}
|
|
1354
|
+
export function projectPagesToGrid(pages, config) {
|
|
1355
|
+
const prevAnchors = {
|
|
1356
|
+
forwardAnchorLeft: {},
|
|
1357
|
+
forwardAnchorRight: {},
|
|
1358
|
+
forwardAnchorCenter: {},
|
|
1359
|
+
};
|
|
1360
|
+
const results = [];
|
|
1361
|
+
for (const page of pages) {
|
|
1362
|
+
// Build projection boxes from text items
|
|
1363
|
+
const projectionBoxes = buildBbox(page, config);
|
|
1364
|
+
// Project to grid
|
|
1365
|
+
const { text, prevAnchors: newAnchors } = projectToGrid(config, page, projectionBoxes, prevAnchors, pages.length);
|
|
1366
|
+
// Update forward anchors if preserving across pages
|
|
1367
|
+
if (config.preserveLayoutAlignmentAcrossPages) {
|
|
1368
|
+
for (const anchor in newAnchors.forwardAnchorLeft) {
|
|
1369
|
+
prevAnchors.forwardAnchorLeft[anchor] = newAnchors.forwardAnchorLeft[anchor];
|
|
1370
|
+
}
|
|
1371
|
+
for (const anchor in newAnchors.forwardAnchorRight) {
|
|
1372
|
+
prevAnchors.forwardAnchorRight[anchor] = newAnchors.forwardAnchorRight[anchor];
|
|
1373
|
+
}
|
|
1374
|
+
for (const anchor in newAnchors.forwardAnchorCenter) {
|
|
1375
|
+
prevAnchors.forwardAnchorCenter[anchor] = newAnchors.forwardAnchorCenter[anchor];
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
// Build result page
|
|
1379
|
+
results.push({
|
|
1380
|
+
pageNum: page.pageNum,
|
|
1381
|
+
width: page.width,
|
|
1382
|
+
height: page.height,
|
|
1383
|
+
text,
|
|
1384
|
+
textItems: page.textItems,
|
|
1385
|
+
boundingBoxes: [],
|
|
1386
|
+
});
|
|
1387
|
+
}
|
|
1388
|
+
// Clean raw text (margin detection, etc)
|
|
1389
|
+
cleanRawText(results, config);
|
|
1390
|
+
return results;
|
|
1391
|
+
}
|
|
1392
|
+
//# sourceMappingURL=gridProjection.js.map
|