playa-pdf 0.8.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.gitignore +1 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/CHANGELOG.md +12 -1
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/PKG-INFO +2 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/README.md +1 -1
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/_version.py +2 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/ccitt.py +2 -1
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cli.py +1 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/content.py +97 -42
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/metadata.py +2 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/document.py +4 -3
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/font.py +8 -8
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/image.py +3 -3
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/interp.py +29 -13
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/miner.py +86 -67
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/page.py +1 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/parser.py +4 -6
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/pdftypes.py +1 -3
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/security.py +1 -1
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/utils.py +4 -8
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/pyproject.toml +4 -1
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_document.py +4 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_miner.py +24 -2
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_pdftypes.py +4 -4
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_xref.py +2 -7
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.flake8 +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.gitattributes +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.gitmodules +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/LICENSE +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/MANIFEST.in +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/benchmark.sh +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/ccitt_decode.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/converter.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/marked_content.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/miner.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/objects.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/parallel.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/parser.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/png_predict.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/structure.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/text.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/tiff_predict.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/type3_charproc.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/adobe-spiderman.jpg +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/cli.md +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/data.md +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/index.md +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/reference.md +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/working-in-the-pdf-mine.md +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/mkdocs.yml +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/__init__.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/_saslprep.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/arcfour.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/ascii85.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78ms-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78ms-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/83pv-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/83pv-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90ms-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90ms-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90msp-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90msp-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90pv-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90pv-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5pc-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5pc-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS1-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS1-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETHK-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETHK-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETen-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETen-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETenms-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETenms-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK2K-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK2K-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBKp-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBKp-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBTpc-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBTpc-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBpc-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBpc-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdla-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdla-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdlb-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdlb-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKgccs-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKgccs-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm314-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm314-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm471-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm471-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKscs-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKscs-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hankaku-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hankaku-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hiragana-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hiragana-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-Johab-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-Johab-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-HW-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-HW-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCpc-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCpc-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Katakana-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Katakana-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Makefile +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/NWP-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/NWP-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/README.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Roman-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Roman-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-HW-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-HW-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX0213-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX0213-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX02132004-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX02132004-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/WP-Symbol-H.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/WP-Symbol-V.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-CNS1.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-GB1.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-Japan1.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-KR.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-Korea1.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-Manga1.pickle.gz +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmapdb.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/color.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/__init__.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/_asobj.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/content.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data_structures.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/encodingdb.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/encodings.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/exceptions.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/fontmetrics.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/fontprogram.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/glyphlist.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/lzw.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/outline.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/py.typed +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/runlength.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/structure.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/worker.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/xref.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/README +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/acroform/AcroForm_TEST.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/acroform/AcroForm_TEST_compiled.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/actualtext.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/ascii_tounicode.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/broken_xobjects.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/character_spacing.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/character_spacing_glyphs.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/character_spacing_texts.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/core_font_encodings.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/duplicate_encoding_tounicode.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-128-m.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-128.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-256-m.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-256-r6.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-256.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/base.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/encrypted_doc_no_id.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/rc4-128.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/rc4-40.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/evil_cmap.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/evil_xobjects.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/extgstate.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/font-size-test.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/graphics_state_in_text_object.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/hello_structure.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/image_structure.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/jo.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/junk_before_header.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/missing_rolemap.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/multi-xrefs.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/pdf_structure.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotated.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotated_type3_fonts.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/0.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/0mb.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/180.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/180mb.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/270.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/270mb.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/90.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/90mb.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/sampleOneByteIdentityEncode.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/scancode/patchelf.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple1.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple2.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple3.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple3_glyphs.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple3_texts.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple4.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple5.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/structure_xobjects.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/structure_xobjects_2.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/test_pdf_with_tiff_predictor.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/text_displacement.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/text_side_effects.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/text_space.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/type3_fonts.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/uncoloured-tiling-pattern.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/utf16_tounicode.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/utf8_tounicode.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_glyphs.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_offset.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_offset_glyphs.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_offset_texts.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_texts.json +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/xobject_graphicstate.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/zen_of_python_corrupted.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/__init__.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/bad_operators.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/bad_pages.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/bad_resources.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/cmap-encoding.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/cmap-onebyte-encoding.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/cmap-tounicode.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/data.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/fallback-xref.pdf +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/issue18117-encoding.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/issue18117-tounicode.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/issue9367-tounicode.txt +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_cli.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_cmapdb.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_crypto.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_data.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_encodingdb.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_fonts.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_indirect_objects.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_interp.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_lazy_api.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_lexer.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_open.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_outline.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_page.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_parallel.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_parser.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_pdfminer_ccitt.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_structure.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_text.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_utils.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/conv_afm.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/conv_cmap.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/conv_glyphlist.py +0 -0
- {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/create_json_schema.py +0 -0
|
@@ -1,10 +1,21 @@
|
|
|
1
|
+
## PLAYA 0.9.0: Unreleased
|
|
2
|
+
|
|
3
|
+
- Refactor and add convenience methods to text objects
|
|
4
|
+
- Insert blank pages for missing object references in page tree
|
|
5
|
+
- Clean up type annotations (breaking change: PDFObject can no longer
|
|
6
|
+
be `str`, as the parser will never create this)
|
|
7
|
+
|
|
8
|
+
## PLAYA 0.8.1: 2025-12-22
|
|
9
|
+
|
|
10
|
+
- Correct subtle issues with mypyc-compiled pdfminer.six code
|
|
11
|
+
|
|
1
12
|
## PLAYA 0.8.0: 2025-12-17
|
|
2
13
|
|
|
3
14
|
- Optionally accelerate image decoding with mypyc
|
|
4
15
|
- Correct explicit string positioning in vertical text
|
|
5
16
|
- Restore caching in text decoding under Python 3.8
|
|
6
17
|
- Bring back pdfminer.six layout analysis algorithm
|
|
7
|
-
-
|
|
18
|
+
- Optionally accelerate pdfminer.six compatibility with mypyc
|
|
8
19
|
|
|
9
20
|
## PLAYA 0.7.2: 2025-11-09
|
|
10
21
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: playa-pdf
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Parallel and LazY Analyzer for PDFs
|
|
5
5
|
Project-URL: Homepage, https://dhdaines.github.io/playa
|
|
6
6
|
Author-email: David Huggins-Daines <dhd@ecolingui.ca>
|
|
@@ -52,7 +52,7 @@ analysis](https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_t
|
|
|
52
52
|
algorithm from
|
|
53
53
|
[pdfminer.six](https://github.com/pdfminer/pdfminer.six) anyways. See
|
|
54
54
|
[the
|
|
55
|
-
documentation](https://dhdaines.github.io/playa/working-in-the-pdf-mine)
|
|
55
|
+
documentation](https://dhdaines.github.io/playa/latest/working-in-the-pdf-mine)
|
|
56
56
|
for more information on how to migrate your code. You may be
|
|
57
57
|
interested to know that PLAYA's implementation is also 15-50% faster,
|
|
58
58
|
depending on how many CPUs you use.
|
|
@@ -21,7 +21,7 @@ analysis](https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_t
|
|
|
21
21
|
algorithm from
|
|
22
22
|
[pdfminer.six](https://github.com/pdfminer/pdfminer.six) anyways. See
|
|
23
23
|
[the
|
|
24
|
-
documentation](https://dhdaines.github.io/playa/working-in-the-pdf-mine)
|
|
24
|
+
documentation](https://dhdaines.github.io/playa/latest/working-in-the-pdf-mine)
|
|
25
25
|
for more information on how to migrate your code. You may be
|
|
26
26
|
interested to know that PLAYA's implementation is also 15-50% faster,
|
|
27
27
|
depending on how many CPUs you use.
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.9.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 9, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -492,7 +492,8 @@ class CCITTG4Parser(BitParser):
|
|
|
492
492
|
print(y, "".join(str(b) for b in bits))
|
|
493
493
|
|
|
494
494
|
def _reset_line(self) -> None:
|
|
495
|
-
#
|
|
495
|
+
# We could just swap them, like in PNG prediction, though it's
|
|
496
|
+
# not clear that would be much faster.
|
|
496
497
|
self._refline = self._curline
|
|
497
498
|
self._curline = [1] * self.width
|
|
498
499
|
self._curpos = -1
|
|
@@ -105,7 +105,6 @@ from playa.pdftypes import (
|
|
|
105
105
|
from playa.structure import ContentItem
|
|
106
106
|
from playa.structure import ContentObject as StructContentObject
|
|
107
107
|
from playa.structure import Element
|
|
108
|
-
from playa.utils import decode_text
|
|
109
108
|
|
|
110
109
|
LOG = logging.getLogger(__name__)
|
|
111
110
|
|
|
@@ -486,7 +485,7 @@ def _extract_outline_item(item: Outline, indent: int, outfh: TextIO) -> bool:
|
|
|
486
485
|
|
|
487
486
|
print(f"{ws}{{", file=outfh)
|
|
488
487
|
if item.title is not None:
|
|
489
|
-
format_attr("title",
|
|
488
|
+
format_attr("title", item.title)
|
|
490
489
|
if item.destination is not None:
|
|
491
490
|
format_attr("destination", asobj(item.destination))
|
|
492
491
|
if s:
|
|
@@ -4,6 +4,7 @@ PDF content objects created by the interpreter.
|
|
|
4
4
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
|
+
from abc import abstractmethod
|
|
7
8
|
from copy import copy
|
|
8
9
|
from dataclasses import dataclass
|
|
9
10
|
from typing import (
|
|
@@ -800,30 +801,95 @@ class PathObject(ContentObject):
|
|
|
800
801
|
return transform_bbox(self.ctm, bbox)
|
|
801
802
|
|
|
802
803
|
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
# dx, dy = apply_matrix_norm(self.matrix, (1, 0))
|
|
806
|
-
dx, dy, _, _, _, _ = matrix
|
|
807
|
-
else:
|
|
808
|
-
# dx, dy = apply_matrix_norm(self.matrix, (0, 1))
|
|
809
|
-
_, _, dx, dy, _, _ = matrix
|
|
810
|
-
if dx == 0: # Nearly always true
|
|
811
|
-
return abs(dy)
|
|
812
|
-
elif dy == 0:
|
|
813
|
-
return abs(dx)
|
|
814
|
-
else:
|
|
815
|
-
import math
|
|
804
|
+
class TextBase(ContentObject):
|
|
805
|
+
"""Common properties for text and glyph objects."""
|
|
816
806
|
|
|
817
|
-
|
|
807
|
+
@property
|
|
808
|
+
@abstractmethod
|
|
809
|
+
def matrix(self) -> Matrix: ...
|
|
810
|
+
|
|
811
|
+
@property
|
|
812
|
+
def font(self) -> Font:
|
|
813
|
+
"""Font for this text object."""
|
|
814
|
+
font = self.gstate.font
|
|
815
|
+
assert font is not None
|
|
816
|
+
return font
|
|
817
|
+
|
|
818
|
+
@property
|
|
819
|
+
def size(self) -> float:
|
|
820
|
+
"""Font size for this text object.
|
|
821
|
+
|
|
822
|
+
This is the actual font size in device space, which is **not**
|
|
823
|
+
the same as `GraphicState.fontsize`. That's the font size in
|
|
824
|
+
text space which is not a very useful number (it's usually 1).
|
|
825
|
+
"""
|
|
826
|
+
vert = False if self.gstate.font is None else self.gstate.font.vertical
|
|
827
|
+
if vert:
|
|
828
|
+
# dx, dy = apply_matrix_norm(self.matrix, (1, 0))
|
|
829
|
+
dx, dy, _, _, _, _ = self.matrix
|
|
830
|
+
else:
|
|
831
|
+
# dx, dy = apply_matrix_norm(self.matrix, (0, 1))
|
|
832
|
+
_, _, dx, dy, _, _ = self.matrix
|
|
833
|
+
if dx == 0: # Nearly always true
|
|
834
|
+
return abs(dy)
|
|
835
|
+
elif dy == 0:
|
|
836
|
+
return abs(dx)
|
|
837
|
+
else:
|
|
838
|
+
import math
|
|
839
|
+
|
|
840
|
+
return math.sqrt(dx * dx + dy * dy)
|
|
841
|
+
|
|
842
|
+
@property
|
|
843
|
+
def fontname(self) -> str:
|
|
844
|
+
"""Font name for this text object"""
|
|
845
|
+
return self.font.fontname
|
|
846
|
+
|
|
847
|
+
@property
|
|
848
|
+
def fontbase(self) -> str:
|
|
849
|
+
"""Original font name for this text object.
|
|
850
|
+
|
|
851
|
+
Fonts in PDF files are usually "subsetted", meaning only the
|
|
852
|
+
glyphs actually used in the document are included. In this
|
|
853
|
+
case the font's `fontname` property usually consists of an
|
|
854
|
+
arbitrary "tag", plus (literally, a `+`) and the original
|
|
855
|
+
name. This is a convenience property to get that original
|
|
856
|
+
name.
|
|
857
|
+
|
|
858
|
+
This is not the same as `GraphicState.font.basefont` which
|
|
859
|
+
usually also includes the subset tag.
|
|
860
|
+
|
|
861
|
+
"""
|
|
862
|
+
fontname = self.fontname
|
|
863
|
+
subset, _, base = fontname.partition("+")
|
|
864
|
+
if base:
|
|
865
|
+
return base
|
|
866
|
+
return fontname
|
|
867
|
+
|
|
868
|
+
@property
|
|
869
|
+
def textfont(self) -> str:
|
|
870
|
+
"""Convenient short form of the font name and size.
|
|
871
|
+
|
|
872
|
+
For example, "Helvetica 12".
|
|
873
|
+
"""
|
|
874
|
+
return f"{self.fontbase} {round(self.size)}"
|
|
875
|
+
|
|
876
|
+
@property
|
|
877
|
+
def origin(self) -> Point:
|
|
878
|
+
"""Origin of this text object in device space."""
|
|
879
|
+
_, _, _, _, dx, dy = self.matrix
|
|
880
|
+
return dx, dy
|
|
818
881
|
|
|
819
882
|
|
|
820
883
|
@dataclass
|
|
821
|
-
class GlyphObject(
|
|
884
|
+
class GlyphObject(TextBase):
|
|
822
885
|
"""Individual glyph on the page.
|
|
823
886
|
|
|
824
887
|
Attributes:
|
|
825
888
|
font: Font for this glyph.
|
|
826
889
|
size: Effective font size for this glyph.
|
|
890
|
+
fontname: Font name.
|
|
891
|
+
fontbase: Short (non-subset) font name.
|
|
892
|
+
textfont: Combined short name and size for the font.
|
|
827
893
|
cid: Character ID for this glyph.
|
|
828
894
|
text: Unicode mapping of this glyph, if any.
|
|
829
895
|
matrix: Rendering matrix `T_rm` for this glyph, which transforms
|
|
@@ -837,7 +903,7 @@ class GlyphObject(ContentObject):
|
|
|
837
903
|
|
|
838
904
|
cid: int
|
|
839
905
|
text: Union[str, None]
|
|
840
|
-
|
|
906
|
+
_matrix: Matrix
|
|
841
907
|
_displacement: float
|
|
842
908
|
_corners: bool
|
|
843
909
|
|
|
@@ -881,20 +947,12 @@ class GlyphObject(ContentObject):
|
|
|
881
947
|
return itor
|
|
882
948
|
|
|
883
949
|
@property
|
|
884
|
-
def
|
|
885
|
-
|
|
886
|
-
assert font is not None
|
|
887
|
-
return font
|
|
888
|
-
|
|
889
|
-
@property
|
|
890
|
-
def size(self) -> float:
|
|
891
|
-
vert = False if self.gstate.font is None else self.gstate.font.vertical
|
|
892
|
-
return _font_size(self.matrix, vert)
|
|
950
|
+
def matrix(self) -> Matrix:
|
|
951
|
+
return self._matrix
|
|
893
952
|
|
|
894
953
|
@property
|
|
895
|
-
def
|
|
896
|
-
|
|
897
|
-
return dx, dy
|
|
954
|
+
def chars(self) -> str:
|
|
955
|
+
return self.text or ""
|
|
898
956
|
|
|
899
957
|
@property
|
|
900
958
|
def displacement(self) -> Point:
|
|
@@ -932,7 +990,7 @@ class GlyphObject(ContentObject):
|
|
|
932
990
|
|
|
933
991
|
|
|
934
992
|
@dataclass
|
|
935
|
-
class TextObject(
|
|
993
|
+
class TextObject(TextBase):
|
|
936
994
|
"""Text object (contains one or more glyphs).
|
|
937
995
|
|
|
938
996
|
Attributes:
|
|
@@ -943,7 +1001,11 @@ class TextObject(ContentObject):
|
|
|
943
1001
|
origin: Origin of this text object in device space.
|
|
944
1002
|
displacement: Vector to the origin of the next text object in
|
|
945
1003
|
device space.
|
|
1004
|
+
font: Font for this text object.
|
|
946
1005
|
size: Effective font size for this text object.
|
|
1006
|
+
fontname: Font name.
|
|
1007
|
+
fontbase: Short (non-subset) font name.
|
|
1008
|
+
textfont: Combined short name and size for the font.
|
|
947
1009
|
text_matrix: Text matrix `T_m` for this text object, which
|
|
948
1010
|
transforms text space coordinates to user space.
|
|
949
1011
|
line_matrix: Text line matrix `T_lm` for this text object, which
|
|
@@ -966,6 +1028,7 @@ class TextObject(ContentObject):
|
|
|
966
1028
|
_chars: Union[List[str], None] = None
|
|
967
1029
|
_bbox: Union[Rect, None] = None
|
|
968
1030
|
_next_glyph_offset: Union[Point, None] = None
|
|
1031
|
+
_displacement: Union[Point, None] = None
|
|
969
1032
|
|
|
970
1033
|
def __iter__(self) -> Iterator[GlyphObject]:
|
|
971
1034
|
"""Generate glyphs for this text object"""
|
|
@@ -1048,7 +1111,7 @@ class TextObject(ContentObject):
|
|
|
1048
1111
|
mcstack=self.mcstack,
|
|
1049
1112
|
cid=cid,
|
|
1050
1113
|
text=text,
|
|
1051
|
-
|
|
1114
|
+
_matrix=matrix,
|
|
1052
1115
|
_displacement=disp,
|
|
1053
1116
|
_corners=corners,
|
|
1054
1117
|
)
|
|
@@ -1091,11 +1154,6 @@ class TextObject(ContentObject):
|
|
|
1091
1154
|
)
|
|
1092
1155
|
return self._matrix
|
|
1093
1156
|
|
|
1094
|
-
@property
|
|
1095
|
-
def size(self) -> float:
|
|
1096
|
-
vert = False if self.gstate.font is None else self.gstate.font.vertical
|
|
1097
|
-
return _font_size(self.matrix, vert)
|
|
1098
|
-
|
|
1099
1157
|
@property
|
|
1100
1158
|
def scaling_matrix(self) -> Matrix:
|
|
1101
1159
|
horizontal_scaling = self.gstate.scaling * 0.01
|
|
@@ -1113,15 +1171,11 @@ class TextObject(ContentObject):
|
|
|
1113
1171
|
def text_matrix(self) -> Matrix:
|
|
1114
1172
|
return translate_matrix(self.line_matrix, self._glyph_offset)
|
|
1115
1173
|
|
|
1116
|
-
@property
|
|
1117
|
-
def origin(self) -> Point:
|
|
1118
|
-
_, _, _, _, dx, dy = self.matrix
|
|
1119
|
-
return dx, dy
|
|
1120
|
-
|
|
1121
1174
|
@property
|
|
1122
1175
|
def displacement(self) -> Point:
|
|
1176
|
+
if self._displacement is not None:
|
|
1177
|
+
return self._displacement
|
|
1123
1178
|
matrix = self.matrix
|
|
1124
|
-
# FIXME: This should be either cached or optimized
|
|
1125
1179
|
next_matrix = mult_matrix(
|
|
1126
1180
|
self.scaling_matrix,
|
|
1127
1181
|
mult_matrix(
|
|
@@ -1129,7 +1183,8 @@ class TextObject(ContentObject):
|
|
|
1129
1183
|
self.ctm,
|
|
1130
1184
|
),
|
|
1131
1185
|
)
|
|
1132
|
-
|
|
1186
|
+
self._displacement = next_matrix[-2] - matrix[-2], next_matrix[-1] - matrix[-1]
|
|
1187
|
+
return self._displacement
|
|
1133
1188
|
|
|
1134
1189
|
@property
|
|
1135
1190
|
def bbox(self) -> Rect:
|
|
@@ -39,7 +39,7 @@ from playa.structure import ContentItem as _StructContentItem
|
|
|
39
39
|
from playa.structure import ContentObject as _StructContentObject
|
|
40
40
|
from playa.structure import Element as _Element
|
|
41
41
|
from playa.structure import Tree as _Tree
|
|
42
|
-
from playa.utils import Matrix, Rect
|
|
42
|
+
from playa.utils import Matrix, Rect
|
|
43
43
|
|
|
44
44
|
log = logging.getLogger(__name__)
|
|
45
45
|
|
|
@@ -556,7 +556,7 @@ def asobj_stream(obj: _ContentStream) -> Dict:
|
|
|
556
556
|
def asobj_outline(obj: _Outline, recurse: bool = True) -> Outline:
|
|
557
557
|
out = Outline()
|
|
558
558
|
if obj.title is not None:
|
|
559
|
-
out["title"] =
|
|
559
|
+
out["title"] = obj.title
|
|
560
560
|
if obj.destination is not None:
|
|
561
561
|
out["destination"] = asobj(obj.destination)
|
|
562
562
|
if recurse:
|
|
@@ -447,7 +447,7 @@ class Document:
|
|
|
447
447
|
if m is None:
|
|
448
448
|
raise PDFSyntaxError(
|
|
449
449
|
f"Not an indirect object at position {pos}: "
|
|
450
|
-
f"{self.buffer[pos:pos+8]!r}"
|
|
450
|
+
f"{self.buffer[pos : pos + 8]!r}"
|
|
451
451
|
)
|
|
452
452
|
_, obj = next(self.parser)
|
|
453
453
|
if obj.objid != objid:
|
|
@@ -695,8 +695,9 @@ class Document:
|
|
|
695
695
|
try:
|
|
696
696
|
page_object = dict_value(self[object_id])
|
|
697
697
|
except IndexError as e:
|
|
698
|
-
log.warning("
|
|
699
|
-
|
|
698
|
+
log.warning("Missing page object: %s", e)
|
|
699
|
+
# Create an empty page to match what pdfium does
|
|
700
|
+
page_object = {"Type": LIT("Page")}
|
|
700
701
|
|
|
701
702
|
# Avoid recursion errors by keeping track of visited nodes
|
|
702
703
|
# (again, this should never actually happen in a valid PDF)
|
|
@@ -88,7 +88,7 @@ class Font:
|
|
|
88
88
|
fontname = resolve1(descriptor.get("FontName"))
|
|
89
89
|
if isinstance(fontname, PSLiteral):
|
|
90
90
|
self.fontname = literal_name(fontname)
|
|
91
|
-
elif isinstance(fontname,
|
|
91
|
+
elif isinstance(fontname, bytes):
|
|
92
92
|
self.fontname = decode_text(fontname)
|
|
93
93
|
else:
|
|
94
94
|
self.fontname = "unknown"
|
|
@@ -532,16 +532,16 @@ class CIDFont(Font):
|
|
|
532
532
|
# These are *supposed* to be ASCII (PDF 1.7 section 9.7.3),
|
|
533
533
|
# but for whatever reason they are sometimes UTF-16BE
|
|
534
534
|
cid_registry = resolve1(self.cidsysteminfo.get("Registry"))
|
|
535
|
-
if isinstance(cid_registry,
|
|
536
|
-
|
|
535
|
+
if isinstance(cid_registry, bytes):
|
|
536
|
+
regstr = decode_text(cid_registry).strip()
|
|
537
537
|
else:
|
|
538
|
-
|
|
538
|
+
regstr = "unknown"
|
|
539
539
|
cid_ordering = resolve1(self.cidsysteminfo.get("Ordering"))
|
|
540
|
-
if isinstance(cid_ordering,
|
|
541
|
-
|
|
540
|
+
if isinstance(cid_ordering, bytes):
|
|
541
|
+
ordstr = decode_text(cid_ordering).strip()
|
|
542
542
|
else:
|
|
543
|
-
|
|
544
|
-
self.cidcoding = f"{
|
|
543
|
+
ordstr = "unknown"
|
|
544
|
+
self.cidcoding = f"{regstr}-{ordstr}"
|
|
545
545
|
self.cmap: CMapBase = self.get_cmap_from_spec(spec)
|
|
546
546
|
|
|
547
547
|
try:
|
|
@@ -337,7 +337,7 @@ def write_cmyk_tiff(
|
|
|
337
337
|
|
|
338
338
|
# 6. --- Write the Actual Pixel Data ---
|
|
339
339
|
# The current file position should now match `offset_image_data`
|
|
340
|
-
assert (
|
|
341
|
-
outfh.tell()
|
|
342
|
-
)
|
|
340
|
+
assert outfh.tell() == offset_image_data, (
|
|
341
|
+
f"File position mismatch: at {outfh.tell()}, expected {offset_image_data}"
|
|
342
|
+
)
|
|
343
343
|
outfh.write(data)
|
|
@@ -19,7 +19,6 @@ from typing import (
|
|
|
19
19
|
Tuple,
|
|
20
20
|
Union,
|
|
21
21
|
Sequence,
|
|
22
|
-
cast,
|
|
23
22
|
)
|
|
24
23
|
|
|
25
24
|
from playa.color import PREDEFINED_COLORSPACE, ColorSpace, get_colorspace
|
|
@@ -58,7 +57,7 @@ from playa.pdftypes import (
|
|
|
58
57
|
resolve1,
|
|
59
58
|
stream_value,
|
|
60
59
|
)
|
|
61
|
-
from playa.utils import
|
|
60
|
+
from playa.utils import mult_matrix
|
|
62
61
|
from playa.worker import _deref_document
|
|
63
62
|
|
|
64
63
|
if TYPE_CHECKING:
|
|
@@ -440,7 +439,7 @@ class LazyInterpreter:
|
|
|
440
439
|
# Inline images are not XObjects, have no xobjid
|
|
441
440
|
return self.render_image(None, obj)
|
|
442
441
|
else:
|
|
443
|
-
|
|
442
|
+
log.warning("EI has unknown argument type: %r", obj)
|
|
444
443
|
return None
|
|
445
444
|
|
|
446
445
|
def do_Do(self, xobjid_arg: PDFObject) -> Union[ContentObject, None]:
|
|
@@ -452,8 +451,7 @@ class LazyInterpreter:
|
|
|
452
451
|
log.debug("Undefined xobject id: %r", xobjid)
|
|
453
452
|
return None
|
|
454
453
|
except TypeError as e:
|
|
455
|
-
|
|
456
|
-
return None
|
|
454
|
+
raise TypeError(f"Empty or invalid xobject with id {xobjid!r}") from e
|
|
457
455
|
subtype = xobj.get("Subtype")
|
|
458
456
|
if subtype is LITERAL_FORM:
|
|
459
457
|
# PDF Ref 1.7, # 4.9
|
|
@@ -530,7 +528,15 @@ class LazyInterpreter:
|
|
|
530
528
|
f1: PDFObject,
|
|
531
529
|
) -> None:
|
|
532
530
|
"""Concatenate matrix to current transformation matrix"""
|
|
533
|
-
|
|
531
|
+
cm = (
|
|
532
|
+
num_value(a1),
|
|
533
|
+
num_value(b1),
|
|
534
|
+
num_value(c1),
|
|
535
|
+
num_value(d1),
|
|
536
|
+
num_value(e1),
|
|
537
|
+
num_value(f1),
|
|
538
|
+
)
|
|
539
|
+
self.ctm = mult_matrix(cm, self.ctm)
|
|
534
540
|
|
|
535
541
|
def do_w(self, linewidth: PDFObject) -> None:
|
|
536
542
|
"""Set line width"""
|
|
@@ -557,8 +563,11 @@ class LazyInterpreter:
|
|
|
557
563
|
"""Set color rendering intent"""
|
|
558
564
|
if self.ignore_colours:
|
|
559
565
|
return
|
|
560
|
-
|
|
561
|
-
|
|
566
|
+
if isinstance(intent, PSLiteral):
|
|
567
|
+
# Should possibly check that it is a valid intent
|
|
568
|
+
self.graphicstate.intent = intent
|
|
569
|
+
else:
|
|
570
|
+
raise TypeError(f"Not a name: {intent!r}")
|
|
562
571
|
|
|
563
572
|
def do_i(self, flatness: PDFObject) -> None:
|
|
564
573
|
"""Set flatness tolerance"""
|
|
@@ -600,7 +609,12 @@ class LazyInterpreter:
|
|
|
600
609
|
if isinstance(bm, PSLiteral):
|
|
601
610
|
self.graphicstate.blend_mode = bm
|
|
602
611
|
else:
|
|
603
|
-
|
|
612
|
+
bml: List[PSLiteral] = []
|
|
613
|
+
for x in list_value(bm):
|
|
614
|
+
if isinstance(PSLiteral, x):
|
|
615
|
+
raise TypeError(f"Not a name: {x!r}")
|
|
616
|
+
bml.append(x)
|
|
617
|
+
self.graphicstate.blend_mode = bml
|
|
604
618
|
if "SMask" in extgstate:
|
|
605
619
|
smask = extgstate["SMask"]
|
|
606
620
|
if isinstance(smask, PSLiteral):
|
|
@@ -883,8 +897,8 @@ class LazyInterpreter:
|
|
|
883
897
|
e_new = tx * a + ty * c + e
|
|
884
898
|
f_new = tx * b + ty * d + f
|
|
885
899
|
self.textstate.line_matrix = (a, b, c, d, e_new, f_new)
|
|
886
|
-
except TypeError:
|
|
887
|
-
|
|
900
|
+
except TypeError as e:
|
|
901
|
+
raise TypeError(f"Invalid offset ({tx!r}, {ty!r})") from e
|
|
888
902
|
self.textstate.glyph_offset = (0, 0)
|
|
889
903
|
|
|
890
904
|
def do_TD(self, tx: PDFObject, ty: PDFObject) -> None:
|
|
@@ -969,12 +983,14 @@ class LazyInterpreter:
|
|
|
969
983
|
def begin_tag(self, tag: PDFObject, props: Dict[str, PDFObject]) -> None:
|
|
970
984
|
"""Handle beginning of tag, setting current MCID if any."""
|
|
971
985
|
assert isinstance(tag, PSLiteral)
|
|
972
|
-
tag = decode_text(tag.name)
|
|
973
986
|
if "MCID" in props:
|
|
974
987
|
mcid = int_value(props["MCID"])
|
|
975
988
|
else:
|
|
976
989
|
mcid = None
|
|
977
|
-
self.mcstack = (
|
|
990
|
+
self.mcstack = (
|
|
991
|
+
*self.mcstack,
|
|
992
|
+
MarkedContent(mcid=mcid, tag=tag.name, props=props),
|
|
993
|
+
)
|
|
978
994
|
|
|
979
995
|
def do_BMC(self, tag: PDFObject) -> None:
|
|
980
996
|
"""Begin marked-content sequence"""
|