playa-pdf 0.8.1__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/CHANGELOG.md +24 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/PKG-INFO +117 -103
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/README.md +115 -100
- playa_pdf-0.10.0/benchmarks/benchmark.sh +28 -0
- playa_pdf-0.10.0/benchmarks/latency.py +54 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/structure.py +1 -1
- playa_pdf-0.10.0/latency_stats.txt +3585 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/mkdocs.yml +3 -1
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/_version.py +2 -2
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/arcfour.py +3 -5
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/ccitt.py +2 -1
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cli.py +1 -2
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmapdb.py +3 -2
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/content.py +214 -109
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/metadata.py +2 -2
- playa_pdf-0.10.0/playa/data_structures.py +114 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/document.py +563 -458
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/font.py +9 -8
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/image.py +3 -3
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/interp.py +57 -41
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/miner.py +21 -27
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/page.py +28 -60
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/parser.py +30 -13
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/pdftypes.py +10 -10
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/security.py +7 -7
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/structure.py +37 -13
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/utils.py +6 -10
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/xref.py +94 -57
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/pyproject.toml +15 -5
- playa_pdf-0.10.0/tests/latency_stats.py +68 -0
- playa_pdf-0.10.0/tests/test_data_structures.py +114 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_document.py +17 -8
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_interp.py +5 -5
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_miner.py +1 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_parser.py +1 -1
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_pdftypes.py +4 -77
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_structure.py +14 -9
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_utils.py +5 -11
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_xref.py +44 -40
- playa_pdf-0.8.1/benchmarks/benchmark.sh +0 -25
- playa_pdf-0.8.1/playa/data_structures.py +0 -88
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.flake8 +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.gitattributes +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.gitignore +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.gitmodules +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/LICENSE +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/MANIFEST.in +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/ccitt_decode.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/converter.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/marked_content.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/miner.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/objects.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/parallel.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/parser.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/png_predict.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/text.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/tiff_predict.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/type3_charproc.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/adobe-spiderman.jpg +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/cli.md +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/data.md +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/index.md +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/reference.md +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/working-in-the-pdf-mine.md +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/__init__.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/_saslprep.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/ascii85.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78ms-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78ms-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/83pv-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/83pv-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90ms-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90ms-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90msp-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90msp-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90pv-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90pv-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5pc-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5pc-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS1-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS1-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETHK-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETHK-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETen-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETen-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETenms-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETenms-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK2K-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK2K-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBKp-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBKp-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBTpc-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBTpc-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBpc-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBpc-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdla-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdla-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdlb-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdlb-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKgccs-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKgccs-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm314-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm314-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm471-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm471-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKscs-B5-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKscs-B5-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hankaku-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hankaku-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hiragana-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hiragana-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-Johab-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-Johab-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-HW-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-HW-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCpc-EUC-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCpc-EUC-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Katakana-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Katakana-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Makefile +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/NWP-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/NWP-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/README.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/RKSJ-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/RKSJ-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Roman-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Roman-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-HW-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-HW-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX0213-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX0213-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX02132004-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX02132004-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UCS2-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UCS2-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF16-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF16-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF32-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF32-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF8-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF8-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/WP-Symbol-H.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/WP-Symbol-V.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-CNS1.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-GB1.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-Japan1.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-KR.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-Korea1.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-Manga1.pickle.gz +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/color.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/__init__.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/_asobj.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/content.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/encodingdb.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/encodings.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/exceptions.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/fontmetrics.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/fontprogram.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/glyphlist.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/lzw.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/outline.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/py.typed +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/runlength.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/worker.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/README +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/acroform/AcroForm_TEST.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/acroform/AcroForm_TEST_compiled.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/actualtext.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/ascii_tounicode.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/broken_xobjects.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/character_spacing.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/character_spacing_glyphs.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/character_spacing_texts.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/core_font_encodings.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/duplicate_encoding_tounicode.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-128-m.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-128.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-256-m.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-256-r6.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-256.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/base.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/encrypted_doc_no_id.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/rc4-128.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/rc4-40.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/evil_cmap.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/evil_xobjects.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/extgstate.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/font-size-test.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/graphics_state_in_text_object.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/hello_structure.pdf +6 -6
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/image_structure.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/jo.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/junk_before_header.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/missing_rolemap.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/multi-xrefs.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/pdf_structure.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotated.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotated_type3_fonts.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/0.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/0mb.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/180.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/180mb.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/270.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/270mb.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/90.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/90mb.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/sampleOneByteIdentityEncode.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/scancode/patchelf.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple1.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple2.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple3.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple3_glyphs.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple3_texts.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple4.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple5.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/structure_xobjects.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/structure_xobjects_2.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/test_pdf_with_tiff_predictor.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/text_displacement.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/text_side_effects.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/text_space.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/type3_fonts.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/uncoloured-tiling-pattern.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/utf16_tounicode.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/utf8_tounicode.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_glyphs.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_offset.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_offset_glyphs.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_offset_texts.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_texts.json +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/xobject_graphicstate.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/zen_of_python_corrupted.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/__init__.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/bad_operators.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/bad_pages.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/bad_resources.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/cmap-encoding.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/cmap-onebyte-encoding.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/cmap-tounicode.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/data.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/fallback-xref.pdf +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/issue18117-encoding.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/issue18117-tounicode.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/issue9367-tounicode.txt +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_cli.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_cmapdb.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_crypto.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_data.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_encodingdb.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_fonts.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_indirect_objects.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_lazy_api.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_lexer.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_open.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_outline.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_page.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_parallel.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_pdfminer_ccitt.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_text.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/conv_afm.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/conv_cmap.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/conv_glyphlist.py +0 -0
- {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/create_json_schema.py +0 -0
|
@@ -1,3 +1,27 @@
|
|
|
1
|
+
## PLAYA 0.10.0: 2026-02-04
|
|
2
|
+
|
|
3
|
+
- Load xref tables lazily
|
|
4
|
+
- Iterate lazily over page lists
|
|
5
|
+
- Support iterating over marked content sections in logical or page
|
|
6
|
+
order
|
|
7
|
+
- Insert blank pages for all invalid entries in page tree
|
|
8
|
+
- Restore Python 3.8 compatibility
|
|
9
|
+
- BREAKING CHANGE: `Document`, `NameTree` and `NumberTree` are now
|
|
10
|
+
proper `collections.abc` Mappings, so you need `items()` to get
|
|
11
|
+
`(key, value)` pairs
|
|
12
|
+
- BREAKING CHANGE: Undefined object IDs now raise `KeyError` and not
|
|
13
|
+
`IndexError`
|
|
14
|
+
- BREAKING CHANGE: `marked_contents` now contain empty iterables
|
|
15
|
+
instead of `None` for empty marked content sections
|
|
16
|
+
- BREAKING CHANGE: `mcid_texts` no longer exists
|
|
17
|
+
|
|
18
|
+
## PLAYA 0.9.0: 2026-01-08
|
|
19
|
+
|
|
20
|
+
- Refactor and add convenience methods to text objects
|
|
21
|
+
- Insert blank pages for missing object references in page tree
|
|
22
|
+
- Clean up type annotations (breaking change: PDFObject can no longer
|
|
23
|
+
be `str`, as the parser will never create this)
|
|
24
|
+
|
|
1
25
|
## PLAYA 0.8.1: 2025-12-22
|
|
2
26
|
|
|
3
27
|
- Correct subtle issues with mypyc-compiled pdfminer.six code
|
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: playa-pdf
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Parallel and LazY Analyzer for PDFs
|
|
5
5
|
Project-URL: Homepage, https://dhdaines.github.io/playa
|
|
6
6
|
Author-email: David Huggins-Daines <dhd@ecolingui.ca>
|
|
7
|
-
License: MIT
|
|
7
|
+
License-Expression: MIT
|
|
8
8
|
License-File: LICENSE
|
|
9
9
|
Keywords: pdf parser,text mining
|
|
10
10
|
Classifier: Development Status :: 4 - Beta
|
|
11
11
|
Classifier: Environment :: Console
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: Intended Audience :: Science/Research
|
|
14
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
15
14
|
Classifier: Programming Language :: Python
|
|
16
15
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.8
|
|
@@ -131,63 +130,56 @@ place! Let's open up a PDF and see what's in it:
|
|
|
131
130
|
pdf = playa.open("my_awesome_document.pdf")
|
|
132
131
|
raw_byte_stream = pdf.buffer
|
|
133
132
|
a_bunch_of_tokens = list(pdf.tokens)
|
|
134
|
-
|
|
133
|
+
a_bunch_of_indirect_object_ids = list(pdf.keys())
|
|
134
|
+
a_bunch_of_indirect_objects = list(pdf.values())
|
|
135
|
+
a_bunch_of_pages = list(pdf.pages)
|
|
135
136
|
```
|
|
136
137
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
138
|
+
Yes, a [`Document`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document) is fundamentally a
|
|
139
|
+
[`Mapping`](https://docs.python.org/3/library/collections.abc.html#collections.abc.Mapping) of object IDs to objects, which
|
|
140
|
+
are represented to the extent possible by native Python objects.
|
|
141
|
+
These may not be terribly useful to you, but you might find them
|
|
142
|
+
interesting. Note that these are "indirect objects" where the actual
|
|
143
|
+
object is accompanied by an object number and "generation number". If
|
|
144
|
+
you wish to find **all** the objects in a PDF file, then you will need
|
|
145
|
+
to iterate over the [`objects`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.objects) property:
|
|
141
146
|
|
|
142
147
|
```python
|
|
143
|
-
for
|
|
144
|
-
|
|
145
|
-
# or also
|
|
146
|
-
for obj in pdf:
|
|
147
|
-
obj.objid, obj.genno, obj.obj
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
Also, these will only be the top-level objects and not those found
|
|
151
|
-
inside object streams (the streams are themselves indirect objects).
|
|
152
|
-
You can iterate over all indirect objects including object streams
|
|
153
|
-
using the `objects` property:
|
|
154
|
-
|
|
155
|
-
```python
|
|
156
|
-
for obj in pdf.objects:
|
|
157
|
-
obj.objid, obj.genno, obj.obj
|
|
148
|
+
for indobj in pdf.objects:
|
|
149
|
+
objid, genno, obj = indobj
|
|
158
150
|
```
|
|
159
151
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
You can also access indirect objects by number (this will return the
|
|
166
|
-
object with most recent generation number):
|
|
152
|
+
It is possible you will encounter multiple objects with the same
|
|
153
|
+
`objid` due to the "incremental updates" feature of PDF. As expected,
|
|
154
|
+
you can subscript the document to access indirect objects by number
|
|
155
|
+
(this will return the object with most recent generation number):
|
|
167
156
|
|
|
168
157
|
```python
|
|
169
158
|
a_particular_object = pdf[42]
|
|
170
159
|
```
|
|
171
160
|
|
|
172
|
-
Your PDF document probably has some pages.
|
|
173
|
-
numbers/labels? They could be things like
|
|
174
|
-
"gzvee"), 'a", or "42", for instance!
|
|
161
|
+
Your PDF document probably has some [pages](https://dhdaines.github.io/playa/latest/reference#playa.document.PageList).
|
|
162
|
+
How many? What are their numbers/labels? They could be things like
|
|
163
|
+
"xvi" (pronounced "gzvee"), 'a", or "42", for instance!
|
|
175
164
|
|
|
176
165
|
```python
|
|
177
166
|
npages = len(pdf.pages)
|
|
178
167
|
page_numbers = [page.label for page in pdf.pages]
|
|
179
168
|
```
|
|
180
169
|
|
|
181
|
-
You can also subscript `
|
|
182
|
-
slice or an iterable of `int`, which
|
|
183
|
-
|
|
184
|
-
back to their document (using weak reference
|
|
185
|
-
leaks) with
|
|
170
|
+
You can also subscript [`pages`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.pages) in
|
|
171
|
+
various other ways, using a slice or an iterable of `int`, which
|
|
172
|
+
will give you a new page list object that behaves similarly. Pages
|
|
173
|
+
and page lists can refer back to their document (using weak reference
|
|
174
|
+
magic to avoid memory leaks) with their
|
|
175
|
+
[`doc`](https://dhdaines.github.io/playa/latest/reference#playa.document.PageList.doc) property.
|
|
186
176
|
|
|
187
177
|
## Some (by no means all) helpful metadata
|
|
188
178
|
|
|
189
|
-
A PDF often contains a "document outline"
|
|
190
|
-
representing the coarse-grained logical
|
|
179
|
+
A PDF often contains a ["document outline"](https://dhdaines.github.io/playa/latest/reference#playa.outline.Outline)
|
|
180
|
+
which is a sequence of trees representing the coarse-grained logical
|
|
181
|
+
structure of the document, accessible via the
|
|
182
|
+
[`outline`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.outline) property:
|
|
191
183
|
|
|
192
184
|
```python
|
|
193
185
|
for entry in pdf.outline:
|
|
@@ -197,9 +189,10 @@ for entry in pdf.outline:
|
|
|
197
189
|
...
|
|
198
190
|
```
|
|
199
191
|
|
|
200
|
-
If you are lucky it has a "logical structure
|
|
201
|
-
might even be
|
|
202
|
-
|
|
192
|
+
If you are lucky it has a ["logical structure
|
|
193
|
+
tree"](https://dhdaines.github.io/playa/latest/reference#playa.structure.Tree). The elements here might even be
|
|
194
|
+
referenced from the [`outline`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.outline)
|
|
195
|
+
above! (or, they might not... with PDF you never know).
|
|
203
196
|
|
|
204
197
|
```python
|
|
205
198
|
for element in pdf.structure:
|
|
@@ -213,16 +206,18 @@ Now perhaps we want to look at a specific page. Okay! You can also
|
|
|
213
206
|
look at its contents, more on that in a bit:
|
|
214
207
|
|
|
215
208
|
```python
|
|
216
|
-
page = pdf.pages
|
|
217
|
-
page = pdf.pages[
|
|
218
|
-
page = pdf.pages["
|
|
209
|
+
page = next(iter(pdf.pages)) # Fast and lazy way to get the first page
|
|
210
|
+
page = pdf.pages[0] # they are numbered from 0
|
|
211
|
+
page = pdf.pages["xviii"] # but you can get them by label (a string)
|
|
212
|
+
page = pdf.pages["42"] # or "logical" page number (also a string)
|
|
219
213
|
print(f"Page {page.label} is {page.width} x {page.height}")
|
|
220
214
|
```
|
|
221
215
|
|
|
222
216
|
Since PDF is at heart a page-oriented, presentation format, many types
|
|
223
217
|
of metadata are mostly accessible via the page objects. For instance
|
|
224
|
-
you can access the fonts used in page with, obviously, the
|
|
225
|
-
property, or the annotations via the
|
|
218
|
+
you can access the fonts used in page with, obviously, the
|
|
219
|
+
[`fonts`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.fonts) property, or the annotations via the
|
|
220
|
+
[`annotations`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.annotations) property.
|
|
226
221
|
|
|
227
222
|
For example, annotations (internal or external links) are defined on
|
|
228
223
|
pages (since their position would not make any sense otherwise).
|
|
@@ -235,17 +230,19 @@ for annot in page.annotations:
|
|
|
235
230
|
```
|
|
236
231
|
|
|
237
232
|
The set of possible entries in annotation dictionaries (PDF 1.7 sect
|
|
238
|
-
12.5.2) is vast and confusing and inconsistently implemented
|
|
239
|
-
|
|
240
|
-
|
|
233
|
+
12.5.2) is vast and confusing and inconsistently implemented. You can
|
|
234
|
+
access the raw annotation dictionary via `props` in the
|
|
235
|
+
[`Annotation`](https://dhdaines.github.io/playa/latest/reference#playa.page.Annotation) object.
|
|
241
236
|
|
|
242
237
|
If the document has logical structure, then the pages will also have a
|
|
243
|
-
slightly different form of logical structure. You can use the
|
|
244
|
-
`find` and
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
238
|
+
slightly different form of logical structure. You can use the
|
|
239
|
+
[`find`](https://dhdaines.github.io/playa/latest/reference#playa.structure.PageStructure.find) and
|
|
240
|
+
[`find_all`](https://dhdaines.github.io/playa/latest/reference#playa.structure.PageStructure.find_all) methods to get
|
|
241
|
+
all of the enclosing structure elements of a given type (actually a
|
|
242
|
+
role) for a page. So for instance if you wanted to get the text
|
|
243
|
+
contents for all the cells in all the tables on a page, assuming the
|
|
244
|
+
creator of said page was kind enough to check the "PDF/UA" box, you
|
|
245
|
+
can do:
|
|
249
246
|
|
|
250
247
|
```python
|
|
251
248
|
for table in page.structure.find_all("Table"):
|
|
@@ -286,8 +283,9 @@ PLAYA allows you to take advantage of multiple CPUs, which can greatly
|
|
|
286
283
|
speed up some operations on large documents. This parallelism
|
|
287
284
|
currently operates at the page level since this is the most logical
|
|
288
285
|
way to split up a PDF. To enable it, pass the `max_workers` argument
|
|
289
|
-
to `playa.open`
|
|
290
|
-
|
|
286
|
+
to [`playa.open`](https://dhdaines.github.io/playa/latest/reference/#playa.open)
|
|
287
|
+
with the number of cores you wish to use (you can also explicitly pass
|
|
288
|
+
`None` to use the maximum):
|
|
291
289
|
|
|
292
290
|
```python
|
|
293
291
|
with playa.open(path, max_workers=4) as pdf:
|
|
@@ -295,7 +293,8 @@ with playa.open(path, max_workers=4) as pdf:
|
|
|
295
293
|
```
|
|
296
294
|
|
|
297
295
|
Now, you can apply a function across the pages of the PDF in parallel
|
|
298
|
-
using the `map` method of
|
|
296
|
+
using the [`map`](https://dhdaines.github.io/playa/latest/reference#playa.document.PageList.map) method of
|
|
297
|
+
[`pdf.pages`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.pages), for example:
|
|
299
298
|
|
|
300
299
|
```python
|
|
301
300
|
def get_page_size(page: Page) -> Tuple[int, int]:
|
|
@@ -305,20 +304,20 @@ page_sizes = pdf.pages.map(get_page_size)
|
|
|
305
304
|
```
|
|
306
305
|
|
|
307
306
|
You could also just do this for certain pages by subscripting
|
|
308
|
-
`pdf.pages` (this can be a slice, an
|
|
309
|
-
generator expression over `int` and/or `str`):
|
|
307
|
+
[`pdf.pages`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.pages) (this can be a slice, an
|
|
308
|
+
iterable of `int`, or a generator expression over `int` and/or `str`):
|
|
310
309
|
|
|
311
310
|
```python
|
|
312
311
|
some_page_sizes = pdf.pages[2:5].map(get_page_size)
|
|
313
312
|
```
|
|
314
313
|
|
|
315
314
|
There are some limitations to this, because it uses `multiprocessing`.
|
|
316
|
-
The function you pass to `map` must be serializable by `pickle`,
|
|
317
|
-
in practice means that an inner function or lambda generally
|
|
318
|
-
work. You can get around this in a very Java-like way by
|
|
319
|
-
callable object that encapsulates the necessary state. If
|
|
320
|
-
avoid traumatising readers of your code, then use
|
|
321
|
-
instead:
|
|
315
|
+
The function you pass to `map` must be serializable by `pickle`,
|
|
316
|
+
which in practice means that an inner function or lambda generally
|
|
317
|
+
doesn't work. You can get around this in a very Java-like way by
|
|
318
|
+
passing a callable object that encapsulates the necessary state. If
|
|
319
|
+
you wish to avoid traumatising readers of your code, then use
|
|
320
|
+
`functools.partial` instead:
|
|
322
321
|
|
|
323
322
|
```python
|
|
324
323
|
pdf.pages.map(partial(myfunc, arg1=value1, arg2=value2))
|
|
@@ -327,9 +326,9 @@ pdf.pages.map(partial(myfunc, arg1=value1, arg2=value2))
|
|
|
327
326
|
Also, any value returned by your function must also be serializable.
|
|
328
327
|
There is a bit of magic that enables this to work for PDF objects
|
|
329
328
|
containing indirect object references, so you should be able to, for
|
|
330
|
-
instance, get the `
|
|
331
|
-
trouble. But if you have your own complex
|
|
332
|
-
may encounter problems (or slowness).
|
|
329
|
+
instance, get the [`annotations`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.annotations) from
|
|
330
|
+
every page without any trouble. But if you have your own complex
|
|
331
|
+
objects that you return you may encounter problems (or slowness).
|
|
333
332
|
|
|
334
333
|
## An important note about coordinate spaces
|
|
335
334
|
|
|
@@ -357,7 +356,7 @@ device space, specifically:
|
|
|
357
356
|
the bottom-right corner.
|
|
358
357
|
|
|
359
358
|
However, for compatibility with `pdfminer.six`, you can also pass
|
|
360
|
-
`space="page"` to `playa.open
|
|
359
|
+
`space="page"` to [`playa.open`](https://dhdaines.github.io/playa/latest/reference/#playa.open). In this case, `(0, 0)` is the
|
|
361
360
|
bottom-left corner of the page as defined by the `MediaBox`, after
|
|
362
361
|
rotation, and coordinates increase from the bottom-left corner of the
|
|
363
362
|
page towards the top-right, as they do in PDF user space.
|
|
@@ -430,20 +429,28 @@ Note that though it's called a "stack", it's actually a tuple. This
|
|
|
430
429
|
means that it is immutable, and you can check if it has changed from
|
|
431
430
|
one object to the next using the `is` operator.
|
|
432
431
|
|
|
433
|
-
All content objects can also refer back to their containing
|
|
434
|
-
from the `page` property. This uses weak
|
|
435
|
-
avoid causing memory leaks.
|
|
432
|
+
All content objects can also refer back to their containing
|
|
433
|
+
[`Page`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page) from the `page` property. This uses weak
|
|
434
|
+
reference magic in order to avoid causing memory leaks.
|
|
436
435
|
|
|
437
436
|
### Form XObjects
|
|
438
437
|
|
|
439
438
|
A PDF page may also contain "Form XObjects" which are like tiny
|
|
440
439
|
embedded PDF documents (they have nothing to do with fillable forms).
|
|
441
|
-
Simply iterating over a
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
440
|
+
Simply iterating over a
|
|
441
|
+
[`Page`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page)
|
|
442
|
+
**will not expand these for you** which may be a source of surprise,
|
|
443
|
+
but you can recurse into them with the
|
|
444
|
+
[`flatten`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.flatten)
|
|
445
|
+
method, or with the convenience properties
|
|
446
|
+
[`paths`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.paths),
|
|
447
|
+
[`images`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.images),
|
|
448
|
+
[`texts`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.texts)
|
|
449
|
+
and
|
|
450
|
+
[`glyphs`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.glyphs).
|
|
451
|
+
You can also identify them in iteration because they have `object_type
|
|
452
|
+
== "xobject"`. The layout objects inside are accessible by iteration,
|
|
453
|
+
as with pages:
|
|
447
454
|
|
|
448
455
|
```python
|
|
449
456
|
for obj in page:
|
|
@@ -453,8 +460,9 @@ for obj in page:
|
|
|
453
460
|
```
|
|
454
461
|
|
|
455
462
|
You can also iterate over them in the page context with
|
|
456
|
-
`page.xobjects` (this will also find Form
|
|
457
|
-
other Form XObjects, which is unfortunately
|
|
463
|
+
[`page.xobjects`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.xobjects) (this will also find Form
|
|
464
|
+
XObjects contained inside other Form XObjects, which is unfortunately
|
|
465
|
+
a thing):
|
|
458
466
|
|
|
459
467
|
```python
|
|
460
468
|
for xobj in page.xobjects:
|
|
@@ -463,9 +471,9 @@ for xobj in page.xobjects:
|
|
|
463
471
|
```
|
|
464
472
|
|
|
465
473
|
Exceptionally, these have a few more features than the ordinary
|
|
466
|
-
`ContentObject` - you can look at their
|
|
467
|
-
the tokens, and you can also see raw,
|
|
468
|
-
`contents`.
|
|
474
|
+
[`ContentObject`](https://dhdaines.github.io/playa/latest/reference#playa.content.ContentObject) - you can look at their
|
|
475
|
+
raw stream contents as well as the tokens, and you can also see raw,
|
|
476
|
+
mysterious PDF objects with `contents`.
|
|
469
477
|
|
|
470
478
|
### Graphics state
|
|
471
479
|
|
|
@@ -474,9 +482,10 @@ of what PDF refers to as the *graphics state*, which is accessible
|
|
|
474
482
|
through `obj.gstate`. This is a mutable object, and since there are
|
|
475
483
|
quite a few parameters in the graphics state, PLAYA does not create a
|
|
476
484
|
copy of it for every object in the layout. If you wish to reuse these
|
|
477
|
-
objects, you should call
|
|
478
|
-
|
|
479
|
-
|
|
485
|
+
objects, you should call
|
|
486
|
+
[`finalize`](https://dhdaines.github.io/playa/latest/reference#playa.content.ContentObject.finalize) on them, which will
|
|
487
|
+
freeze the graphics state and any other necessary context, allowing
|
|
488
|
+
the object to be stored and reused *as long as the document exists*:
|
|
480
489
|
|
|
481
490
|
```python
|
|
482
491
|
for obj in page:
|
|
@@ -537,15 +546,18 @@ individual glyphs (which might or might not correspond to characters),
|
|
|
537
546
|
this is not always what you want, and moreover it is computationally
|
|
538
547
|
quite expensive. So PLAYA, by default, does not do this. If you
|
|
539
548
|
don't need to know the actual bounding box of a text object, then
|
|
540
|
-
don't access `obj.bbox` and it
|
|
541
|
-
to know the position of each
|
|
542
|
-
characters, then just look at
|
|
549
|
+
don't access [`obj.bbox`](https://dhdaines.github.io/playa/latest/reference#playa.content.ContentObject.bbox) and it
|
|
550
|
+
won't be computed. If you don't need to know the position of each
|
|
551
|
+
glyph but simply want the Unicode characters, then just look at
|
|
552
|
+
[`obj.chars`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject.chars).
|
|
543
553
|
|
|
544
|
-
It is also important to understand that
|
|
554
|
+
It is also important to understand that
|
|
555
|
+
[`obj.chars`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject.chars) may or may not
|
|
545
556
|
correspond to the actual text that a human will read on the page. To
|
|
546
557
|
actually extract *text* from a PDF necessarily involves Heuristics or
|
|
547
|
-
Machine Learning
|
|
548
|
-
|
|
558
|
+
Machine Learning. PLAYA has [some simple
|
|
559
|
+
heuristics](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.extract_text) to do this, which will work
|
|
560
|
+
better with tagged and accessible PDFs, but don't expect miracles.
|
|
549
561
|
|
|
550
562
|
This is because PDFs, especially ones produced by OCR, don't organize
|
|
551
563
|
text objects in any meaningful fashion, so you will want to actually
|
|
@@ -566,8 +578,9 @@ to ignore glyphs with `glyph.gstate.render_mode == 3` (which means
|
|
|
566
578
|
For text extraction you really don't care about the `bbox`, but you
|
|
567
579
|
probably *do* care about the origin of each glyph relative to its
|
|
568
580
|
neighbours. For this reason PLAYA provides you with two convenience
|
|
569
|
-
properties, `origin` and
|
|
570
|
-
|
|
581
|
+
properties, [`origin`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextBase.origin). and
|
|
582
|
+
[`displacement`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextBase.displacement), which are
|
|
583
|
+
considerably faster to compute than the `bbox`.
|
|
571
584
|
|
|
572
585
|
PLAYA doesn't guarantee that text objects come at you in anything
|
|
573
586
|
other than the order they occur in the file (but it does guarantee
|
|
@@ -606,12 +619,13 @@ to eradicate the [numerous inconsistencies, contradictions, and
|
|
|
606
619
|
ambiguities](https://github.com/pdf-association/pdf-issues) of the
|
|
607
620
|
previous standard)
|
|
608
621
|
|
|
609
|
-
In particular, we care **a lot** about marked content operators,
|
|
610
|
-
of the abovementioned `ActualText` property. For this reason
|
|
611
|
-
`TextObject` in PLAYA **does not** and
|
|
612
|
-
PDF text object as defined by the `BT`
|
|
613
|
-
moment, every text-showing operator
|
|
614
|
-
|
|
622
|
+
In particular, we care **a lot** about marked content operators,
|
|
623
|
+
because of the abovementioned `ActualText` property. For this reason
|
|
624
|
+
a [`TextObject`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject) in PLAYA **does not** and
|
|
625
|
+
**will never** correspond to a PDF text object as defined by the `BT`
|
|
626
|
+
and `ET` operators. For the moment, every text-showing operator
|
|
627
|
+
triggers a new [`TextObject`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject). It is
|
|
628
|
+
possible (though unlikely) that in the future, only changes in marked
|
|
615
629
|
content or graphics state will do this.
|
|
616
630
|
|
|
617
631
|
## Conclusion
|