playa-pdf 0.8.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (340) hide show
  1. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.gitignore +1 -0
  2. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/CHANGELOG.md +12 -1
  3. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/PKG-INFO +2 -2
  4. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/README.md +1 -1
  5. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/_version.py +2 -2
  6. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/ccitt.py +2 -1
  7. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cli.py +1 -2
  8. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/content.py +97 -42
  9. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/metadata.py +2 -2
  10. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/document.py +4 -3
  11. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/font.py +8 -8
  12. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/image.py +3 -3
  13. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/interp.py +29 -13
  14. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/miner.py +86 -67
  15. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/page.py +1 -2
  16. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/parser.py +4 -6
  17. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/pdftypes.py +1 -3
  18. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/security.py +1 -1
  19. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/utils.py +4 -8
  20. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/pyproject.toml +4 -1
  21. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_document.py +4 -2
  22. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_miner.py +24 -2
  23. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_pdftypes.py +4 -4
  24. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_xref.py +2 -7
  25. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.flake8 +0 -0
  26. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.gitattributes +0 -0
  27. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/.gitmodules +0 -0
  28. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/LICENSE +0 -0
  29. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/MANIFEST.in +0 -0
  30. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/benchmark.sh +0 -0
  31. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/ccitt_decode.py +0 -0
  32. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/converter.py +0 -0
  33. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/marked_content.py +0 -0
  34. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/miner.py +0 -0
  35. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/objects.py +0 -0
  36. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/parallel.py +0 -0
  37. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/parser.py +0 -0
  38. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/png_predict.py +0 -0
  39. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/structure.py +0 -0
  40. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/text.py +0 -0
  41. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/tiff_predict.py +0 -0
  42. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/benchmarks/type3_charproc.py +0 -0
  43. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/adobe-spiderman.jpg +0 -0
  44. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/cli.md +0 -0
  45. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/data.md +0 -0
  46. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/index.md +0 -0
  47. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/reference.md +0 -0
  48. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/docs/working-in-the-pdf-mine.md +0 -0
  49. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/mkdocs.yml +0 -0
  50. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/__init__.py +0 -0
  51. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/_saslprep.py +0 -0
  52. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/arcfour.py +0 -0
  53. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/ascii85.py +0 -0
  54. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-EUC-H.pickle.gz +0 -0
  55. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-EUC-V.pickle.gz +0 -0
  56. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-H.pickle.gz +0 -0
  57. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-RKSJ-H.pickle.gz +0 -0
  58. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-RKSJ-V.pickle.gz +0 -0
  59. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78-V.pickle.gz +0 -0
  60. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78ms-RKSJ-H.pickle.gz +0 -0
  61. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/78ms-RKSJ-V.pickle.gz +0 -0
  62. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/83pv-RKSJ-H.pickle.gz +0 -0
  63. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/83pv-RKSJ-V.pickle.gz +0 -0
  64. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90ms-RKSJ-H.pickle.gz +0 -0
  65. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90ms-RKSJ-V.pickle.gz +0 -0
  66. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90msp-RKSJ-H.pickle.gz +0 -0
  67. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90msp-RKSJ-V.pickle.gz +0 -0
  68. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90pv-RKSJ-H.pickle.gz +0 -0
  69. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/90pv-RKSJ-V.pickle.gz +0 -0
  70. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-H.pickle.gz +0 -0
  71. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-RKSJ-H.pickle.gz +0 -0
  72. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-RKSJ-V.pickle.gz +0 -0
  73. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Add-V.pickle.gz +0 -0
  74. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5-H.pickle.gz +0 -0
  75. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5-V.pickle.gz +0 -0
  76. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5pc-H.pickle.gz +0 -0
  77. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/B5pc-V.pickle.gz +0 -0
  78. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS-EUC-H.pickle.gz +0 -0
  79. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS-EUC-V.pickle.gz +0 -0
  80. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS1-H.pickle.gz +0 -0
  81. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS1-V.pickle.gz +0 -0
  82. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS2-H.pickle.gz +0 -0
  83. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/CNS2-V.pickle.gz +0 -0
  84. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETHK-B5-H.pickle.gz +0 -0
  85. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETHK-B5-V.pickle.gz +0 -0
  86. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETen-B5-H.pickle.gz +0 -0
  87. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETen-B5-V.pickle.gz +0 -0
  88. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETenms-B5-H.pickle.gz +0 -0
  89. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/ETenms-B5-V.pickle.gz +0 -0
  90. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/EUC-H.pickle.gz +0 -0
  91. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/EUC-V.pickle.gz +0 -0
  92. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-H.pickle.gz +0 -0
  93. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-RKSJ-H.pickle.gz +0 -0
  94. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-RKSJ-V.pickle.gz +0 -0
  95. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Ext-V.pickle.gz +0 -0
  96. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-EUC-H.pickle.gz +0 -0
  97. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-EUC-V.pickle.gz +0 -0
  98. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-H.pickle.gz +0 -0
  99. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GB-V.pickle.gz +0 -0
  100. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK-EUC-H.pickle.gz +0 -0
  101. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK-EUC-V.pickle.gz +0 -0
  102. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK2K-H.pickle.gz +0 -0
  103. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBK2K-V.pickle.gz +0 -0
  104. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBKp-EUC-H.pickle.gz +0 -0
  105. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBKp-EUC-V.pickle.gz +0 -0
  106. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-EUC-H.pickle.gz +0 -0
  107. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-EUC-V.pickle.gz +0 -0
  108. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-H.pickle.gz +0 -0
  109. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBT-V.pickle.gz +0 -0
  110. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBTpc-EUC-H.pickle.gz +0 -0
  111. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBTpc-EUC-V.pickle.gz +0 -0
  112. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBpc-EUC-H.pickle.gz +0 -0
  113. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/GBpc-EUC-V.pickle.gz +0 -0
  114. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/H.pickle.gz +0 -0
  115. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdla-B5-H.pickle.gz +0 -0
  116. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdla-B5-V.pickle.gz +0 -0
  117. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdlb-B5-H.pickle.gz +0 -0
  118. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKdlb-B5-V.pickle.gz +0 -0
  119. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKgccs-B5-H.pickle.gz +0 -0
  120. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKgccs-B5-V.pickle.gz +0 -0
  121. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm314-B5-H.pickle.gz +0 -0
  122. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm314-B5-V.pickle.gz +0 -0
  123. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm471-B5-H.pickle.gz +0 -0
  124. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKm471-B5-V.pickle.gz +0 -0
  125. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKscs-B5-H.pickle.gz +0 -0
  126. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/HKscs-B5-V.pickle.gz +0 -0
  127. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hankaku-H.pickle.gz +0 -0
  128. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hankaku-V.pickle.gz +0 -0
  129. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hiragana-H.pickle.gz +0 -0
  130. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Hiragana-V.pickle.gz +0 -0
  131. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-EUC-H.pickle.gz +0 -0
  132. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-EUC-V.pickle.gz +0 -0
  133. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-H.pickle.gz +0 -0
  134. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-Johab-H.pickle.gz +0 -0
  135. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-Johab-V.pickle.gz +0 -0
  136. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSC-V.pickle.gz +0 -0
  137. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-H.pickle.gz +0 -0
  138. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-HW-H.pickle.gz +0 -0
  139. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-HW-V.pickle.gz +0 -0
  140. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCms-UHC-V.pickle.gz +0 -0
  141. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCpc-EUC-H.pickle.gz +0 -0
  142. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/KSCpc-EUC-V.pickle.gz +0 -0
  143. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Katakana-H.pickle.gz +0 -0
  144. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Katakana-V.pickle.gz +0 -0
  145. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Makefile +0 -0
  146. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/NWP-H.pickle.gz +0 -0
  147. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/NWP-V.pickle.gz +0 -0
  148. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/README.txt +0 -0
  149. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/RKSJ-H.pickle.gz +0 -0
  150. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/RKSJ-V.pickle.gz +0 -0
  151. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Roman-H.pickle.gz +0 -0
  152. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/Roman-V.pickle.gz +0 -0
  153. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF16-H.pickle.gz +0 -0
  154. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF16-V.pickle.gz +0 -0
  155. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF32-H.pickle.gz +0 -0
  156. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF32-V.pickle.gz +0 -0
  157. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF8-H.pickle.gz +0 -0
  158. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniAKR-UTF8-V.pickle.gz +0 -0
  159. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UCS2-H.pickle.gz +0 -0
  160. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UCS2-V.pickle.gz +0 -0
  161. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF16-H.pickle.gz +0 -0
  162. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF16-V.pickle.gz +0 -0
  163. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF32-H.pickle.gz +0 -0
  164. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF32-V.pickle.gz +0 -0
  165. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF8-H.pickle.gz +0 -0
  166. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniCNS-UTF8-V.pickle.gz +0 -0
  167. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UCS2-H.pickle.gz +0 -0
  168. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UCS2-V.pickle.gz +0 -0
  169. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF16-H.pickle.gz +0 -0
  170. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF16-V.pickle.gz +0 -0
  171. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF32-H.pickle.gz +0 -0
  172. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF32-V.pickle.gz +0 -0
  173. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF8-H.pickle.gz +0 -0
  174. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniGB-UTF8-V.pickle.gz +0 -0
  175. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-H.pickle.gz +0 -0
  176. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-HW-H.pickle.gz +0 -0
  177. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-HW-V.pickle.gz +0 -0
  178. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UCS2-V.pickle.gz +0 -0
  179. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF16-H.pickle.gz +0 -0
  180. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF16-V.pickle.gz +0 -0
  181. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF32-H.pickle.gz +0 -0
  182. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF32-V.pickle.gz +0 -0
  183. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF8-H.pickle.gz +0 -0
  184. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS-UTF8-V.pickle.gz +0 -0
  185. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF16-H.pickle.gz +0 -0
  186. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF16-V.pickle.gz +0 -0
  187. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF32-H.pickle.gz +0 -0
  188. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF32-V.pickle.gz +0 -0
  189. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF8-H.pickle.gz +0 -0
  190. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJIS2004-UTF8-V.pickle.gz +0 -0
  191. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX0213-UTF32-H.pickle.gz +0 -0
  192. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX0213-UTF32-V.pickle.gz +0 -0
  193. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX02132004-UTF32-H.pickle.gz +0 -0
  194. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniJISX02132004-UTF32-V.pickle.gz +0 -0
  195. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UCS2-H.pickle.gz +0 -0
  196. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UCS2-V.pickle.gz +0 -0
  197. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF16-H.pickle.gz +0 -0
  198. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF16-V.pickle.gz +0 -0
  199. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF32-H.pickle.gz +0 -0
  200. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF32-V.pickle.gz +0 -0
  201. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF8-H.pickle.gz +0 -0
  202. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniKS-UTF8-V.pickle.gz +0 -0
  203. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF16-H.pickle.gz +0 -0
  204. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF16-V.pickle.gz +0 -0
  205. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF32-H.pickle.gz +0 -0
  206. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF32-V.pickle.gz +0 -0
  207. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF8-H.pickle.gz +0 -0
  208. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/UniManga-UTF8-V.pickle.gz +0 -0
  209. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/V.pickle.gz +0 -0
  210. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/WP-Symbol-H.pickle.gz +0 -0
  211. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/WP-Symbol-V.pickle.gz +0 -0
  212. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-CNS1.pickle.gz +0 -0
  213. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-GB1.pickle.gz +0 -0
  214. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-Japan1.pickle.gz +0 -0
  215. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-KR.pickle.gz +0 -0
  216. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-Korea1.pickle.gz +0 -0
  217. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmap/to-unicode-Adobe-Manga1.pickle.gz +0 -0
  218. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/cmapdb.py +0 -0
  219. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/color.py +0 -0
  220. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/__init__.py +0 -0
  221. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/_asobj.py +0 -0
  222. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data/content.py +0 -0
  223. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/data_structures.py +0 -0
  224. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/encodingdb.py +0 -0
  225. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/encodings.py +0 -0
  226. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/exceptions.py +0 -0
  227. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/fontmetrics.py +0 -0
  228. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/fontprogram.py +0 -0
  229. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/glyphlist.py +0 -0
  230. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/lzw.py +0 -0
  231. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/outline.py +0 -0
  232. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/py.typed +0 -0
  233. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/runlength.py +0 -0
  234. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/structure.py +0 -0
  235. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/worker.py +0 -0
  236. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/playa/xref.py +0 -0
  237. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/README +0 -0
  238. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/acroform/AcroForm_TEST.pdf +0 -0
  239. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/acroform/AcroForm_TEST_compiled.pdf +0 -0
  240. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/actualtext.pdf +0 -0
  241. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/ascii_tounicode.pdf +0 -0
  242. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/broken_xobjects.pdf +0 -0
  243. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/character_spacing.pdf +0 -0
  244. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/character_spacing_glyphs.json +0 -0
  245. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/character_spacing_texts.json +0 -0
  246. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/core_font_encodings.pdf +0 -0
  247. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/duplicate_encoding_tounicode.pdf +0 -0
  248. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-128-m.pdf +0 -0
  249. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-128.pdf +0 -0
  250. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-256-m.pdf +0 -0
  251. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-256-r6.pdf +0 -0
  252. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/aes-256.pdf +0 -0
  253. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/base.pdf +0 -0
  254. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/encrypted_doc_no_id.pdf +0 -0
  255. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/rc4-128.pdf +0 -0
  256. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/encryption/rc4-40.pdf +0 -0
  257. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/evil_cmap.pdf +0 -0
  258. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/evil_xobjects.pdf +0 -0
  259. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/extgstate.pdf +0 -0
  260. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/font-size-test.pdf +0 -0
  261. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/graphics_state_in_text_object.pdf +0 -0
  262. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/hello_structure.pdf +0 -0
  263. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/image_structure.pdf +0 -0
  264. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/jo.pdf +0 -0
  265. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/junk_before_header.pdf +0 -0
  266. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/missing_rolemap.pdf +0 -0
  267. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/multi-xrefs.pdf +0 -0
  268. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/pdf_structure.pdf +0 -0
  269. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotated.pdf +0 -0
  270. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotated_type3_fonts.pdf +0 -0
  271. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/0.pdf +0 -0
  272. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/0mb.pdf +0 -0
  273. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/180.pdf +0 -0
  274. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/180mb.pdf +0 -0
  275. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/270.pdf +0 -0
  276. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/270mb.pdf +0 -0
  277. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/90.pdf +0 -0
  278. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/rotation/90mb.pdf +0 -0
  279. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/sampleOneByteIdentityEncode.pdf +0 -0
  280. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/scancode/patchelf.pdf +0 -0
  281. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple1.pdf +0 -0
  282. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple2.pdf +0 -0
  283. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple3.pdf +0 -0
  284. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple3_glyphs.json +0 -0
  285. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple3_texts.json +0 -0
  286. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple4.pdf +0 -0
  287. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/simple5.pdf +0 -0
  288. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/structure_xobjects.pdf +0 -0
  289. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/structure_xobjects_2.pdf +0 -0
  290. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/test_pdf_with_tiff_predictor.pdf +0 -0
  291. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/text_displacement.pdf +0 -0
  292. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/text_side_effects.pdf +0 -0
  293. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/text_space.pdf +0 -0
  294. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/type3_fonts.pdf +0 -0
  295. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/uncoloured-tiling-pattern.pdf +0 -0
  296. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/utf16_tounicode.pdf +0 -0
  297. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/utf8_tounicode.pdf +0 -0
  298. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing.pdf +0 -0
  299. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_glyphs.json +0 -0
  300. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_offset.pdf +0 -0
  301. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_offset_glyphs.json +0 -0
  302. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_offset_texts.json +0 -0
  303. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/vertical_writing_texts.json +0 -0
  304. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/xobject_graphicstate.pdf +0 -0
  305. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/samples/zen_of_python_corrupted.pdf +0 -0
  306. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/__init__.py +0 -0
  307. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/bad_operators.pdf +0 -0
  308. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/bad_pages.pdf +0 -0
  309. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/bad_resources.pdf +0 -0
  310. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/cmap-encoding.txt +0 -0
  311. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/cmap-onebyte-encoding.txt +0 -0
  312. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/cmap-tounicode.txt +0 -0
  313. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/data.py +0 -0
  314. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/fallback-xref.pdf +0 -0
  315. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/issue18117-encoding.txt +0 -0
  316. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/issue18117-tounicode.txt +0 -0
  317. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/issue9367-tounicode.txt +0 -0
  318. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_cli.py +0 -0
  319. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_cmapdb.py +0 -0
  320. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_crypto.py +0 -0
  321. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_data.py +0 -0
  322. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_encodingdb.py +0 -0
  323. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_fonts.py +0 -0
  324. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_indirect_objects.py +0 -0
  325. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_interp.py +0 -0
  326. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_lazy_api.py +0 -0
  327. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_lexer.py +0 -0
  328. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_open.py +0 -0
  329. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_outline.py +0 -0
  330. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_page.py +0 -0
  331. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_parallel.py +0 -0
  332. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_parser.py +0 -0
  333. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_pdfminer_ccitt.py +0 -0
  334. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_structure.py +0 -0
  335. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_text.py +0 -0
  336. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tests/test_utils.py +0 -0
  337. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/conv_afm.py +0 -0
  338. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/conv_cmap.py +0 -0
  339. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/conv_glyphlist.py +0 -0
  340. {playa_pdf-0.8.0 → playa_pdf-0.9.0}/tools/create_json_schema.py +0 -0
@@ -29,3 +29,4 @@ poetry.lock
29
29
  /playa/_version.py
30
30
  .venv
31
31
  *.so
32
+ .coverage
@@ -1,10 +1,21 @@
1
+ ## PLAYA 0.9.0: Unreleased
2
+
3
+ - Refactor and add convenience methods to text objects
4
+ - Insert blank pages for missing object references in page tree
5
+ - Clean up type annotations (breaking change: PDFObject can no longer
6
+ be `str`, as the parser will never create this)
7
+
8
+ ## PLAYA 0.8.1: 2025-12-22
9
+
10
+ - Correct subtle issues with mypyc-compiled pdfminer.six code
11
+
1
12
  ## PLAYA 0.8.0: 2025-12-17
2
13
 
3
14
  - Optionally accelerate image decoding with mypyc
4
15
  - Correct explicit string positioning in vertical text
5
16
  - Restore caching in text decoding under Python 3.8
6
17
  - Bring back pdfminer.six layout analysis algorithm
7
- - TODO: refactor and accelerate text extraction with mypyc
18
+ - Optionally accelerate pdfminer.six compatibility with mypyc
8
19
 
9
20
  ## PLAYA 0.7.2: 2025-11-09
10
21
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: playa-pdf
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: Parallel and LazY Analyzer for PDFs
5
5
  Project-URL: Homepage, https://dhdaines.github.io/playa
6
6
  Author-email: David Huggins-Daines <dhd@ecolingui.ca>
@@ -52,7 +52,7 @@ analysis](https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_t
52
52
  algorithm from
53
53
  [pdfminer.six](https://github.com/pdfminer/pdfminer.six) anyways. See
54
54
  [the
55
- documentation](https://dhdaines.github.io/playa/working-in-the-pdf-mine)
55
+ documentation](https://dhdaines.github.io/playa/latest/working-in-the-pdf-mine)
56
56
  for more information on how to migrate your code. You may be
57
57
  interested to know that PLAYA's implementation is also 15-50% faster,
58
58
  depending on how many CPUs you use.
@@ -21,7 +21,7 @@ analysis](https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_t
21
21
  algorithm from
22
22
  [pdfminer.six](https://github.com/pdfminer/pdfminer.six) anyways. See
23
23
  [the
24
- documentation](https://dhdaines.github.io/playa/working-in-the-pdf-mine)
24
+ documentation](https://dhdaines.github.io/playa/latest/working-in-the-pdf-mine)
25
25
  for more information on how to migrate your code. You may be
26
26
  interested to know that PLAYA's implementation is also 15-50% faster,
27
27
  depending on how many CPUs you use.
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.8.0'
32
- __version_tuple__ = version_tuple = (0, 8, 0)
31
+ __version__ = version = '0.9.0'
32
+ __version_tuple__ = version_tuple = (0, 9, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -492,7 +492,8 @@ class CCITTG4Parser(BitParser):
492
492
  print(y, "".join(str(b) for b in bits))
493
493
 
494
494
  def _reset_line(self) -> None:
495
- # FIXME: probably, we could just swap them, like in PNG prediction
495
+ # We could just swap them, like in PNG prediction, though it's
496
+ # not clear that would be much faster.
496
497
  self._refline = self._curline
497
498
  self._curline = [1] * self.width
498
499
  self._curpos = -1
@@ -105,7 +105,6 @@ from playa.pdftypes import (
105
105
  from playa.structure import ContentItem
106
106
  from playa.structure import ContentObject as StructContentObject
107
107
  from playa.structure import Element
108
- from playa.utils import decode_text
109
108
 
110
109
  LOG = logging.getLogger(__name__)
111
110
 
@@ -486,7 +485,7 @@ def _extract_outline_item(item: Outline, indent: int, outfh: TextIO) -> bool:
486
485
 
487
486
  print(f"{ws}{{", file=outfh)
488
487
  if item.title is not None:
489
- format_attr("title", decode_text(item.title))
488
+ format_attr("title", item.title)
490
489
  if item.destination is not None:
491
490
  format_attr("destination", asobj(item.destination))
492
491
  if s:
@@ -4,6 +4,7 @@ PDF content objects created by the interpreter.
4
4
 
5
5
  import itertools
6
6
  import logging
7
+ from abc import abstractmethod
7
8
  from copy import copy
8
9
  from dataclasses import dataclass
9
10
  from typing import (
@@ -800,30 +801,95 @@ class PathObject(ContentObject):
800
801
  return transform_bbox(self.ctm, bbox)
801
802
 
802
803
 
803
- def _font_size(matrix: Matrix, vert: bool = False) -> float:
804
- if vert:
805
- # dx, dy = apply_matrix_norm(self.matrix, (1, 0))
806
- dx, dy, _, _, _, _ = matrix
807
- else:
808
- # dx, dy = apply_matrix_norm(self.matrix, (0, 1))
809
- _, _, dx, dy, _, _ = matrix
810
- if dx == 0: # Nearly always true
811
- return abs(dy)
812
- elif dy == 0:
813
- return abs(dx)
814
- else:
815
- import math
804
+ class TextBase(ContentObject):
805
+ """Common properties for text and glyph objects."""
816
806
 
817
- return math.sqrt(dx * dx + dy * dy)
807
+ @property
808
+ @abstractmethod
809
+ def matrix(self) -> Matrix: ...
810
+
811
+ @property
812
+ def font(self) -> Font:
813
+ """Font for this text object."""
814
+ font = self.gstate.font
815
+ assert font is not None
816
+ return font
817
+
818
+ @property
819
+ def size(self) -> float:
820
+ """Font size for this text object.
821
+
822
+ This is the actual font size in device space, which is **not**
823
+ the same as `GraphicState.fontsize`. That's the font size in
824
+ text space which is not a very useful number (it's usually 1).
825
+ """
826
+ vert = False if self.gstate.font is None else self.gstate.font.vertical
827
+ if vert:
828
+ # dx, dy = apply_matrix_norm(self.matrix, (1, 0))
829
+ dx, dy, _, _, _, _ = self.matrix
830
+ else:
831
+ # dx, dy = apply_matrix_norm(self.matrix, (0, 1))
832
+ _, _, dx, dy, _, _ = self.matrix
833
+ if dx == 0: # Nearly always true
834
+ return abs(dy)
835
+ elif dy == 0:
836
+ return abs(dx)
837
+ else:
838
+ import math
839
+
840
+ return math.sqrt(dx * dx + dy * dy)
841
+
842
+ @property
843
+ def fontname(self) -> str:
844
+ """Font name for this text object"""
845
+ return self.font.fontname
846
+
847
+ @property
848
+ def fontbase(self) -> str:
849
+ """Original font name for this text object.
850
+
851
+ Fonts in PDF files are usually "subsetted", meaning only the
852
+ glyphs actually used in the document are included. In this
853
+ case the font's `fontname` property usually consists of an
854
+ arbitrary "tag", plus (literally, a `+`) and the original
855
+ name. This is a convenience property to get that original
856
+ name.
857
+
858
+ This is not the same as `GraphicState.font.basefont` which
859
+ usually also includes the subset tag.
860
+
861
+ """
862
+ fontname = self.fontname
863
+ subset, _, base = fontname.partition("+")
864
+ if base:
865
+ return base
866
+ return fontname
867
+
868
+ @property
869
+ def textfont(self) -> str:
870
+ """Convenient short form of the font name and size.
871
+
872
+ For example, "Helvetica 12".
873
+ """
874
+ return f"{self.fontbase} {round(self.size)}"
875
+
876
+ @property
877
+ def origin(self) -> Point:
878
+ """Origin of this text object in device space."""
879
+ _, _, _, _, dx, dy = self.matrix
880
+ return dx, dy
818
881
 
819
882
 
820
883
  @dataclass
821
- class GlyphObject(ContentObject):
884
+ class GlyphObject(TextBase):
822
885
  """Individual glyph on the page.
823
886
 
824
887
  Attributes:
825
888
  font: Font for this glyph.
826
889
  size: Effective font size for this glyph.
890
+ fontname: Font name.
891
+ fontbase: Short (non-subset) font name.
892
+ textfont: Combined short name and size for the font.
827
893
  cid: Character ID for this glyph.
828
894
  text: Unicode mapping of this glyph, if any.
829
895
  matrix: Rendering matrix `T_rm` for this glyph, which transforms
@@ -837,7 +903,7 @@ class GlyphObject(ContentObject):
837
903
 
838
904
  cid: int
839
905
  text: Union[str, None]
840
- matrix: Matrix
906
+ _matrix: Matrix
841
907
  _displacement: float
842
908
  _corners: bool
843
909
 
@@ -881,20 +947,12 @@ class GlyphObject(ContentObject):
881
947
  return itor
882
948
 
883
949
  @property
884
- def font(self) -> Font:
885
- font = self.gstate.font
886
- assert font is not None
887
- return font
888
-
889
- @property
890
- def size(self) -> float:
891
- vert = False if self.gstate.font is None else self.gstate.font.vertical
892
- return _font_size(self.matrix, vert)
950
+ def matrix(self) -> Matrix:
951
+ return self._matrix
893
952
 
894
953
  @property
895
- def origin(self) -> Point:
896
- _, _, _, _, dx, dy = self.matrix
897
- return dx, dy
954
+ def chars(self) -> str:
955
+ return self.text or ""
898
956
 
899
957
  @property
900
958
  def displacement(self) -> Point:
@@ -932,7 +990,7 @@ class GlyphObject(ContentObject):
932
990
 
933
991
 
934
992
  @dataclass
935
- class TextObject(ContentObject):
993
+ class TextObject(TextBase):
936
994
  """Text object (contains one or more glyphs).
937
995
 
938
996
  Attributes:
@@ -943,7 +1001,11 @@ class TextObject(ContentObject):
943
1001
  origin: Origin of this text object in device space.
944
1002
  displacement: Vector to the origin of the next text object in
945
1003
  device space.
1004
+ font: Font for this text object.
946
1005
  size: Effective font size for this text object.
1006
+ fontname: Font name.
1007
+ fontbase: Short (non-subset) font name.
1008
+ textfont: Combined short name and size for the font.
947
1009
  text_matrix: Text matrix `T_m` for this text object, which
948
1010
  transforms text space coordinates to user space.
949
1011
  line_matrix: Text line matrix `T_lm` for this text object, which
@@ -966,6 +1028,7 @@ class TextObject(ContentObject):
966
1028
  _chars: Union[List[str], None] = None
967
1029
  _bbox: Union[Rect, None] = None
968
1030
  _next_glyph_offset: Union[Point, None] = None
1031
+ _displacement: Union[Point, None] = None
969
1032
 
970
1033
  def __iter__(self) -> Iterator[GlyphObject]:
971
1034
  """Generate glyphs for this text object"""
@@ -1048,7 +1111,7 @@ class TextObject(ContentObject):
1048
1111
  mcstack=self.mcstack,
1049
1112
  cid=cid,
1050
1113
  text=text,
1051
- matrix=matrix,
1114
+ _matrix=matrix,
1052
1115
  _displacement=disp,
1053
1116
  _corners=corners,
1054
1117
  )
@@ -1091,11 +1154,6 @@ class TextObject(ContentObject):
1091
1154
  )
1092
1155
  return self._matrix
1093
1156
 
1094
- @property
1095
- def size(self) -> float:
1096
- vert = False if self.gstate.font is None else self.gstate.font.vertical
1097
- return _font_size(self.matrix, vert)
1098
-
1099
1157
  @property
1100
1158
  def scaling_matrix(self) -> Matrix:
1101
1159
  horizontal_scaling = self.gstate.scaling * 0.01
@@ -1113,15 +1171,11 @@ class TextObject(ContentObject):
1113
1171
  def text_matrix(self) -> Matrix:
1114
1172
  return translate_matrix(self.line_matrix, self._glyph_offset)
1115
1173
 
1116
- @property
1117
- def origin(self) -> Point:
1118
- _, _, _, _, dx, dy = self.matrix
1119
- return dx, dy
1120
-
1121
1174
  @property
1122
1175
  def displacement(self) -> Point:
1176
+ if self._displacement is not None:
1177
+ return self._displacement
1123
1178
  matrix = self.matrix
1124
- # FIXME: This should be either cached or optimized
1125
1179
  next_matrix = mult_matrix(
1126
1180
  self.scaling_matrix,
1127
1181
  mult_matrix(
@@ -1129,7 +1183,8 @@ class TextObject(ContentObject):
1129
1183
  self.ctm,
1130
1184
  ),
1131
1185
  )
1132
- return next_matrix[-2] - matrix[-2], next_matrix[-1] - matrix[-1]
1186
+ self._displacement = next_matrix[-2] - matrix[-2], next_matrix[-1] - matrix[-1]
1187
+ return self._displacement
1133
1188
 
1134
1189
  @property
1135
1190
  def bbox(self) -> Rect:
@@ -39,7 +39,7 @@ from playa.structure import ContentItem as _StructContentItem
39
39
  from playa.structure import ContentObject as _StructContentObject
40
40
  from playa.structure import Element as _Element
41
41
  from playa.structure import Tree as _Tree
42
- from playa.utils import Matrix, Rect, decode_text
42
+ from playa.utils import Matrix, Rect
43
43
 
44
44
  log = logging.getLogger(__name__)
45
45
 
@@ -556,7 +556,7 @@ def asobj_stream(obj: _ContentStream) -> Dict:
556
556
  def asobj_outline(obj: _Outline, recurse: bool = True) -> Outline:
557
557
  out = Outline()
558
558
  if obj.title is not None:
559
- out["title"] = decode_text(obj.title)
559
+ out["title"] = obj.title
560
560
  if obj.destination is not None:
561
561
  out["destination"] = asobj(obj.destination)
562
562
  if recurse:
@@ -447,7 +447,7 @@ class Document:
447
447
  if m is None:
448
448
  raise PDFSyntaxError(
449
449
  f"Not an indirect object at position {pos}: "
450
- f"{self.buffer[pos:pos+8]!r}"
450
+ f"{self.buffer[pos : pos + 8]!r}"
451
451
  )
452
452
  _, obj = next(self.parser)
453
453
  if obj.objid != objid:
@@ -695,8 +695,9 @@ class Document:
695
695
  try:
696
696
  page_object = dict_value(self[object_id])
697
697
  except IndexError as e:
698
- log.warning("Skipping missing page object: %s", e)
699
- continue
698
+ log.warning("Missing page object: %s", e)
699
+ # Create an empty page to match what pdfium does
700
+ page_object = {"Type": LIT("Page")}
700
701
 
701
702
  # Avoid recursion errors by keeping track of visited nodes
702
703
  # (again, this should never actually happen in a valid PDF)
@@ -88,7 +88,7 @@ class Font:
88
88
  fontname = resolve1(descriptor.get("FontName"))
89
89
  if isinstance(fontname, PSLiteral):
90
90
  self.fontname = literal_name(fontname)
91
- elif isinstance(fontname, (bytes, str)):
91
+ elif isinstance(fontname, bytes):
92
92
  self.fontname = decode_text(fontname)
93
93
  else:
94
94
  self.fontname = "unknown"
@@ -532,16 +532,16 @@ class CIDFont(Font):
532
532
  # These are *supposed* to be ASCII (PDF 1.7 section 9.7.3),
533
533
  # but for whatever reason they are sometimes UTF-16BE
534
534
  cid_registry = resolve1(self.cidsysteminfo.get("Registry"))
535
- if isinstance(cid_registry, (str, bytes)):
536
- cid_registry = decode_text(cid_registry)
535
+ if isinstance(cid_registry, bytes):
536
+ regstr = decode_text(cid_registry).strip()
537
537
  else:
538
- cid_registry = "unknown"
538
+ regstr = "unknown"
539
539
  cid_ordering = resolve1(self.cidsysteminfo.get("Ordering"))
540
- if isinstance(cid_ordering, (str, bytes)):
541
- cid_ordering = decode_text(cid_ordering)
540
+ if isinstance(cid_ordering, bytes):
541
+ ordstr = decode_text(cid_ordering).strip()
542
542
  else:
543
- cid_ordering = "unknown"
544
- self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
543
+ ordstr = "unknown"
544
+ self.cidcoding = f"{regstr}-{ordstr}"
545
545
  self.cmap: CMapBase = self.get_cmap_from_spec(spec)
546
546
 
547
547
  try:
@@ -337,7 +337,7 @@ def write_cmyk_tiff(
337
337
 
338
338
  # 6. --- Write the Actual Pixel Data ---
339
339
  # The current file position should now match `offset_image_data`
340
- assert (
341
- outfh.tell() == offset_image_data
342
- ), f"File position mismatch: at {outfh.tell()}, expected {offset_image_data}"
340
+ assert outfh.tell() == offset_image_data, (
341
+ f"File position mismatch: at {outfh.tell()}, expected {offset_image_data}"
342
+ )
343
343
  outfh.write(data)
@@ -19,7 +19,6 @@ from typing import (
19
19
  Tuple,
20
20
  Union,
21
21
  Sequence,
22
- cast,
23
22
  )
24
23
 
25
24
  from playa.color import PREDEFINED_COLORSPACE, ColorSpace, get_colorspace
@@ -58,7 +57,7 @@ from playa.pdftypes import (
58
57
  resolve1,
59
58
  stream_value,
60
59
  )
61
- from playa.utils import decode_text, mult_matrix
60
+ from playa.utils import mult_matrix
62
61
  from playa.worker import _deref_document
63
62
 
64
63
  if TYPE_CHECKING:
@@ -440,7 +439,7 @@ class LazyInterpreter:
440
439
  # Inline images are not XObjects, have no xobjid
441
440
  return self.render_image(None, obj)
442
441
  else:
443
- # FIXME: Do... something?
442
+ log.warning("EI has unknown argument type: %r", obj)
444
443
  return None
445
444
 
446
445
  def do_Do(self, xobjid_arg: PDFObject) -> Union[ContentObject, None]:
@@ -452,8 +451,7 @@ class LazyInterpreter:
452
451
  log.debug("Undefined xobject id: %r", xobjid)
453
452
  return None
454
453
  except TypeError as e:
455
- log.debug("Empty or invalid xobject with id %r: %s", xobjid, e)
456
- return None
454
+ raise TypeError(f"Empty or invalid xobject with id {xobjid!r}") from e
457
455
  subtype = xobj.get("Subtype")
458
456
  if subtype is LITERAL_FORM:
459
457
  # PDF Ref 1.7, # 4.9
@@ -530,7 +528,15 @@ class LazyInterpreter:
530
528
  f1: PDFObject,
531
529
  ) -> None:
532
530
  """Concatenate matrix to current transformation matrix"""
533
- self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
531
+ cm = (
532
+ num_value(a1),
533
+ num_value(b1),
534
+ num_value(c1),
535
+ num_value(d1),
536
+ num_value(e1),
537
+ num_value(f1),
538
+ )
539
+ self.ctm = mult_matrix(cm, self.ctm)
534
540
 
535
541
  def do_w(self, linewidth: PDFObject) -> None:
536
542
  """Set line width"""
@@ -557,8 +563,11 @@ class LazyInterpreter:
557
563
  """Set color rendering intent"""
558
564
  if self.ignore_colours:
559
565
  return
560
- # FIXME: Should actually be a (runtime checked) enum
561
- self.graphicstate.intent = cast(PSLiteral, intent)
566
+ if isinstance(intent, PSLiteral):
567
+ # Should possibly check that it is a valid intent
568
+ self.graphicstate.intent = intent
569
+ else:
570
+ raise TypeError(f"Not a name: {intent!r}")
562
571
 
563
572
  def do_i(self, flatness: PDFObject) -> None:
564
573
  """Set flatness tolerance"""
@@ -600,7 +609,12 @@ class LazyInterpreter:
600
609
  if isinstance(bm, PSLiteral):
601
610
  self.graphicstate.blend_mode = bm
602
611
  else:
603
- self.graphicstate.blend_mode = cast(List[PSLiteral], list_value(bm))
612
+ bml: List[PSLiteral] = []
613
+ for x in list_value(bm):
614
+ if isinstance(PSLiteral, x):
615
+ raise TypeError(f"Not a name: {x!r}")
616
+ bml.append(x)
617
+ self.graphicstate.blend_mode = bml
604
618
  if "SMask" in extgstate:
605
619
  smask = extgstate["SMask"]
606
620
  if isinstance(smask, PSLiteral):
@@ -883,8 +897,8 @@ class LazyInterpreter:
883
897
  e_new = tx * a + ty * c + e
884
898
  f_new = tx * b + ty * d + f
885
899
  self.textstate.line_matrix = (a, b, c, d, e_new, f_new)
886
- except TypeError:
887
- log.warning("Invalid offset (%r, %r) for Td", tx, ty)
900
+ except TypeError as e:
901
+ raise TypeError(f"Invalid offset ({tx!r}, {ty!r})") from e
888
902
  self.textstate.glyph_offset = (0, 0)
889
903
 
890
904
  def do_TD(self, tx: PDFObject, ty: PDFObject) -> None:
@@ -969,12 +983,14 @@ class LazyInterpreter:
969
983
  def begin_tag(self, tag: PDFObject, props: Dict[str, PDFObject]) -> None:
970
984
  """Handle beginning of tag, setting current MCID if any."""
971
985
  assert isinstance(tag, PSLiteral)
972
- tag = decode_text(tag.name)
973
986
  if "MCID" in props:
974
987
  mcid = int_value(props["MCID"])
975
988
  else:
976
989
  mcid = None
977
- self.mcstack = (*self.mcstack, MarkedContent(mcid=mcid, tag=tag, props=props))
990
+ self.mcstack = (
991
+ *self.mcstack,
992
+ MarkedContent(mcid=mcid, tag=tag.name, props=props),
993
+ )
978
994
 
979
995
  def do_BMC(self, tag: PDFObject) -> None:
980
996
  """Begin marked-content sequence"""