playa-pdf 0.8.1__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/CHANGELOG.md +24 -0
  2. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/PKG-INFO +117 -103
  3. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/README.md +115 -100
  4. playa_pdf-0.10.0/benchmarks/benchmark.sh +28 -0
  5. playa_pdf-0.10.0/benchmarks/latency.py +54 -0
  6. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/structure.py +1 -1
  7. playa_pdf-0.10.0/latency_stats.txt +3585 -0
  8. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/mkdocs.yml +3 -1
  9. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/_version.py +2 -2
  10. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/arcfour.py +3 -5
  11. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/ccitt.py +2 -1
  12. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cli.py +1 -2
  13. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmapdb.py +3 -2
  14. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/content.py +214 -109
  15. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/metadata.py +2 -2
  16. playa_pdf-0.10.0/playa/data_structures.py +114 -0
  17. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/document.py +563 -458
  18. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/font.py +9 -8
  19. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/image.py +3 -3
  20. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/interp.py +57 -41
  21. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/miner.py +21 -27
  22. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/page.py +28 -60
  23. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/parser.py +30 -13
  24. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/pdftypes.py +10 -10
  25. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/security.py +7 -7
  26. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/structure.py +37 -13
  27. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/utils.py +6 -10
  28. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/xref.py +94 -57
  29. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/pyproject.toml +15 -5
  30. playa_pdf-0.10.0/tests/latency_stats.py +68 -0
  31. playa_pdf-0.10.0/tests/test_data_structures.py +114 -0
  32. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_document.py +17 -8
  33. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_interp.py +5 -5
  34. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_miner.py +1 -0
  35. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_parser.py +1 -1
  36. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_pdftypes.py +4 -77
  37. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_structure.py +14 -9
  38. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_utils.py +5 -11
  39. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_xref.py +44 -40
  40. playa_pdf-0.8.1/benchmarks/benchmark.sh +0 -25
  41. playa_pdf-0.8.1/playa/data_structures.py +0 -88
  42. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.flake8 +0 -0
  43. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.gitattributes +0 -0
  44. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.gitignore +0 -0
  45. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/.gitmodules +0 -0
  46. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/LICENSE +0 -0
  47. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/MANIFEST.in +0 -0
  48. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/ccitt_decode.py +0 -0
  49. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/converter.py +0 -0
  50. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/marked_content.py +0 -0
  51. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/miner.py +0 -0
  52. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/objects.py +0 -0
  53. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/parallel.py +0 -0
  54. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/parser.py +0 -0
  55. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/png_predict.py +0 -0
  56. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/text.py +0 -0
  57. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/tiff_predict.py +0 -0
  58. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/benchmarks/type3_charproc.py +0 -0
  59. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/adobe-spiderman.jpg +0 -0
  60. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/cli.md +0 -0
  61. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/data.md +0 -0
  62. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/index.md +0 -0
  63. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/reference.md +0 -0
  64. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/docs/working-in-the-pdf-mine.md +0 -0
  65. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/__init__.py +0 -0
  66. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/_saslprep.py +0 -0
  67. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/ascii85.py +0 -0
  68. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-EUC-H.pickle.gz +0 -0
  69. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-EUC-V.pickle.gz +0 -0
  70. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-H.pickle.gz +0 -0
  71. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-RKSJ-H.pickle.gz +0 -0
  72. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-RKSJ-V.pickle.gz +0 -0
  73. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78-V.pickle.gz +0 -0
  74. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78ms-RKSJ-H.pickle.gz +0 -0
  75. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/78ms-RKSJ-V.pickle.gz +0 -0
  76. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/83pv-RKSJ-H.pickle.gz +0 -0
  77. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/83pv-RKSJ-V.pickle.gz +0 -0
  78. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90ms-RKSJ-H.pickle.gz +0 -0
  79. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90ms-RKSJ-V.pickle.gz +0 -0
  80. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90msp-RKSJ-H.pickle.gz +0 -0
  81. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90msp-RKSJ-V.pickle.gz +0 -0
  82. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90pv-RKSJ-H.pickle.gz +0 -0
  83. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/90pv-RKSJ-V.pickle.gz +0 -0
  84. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-H.pickle.gz +0 -0
  85. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-RKSJ-H.pickle.gz +0 -0
  86. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-RKSJ-V.pickle.gz +0 -0
  87. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Add-V.pickle.gz +0 -0
  88. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5-H.pickle.gz +0 -0
  89. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5-V.pickle.gz +0 -0
  90. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5pc-H.pickle.gz +0 -0
  91. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/B5pc-V.pickle.gz +0 -0
  92. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS-EUC-H.pickle.gz +0 -0
  93. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS-EUC-V.pickle.gz +0 -0
  94. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS1-H.pickle.gz +0 -0
  95. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS1-V.pickle.gz +0 -0
  96. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS2-H.pickle.gz +0 -0
  97. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/CNS2-V.pickle.gz +0 -0
  98. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETHK-B5-H.pickle.gz +0 -0
  99. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETHK-B5-V.pickle.gz +0 -0
  100. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETen-B5-H.pickle.gz +0 -0
  101. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETen-B5-V.pickle.gz +0 -0
  102. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETenms-B5-H.pickle.gz +0 -0
  103. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/ETenms-B5-V.pickle.gz +0 -0
  104. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/EUC-H.pickle.gz +0 -0
  105. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/EUC-V.pickle.gz +0 -0
  106. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-H.pickle.gz +0 -0
  107. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-RKSJ-H.pickle.gz +0 -0
  108. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-RKSJ-V.pickle.gz +0 -0
  109. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Ext-V.pickle.gz +0 -0
  110. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-EUC-H.pickle.gz +0 -0
  111. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-EUC-V.pickle.gz +0 -0
  112. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-H.pickle.gz +0 -0
  113. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GB-V.pickle.gz +0 -0
  114. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK-EUC-H.pickle.gz +0 -0
  115. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK-EUC-V.pickle.gz +0 -0
  116. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK2K-H.pickle.gz +0 -0
  117. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBK2K-V.pickle.gz +0 -0
  118. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBKp-EUC-H.pickle.gz +0 -0
  119. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBKp-EUC-V.pickle.gz +0 -0
  120. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-EUC-H.pickle.gz +0 -0
  121. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-EUC-V.pickle.gz +0 -0
  122. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-H.pickle.gz +0 -0
  123. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBT-V.pickle.gz +0 -0
  124. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBTpc-EUC-H.pickle.gz +0 -0
  125. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBTpc-EUC-V.pickle.gz +0 -0
  126. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBpc-EUC-H.pickle.gz +0 -0
  127. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/GBpc-EUC-V.pickle.gz +0 -0
  128. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/H.pickle.gz +0 -0
  129. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdla-B5-H.pickle.gz +0 -0
  130. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdla-B5-V.pickle.gz +0 -0
  131. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdlb-B5-H.pickle.gz +0 -0
  132. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKdlb-B5-V.pickle.gz +0 -0
  133. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKgccs-B5-H.pickle.gz +0 -0
  134. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKgccs-B5-V.pickle.gz +0 -0
  135. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm314-B5-H.pickle.gz +0 -0
  136. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm314-B5-V.pickle.gz +0 -0
  137. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm471-B5-H.pickle.gz +0 -0
  138. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKm471-B5-V.pickle.gz +0 -0
  139. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKscs-B5-H.pickle.gz +0 -0
  140. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/HKscs-B5-V.pickle.gz +0 -0
  141. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hankaku-H.pickle.gz +0 -0
  142. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hankaku-V.pickle.gz +0 -0
  143. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hiragana-H.pickle.gz +0 -0
  144. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Hiragana-V.pickle.gz +0 -0
  145. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-EUC-H.pickle.gz +0 -0
  146. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-EUC-V.pickle.gz +0 -0
  147. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-H.pickle.gz +0 -0
  148. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-Johab-H.pickle.gz +0 -0
  149. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-Johab-V.pickle.gz +0 -0
  150. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSC-V.pickle.gz +0 -0
  151. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-H.pickle.gz +0 -0
  152. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-HW-H.pickle.gz +0 -0
  153. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-HW-V.pickle.gz +0 -0
  154. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCms-UHC-V.pickle.gz +0 -0
  155. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCpc-EUC-H.pickle.gz +0 -0
  156. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/KSCpc-EUC-V.pickle.gz +0 -0
  157. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Katakana-H.pickle.gz +0 -0
  158. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Katakana-V.pickle.gz +0 -0
  159. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Makefile +0 -0
  160. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/NWP-H.pickle.gz +0 -0
  161. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/NWP-V.pickle.gz +0 -0
  162. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/README.txt +0 -0
  163. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/RKSJ-H.pickle.gz +0 -0
  164. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/RKSJ-V.pickle.gz +0 -0
  165. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Roman-H.pickle.gz +0 -0
  166. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/Roman-V.pickle.gz +0 -0
  167. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF16-H.pickle.gz +0 -0
  168. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF16-V.pickle.gz +0 -0
  169. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF32-H.pickle.gz +0 -0
  170. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF32-V.pickle.gz +0 -0
  171. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF8-H.pickle.gz +0 -0
  172. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniAKR-UTF8-V.pickle.gz +0 -0
  173. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UCS2-H.pickle.gz +0 -0
  174. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UCS2-V.pickle.gz +0 -0
  175. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF16-H.pickle.gz +0 -0
  176. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF16-V.pickle.gz +0 -0
  177. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF32-H.pickle.gz +0 -0
  178. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF32-V.pickle.gz +0 -0
  179. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF8-H.pickle.gz +0 -0
  180. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniCNS-UTF8-V.pickle.gz +0 -0
  181. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UCS2-H.pickle.gz +0 -0
  182. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UCS2-V.pickle.gz +0 -0
  183. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF16-H.pickle.gz +0 -0
  184. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF16-V.pickle.gz +0 -0
  185. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF32-H.pickle.gz +0 -0
  186. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF32-V.pickle.gz +0 -0
  187. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF8-H.pickle.gz +0 -0
  188. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniGB-UTF8-V.pickle.gz +0 -0
  189. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-H.pickle.gz +0 -0
  190. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-HW-H.pickle.gz +0 -0
  191. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-HW-V.pickle.gz +0 -0
  192. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UCS2-V.pickle.gz +0 -0
  193. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF16-H.pickle.gz +0 -0
  194. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF16-V.pickle.gz +0 -0
  195. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF32-H.pickle.gz +0 -0
  196. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF32-V.pickle.gz +0 -0
  197. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF8-H.pickle.gz +0 -0
  198. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS-UTF8-V.pickle.gz +0 -0
  199. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF16-H.pickle.gz +0 -0
  200. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF16-V.pickle.gz +0 -0
  201. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF32-H.pickle.gz +0 -0
  202. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF32-V.pickle.gz +0 -0
  203. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF8-H.pickle.gz +0 -0
  204. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJIS2004-UTF8-V.pickle.gz +0 -0
  205. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX0213-UTF32-H.pickle.gz +0 -0
  206. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX0213-UTF32-V.pickle.gz +0 -0
  207. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX02132004-UTF32-H.pickle.gz +0 -0
  208. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniJISX02132004-UTF32-V.pickle.gz +0 -0
  209. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UCS2-H.pickle.gz +0 -0
  210. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UCS2-V.pickle.gz +0 -0
  211. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF16-H.pickle.gz +0 -0
  212. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF16-V.pickle.gz +0 -0
  213. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF32-H.pickle.gz +0 -0
  214. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF32-V.pickle.gz +0 -0
  215. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF8-H.pickle.gz +0 -0
  216. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniKS-UTF8-V.pickle.gz +0 -0
  217. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF16-H.pickle.gz +0 -0
  218. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF16-V.pickle.gz +0 -0
  219. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF32-H.pickle.gz +0 -0
  220. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF32-V.pickle.gz +0 -0
  221. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF8-H.pickle.gz +0 -0
  222. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/UniManga-UTF8-V.pickle.gz +0 -0
  223. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/V.pickle.gz +0 -0
  224. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/WP-Symbol-H.pickle.gz +0 -0
  225. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/WP-Symbol-V.pickle.gz +0 -0
  226. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-CNS1.pickle.gz +0 -0
  227. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-GB1.pickle.gz +0 -0
  228. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-Japan1.pickle.gz +0 -0
  229. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-KR.pickle.gz +0 -0
  230. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-Korea1.pickle.gz +0 -0
  231. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/cmap/to-unicode-Adobe-Manga1.pickle.gz +0 -0
  232. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/color.py +0 -0
  233. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/__init__.py +0 -0
  234. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/_asobj.py +0 -0
  235. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/data/content.py +0 -0
  236. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/encodingdb.py +0 -0
  237. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/encodings.py +0 -0
  238. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/exceptions.py +0 -0
  239. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/fontmetrics.py +0 -0
  240. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/fontprogram.py +0 -0
  241. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/glyphlist.py +0 -0
  242. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/lzw.py +0 -0
  243. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/outline.py +0 -0
  244. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/py.typed +0 -0
  245. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/runlength.py +0 -0
  246. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/playa/worker.py +0 -0
  247. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/README +0 -0
  248. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/acroform/AcroForm_TEST.pdf +0 -0
  249. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/acroform/AcroForm_TEST_compiled.pdf +0 -0
  250. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/actualtext.pdf +0 -0
  251. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/ascii_tounicode.pdf +0 -0
  252. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/broken_xobjects.pdf +0 -0
  253. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/character_spacing.pdf +0 -0
  254. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/character_spacing_glyphs.json +0 -0
  255. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/character_spacing_texts.json +0 -0
  256. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/core_font_encodings.pdf +0 -0
  257. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/duplicate_encoding_tounicode.pdf +0 -0
  258. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-128-m.pdf +0 -0
  259. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-128.pdf +0 -0
  260. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-256-m.pdf +0 -0
  261. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-256-r6.pdf +0 -0
  262. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/aes-256.pdf +0 -0
  263. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/base.pdf +0 -0
  264. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/encrypted_doc_no_id.pdf +0 -0
  265. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/rc4-128.pdf +0 -0
  266. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/encryption/rc4-40.pdf +0 -0
  267. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/evil_cmap.pdf +0 -0
  268. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/evil_xobjects.pdf +0 -0
  269. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/extgstate.pdf +0 -0
  270. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/font-size-test.pdf +0 -0
  271. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/graphics_state_in_text_object.pdf +0 -0
  272. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/hello_structure.pdf +6 -6
  273. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/image_structure.pdf +0 -0
  274. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/jo.pdf +0 -0
  275. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/junk_before_header.pdf +0 -0
  276. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/missing_rolemap.pdf +0 -0
  277. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/multi-xrefs.pdf +0 -0
  278. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/pdf_structure.pdf +0 -0
  279. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotated.pdf +0 -0
  280. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotated_type3_fonts.pdf +0 -0
  281. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/0.pdf +0 -0
  282. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/0mb.pdf +0 -0
  283. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/180.pdf +0 -0
  284. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/180mb.pdf +0 -0
  285. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/270.pdf +0 -0
  286. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/270mb.pdf +0 -0
  287. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/90.pdf +0 -0
  288. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/rotation/90mb.pdf +0 -0
  289. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/sampleOneByteIdentityEncode.pdf +0 -0
  290. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/scancode/patchelf.pdf +0 -0
  291. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple1.pdf +0 -0
  292. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple2.pdf +0 -0
  293. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple3.pdf +0 -0
  294. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple3_glyphs.json +0 -0
  295. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple3_texts.json +0 -0
  296. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple4.pdf +0 -0
  297. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/simple5.pdf +0 -0
  298. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/structure_xobjects.pdf +0 -0
  299. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/structure_xobjects_2.pdf +0 -0
  300. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/test_pdf_with_tiff_predictor.pdf +0 -0
  301. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/text_displacement.pdf +0 -0
  302. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/text_side_effects.pdf +0 -0
  303. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/text_space.pdf +0 -0
  304. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/type3_fonts.pdf +0 -0
  305. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/uncoloured-tiling-pattern.pdf +0 -0
  306. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/utf16_tounicode.pdf +0 -0
  307. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/utf8_tounicode.pdf +0 -0
  308. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing.pdf +0 -0
  309. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_glyphs.json +0 -0
  310. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_offset.pdf +0 -0
  311. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_offset_glyphs.json +0 -0
  312. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_offset_texts.json +0 -0
  313. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/vertical_writing_texts.json +0 -0
  314. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/xobject_graphicstate.pdf +0 -0
  315. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/samples/zen_of_python_corrupted.pdf +0 -0
  316. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/__init__.py +0 -0
  317. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/bad_operators.pdf +0 -0
  318. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/bad_pages.pdf +0 -0
  319. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/bad_resources.pdf +0 -0
  320. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/cmap-encoding.txt +0 -0
  321. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/cmap-onebyte-encoding.txt +0 -0
  322. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/cmap-tounicode.txt +0 -0
  323. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/data.py +0 -0
  324. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/fallback-xref.pdf +0 -0
  325. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/issue18117-encoding.txt +0 -0
  326. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/issue18117-tounicode.txt +0 -0
  327. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/issue9367-tounicode.txt +0 -0
  328. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_cli.py +0 -0
  329. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_cmapdb.py +0 -0
  330. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_crypto.py +0 -0
  331. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_data.py +0 -0
  332. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_encodingdb.py +0 -0
  333. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_fonts.py +0 -0
  334. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_indirect_objects.py +0 -0
  335. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_lazy_api.py +0 -0
  336. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_lexer.py +0 -0
  337. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_open.py +0 -0
  338. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_outline.py +0 -0
  339. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_page.py +0 -0
  340. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_parallel.py +0 -0
  341. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_pdfminer_ccitt.py +0 -0
  342. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tests/test_text.py +0 -0
  343. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/conv_afm.py +0 -0
  344. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/conv_cmap.py +0 -0
  345. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/conv_glyphlist.py +0 -0
  346. {playa_pdf-0.8.1 → playa_pdf-0.10.0}/tools/create_json_schema.py +0 -0
@@ -1,3 +1,27 @@
1
+ ## PLAYA 0.10.0: 2026-02-04
2
+
3
+ - Load xref tables lazily
4
+ - Iterate lazily over page lists
5
+ - Support iterating over marked content sections in logical or page
6
+ order
7
+ - Insert blank pages for all invalid entries in page tree
8
+ - Restore Python 3.8 compatibility
9
+ - BREAKING CHANGE: `Document`, `NameTree` and `NumberTree` are now
10
+ proper `collections.abc` Mappings, so you need `items()` to get
11
+ `(key, value)` pairs
12
+ - BREAKING CHANGE: Undefined object IDs now raise `KeyError` and not
13
+ `IndexError`
14
+ - BREAKING CHANGE: `marked_contents` now contain empty iterables
15
+ instead of `None` for empty marked content sections
16
+ - BREAKING CHANGE: `mcid_texts` no longer exists
17
+
18
+ ## PLAYA 0.9.0: 2026-01-08
19
+
20
+ - Refactor and add convenience methods to text objects
21
+ - Insert blank pages for missing object references in page tree
22
+ - Clean up type annotations (breaking change: PDFObject can no longer
23
+ be `str`, as the parser will never create this)
24
+
1
25
  ## PLAYA 0.8.1: 2025-12-22
2
26
 
3
27
  - Correct subtle issues with mypyc-compiled pdfminer.six code
@@ -1,17 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: playa-pdf
3
- Version: 0.8.1
3
+ Version: 0.10.0
4
4
  Summary: Parallel and LazY Analyzer for PDFs
5
5
  Project-URL: Homepage, https://dhdaines.github.io/playa
6
6
  Author-email: David Huggins-Daines <dhd@ecolingui.ca>
7
- License: MIT
7
+ License-Expression: MIT
8
8
  License-File: LICENSE
9
9
  Keywords: pdf parser,text mining
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Science/Research
14
- Classifier: License :: OSI Approved :: MIT License
15
14
  Classifier: Programming Language :: Python
16
15
  Classifier: Programming Language :: Python :: 3 :: Only
17
16
  Classifier: Programming Language :: Python :: 3.8
@@ -131,63 +130,56 @@ place! Let's open up a PDF and see what's in it:
131
130
  pdf = playa.open("my_awesome_document.pdf")
132
131
  raw_byte_stream = pdf.buffer
133
132
  a_bunch_of_tokens = list(pdf.tokens)
134
- a_bunch_of_indirect_objects = list(pdf)
133
+ a_bunch_of_indirect_object_ids = list(pdf.keys())
134
+ a_bunch_of_indirect_objects = list(pdf.values())
135
+ a_bunch_of_pages = list(pdf.pages)
135
136
  ```
136
137
 
137
- The raw PDF tokens and objects are probably not terribly useful to
138
- you, but you might find them interesting. Note that these are
139
- "indirect objects" where the actual object is accompanied by an object
140
- number and generation number:
138
+ Yes, a [`Document`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document) is fundamentally a
139
+ [`Mapping`](https://docs.python.org/3/library/collections.abc.html#collections.abc.Mapping) of object IDs to objects, which
140
+ are represented to the extent possible by native Python objects.
141
+ These may not be terribly useful to you, but you might find them
142
+ interesting. Note that these are "indirect objects" where the actual
143
+ object is accompanied by an object number and "generation number". If
144
+ you wish to find **all** the objects in a PDF file, then you will need
145
+ to iterate over the [`objects`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.objects) property:
141
146
 
142
147
  ```python
143
- for objid, genno, obj in pdf:
144
- ...
145
- # or also
146
- for obj in pdf:
147
- obj.objid, obj.genno, obj.obj
148
- ```
149
-
150
- Also, these will only be the top-level objects and not those found
151
- inside object streams (the streams are themselves indirect objects).
152
- You can iterate over all indirect objects including object streams
153
- using the `objects` property:
154
-
155
- ```python
156
- for obj in pdf.objects:
157
- obj.objid, obj.genno, obj.obj
148
+ for indobj in pdf.objects:
149
+ objid, genno, obj = indobj
158
150
  ```
159
151
 
160
- In this case it is possible you will encounter multiple objects with
161
- the same `objid` due to the "incremental updates" feature of PDF.
162
- Currently, iterating over the objects in a particular stream is
163
- possible, but complicated.
164
-
165
- You can also access indirect objects by number (this will return the
166
- object with most recent generation number):
152
+ It is possible you will encounter multiple objects with the same
153
+ `objid` due to the "incremental updates" feature of PDF. As expected,
154
+ you can subscript the document to access indirect objects by number
155
+ (this will return the object with most recent generation number):
167
156
 
168
157
  ```python
169
158
  a_particular_object = pdf[42]
170
159
  ```
171
160
 
172
- Your PDF document probably has some pages. How many? What are their
173
- numbers/labels? They could be things like "xvi" (pronounced
174
- "gzvee"), 'a", or "42", for instance!
161
+ Your PDF document probably has some [pages](https://dhdaines.github.io/playa/latest/reference#playa.document.PageList).
162
+ How many? What are their numbers/labels? They could be things like
163
+ "xvi" (pronounced "gzvee"), 'a", or "42", for instance!
175
164
 
176
165
  ```python
177
166
  npages = len(pdf.pages)
178
167
  page_numbers = [page.label for page in pdf.pages]
179
168
  ```
180
169
 
181
- You can also subscript `pdf.pages` in various other ways, using a
182
- slice or an iterable of `int`, which will give you a page list object
183
- that behaves similarly to `pdf.pages`. Pages and page lists can refer
184
- back to their document (using weak reference magic to avoid memory
185
- leaks) with the `doc` property.
170
+ You can also subscript [`pages`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.pages) in
171
+ various other ways, using a slice or an iterable of `int`, which
172
+ will give you a new page list object that behaves similarly. Pages
173
+ and page lists can refer back to their document (using weak reference
174
+ magic to avoid memory leaks) with their
175
+ [`doc`](https://dhdaines.github.io/playa/latest/reference#playa.document.PageList.doc) property.
186
176
 
187
177
  ## Some (by no means all) helpful metadata
188
178
 
189
- A PDF often contains a "document outline" which is a sequence of trees
190
- representing the coarse-grained logical structure of the document.
179
+ A PDF often contains a ["document outline"](https://dhdaines.github.io/playa/latest/reference#playa.outline.Outline)
180
+ which is a sequence of trees representing the coarse-grained logical
181
+ structure of the document, accessible via the
182
+ [`outline`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.outline) property:
191
183
 
192
184
  ```python
193
185
  for entry in pdf.outline:
@@ -197,9 +189,10 @@ for entry in pdf.outline:
197
189
  ...
198
190
  ```
199
191
 
200
- If you are lucky it has a "logical structure tree". The elements here
201
- might even be referenced from the `outline` above! (or, they might
202
- not... with PDF you never know).
192
+ If you are lucky it has a ["logical structure
193
+ tree"](https://dhdaines.github.io/playa/latest/reference#playa.structure.Tree). The elements here might even be
194
+ referenced from the [`outline`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.outline)
195
+ above! (or, they might not... with PDF you never know).
203
196
 
204
197
  ```python
205
198
  for element in pdf.structure:
@@ -213,16 +206,18 @@ Now perhaps we want to look at a specific page. Okay! You can also
213
206
  look at its contents, more on that in a bit:
214
207
 
215
208
  ```python
216
- page = pdf.pages[0] # they are numbered from 0
217
- page = pdf.pages["xviii"] # but you can get them by label (a string)
218
- page = pdf.pages["42"] # or "logical" page number (also a string)
209
+ page = next(iter(pdf.pages)) # Fast and lazy way to get the first page
210
+ page = pdf.pages[0] # they are numbered from 0
211
+ page = pdf.pages["xviii"] # but you can get them by label (a string)
212
+ page = pdf.pages["42"] # or "logical" page number (also a string)
219
213
  print(f"Page {page.label} is {page.width} x {page.height}")
220
214
  ```
221
215
 
222
216
  Since PDF is at heart a page-oriented, presentation format, many types
223
217
  of metadata are mostly accessible via the page objects. For instance
224
- you can access the fonts used in page with, obviously, the `fonts`
225
- property, or the annotations via the `annotations` property.
218
+ you can access the fonts used in page with, obviously, the
219
+ [`fonts`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.fonts) property, or the annotations via the
220
+ [`annotations`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.annotations) property.
226
221
 
227
222
  For example, annotations (internal or external links) are defined on
228
223
  pages (since their position would not make any sense otherwise).
@@ -235,17 +230,19 @@ for annot in page.annotations:
235
230
  ```
236
231
 
237
232
  The set of possible entries in annotation dictionaries (PDF 1.7 sect
238
- 12.5.2) is vast and confusing and inconsistently implemented, but you
239
- can always access them by their names (as defined in the PDF standard)
240
- via `annot.props`.
233
+ 12.5.2) is vast and confusing and inconsistently implemented. You can
234
+ access the raw annotation dictionary via `props` in the
235
+ [`Annotation`](https://dhdaines.github.io/playa/latest/reference#playa.page.Annotation) object.
241
236
 
242
237
  If the document has logical structure, then the pages will also have a
243
- slightly different form of logical structure. You can use the same
244
- `find` and `find_all` methods to get all of the enclosing structure
245
- elements of a given type (actually a role) for a page. So for
246
- instance if you wanted to get the text contents for all the cells in
247
- all the tables on a page, assuming the creator of said page was kind
248
- enough to check the "PDF/UA" box, you can do:
238
+ slightly different form of logical structure. You can use the
239
+ [`find`](https://dhdaines.github.io/playa/latest/reference#playa.structure.PageStructure.find) and
240
+ [`find_all`](https://dhdaines.github.io/playa/latest/reference#playa.structure.PageStructure.find_all) methods to get
241
+ all of the enclosing structure elements of a given type (actually a
242
+ role) for a page. So for instance if you wanted to get the text
243
+ contents for all the cells in all the tables on a page, assuming the
244
+ creator of said page was kind enough to check the "PDF/UA" box, you
245
+ can do:
249
246
 
250
247
  ```python
251
248
  for table in page.structure.find_all("Table"):
@@ -286,8 +283,9 @@ PLAYA allows you to take advantage of multiple CPUs, which can greatly
286
283
  speed up some operations on large documents. This parallelism
287
284
  currently operates at the page level since this is the most logical
288
285
  way to split up a PDF. To enable it, pass the `max_workers` argument
289
- to `playa.open` with the number of cores you wish to use (you can also
290
- explicitly pass `None` to use the maximum):
286
+ to [`playa.open`](https://dhdaines.github.io/playa/latest/reference/#playa.open)
287
+ with the number of cores you wish to use (you can also explicitly pass
288
+ `None` to use the maximum):
291
289
 
292
290
  ```python
293
291
  with playa.open(path, max_workers=4) as pdf:
@@ -295,7 +293,8 @@ with playa.open(path, max_workers=4) as pdf:
295
293
  ```
296
294
 
297
295
  Now, you can apply a function across the pages of the PDF in parallel
298
- using the `map` method of `pdf.pages`, for example:
296
+ using the [`map`](https://dhdaines.github.io/playa/latest/reference#playa.document.PageList.map) method of
297
+ [`pdf.pages`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.pages), for example:
299
298
 
300
299
  ```python
301
300
  def get_page_size(page: Page) -> Tuple[int, int]:
@@ -305,20 +304,20 @@ page_sizes = pdf.pages.map(get_page_size)
305
304
  ```
306
305
 
307
306
  You could also just do this for certain pages by subscripting
308
- `pdf.pages` (this can be a slice, an iterable of `int`, or a
309
- generator expression over `int` and/or `str`):
307
+ [`pdf.pages`](https://dhdaines.github.io/playa/latest/reference#playa.document.Document.pages) (this can be a slice, an
308
+ iterable of `int`, or a generator expression over `int` and/or `str`):
310
309
 
311
310
  ```python
312
311
  some_page_sizes = pdf.pages[2:5].map(get_page_size)
313
312
  ```
314
313
 
315
314
  There are some limitations to this, because it uses `multiprocessing`.
316
- The function you pass to `map` must be serializable by `pickle`, which
317
- in practice means that an inner function or lambda generally doesn't
318
- work. You can get around this in a very Java-like way by passing a
319
- callable object that encapsulates the necessary state. If you wish to
320
- avoid traumatising readers of your code, then use `functools.partial`
321
- instead:
315
+ The function you pass to `map` must be serializable by `pickle`,
316
+ which in practice means that an inner function or lambda generally
317
+ doesn't work. You can get around this in a very Java-like way by
318
+ passing a callable object that encapsulates the necessary state. If
319
+ you wish to avoid traumatising readers of your code, then use
320
+ `functools.partial` instead:
322
321
 
323
322
  ```python
324
323
  pdf.pages.map(partial(myfunc, arg1=value1, arg2=value2))
@@ -327,9 +326,9 @@ pdf.pages.map(partial(myfunc, arg1=value1, arg2=value2))
327
326
  Also, any value returned by your function must also be serializable.
328
327
  There is a bit of magic that enables this to work for PDF objects
329
328
  containing indirect object references, so you should be able to, for
330
- instance, get the `dests` or `annots` from every page without any
331
- trouble. But if you have your own complex objects that you return you
332
- may encounter problems (or slowness).
329
+ instance, get the [`annotations`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.annotations) from
330
+ every page without any trouble. But if you have your own complex
331
+ objects that you return you may encounter problems (or slowness).
333
332
 
334
333
  ## An important note about coordinate spaces
335
334
 
@@ -357,7 +356,7 @@ device space, specifically:
357
356
  the bottom-right corner.
358
357
 
359
358
  However, for compatibility with `pdfminer.six`, you can also pass
360
- `space="page"` to `playa.open`. In this case, `(0, 0)` is the
359
+ `space="page"` to [`playa.open`](https://dhdaines.github.io/playa/latest/reference/#playa.open). In this case, `(0, 0)` is the
361
360
  bottom-left corner of the page as defined by the `MediaBox`, after
362
361
  rotation, and coordinates increase from the bottom-left corner of the
363
362
  page towards the top-right, as they do in PDF user space.
@@ -430,20 +429,28 @@ Note that though it's called a "stack", it's actually a tuple. This
430
429
  means that it is immutable, and you can check if it has changed from
431
430
  one object to the next using the `is` operator.
432
431
 
433
- All content objects can also refer back to their containing `Page`
434
- from the `page` property. This uses weak reference magic in order to
435
- avoid causing memory leaks.
432
+ All content objects can also refer back to their containing
433
+ [`Page`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page) from the `page` property. This uses weak
434
+ reference magic in order to avoid causing memory leaks.
436
435
 
437
436
  ### Form XObjects
438
437
 
439
438
  A PDF page may also contain "Form XObjects" which are like tiny
440
439
  embedded PDF documents (they have nothing to do with fillable forms).
441
- Simply iterating over a `Page` **will not expand these for you** which
442
- may be a source of surprise, but you can recurse into them with the
443
- `flatten` method, or with the convenience properties `paths`,
444
- `images`, `texts` and `glyphs`. You can also identify them in
445
- iteration because they have `object_type == "xobject"`. The layout
446
- objects inside are accessible by iteration, as with pages:
440
+ Simply iterating over a
441
+ [`Page`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page)
442
+ **will not expand these for you** which may be a source of surprise,
443
+ but you can recurse into them with the
444
+ [`flatten`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.flatten)
445
+ method, or with the convenience properties
446
+ [`paths`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.paths),
447
+ [`images`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.images),
448
+ [`texts`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.texts)
449
+ and
450
+ [`glyphs`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.glyphs).
451
+ You can also identify them in iteration because they have `object_type
452
+ == "xobject"`. The layout objects inside are accessible by iteration,
453
+ as with pages:
447
454
 
448
455
  ```python
449
456
  for obj in page:
@@ -453,8 +460,9 @@ for obj in page:
453
460
  ```
454
461
 
455
462
  You can also iterate over them in the page context with
456
- `page.xobjects` (this will also find Form XObjects contained inside
457
- other Form XObjects, which is unfortunately a thing):
463
+ [`page.xobjects`](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.xobjects) (this will also find Form
464
+ XObjects contained inside other Form XObjects, which is unfortunately
465
+ a thing):
458
466
 
459
467
  ```python
460
468
  for xobj in page.xobjects:
@@ -463,9 +471,9 @@ for xobj in page.xobjects:
463
471
  ```
464
472
 
465
473
  Exceptionally, these have a few more features than the ordinary
466
- `ContentObject` - you can look at their raw stream contents as well as
467
- the tokens, and you can also see raw, mysterious PDF objects with
468
- `contents`.
474
+ [`ContentObject`](https://dhdaines.github.io/playa/latest/reference#playa.content.ContentObject) - you can look at their
475
+ raw stream contents as well as the tokens, and you can also see raw,
476
+ mysterious PDF objects with `contents`.
469
477
 
470
478
  ### Graphics state
471
479
 
@@ -474,9 +482,10 @@ of what PDF refers to as the *graphics state*, which is accessible
474
482
  through `obj.gstate`. This is a mutable object, and since there are
475
483
  quite a few parameters in the graphics state, PLAYA does not create a
476
484
  copy of it for every object in the layout. If you wish to reuse these
477
- objects, you should call `finalize` on them, which will freeze the
478
- graphics state and any other necessary context, allowing the object to
479
- be stored and reused *as long as the document exists*:
485
+ objects, you should call
486
+ [`finalize`](https://dhdaines.github.io/playa/latest/reference#playa.content.ContentObject.finalize) on them, which will
487
+ freeze the graphics state and any other necessary context, allowing
488
+ the object to be stored and reused *as long as the document exists*:
480
489
 
481
490
  ```python
482
491
  for obj in page:
@@ -537,15 +546,18 @@ individual glyphs (which might or might not correspond to characters),
537
546
  this is not always what you want, and moreover it is computationally
538
547
  quite expensive. So PLAYA, by default, does not do this. If you
539
548
  don't need to know the actual bounding box of a text object, then
540
- don't access `obj.bbox` and it won't be computed. If you don't need
541
- to know the position of each glyph but simply want the Unicode
542
- characters, then just look at `obj.chars`.
549
+ don't access [`obj.bbox`](https://dhdaines.github.io/playa/latest/reference#playa.content.ContentObject.bbox) and it
550
+ won't be computed. If you don't need to know the position of each
551
+ glyph but simply want the Unicode characters, then just look at
552
+ [`obj.chars`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject.chars).
543
553
 
544
- It is also important to understand that `obj.chars` may or may not
554
+ It is also important to understand that
555
+ [`obj.chars`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject.chars) may or may not
545
556
  correspond to the actual text that a human will read on the page. To
546
557
  actually extract *text* from a PDF necessarily involves Heuristics or
547
- Machine Learning (yes, capitalized, like that) and PLAYA does not do
548
- either of those things.
558
+ Machine Learning. PLAYA has [some simple
559
+ heuristics](https://dhdaines.github.io/playa/latest/reference#playa.page.Page.extract_text) to do this, which will work
560
+ better with tagged and accessible PDFs, but don't expect miracles.
549
561
 
550
562
  This is because PDFs, especially ones produced by OCR, don't organize
551
563
  text objects in any meaningful fashion, so you will want to actually
@@ -566,8 +578,9 @@ to ignore glyphs with `glyph.gstate.render_mode == 3` (which means
566
578
  For text extraction you really don't care about the `bbox`, but you
567
579
  probably *do* care about the origin of each glyph relative to its
568
580
  neighbours. For this reason PLAYA provides you with two convenience
569
- properties, `origin` and `displacement`, which are considerably faster
570
- to compute than the `bbox`.
581
+ properties, [`origin`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextBase.origin). and
582
+ [`displacement`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextBase.displacement), which are
583
+ considerably faster to compute than the `bbox`.
571
584
 
572
585
  PLAYA doesn't guarantee that text objects come at you in anything
573
586
  other than the order they occur in the file (but it does guarantee
@@ -606,12 +619,13 @@ to eradicate the [numerous inconsistencies, contradictions, and
606
619
  ambiguities](https://github.com/pdf-association/pdf-issues) of the
607
620
  previous standard)
608
621
 
609
- In particular, we care **a lot** about marked content operators, because
610
- of the abovementioned `ActualText` property. For this reason a
611
- `TextObject` in PLAYA **does not** and **will never** correspond to a
612
- PDF text object as defined by the `BT` and `ET` operators. For the
613
- moment, every text-showing operator triggers a new `TextObject`. It
614
- is possible (though unlikely) that in the future, only changes in marked
622
+ In particular, we care **a lot** about marked content operators,
623
+ because of the abovementioned `ActualText` property. For this reason
624
+ a [`TextObject`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject) in PLAYA **does not** and
625
+ **will never** correspond to a PDF text object as defined by the `BT`
626
+ and `ET` operators. For the moment, every text-showing operator
627
+ triggers a new [`TextObject`](https://dhdaines.github.io/playa/latest/reference#playa.content.TextObject). It is
628
+ possible (though unlikely) that in the future, only changes in marked
615
629
  content or graphics state will do this.
616
630
 
617
631
  ## Conclusion