polyfile-weave 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polyfile-weave might be problematic. Click here for more details.

Files changed (585) hide show
  1. polyfile/__init__.py +15 -0
  2. polyfile/__main__.py +394 -0
  3. polyfile/arithmetic.py +27 -0
  4. polyfile/ast.py +114 -0
  5. polyfile/debugger.py +1039 -0
  6. polyfile/expressions.py +346 -0
  7. polyfile/fileutils.py +343 -0
  8. polyfile/html.py +135 -0
  9. polyfile/http/__init__.py +1 -0
  10. polyfile/http/defacto.py +37 -0
  11. polyfile/http/deprecated.py +51 -0
  12. polyfile/http/experimental.py +67 -0
  13. polyfile/http/http_11.py +548 -0
  14. polyfile/http/matcher.py +37 -0
  15. polyfile/http/structured_headers.py +48 -0
  16. polyfile/iterators.py +72 -0
  17. polyfile/jpeg.py +24 -0
  18. polyfile/kaitai/__init__.py +0 -0
  19. polyfile/kaitai/compiler.py +156 -0
  20. polyfile/kaitai/parser.py +312 -0
  21. polyfile/kaitai/parsers/__init__.py +0 -0
  22. polyfile/kaitai/parsers/aix_utmp.py +116 -0
  23. polyfile/kaitai/parsers/allegro_dat.py +367 -0
  24. polyfile/kaitai/parsers/andes_firmware.py +64 -0
  25. polyfile/kaitai/parsers/android_bootldr_asus.py +105 -0
  26. polyfile/kaitai/parsers/android_bootldr_huawei.py +181 -0
  27. polyfile/kaitai/parsers/android_bootldr_qcom.py +217 -0
  28. polyfile/kaitai/parsers/android_dto.py +138 -0
  29. polyfile/kaitai/parsers/android_img.py +319 -0
  30. polyfile/kaitai/parsers/android_nanoapp_header.py +83 -0
  31. polyfile/kaitai/parsers/android_opengl_shaders_cache.py +151 -0
  32. polyfile/kaitai/parsers/android_sparse.py +237 -0
  33. polyfile/kaitai/parsers/android_super.py +401 -0
  34. polyfile/kaitai/parsers/apm_partition_table.py +196 -0
  35. polyfile/kaitai/parsers/apple_single_double.py +180 -0
  36. polyfile/kaitai/parsers/asn1_der.py +235 -0
  37. polyfile/kaitai/parsers/au.py +138 -0
  38. polyfile/kaitai/parsers/avantes_roh60.py +112 -0
  39. polyfile/kaitai/parsers/avi.py +296 -0
  40. polyfile/kaitai/parsers/bcd.py +111 -0
  41. polyfile/kaitai/parsers/bitcoin_transaction.py +210 -0
  42. polyfile/kaitai/parsers/blender_blend.py +334 -0
  43. polyfile/kaitai/parsers/bmp.py +780 -0
  44. polyfile/kaitai/parsers/bson.py +411 -0
  45. polyfile/kaitai/parsers/btrfs_stream.py +318 -0
  46. polyfile/kaitai/parsers/bytes_with_io.py +27 -0
  47. polyfile/kaitai/parsers/chrome_pak.py +194 -0
  48. polyfile/kaitai/parsers/code_6502.py +456 -0
  49. polyfile/kaitai/parsers/compressed_resource.py +217 -0
  50. polyfile/kaitai/parsers/cpio_old_le.py +154 -0
  51. polyfile/kaitai/parsers/cramfs.py +344 -0
  52. polyfile/kaitai/parsers/creative_voice_file.py +342 -0
  53. polyfile/kaitai/parsers/dbf.py +274 -0
  54. polyfile/kaitai/parsers/dcmp_0.py +664 -0
  55. polyfile/kaitai/parsers/dcmp_1.py +422 -0
  56. polyfile/kaitai/parsers/dcmp_2.py +312 -0
  57. polyfile/kaitai/parsers/dcmp_variable_length_integer.py +66 -0
  58. polyfile/kaitai/parsers/dex.py +1086 -0
  59. polyfile/kaitai/parsers/dicom.py +4370 -0
  60. polyfile/kaitai/parsers/dime_message.py +201 -0
  61. polyfile/kaitai/parsers/dns_packet.py +569 -0
  62. polyfile/kaitai/parsers/doom_wad.py +654 -0
  63. polyfile/kaitai/parsers/dos_datetime.py +191 -0
  64. polyfile/kaitai/parsers/dos_mz.py +172 -0
  65. polyfile/kaitai/parsers/ds_store.py +513 -0
  66. polyfile/kaitai/parsers/dtb.py +310 -0
  67. polyfile/kaitai/parsers/dune_2_pak.py +126 -0
  68. polyfile/kaitai/parsers/edid.py +472 -0
  69. polyfile/kaitai/parsers/efivar_signature_list.py +331 -0
  70. polyfile/kaitai/parsers/elf.py +2482 -0
  71. polyfile/kaitai/parsers/ethernet_frame.py +114 -0
  72. polyfile/kaitai/parsers/exif.py +723 -0
  73. polyfile/kaitai/parsers/ext2.py +537 -0
  74. polyfile/kaitai/parsers/fallout2_dat.py +187 -0
  75. polyfile/kaitai/parsers/fallout_dat.py +156 -0
  76. polyfile/kaitai/parsers/fasttracker_xm_module.py +558 -0
  77. polyfile/kaitai/parsers/ftl_dat.py +90 -0
  78. polyfile/kaitai/parsers/genmidi_op2.py +161 -0
  79. polyfile/kaitai/parsers/gettext_mo.py +541 -0
  80. polyfile/kaitai/parsers/gif.py +492 -0
  81. polyfile/kaitai/parsers/gimp_brush.py +244 -0
  82. polyfile/kaitai/parsers/glibc_utmp.py +114 -0
  83. polyfile/kaitai/parsers/gltf_binary.py +132 -0
  84. polyfile/kaitai/parsers/google_protobuf.py +151 -0
  85. polyfile/kaitai/parsers/gpt_partition_table.py +175 -0
  86. polyfile/kaitai/parsers/gran_turismo_vol.py +140 -0
  87. polyfile/kaitai/parsers/grub2_font.py +337 -0
  88. polyfile/kaitai/parsers/gzip.py +232 -0
  89. polyfile/kaitai/parsers/hashcat_restore.py +60 -0
  90. polyfile/kaitai/parsers/hccap.py +111 -0
  91. polyfile/kaitai/parsers/hccapx.py +103 -0
  92. polyfile/kaitai/parsers/heaps_pak.py +177 -0
  93. polyfile/kaitai/parsers/heroes_of_might_and_magic_agg.py +116 -0
  94. polyfile/kaitai/parsers/heroes_of_might_and_magic_bmp.py +34 -0
  95. polyfile/kaitai/parsers/icmp_packet.py +136 -0
  96. polyfile/kaitai/parsers/ico.py +129 -0
  97. polyfile/kaitai/parsers/id3v1_1.py +220 -0
  98. polyfile/kaitai/parsers/id3v2_3.py +324 -0
  99. polyfile/kaitai/parsers/id3v2_4.py +423 -0
  100. polyfile/kaitai/parsers/ines.py +282 -0
  101. polyfile/kaitai/parsers/ipv4_packet.py +158 -0
  102. polyfile/kaitai/parsers/ipv6_packet.py +55 -0
  103. polyfile/kaitai/parsers/iso9660.py +544 -0
  104. polyfile/kaitai/parsers/java_class.py +1113 -0
  105. polyfile/kaitai/parsers/jpeg.py +361 -0
  106. polyfile/kaitai/parsers/luks.py +149 -0
  107. polyfile/kaitai/parsers/lzh.py +165 -0
  108. polyfile/kaitai/parsers/mac_os_resource_snd.py +493 -0
  109. polyfile/kaitai/parsers/mach_o.py +3033 -0
  110. polyfile/kaitai/parsers/mach_o_fat.py +92 -0
  111. polyfile/kaitai/parsers/magicavoxel_vox.py +391 -0
  112. polyfile/kaitai/parsers/manifest.json +1 -0
  113. polyfile/kaitai/parsers/mbr_partition_table.py +119 -0
  114. polyfile/kaitai/parsers/mcap.py +1015 -0
  115. polyfile/kaitai/parsers/microsoft_cfb.py +293 -0
  116. polyfile/kaitai/parsers/microsoft_network_monitor_v2.py +309 -0
  117. polyfile/kaitai/parsers/microsoft_pe.py +765 -0
  118. polyfile/kaitai/parsers/mifare_classic.py +706 -0
  119. polyfile/kaitai/parsers/minecraft_nbt.py +449 -0
  120. polyfile/kaitai/parsers/monomakh_sapr_chg.py +69 -0
  121. polyfile/kaitai/parsers/mozilla_mar.py +239 -0
  122. polyfile/kaitai/parsers/mp4.py +333 -0
  123. polyfile/kaitai/parsers/msgpack.py +467 -0
  124. polyfile/kaitai/parsers/nitf.py +1189 -0
  125. polyfile/kaitai/parsers/nt_mdt_pal.py +155 -0
  126. polyfile/kaitai/parsers/ogg.py +118 -0
  127. polyfile/kaitai/parsers/openpgp_message.py +993 -0
  128. polyfile/kaitai/parsers/packet_ppi.py +515 -0
  129. polyfile/kaitai/parsers/pcap.py +344 -0
  130. polyfile/kaitai/parsers/pcf_font.py +506 -0
  131. polyfile/kaitai/parsers/pcx.py +195 -0
  132. polyfile/kaitai/parsers/pcx_dcx.py +79 -0
  133. polyfile/kaitai/parsers/phar_without_stub.py +399 -0
  134. polyfile/kaitai/parsers/php_serialized_value.py +505 -0
  135. polyfile/kaitai/parsers/png.py +721 -0
  136. polyfile/kaitai/parsers/protocol_body.py +260 -0
  137. polyfile/kaitai/parsers/psx_tim.py +104 -0
  138. polyfile/kaitai/parsers/python_pickle.py +718 -0
  139. polyfile/kaitai/parsers/python_pyc_27.py +510 -0
  140. polyfile/kaitai/parsers/quake_mdl.py +441 -0
  141. polyfile/kaitai/parsers/quake_pak.py +112 -0
  142. polyfile/kaitai/parsers/quicktime_mov.py +634 -0
  143. polyfile/kaitai/parsers/rar.py +265 -0
  144. polyfile/kaitai/parsers/regf.py +569 -0
  145. polyfile/kaitai/parsers/renderware_binary_stream.py +877 -0
  146. polyfile/kaitai/parsers/resource_fork.py +611 -0
  147. polyfile/kaitai/parsers/respack.py +57 -0
  148. polyfile/kaitai/parsers/riff.py +409 -0
  149. polyfile/kaitai/parsers/rpm.py +964 -0
  150. polyfile/kaitai/parsers/rtcp_payload.py +579 -0
  151. polyfile/kaitai/parsers/rtp_packet.py +150 -0
  152. polyfile/kaitai/parsers/rtpdump.py +115 -0
  153. polyfile/kaitai/parsers/ruby_marshal.py +423 -0
  154. polyfile/kaitai/parsers/s3m.py +493 -0
  155. polyfile/kaitai/parsers/saints_row_2_vpp_pc.py +254 -0
  156. polyfile/kaitai/parsers/shapefile_index.py +174 -0
  157. polyfile/kaitai/parsers/shapefile_main.py +893 -0
  158. polyfile/kaitai/parsers/some_ip.py +209 -0
  159. polyfile/kaitai/parsers/some_ip_container.py +37 -0
  160. polyfile/kaitai/parsers/some_ip_sd.py +86 -0
  161. polyfile/kaitai/parsers/some_ip_sd_entries.py +160 -0
  162. polyfile/kaitai/parsers/some_ip_sd_options.py +374 -0
  163. polyfile/kaitai/parsers/specpr.py +404 -0
  164. polyfile/kaitai/parsers/sqlite3.py +472 -0
  165. polyfile/kaitai/parsers/ssh_public_key.py +252 -0
  166. polyfile/kaitai/parsers/standard_midi_file.py +390 -0
  167. polyfile/kaitai/parsers/stl.py +111 -0
  168. polyfile/kaitai/parsers/sudoers_ts.py +201 -0
  169. polyfile/kaitai/parsers/swf.py +406 -0
  170. polyfile/kaitai/parsers/systemd_journal.py +361 -0
  171. polyfile/kaitai/parsers/tcp_segment.py +57 -0
  172. polyfile/kaitai/parsers/tga.py +213 -0
  173. polyfile/kaitai/parsers/tls_client_hello.py +293 -0
  174. polyfile/kaitai/parsers/tr_dos_image.py +322 -0
  175. polyfile/kaitai/parsers/tsm.py +198 -0
  176. polyfile/kaitai/parsers/ttf.py +1847 -0
  177. polyfile/kaitai/parsers/udp_datagram.py +42 -0
  178. polyfile/kaitai/parsers/uefi_te.py +236 -0
  179. polyfile/kaitai/parsers/uimage.py +198 -0
  180. polyfile/kaitai/parsers/utf8_string.py +137 -0
  181. polyfile/kaitai/parsers/vfat.py +410 -0
  182. polyfile/kaitai/parsers/vlq_base128_be.py +104 -0
  183. polyfile/kaitai/parsers/vlq_base128_le.py +129 -0
  184. polyfile/kaitai/parsers/vmware_vmdk.py +167 -0
  185. polyfile/kaitai/parsers/vp8_ivf.py +112 -0
  186. polyfile/kaitai/parsers/warcraft_2_pud.py +423 -0
  187. polyfile/kaitai/parsers/wav.py +1014 -0
  188. polyfile/kaitai/parsers/websocket.py +167 -0
  189. polyfile/kaitai/parsers/windows_evt_log.py +304 -0
  190. polyfile/kaitai/parsers/windows_lnk_file.py +467 -0
  191. polyfile/kaitai/parsers/windows_minidump.py +575 -0
  192. polyfile/kaitai/parsers/windows_resource_file.py +243 -0
  193. polyfile/kaitai/parsers/windows_shell_items.py +190 -0
  194. polyfile/kaitai/parsers/windows_systemtime.py +52 -0
  195. polyfile/kaitai/parsers/wmf.py +502 -0
  196. polyfile/kaitai/parsers/xar.py +181 -0
  197. polyfile/kaitai/parsers/xwd.py +189 -0
  198. polyfile/kaitai/parsers/zip.py +685 -0
  199. polyfile/kaitai/parsers/zisofs.py +158 -0
  200. polyfile/kaitai/parsers/zx_spectrum_tap.py +184 -0
  201. polyfile/kaitaimatcher.py +113 -0
  202. polyfile/languagematcher.py +217 -0
  203. polyfile/logger.py +135 -0
  204. polyfile/magic.py +2983 -0
  205. polyfile/magic_defs/COPYING +29 -0
  206. polyfile/magic_defs/__init__.py +0 -0
  207. polyfile/magic_defs/acorn +102 -0
  208. polyfile/magic_defs/adi +13 -0
  209. polyfile/magic_defs/adventure +122 -0
  210. polyfile/magic_defs/aes +29 -0
  211. polyfile/magic_defs/algol68 +35 -0
  212. polyfile/magic_defs/allegro +9 -0
  213. polyfile/magic_defs/alliant +18 -0
  214. polyfile/magic_defs/alpha +32 -0
  215. polyfile/magic_defs/amanda +12 -0
  216. polyfile/magic_defs/amigaos +218 -0
  217. polyfile/magic_defs/android +259 -0
  218. polyfile/magic_defs/animation +1197 -0
  219. polyfile/magic_defs/aout +46 -0
  220. polyfile/magic_defs/apache +28 -0
  221. polyfile/magic_defs/apl +7 -0
  222. polyfile/magic_defs/apple +773 -0
  223. polyfile/magic_defs/application +7 -0
  224. polyfile/magic_defs/applix +13 -0
  225. polyfile/magic_defs/apt +52 -0
  226. polyfile/magic_defs/archive +2586 -0
  227. polyfile/magic_defs/aria +38 -0
  228. polyfile/magic_defs/arm +50 -0
  229. polyfile/magic_defs/asf +132 -0
  230. polyfile/magic_defs/assembler +18 -0
  231. polyfile/magic_defs/asterix +18 -0
  232. polyfile/magic_defs/att3b +41 -0
  233. polyfile/magic_defs/audio +1291 -0
  234. polyfile/magic_defs/avm +33 -0
  235. polyfile/magic_defs/basis +18 -0
  236. polyfile/magic_defs/beetle +7 -0
  237. polyfile/magic_defs/ber +65 -0
  238. polyfile/magic_defs/bflt +14 -0
  239. polyfile/magic_defs/bhl +10 -0
  240. polyfile/magic_defs/bioinformatics +178 -0
  241. polyfile/magic_defs/biosig +154 -0
  242. polyfile/magic_defs/blackberry +8 -0
  243. polyfile/magic_defs/blcr +25 -0
  244. polyfile/magic_defs/blender +50 -0
  245. polyfile/magic_defs/blit +24 -0
  246. polyfile/magic_defs/bm +10 -0
  247. polyfile/magic_defs/bout +11 -0
  248. polyfile/magic_defs/bsdi +33 -0
  249. polyfile/magic_defs/bsi +10 -0
  250. polyfile/magic_defs/btsnoop +13 -0
  251. polyfile/magic_defs/burp +7 -0
  252. polyfile/magic_defs/bytecode +41 -0
  253. polyfile/magic_defs/c-lang +110 -0
  254. polyfile/magic_defs/c64 +531 -0
  255. polyfile/magic_defs/cad +437 -0
  256. polyfile/magic_defs/cafebabe +107 -0
  257. polyfile/magic_defs/cbor +21 -0
  258. polyfile/magic_defs/ccf +14 -0
  259. polyfile/magic_defs/cddb +12 -0
  260. polyfile/magic_defs/chord +15 -0
  261. polyfile/magic_defs/cisco +12 -0
  262. polyfile/magic_defs/citrus +12 -0
  263. polyfile/magic_defs/clarion +27 -0
  264. polyfile/magic_defs/claris +48 -0
  265. polyfile/magic_defs/clipper +65 -0
  266. polyfile/magic_defs/clojure +30 -0
  267. polyfile/magic_defs/coff +98 -0
  268. polyfile/magic_defs/commands +201 -0
  269. polyfile/magic_defs/communications +22 -0
  270. polyfile/magic_defs/compress +461 -0
  271. polyfile/magic_defs/console +1213 -0
  272. polyfile/magic_defs/convex +69 -0
  273. polyfile/magic_defs/coverage +91 -0
  274. polyfile/magic_defs/cracklib +14 -0
  275. polyfile/magic_defs/crypto +31 -0
  276. polyfile/magic_defs/csv +8 -0
  277. polyfile/magic_defs/ctags +6 -0
  278. polyfile/magic_defs/ctf +23 -0
  279. polyfile/magic_defs/cubemap +8 -0
  280. polyfile/magic_defs/cups +56 -0
  281. polyfile/magic_defs/dact +11 -0
  282. polyfile/magic_defs/database +886 -0
  283. polyfile/magic_defs/dataone +47 -0
  284. polyfile/magic_defs/dbpf +15 -0
  285. polyfile/magic_defs/der +146 -0
  286. polyfile/magic_defs/diamond +12 -0
  287. polyfile/magic_defs/dif +33 -0
  288. polyfile/magic_defs/diff +41 -0
  289. polyfile/magic_defs/digital +59 -0
  290. polyfile/magic_defs/dolby +69 -0
  291. polyfile/magic_defs/dsf +25 -0
  292. polyfile/magic_defs/dump +96 -0
  293. polyfile/magic_defs/dwarfs +45 -0
  294. polyfile/magic_defs/dyadic +61 -0
  295. polyfile/magic_defs/ebml +8 -0
  296. polyfile/magic_defs/edid +11 -0
  297. polyfile/magic_defs/editors +43 -0
  298. polyfile/magic_defs/efi +15 -0
  299. polyfile/magic_defs/elf +379 -0
  300. polyfile/magic_defs/encore +22 -0
  301. polyfile/magic_defs/epoc +62 -0
  302. polyfile/magic_defs/erlang +21 -0
  303. polyfile/magic_defs/espressif +57 -0
  304. polyfile/magic_defs/esri +28 -0
  305. polyfile/magic_defs/etf +33 -0
  306. polyfile/magic_defs/fcs +9 -0
  307. polyfile/magic_defs/filesystems +2694 -0
  308. polyfile/magic_defs/finger +16 -0
  309. polyfile/magic_defs/firmware +133 -0
  310. polyfile/magic_defs/flash +62 -0
  311. polyfile/magic_defs/flif +36 -0
  312. polyfile/magic_defs/fonts +449 -0
  313. polyfile/magic_defs/forth +82 -0
  314. polyfile/magic_defs/fortran +9 -0
  315. polyfile/magic_defs/frame +62 -0
  316. polyfile/magic_defs/freebsd +164 -0
  317. polyfile/magic_defs/fsav +128 -0
  318. polyfile/magic_defs/fusecompress +12 -0
  319. polyfile/magic_defs/games +696 -0
  320. polyfile/magic_defs/gcc +17 -0
  321. polyfile/magic_defs/gconv +10 -0
  322. polyfile/magic_defs/gentoo +85 -0
  323. polyfile/magic_defs/geo +166 -0
  324. polyfile/magic_defs/geos +20 -0
  325. polyfile/magic_defs/gimp +77 -0
  326. polyfile/magic_defs/git +13 -0
  327. polyfile/magic_defs/glibc +21 -0
  328. polyfile/magic_defs/gnome +59 -0
  329. polyfile/magic_defs/gnu +173 -0
  330. polyfile/magic_defs/gnumeric +8 -0
  331. polyfile/magic_defs/gpt +240 -0
  332. polyfile/magic_defs/gpu +28 -0
  333. polyfile/magic_defs/grace +21 -0
  334. polyfile/magic_defs/graphviz +12 -0
  335. polyfile/magic_defs/gringotts +48 -0
  336. polyfile/magic_defs/guile +13 -0
  337. polyfile/magic_defs/hardware +12 -0
  338. polyfile/magic_defs/hitachi-sh +30 -0
  339. polyfile/magic_defs/hp +433 -0
  340. polyfile/magic_defs/human68k +26 -0
  341. polyfile/magic_defs/ibm370 +52 -0
  342. polyfile/magic_defs/ibm6000 +35 -0
  343. polyfile/magic_defs/icc +214 -0
  344. polyfile/magic_defs/iff +80 -0
  345. polyfile/magic_defs/images +4210 -0
  346. polyfile/magic_defs/inform +9 -0
  347. polyfile/magic_defs/intel +310 -0
  348. polyfile/magic_defs/interleaf +9 -0
  349. polyfile/magic_defs/island +10 -0
  350. polyfile/magic_defs/ispell +63 -0
  351. polyfile/magic_defs/isz +15 -0
  352. polyfile/magic_defs/java +52 -0
  353. polyfile/magic_defs/javascript +171 -0
  354. polyfile/magic_defs/jpeg +252 -0
  355. polyfile/magic_defs/json +8 -0
  356. polyfile/magic_defs/karma +9 -0
  357. polyfile/magic_defs/kde +11 -0
  358. polyfile/magic_defs/keepass +20 -0
  359. polyfile/magic_defs/kerberos +45 -0
  360. polyfile/magic_defs/kicad +85 -0
  361. polyfile/magic_defs/kml +34 -0
  362. polyfile/magic_defs/lammps +64 -0
  363. polyfile/magic_defs/lecter +6 -0
  364. polyfile/magic_defs/lex +12 -0
  365. polyfile/magic_defs/lif +50 -0
  366. polyfile/magic_defs/linux +557 -0
  367. polyfile/magic_defs/lisp +78 -0
  368. polyfile/magic_defs/llvm +22 -0
  369. polyfile/magic_defs/locoscript +12 -0
  370. polyfile/magic_defs/lua +31 -0
  371. polyfile/magic_defs/luks +126 -0
  372. polyfile/magic_defs/m4 +11 -0
  373. polyfile/magic_defs/mach +303 -0
  374. polyfile/magic_defs/macintosh +505 -0
  375. polyfile/magic_defs/macos +7 -0
  376. polyfile/magic_defs/magic +10 -0
  377. polyfile/magic_defs/magic.mgc +0 -0
  378. polyfile/magic_defs/mail.news +132 -0
  379. polyfile/magic_defs/make +21 -0
  380. polyfile/magic_defs/map +413 -0
  381. polyfile/magic_defs/maple +109 -0
  382. polyfile/magic_defs/marc21 +30 -0
  383. polyfile/magic_defs/mathcad +8 -0
  384. polyfile/magic_defs/mathematica +188 -0
  385. polyfile/magic_defs/matroska +17 -0
  386. polyfile/magic_defs/mcrypt +52 -0
  387. polyfile/magic_defs/measure +44 -0
  388. polyfile/magic_defs/mercurial +13 -0
  389. polyfile/magic_defs/metastore +8 -0
  390. polyfile/magic_defs/meteorological +53 -0
  391. polyfile/magic_defs/microfocus +21 -0
  392. polyfile/magic_defs/mime +9 -0
  393. polyfile/magic_defs/mips +120 -0
  394. polyfile/magic_defs/mirage +8 -0
  395. polyfile/magic_defs/misctools +140 -0
  396. polyfile/magic_defs/mkid +11 -0
  397. polyfile/magic_defs/mlssa +8 -0
  398. polyfile/magic_defs/mmdf +6 -0
  399. polyfile/magic_defs/modem +92 -0
  400. polyfile/magic_defs/modulefile +9 -0
  401. polyfile/magic_defs/motorola +71 -0
  402. polyfile/magic_defs/mozilla +37 -0
  403. polyfile/magic_defs/msdos +2304 -0
  404. polyfile/magic_defs/msooxml +68 -0
  405. polyfile/magic_defs/msvc +222 -0
  406. polyfile/magic_defs/msx +309 -0
  407. polyfile/magic_defs/mup +24 -0
  408. polyfile/magic_defs/music +17 -0
  409. polyfile/magic_defs/nasa +7 -0
  410. polyfile/magic_defs/natinst +24 -0
  411. polyfile/magic_defs/ncr +49 -0
  412. polyfile/magic_defs/neko +12 -0
  413. polyfile/magic_defs/netbsd +251 -0
  414. polyfile/magic_defs/netscape +26 -0
  415. polyfile/magic_defs/netware +11 -0
  416. polyfile/magic_defs/news +13 -0
  417. polyfile/magic_defs/nifty +202 -0
  418. polyfile/magic_defs/nim-lang +29 -0
  419. polyfile/magic_defs/nitpicker +14 -0
  420. polyfile/magic_defs/numpy +9 -0
  421. polyfile/magic_defs/oasis +12 -0
  422. polyfile/magic_defs/ocaml +14 -0
  423. polyfile/magic_defs/octave +6 -0
  424. polyfile/magic_defs/ole2compounddocs +760 -0
  425. polyfile/magic_defs/olf +98 -0
  426. polyfile/magic_defs/openfst +17 -0
  427. polyfile/magic_defs/opentimestamps +16 -0
  428. polyfile/magic_defs/oric +16 -0
  429. polyfile/magic_defs/os2 +186 -0
  430. polyfile/magic_defs/os400 +39 -0
  431. polyfile/magic_defs/os9 +80 -0
  432. polyfile/magic_defs/osf1 +10 -0
  433. polyfile/magic_defs/palm +156 -0
  434. polyfile/magic_defs/parix +13 -0
  435. polyfile/magic_defs/parrot +22 -0
  436. polyfile/magic_defs/pascal +39 -0
  437. polyfile/magic_defs/pbf +11 -0
  438. polyfile/magic_defs/pbm +8 -0
  439. polyfile/magic_defs/pc88 +24 -0
  440. polyfile/magic_defs/pc98 +77 -0
  441. polyfile/magic_defs/pci_ids +116 -0
  442. polyfile/magic_defs/pcjr +8 -0
  443. polyfile/magic_defs/pdf +51 -0
  444. polyfile/magic_defs/pdp +42 -0
  445. polyfile/magic_defs/perl +100 -0
  446. polyfile/magic_defs/pgf +52 -0
  447. polyfile/magic_defs/pgp +581 -0
  448. polyfile/magic_defs/pgp-binary-keys +388 -0
  449. polyfile/magic_defs/pkgadd +7 -0
  450. polyfile/magic_defs/plan9 +25 -0
  451. polyfile/magic_defs/playdate +57 -0
  452. polyfile/magic_defs/plus5 +18 -0
  453. polyfile/magic_defs/pmem +46 -0
  454. polyfile/magic_defs/polyfile_zip +5 -0
  455. polyfile/magic_defs/polyml +23 -0
  456. polyfile/magic_defs/printer +269 -0
  457. polyfile/magic_defs/project +10 -0
  458. polyfile/magic_defs/psdbms +14 -0
  459. polyfile/magic_defs/psl +14 -0
  460. polyfile/magic_defs/pulsar +13 -0
  461. polyfile/magic_defs/puzzle +17 -0
  462. polyfile/magic_defs/pwsafe +14 -0
  463. polyfile/magic_defs/pyramid +12 -0
  464. polyfile/magic_defs/python +305 -0
  465. polyfile/magic_defs/qt +30 -0
  466. polyfile/magic_defs/revision +66 -0
  467. polyfile/magic_defs/riff +840 -0
  468. polyfile/magic_defs/rinex +44 -0
  469. polyfile/magic_defs/ringdove +45 -0
  470. polyfile/magic_defs/rpi +52 -0
  471. polyfile/magic_defs/rpm +45 -0
  472. polyfile/magic_defs/rpmsg +7 -0
  473. polyfile/magic_defs/rst +11 -0
  474. polyfile/magic_defs/rtf +94 -0
  475. polyfile/magic_defs/ruby +55 -0
  476. polyfile/magic_defs/rust +21 -0
  477. polyfile/magic_defs/sc +7 -0
  478. polyfile/magic_defs/sccs +24 -0
  479. polyfile/magic_defs/scientific +144 -0
  480. polyfile/magic_defs/securitycerts +6 -0
  481. polyfile/magic_defs/selinux +24 -0
  482. polyfile/magic_defs/sendmail +37 -0
  483. polyfile/magic_defs/sequent +42 -0
  484. polyfile/magic_defs/sereal +35 -0
  485. polyfile/magic_defs/sgi +144 -0
  486. polyfile/magic_defs/sgml +161 -0
  487. polyfile/magic_defs/sharc +23 -0
  488. polyfile/magic_defs/sinclair +40 -0
  489. polyfile/magic_defs/sisu +18 -0
  490. polyfile/magic_defs/sketch +6 -0
  491. polyfile/magic_defs/smalltalk +25 -0
  492. polyfile/magic_defs/smile +34 -0
  493. polyfile/magic_defs/sniffer +482 -0
  494. polyfile/magic_defs/softquad +40 -0
  495. polyfile/magic_defs/sosi +40 -0
  496. polyfile/magic_defs/spec +21 -0
  497. polyfile/magic_defs/spectrum +184 -0
  498. polyfile/magic_defs/sql +288 -0
  499. polyfile/magic_defs/ssh +39 -0
  500. polyfile/magic_defs/ssl +20 -0
  501. polyfile/magic_defs/statistics +45 -0
  502. polyfile/magic_defs/subtitle +38 -0
  503. polyfile/magic_defs/sun +141 -0
  504. polyfile/magic_defs/svf +5 -0
  505. polyfile/magic_defs/sylk +36 -0
  506. polyfile/magic_defs/symbos +42 -0
  507. polyfile/magic_defs/sysex +429 -0
  508. polyfile/magic_defs/tcl +29 -0
  509. polyfile/magic_defs/teapot +6 -0
  510. polyfile/magic_defs/terminfo +63 -0
  511. polyfile/magic_defs/tex +141 -0
  512. polyfile/magic_defs/tgif +7 -0
  513. polyfile/magic_defs/ti-8x +239 -0
  514. polyfile/magic_defs/timezone +42 -0
  515. polyfile/magic_defs/tplink +95 -0
  516. polyfile/magic_defs/troff +38 -0
  517. polyfile/magic_defs/tuxedo +8 -0
  518. polyfile/magic_defs/typeset +8 -0
  519. polyfile/magic_defs/uf2 +72 -0
  520. polyfile/magic_defs/unicode +15 -0
  521. polyfile/magic_defs/unisig +12 -0
  522. polyfile/magic_defs/unknown +34 -0
  523. polyfile/magic_defs/usd +21 -0
  524. polyfile/magic_defs/uterus +16 -0
  525. polyfile/magic_defs/uuencode +28 -0
  526. polyfile/magic_defs/vacuum-cleaner +54 -0
  527. polyfile/magic_defs/varied.out +46 -0
  528. polyfile/magic_defs/varied.script +21 -0
  529. polyfile/magic_defs/vax +32 -0
  530. polyfile/magic_defs/vicar +17 -0
  531. polyfile/magic_defs/virtual +307 -0
  532. polyfile/magic_defs/virtutech +12 -0
  533. polyfile/magic_defs/visx +32 -0
  534. polyfile/magic_defs/vms +30 -0
  535. polyfile/magic_defs/vmware +6 -0
  536. polyfile/magic_defs/vorbis +155 -0
  537. polyfile/magic_defs/vxl +14 -0
  538. polyfile/magic_defs/warc +16 -0
  539. polyfile/magic_defs/weak +16 -0
  540. polyfile/magic_defs/web +18 -0
  541. polyfile/magic_defs/webassembly +17 -0
  542. polyfile/magic_defs/windows +1811 -0
  543. polyfile/magic_defs/wireless +7 -0
  544. polyfile/magic_defs/wordprocessors +630 -0
  545. polyfile/magic_defs/wsdl +23 -0
  546. polyfile/magic_defs/x68000 +25 -0
  547. polyfile/magic_defs/xdelta +13 -0
  548. polyfile/magic_defs/xenix +106 -0
  549. polyfile/magic_defs/xilinx +58 -0
  550. polyfile/magic_defs/xo65 +37 -0
  551. polyfile/magic_defs/xwindows +43 -0
  552. polyfile/magic_defs/yara +17 -0
  553. polyfile/magic_defs/zfs +96 -0
  554. polyfile/magic_defs/zilog +12 -0
  555. polyfile/magic_defs/zip +126 -0
  556. polyfile/magic_defs/zyxel +17 -0
  557. polyfile/nes.py +144 -0
  558. polyfile/nitf.py +15 -0
  559. polyfile/pdf.py +1264 -0
  560. polyfile/pickles.py +45 -0
  561. polyfile/polyfile.py +409 -0
  562. polyfile/profiling.py +115 -0
  563. polyfile/repl.py +624 -0
  564. polyfile/search.py +310 -0
  565. polyfile/serialization.py +323 -0
  566. polyfile/structmatcher.py +46 -0
  567. polyfile/structs.py +281 -0
  568. polyfile/templates/download.js +162 -0
  569. polyfile/templates/hexdump.css +268 -0
  570. polyfile/templates/hexdump.js +756 -0
  571. polyfile/templates/jquery-3.4.1.min.js +2 -0
  572. polyfile/templates/template.html +119 -0
  573. polyfile/wildcards.py +62 -0
  574. polyfile/zipmatcher.py +183 -0
  575. polyfile_weave-0.5.5.dist-info/METADATA +173 -0
  576. polyfile_weave-0.5.5.dist-info/RECORD +585 -0
  577. polyfile_weave-0.5.5.dist-info/WHEEL +5 -0
  578. polyfile_weave-0.5.5.dist-info/entry_points.txt +2 -0
  579. polyfile_weave-0.5.5.dist-info/licenses/LICENSE +202 -0
  580. polyfile_weave-0.5.5.dist-info/top_level.txt +2 -0
  581. polymerge/__init__.py +1 -0
  582. polymerge/__main__.py +296 -0
  583. polymerge/cfg.py +127 -0
  584. polymerge/polymerge.py +227 -0
  585. polymerge/polytracker.py +190 -0
polyfile/pdf.py ADDED
@@ -0,0 +1,1264 @@
1
+ import sys
2
+ from typing import Callable, Dict, Iterator, List, Optional, Type, TypeVar, Union
3
+ import zlib
4
+
5
+ from pdfminer.ascii85 import ascii85decode, asciihexdecode
6
+ from pdfminer.ccitt import ccittfaxdecode
7
+ from pdfminer.lzw import lzwdecode
8
+ from pdfminer.pdfparser import PDFSyntaxError
9
+ from pdfminer.pdftypes import PDFNotImplementedError
10
+ from pdfminer.runlength import rldecode
11
+ from pdfminer.pdfparser import PDFParser as PDFMinerParser, PDFStream, PDFObjRef
12
+ from pdfminer.psparser import ExtraT, PSBaseParserToken, PSKeyword, PSObject, PSLiteral, PSStackEntry, PSSyntaxError
13
+ from pdfminer.pdfdocument import (
14
+ PDFDocument, PDFXRef, KWD, PDFNoValidXRef, PSEOF, dict_value, LITERAL_XREF, LITERAL_OBJSTM, LITERAL_CATALOG,
15
+ DecipherCallable, PDFObjectNotFound
16
+ )
17
+ from pdfminer.pdftypes import (
18
+ LIT, LITERALS_FLATE_DECODE, LITERALS_ASCIIHEX_DECODE, LITERALS_CCITTFAX_DECODE, LITERALS_RUNLENGTH_DECODE,
19
+ LITERAL_CRYPT, LITERALS_LZW_DECODE, LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_ASCII85_DECODE,
20
+ int_value, apply_png_predictor
21
+ )
22
+
23
+ from .fileutils import FileStream
24
+ from .fileutils import Tempfile
25
+ from .logger import getStatusLogger
26
+ from .magic import AbsoluteOffset, FailedTest, MagicMatcher, MagicTest, MatchedTest, TestResult, TestType
27
+ from .polyfile import Match, Matcher, Submatch, register_parser
28
+
29
+ log = getStatusLogger("PDF")
30
+
31
+
32
+ def load_trailer(self, parser: "PDFParser") -> None:
33
+ try:
34
+ (_, kwd) = parser.nexttoken()
35
+ assert kwd == KWD(b'trailer'), f"{kwd!s} != {KWD(b'trailer')!s}"
36
+ flush_before = parser.auto_flush
37
+ try:
38
+ # This might be a bug in pdfminer, or it's just that we are using it wrong, but we need to
39
+ # flush our entire token stack to the results list in order to parse the trailer dict:
40
+ parser.auto_flush = True
41
+ (_, dic) = parser.nextobject()
42
+ finally:
43
+ parser.auto_flush = flush_before
44
+ except PSEOF:
45
+ x = parser.pop(1)
46
+ if not x:
47
+ raise PDFNoValidXRef('Unexpected EOF - file corrupted')
48
+ (_, dic) = x[0]
49
+ self.trailer.update(dict_value(dic))
50
+ log.debug('trailer=%r', self.trailer)
51
+ return
52
+
53
+
54
+ def load_xref(self: PDFXRef, parser: "PDFParser"):
55
+ while True:
56
+ try:
57
+ (pos, line) = parser.nextline()
58
+ line = line.strip()
59
+ if not line:
60
+ continue
61
+ except PSEOF:
62
+ raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
63
+ if line.startswith(b"trailer"):
64
+ parser.seek(pos)
65
+ break
66
+ f = line.split(b" ")
67
+ if len(f) != 2:
68
+ error_msg = "Trailer not found: {!r}: line={!r}".format(parser, line)
69
+ raise PDFNoValidXRef(error_msg)
70
+ try:
71
+ (start, nobjs) = map(int, f)
72
+ except ValueError:
73
+ error_msg = "Invalid line: {!r}: line={!r}".format(parser, line)
74
+ raise PDFNoValidXRef(error_msg)
75
+ for objid in range(start, start + nobjs):
76
+ try:
77
+ (_, line) = parser.nextline()
78
+ line = line.strip()
79
+ except PSEOF:
80
+ raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
81
+ f = line.split(b" ")
82
+ if len(f) != 3:
83
+ error_msg = "Invalid XRef format: {!r}, line={!r}".format(
84
+ parser, line
85
+ )
86
+ raise PDFNoValidXRef(error_msg)
87
+ (pos_b, genno_b, use_b) = f
88
+ if use_b != b"n":
89
+ continue
90
+ self.offsets[objid] = (None, pos_b.__int__(), genno_b.__int__())
91
+ log.debug("xref objects: %r", self.offsets)
92
+ self.load_trailer(parser)
93
+
94
+
95
+ PDFXRef.load_trailer = load_trailer
96
+ PDFXRef.load = load_xref
97
+
98
+
99
+ class PSToken:
100
+ pdf_offset: int
101
+ pdf_bytes: int
102
+
103
+ def __new__(cls, *args, **kwargs):
104
+ ret = super().__new__(cls, *args)
105
+ ret.pdf_offset = kwargs["pdf_offset"]
106
+ ret.pdf_bytes = kwargs["pdf_bytes"]
107
+ return ret
108
+
109
+ def __int__(self):
110
+ if isinstance(self, PSInt):
111
+ return self
112
+ return PSInt(int(self, base=10), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
113
+
114
+ def __float__(self):
115
+ if isinstance(self, float):
116
+ return self
117
+ elif isinstance(self, int):
118
+ return PSFloat(int(self, base=10), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
119
+ elif isinstance(self, bytes):
120
+ return PSFloat(self.decode(), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
121
+ elif isinstance(self, PSStr):
122
+ return PSFloat(str(self), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
123
+ else:
124
+ raise NotImplementedError()
125
+
126
+ def __bytes__(self):
127
+ if isinstance(self, PSBytes):
128
+ return self
129
+ else:
130
+ return PSBytes(self, pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
131
+
132
+ def __hex__(self):
133
+ return PSStr(super().__hex__(), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
134
+
135
+ def __str__(self):
136
+ raise NotImplementedError()
137
+ # return PSStr(super().__str__(), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
138
+
139
+ def __repr__(self):
140
+ return f"{self.__class__.__name__}({super().__repr__()}, pdf_offset={self.pdf_offset!r}, "\
141
+ f"pdf_bytes={self.pdf_bytes!r})"
142
+
143
+
144
+ class PSInt(PSToken, int):
145
+ def __index__(self):
146
+ return self
147
+
148
+ def __str__(self):
149
+ return str(int(self))
150
+
151
+
152
+ C = TypeVar("C")
153
+
154
+
155
+ class PSSequence(PSToken):
156
+ def split(self: Type[C], sep: Optional[C] = None, maxsplit: int = -1) -> List[C]:
157
+ remainder = self
158
+ current: Optional[C] = None
159
+ result: List[C] = []
160
+ if sep is None:
161
+ remainder = remainder.strip()
162
+ while remainder and (maxsplit < 0 or len(result) <= maxsplit):
163
+ c = remainder[0:1]
164
+ remainder = remainder[1:]
165
+ if sep is None:
166
+ if not c.strip():
167
+ if current is not None:
168
+ result.append(current)
169
+ current = None
170
+ else:
171
+ if current is None:
172
+ current = c
173
+ else:
174
+ current += c
175
+ else:
176
+ if current is None:
177
+ current = c
178
+ else:
179
+ current += c
180
+ if current[-len(sep):] == sep:
181
+ result.append(current[:-len(sep)])
182
+ current = None
183
+ if current is not None:
184
+ if not result or maxsplit < 0 or len(result) <= maxsplit:
185
+ result.append(current)
186
+ else:
187
+ result[-1] += current
188
+ return result
189
+
190
+ def __add__(self: Type[C], other) -> C:
191
+ if isinstance(other, self.__class__) and other.pdf_offset == self.pdf_offset + self.pdf_bytes:
192
+ return self.__class__(super().__add__(other), pdf_offset=self.pdf_offset)
193
+ return self.__class__(super().__add__(other), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
194
+
195
+ def __radd__(self: Type[C], other) -> C:
196
+ return self.__class__(other, pdf_offset=self.pdf_offset - len(other)) + self
197
+
198
+ def lstrip(self: Type[C], chars: bytes = b" \t\n\r") -> C:
199
+ ret = self
200
+ while ret and ret[0] in chars:
201
+ ret = ret[1:]
202
+ return ret
203
+
204
+ def rstrip(self: Type[C], chars: bytes = b" \t\n\r") -> C:
205
+ ret = self
206
+ while ret and ret[-1] in chars:
207
+ ret = ret[:-1]
208
+ return ret
209
+
210
+ def strip(self: Type[C], chars: bytes = b" \t\n\r") -> C:
211
+ return self.lstrip(chars).rstrip(chars)
212
+
213
+ def __getitem__(self, item):
214
+ if isinstance(item, int):
215
+ value = super().__getitem__(item)
216
+ return make_ps_object(value, pdf_offset=self.pdf_offset+item, pdf_bytes=self.pdf_bytes-item)
217
+ elif isinstance(item, slice):
218
+ if item.start is None:
219
+ start = 0
220
+ else:
221
+ start = item.start
222
+ if item.stop is None:
223
+ stop = self.pdf_bytes
224
+ else:
225
+ stop = item.stop
226
+ try:
227
+ return self.__class__(
228
+ super().__getitem__(item),
229
+ pdf_offset=self.pdf_offset+start,
230
+ pdf_bytes=self.pdf_bytes-(stop - start)
231
+ )
232
+ except ValueError:
233
+ if isinstance(self, PSBytes):
234
+ return PSBytes(
235
+ super().__getitem__(item),
236
+ pdf_offset=self.pdf_offset+start,
237
+ pdf_bytes=self.pdf_bytes-(stop - start)
238
+ )
239
+ else:
240
+ raise
241
+ else:
242
+ return super().__getitem__(item)
243
+
244
+
245
+ class PSStr(PSSequence, str):
246
+ def encode(self, encoding: str = ..., errors: str = ...) -> bytes:
247
+ return PSBytes(super().encode(encoding, errors), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
248
+
249
+ def __str__(self):
250
+ return str.__str__(self)
251
+
252
+
253
+ class PSBytes(PSSequence, bytes):
254
+ def __new__(cls, *args, **kwargs):
255
+ kwargs = dict(kwargs)
256
+ if "pdf_bytes" not in kwargs:
257
+ kwargs["pdf_bytes"] = len(args[0])
258
+ return super().__new__(cls, *args, **kwargs)
259
+
260
+ def __getitem__(self, item):
261
+ if isinstance(item, slice):
262
+ if item.start is None:
263
+ start = 0
264
+ else:
265
+ start = item.start
266
+ return PSBytes(super().__getitem__(item), pdf_offset=self.pdf_offset + start)
267
+ else:
268
+ ret = super().__getitem__(item)
269
+ if isinstance(ret, PSInt):
270
+ return ret
271
+ else:
272
+ return PSInt(ret, pdf_offset=self.pdf_offset + item)
273
+
274
+ def decode(self, encoding: str = "utf-8", errors: str = "strict") -> PSStr:
275
+ return PSStr(super().decode(encoding, errors), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
276
+
277
+
278
+ def __str__(self):
279
+ return bytes.__str__(self)
280
+
281
+
282
+ class PDFDeciphered(PSBytes):
283
+ original_bytes: bytes
284
+
285
+ def __new__(cls, *args, **kwargs):
286
+ kwargs = dict(kwargs)
287
+ if "pdf_bytes" not in kwargs:
288
+ kwargs["pdf_bytes"] = len(args[0])
289
+ if "original_bytes" in kwargs:
290
+ original_bytes = kwargs["original_bytes"]
291
+ del kwargs["original_bytes"]
292
+ else:
293
+ raise ValueError(f"{cls.__name__}.__init__ requires the `original_bytes` argument")
294
+ ret = super().__new__(cls, *args, **kwargs)
295
+ setattr(ret, "original_bytes", original_bytes)
296
+ return ret
297
+
298
+
299
+ class PSFloat(PSToken, float):
300
+ def __str__(self):
301
+ return float.__str__(self)
302
+
303
+
304
+ class PSBool:
305
+ def __init__(self, value: bool, pdf_offset: int, pdf_bytes: int):
306
+ self.value: bool = value
307
+ self.pdf_offset: int = pdf_offset
308
+ self.pdf_bytes: int = pdf_bytes
309
+
310
+ def __bool__(self):
311
+ return self.value
312
+
313
+ def __int__(self):
314
+ return PSInt(int(self.value), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
315
+
316
+ def __eq__(self, other):
317
+ return self.value == bool(other)
318
+
319
+ def __ne__(self, other):
320
+ return self.value != bool(other)
321
+
322
+ def __hash__(self):
323
+ return hash(self.value)
324
+
325
+ def __str__(self):
326
+ return PSStr(self.value, pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
327
+
328
+ def __repr__(self):
329
+ return f"{self.__class__.__name__}(value={self.value!r}, pdf_offset={self.pdf_offset!r}, "\
330
+ f"pdf_bytes={self.pdf_bytes!r})"
331
+
332
+
333
+ class PDFLiteral(PSLiteral):
334
+ def __init__(self, name: PSLiteral.NameType, pdf_offset: int, pdf_bytes: int):
335
+ if isinstance(name, str) and not isinstance(name, PSStr):
336
+ super().__init__(PSStr(name, pdf_offset=pdf_offset + 1, pdf_bytes=pdf_bytes))
337
+ elif isinstance(name, bytes) and not isinstance(name, PSBytes):
338
+ super().__init__(PSBytes(name, pdf_offset=pdf_offset + 1, pdf_bytes=pdf_bytes))
339
+ else:
340
+ super().__init__(name)
341
+
342
+ @property
343
+ def pdf_bytes(self) -> int:
344
+ return self.name.pdf_bytes + 1 # add one to account for the leading "/"
345
+
346
+ @property
347
+ def pdf_offset(self) -> int:
348
+ return self.name.pdf_offset - 1
349
+
350
+ def __eq__(self, other):
351
+ return isinstance(other, PSLiteral) and self.name == other.name
352
+
353
+
354
+ class PDFKeyword(PSKeyword):
355
+ def __init__(self, name: bytes, pdf_offset: int, pdf_bytes: int):
356
+ pdf_bytes = len(name) # sometimes we actually lose the length of the token, so rely on the keyword name
357
+ if not isinstance(name, PSBytes):
358
+ super().__init__(PSBytes(name, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes))
359
+ else:
360
+ super().__init__(name)
361
+ self.pdf_offset: int = pdf_offset
362
+ self.pdf_bytes: int = pdf_bytes
363
+
364
+ def __eq__(self, other):
365
+ return isinstance(other, PSKeyword) and self.name == other.name
366
+
367
+ def __repr__(self):
368
+ return f"{self.__class__.__name__}({self.name!r}, pdf_offset={self.pdf_offset}, pdf_bytes={self.pdf_bytes})"
369
+
370
+ def __str__(self):
371
+ return f"/{self.name!s}"
372
+
373
+
374
+ PDFBaseParserToken = Union[PSFloat, PSBool, PDFLiteral, PSKeyword, PSBytes, PSInt]
375
+
376
+
377
+ """
378
+ pdfminer.pdfdocument unfortunately tests for equality with these literals using `is` rather than `==`, so we must
379
+ return their singletons from a dict rather than our instrumented PDFLiteral objects:
380
+ """
381
+ PROTECTED_LITERALS: Dict[str, PSLiteral] = {
382
+ LITERAL_OBJSTM.name: LITERAL_OBJSTM,
383
+ LITERAL_XREF.name: LITERAL_XREF,
384
+ LITERAL_CATALOG.name: LITERAL_CATALOG
385
+ }
386
+
387
+
388
+ if sys.version_info < (3, 7):
389
+ # Before Python 3.7, we'll get an MRO error if we extend from both dict and Dict
390
+ PDFDict_Type = object
391
+ else:
392
+ PDFDict_Type = Dict[PSStr, Union[PDFBaseParserToken, PSStr, "PDFDict", "PDFList"]]
393
+
394
+
395
+ class PDFDict(dict, PDFDict_Type):
396
+ pdf_offset: int
397
+ pdf_bytes: int
398
+
399
+ def __init__(self, *args, **kwargs):
400
+ kwargs = dict(kwargs)
401
+ if "pdf_offset" in kwargs:
402
+ del kwargs["pdf_offset"]
403
+ if "pdf_bytes" in kwargs:
404
+ del kwargs["pdf_bytes"]
405
+ super().__init__(*args, **kwargs)
406
+
407
+ def get(self, key, default = None):
408
+ result = super().get(key, default)
409
+ if isinstance(result, PDFLiteral) and result.name in PROTECTED_LITERALS:
410
+ # we must return the protected literals as their singleton version:
411
+ return PROTECTED_LITERALS[result.name]
412
+ return result
413
+
414
+ def __new__(cls, *args, pdf_offset: int, pdf_bytes: int, **kwargs):
415
+ ret = super().__new__(cls, *args, **kwargs)
416
+ ret.pdf_offset = pdf_offset
417
+ ret.pdf_bytes = pdf_bytes
418
+ return ret
419
+
420
+ def __str__(self):
421
+ return dict.__str__(self)
422
+
423
+
424
+
425
+ class PDFList(PSSequence, list):
426
+ @staticmethod
427
+ def load(iterable) -> "PDFList":
428
+ start_offset: Optional[int] = None
429
+ end_offset: Optional[int] = None
430
+ items = []
431
+ for item in iterable:
432
+ if hasattr(item, "pdf_offset") and hasattr(item, "pdf_bytes"):
433
+ if start_offset is None or start_offset > item.pdf_offset:
434
+ start_offset = item.pdf_offset
435
+ if end_offset is None or end_offset < item.pdf_offset + item.pdf_bytes:
436
+ end_offset = item.pdf_offset + item.pdf_bytes
437
+ items.append(item)
438
+ if start_offset is None or end_offset is None:
439
+ raise ValueError(f"Cannot determine PDF bounds for list {items!r}")
440
+ return PDFList(items, pdf_offset=start_offset, pdf_bytes=end_offset - start_offset)
441
+
442
+ def __str__(self):
443
+ return list.__str__(self)
444
+
445
+
446
+
447
+ def make_ps_object(value, pdf_offset: int, pdf_bytes: int) -> Union[PDFBaseParserToken, PSStr, PDFDict]:
448
+ if isinstance(value, PSLiteral):
449
+ return PDFLiteral(value.name, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
450
+ # Unfortunately, we can't convert PSKeywords to PDFKeywords here because pdfminer requires them to be singletons
451
+ # elif isinstance(value, PSKeyword):
452
+ # return PDFKeyword(value.name, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
453
+ elif isinstance(value, PDFDict):
454
+ value.pdf_offset = pdf_offset
455
+ value.pdf_bytes = pdf_bytes
456
+ return value
457
+ elif isinstance(value, dict):
458
+ return PDFDict(value, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
459
+ elif isinstance(value, PSObject):
460
+ setattr(value, "pdf_offset", pdf_offset)
461
+ if isinstance(value, PSKeyword):
462
+ # sometimes the byte count gets off, so set it to the name size
463
+ pdf_bytes = len(value.name)
464
+ setattr(value, "pdf_bytes", pdf_bytes)
465
+ return value
466
+ elif isinstance(value, int):
467
+ supertype = PSInt
468
+ elif isinstance(value, float):
469
+ supertype = PSFloat
470
+ elif isinstance(value, bool):
471
+ supertype = PSBool
472
+ elif isinstance(value, bytes):
473
+ supertype = PSBytes
474
+ elif isinstance(value, str):
475
+ supertype = PSStr
476
+ else:
477
+ raise NotImplementedError(f"Add suppport for PSSequences containing elements of type {type(value)}")
478
+ return supertype(value, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
479
+
480
+
481
+ class DecodingError(bytes):
482
+ message: Optional[str]
483
+
484
+ def __new__(cls, *args, **kwargs):
485
+ kwargs = dict(kwargs)
486
+ if "message" in kwargs:
487
+ message = kwargs["message"]
488
+ del kwargs["message"]
489
+ else:
490
+ message = None
491
+ ret = super().__new__(cls, b'')
492
+ setattr(ret, "message", message)
493
+ return ret
494
+
495
+
496
+ class PDFStreamFilter(PSBytes):
497
+ name: str
498
+ original_bytes: bytes
499
+ error: Optional[DecodingError]
500
+
501
+ def __new__(cls, *args, **kwargs):
502
+ kwargs = dict(kwargs)
503
+ if "pdf_bytes" not in kwargs:
504
+ kwargs["pdf_bytes"] = len(args[0])
505
+ if "original_bytes" in kwargs:
506
+ original_bytes = kwargs["original_bytes"]
507
+ del kwargs["original_bytes"]
508
+ else:
509
+ raise ValueError(f"{cls.__name__}.__init__ requires the `original_bytes` argument")
510
+ if "name" in kwargs:
511
+ name = kwargs["name"]
512
+ del kwargs["name"]
513
+ else:
514
+ raise ValueError(f"{cls.__name__}.__init__ requires the `name` argument")
515
+ if isinstance(args[0], DecodingError):
516
+ error = args[0]
517
+ else:
518
+ error = None
519
+ ret = super().__new__(cls, *args, **kwargs)
520
+ setattr(ret, "original_bytes", original_bytes)
521
+ setattr(ret, "name", name)
522
+ setattr(ret, "error", error)
523
+ return ret
524
+
525
+
526
+ class PNGPredictor(PSBytes):
527
+ params: PDFDict
528
+ original_bytes: bytes
529
+
530
+ def __new__(cls, *args, **kwargs):
531
+ kwargs = dict(kwargs)
532
+ if "pdf_bytes" not in kwargs:
533
+ kwargs["pdf_bytes"] = len(args[0])
534
+ if "original_bytes" in kwargs:
535
+ original_bytes = kwargs["original_bytes"]
536
+ del kwargs["original_bytes"]
537
+ else:
538
+ raise ValueError(f"{cls.__name__}.__init__ requires the `original_bytes` argument")
539
+ if "params" in kwargs:
540
+ params = kwargs["params"]
541
+ del kwargs["params"]
542
+ else:
543
+ raise ValueError(f"{cls.__name__}.__init__ requires the `params` argument")
544
+ ret = super().__new__(cls, *args, **kwargs)
545
+ setattr(ret, "original_bytes", original_bytes)
546
+ setattr(ret, "params", params)
547
+ return ret
548
+
549
+
550
+ class PDFObjectStream(PDFStream):
551
+ def __init__(self, parent: PDFStream, pdf_offset: int, pdf_bytes: int):
552
+ super().__init__(
553
+ attrs=parent.attrs,
554
+ rawdata=PSBytes(parent.rawdata, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes),
555
+ decipher=parent.decipher
556
+ )
557
+ self.parent: PDFStream = parent
558
+ self.pdf_offset: int = pdf_offset
559
+ self.pdf_bytes: int = pdf_bytes
560
+ self.data = parent.data
561
+ self.objid = parent.objid
562
+ self.genno = parent.genno
563
+
564
+ @property
565
+ def data(self) -> Optional[PSBytes]:
566
+ return self._data
567
+
568
+ @data.setter
569
+ def data(self, new_value: Optional[bytes]):
570
+ if new_value is not None and not isinstance(new_value, PSBytes):
571
+ self._data = PSBytes(new_value, pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
572
+ else:
573
+ self._data = new_value
574
+
575
+ @property
576
+ def data_value(self) -> PSBytes:
577
+ if self.data is not None:
578
+ return self.data
579
+ elif self.rawdata is not None:
580
+ return self.rawdata
581
+ else:
582
+ raise ValueError(f"PDFObjectStream {self!r} does not have any data")
583
+
584
+ def decode(self):
585
+ assert self.data is None \
586
+ and self.rawdata is not None, str((self.data, self.rawdata))
587
+ data = self.rawdata
588
+ if self.decipher:
589
+ # Handle encryption
590
+ assert self.objid is not None
591
+ assert self.genno is not None
592
+ data = self.decipher(self.objid, self.genno, data, self.attrs)
593
+ filters = self.get_filters()
594
+ if not filters:
595
+ self.data = data
596
+ self.rawdata = None
597
+ return
598
+ for (f, params) in filters:
599
+ decoded: Optional[bytes] = None
600
+ if f in LITERALS_FLATE_DECODE:
601
+ # will get errors if the document is encrypted.
602
+ try:
603
+ decoded = zlib.decompress(data)
604
+ except zlib.error as e:
605
+ decoded = DecodingError(str(e))
606
+ elif f in LITERALS_LZW_DECODE:
607
+ decoded = lzwdecode(data)
608
+ elif f in LITERALS_ASCII85_DECODE:
609
+ decoded = ascii85decode(data)
610
+ elif f in LITERALS_ASCIIHEX_DECODE:
611
+ decoded = asciihexdecode(data)
612
+ elif f in LITERALS_RUNLENGTH_DECODE:
613
+ decoded = rldecode(data)
614
+ elif f in LITERALS_CCITTFAX_DECODE:
615
+ decoded = ccittfaxdecode(data, params)
616
+ elif f in LITERALS_DCT_DECODE or f == LIT("JPXDecode"):
617
+ # This is probably a JPG stream
618
+ # it does not need to be decoded twice.
619
+ # Just return the stream to the user.
620
+ pass
621
+ elif f in LITERALS_JBIG2_DECODE:
622
+ pass
623
+ elif f == LITERAL_CRYPT:
624
+ # not yet..
625
+ raise PDFNotImplementedError('/Crypt filter is unsupported')
626
+ else:
627
+ raise PDFNotImplementedError('Unsupported filter: %r' % f)
628
+ if decoded is not None:
629
+ if isinstance(f, PDFLiteral):
630
+ name = f.name
631
+ else:
632
+ name = f
633
+ data = PDFStreamFilter(
634
+ decoded,
635
+ pdf_offset=data.pdf_offset,
636
+ pdf_bytes=data.pdf_bytes,
637
+ original_bytes=data,
638
+ name=name
639
+ )
640
+ # apply predictors
641
+ if params and 'Predictor' in params:
642
+ pred = int_value(params['Predictor'])
643
+ if pred == 1:
644
+ # no predictor
645
+ pass
646
+ elif 10 <= pred:
647
+ # PNG predictor
648
+ colors = int_value(params.get('Colors', 1))
649
+ columns = int_value(params.get('Columns', 1))
650
+ raw_bits_per_component = params.get('BitsPerComponent', 8)
651
+ bitspercomponent = int_value(raw_bits_per_component)
652
+ predicted = apply_png_predictor(pred, colors, columns,
653
+ bitspercomponent, data)
654
+ data = PNGPredictor(
655
+ predicted,
656
+ pdf_offset=data.pdf_offset,
657
+ pdf_bytes=data.pdf_bytes,
658
+ original_bytes=data,
659
+ params=params
660
+ )
661
+ else:
662
+ error_msg = 'Unsupported predictor: %r' % pred
663
+ raise PDFNotImplementedError(error_msg)
664
+ self.data = data
665
+ self.rawdata = None
666
+ return
667
+
668
+
669
+ class PDFParser(PDFMinerParser):
670
+ auto_flush: bool = False
671
+
672
+ @staticmethod
673
+ def string_escape(data: Union[bytes, int]) -> str:
674
+ if not isinstance(data, int):
675
+ return "".join(PDFParser.string_escape(d) for d in data)
676
+ elif data == ord('\n'):
677
+ return "\\n"
678
+ elif data == ord('\t'):
679
+ return "\\t"
680
+ elif data == ord('\r'):
681
+ return "\\r"
682
+ elif data == 0:
683
+ return "\\0"
684
+ elif data == ord('\\'):
685
+ return "\\\\"
686
+ elif 32 <= data <= 126:
687
+ return chr(data)
688
+ else:
689
+ return f"\\x{data:02X}"
690
+
691
+ def token_context(self, token: Union[PDFBaseParserToken, PSStr], padding_bytes: int = 10) -> str:
692
+ pos_before = self.fp.tell()
693
+ try:
694
+ bytes_before = min(token.pdf_offset, padding_bytes)
695
+ self.fp.seek(token.pdf_offset - bytes_before)
696
+ if bytes_before > 0:
697
+ context_before = PDFParser.string_escape(self.fp.read(bytes_before))
698
+ else:
699
+ context_before = ""
700
+ content = PDFParser.string_escape(self.fp.read(token.pdf_bytes))
701
+ context_after = PDFParser.string_escape(self.fp.read(padding_bytes))
702
+ return f"{context_before}{content}{context_after}\n" \
703
+ f"{' ' * len(context_before)}" \
704
+ f"{'^' * len(content)}" \
705
+ f"{' ' * len(context_after)}"
706
+ finally:
707
+ self.fp.seek(pos_before)
708
+
709
+ def push(self, *objs: PSStackEntry[ExtraT]):
710
+ transformed = []
711
+ for obj in objs:
712
+ if len(obj) == 2 and isinstance(obj[1], dict):
713
+ length = self._curtokenpos + 1 - obj[0]
714
+ assert length > 0
715
+ transformed.append((obj[0], PDFDict(obj[1], pdf_offset=obj[0], pdf_bytes=length + 2)))
716
+ elif len(obj) == 2 and isinstance(obj[1], list):
717
+ length = self._curtokenpos + 1 - obj[0]
718
+ assert length > 0
719
+ transformed.append((obj[0], PDFList(obj[1], pdf_offset=obj[0], pdf_bytes=length)))
720
+ elif len(obj) == 2 and isinstance(obj[1], PDFStream):
721
+ stream: PDFStream = obj[1]
722
+ pos = obj[0]
723
+ transformed.append((pos, PDFObjectStream(stream, pdf_offset=pos, pdf_bytes=len(stream.rawdata))))
724
+ elif len(obj) == 2 and isinstance(obj[1], PSObject) and not isinstance(obj[1], PDFLiteral):
725
+ pos = obj[0]
726
+ psobj = obj[1]
727
+ length = self._curtokenpos + 1 - obj[0]
728
+ if isinstance(psobj, PDFObjRef):
729
+ orig_pos = pos
730
+ pos = min(pos, psobj.objid.pdf_offset)
731
+ length += orig_pos - pos
732
+ setattr(psobj, "pdf_offset", pos)
733
+ setattr(psobj, "pdf_bytes", length)
734
+ transformed.append((pos, psobj))
735
+ else:
736
+ transformed.append(obj)
737
+ return super().push(*transformed)
738
+
739
+ def _add_token(self, obj: PSBaseParserToken):
740
+ if hasattr(obj, "pdf_offset"):
741
+ pos = obj.pdf_offset
742
+ else:
743
+ pos = self._curtokenpos
744
+ if hasattr(obj, "pdf_bytes"):
745
+ length = obj.pdf_bytes
746
+ elif isinstance(obj, PSLiteral):
747
+ length = len(self._curtoken)
748
+ else:
749
+ length = len(self._curtoken)
750
+ obj = make_ps_object(obj, pdf_offset=pos, pdf_bytes=length)
751
+ # log.info(f"\n{self.token_context(obj)}")
752
+ return super()._add_token(obj)
753
+
754
+ def flush(self):
755
+ if self.auto_flush:
756
+ self.add_results(*self.popall())
757
+ else:
758
+ super().flush()
759
+
760
+ def do_keyword(self, pos: int, token: PSKeyword):
761
+ if token is self.KEYWORD_R:
762
+ # reference to indirect object
763
+ try:
764
+ ((_, objid), (_, genno)) = self.pop(2)
765
+ obj = PDFObjRef(self.doc, objid, genno)
766
+ self.push((pos, obj))
767
+ except PSSyntaxError:
768
+ pass
769
+ else:
770
+ super().do_keyword(pos, token)
771
+
772
+ # def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
773
+ # pos, token = super().nexttoken()
774
+ # if isinstance(token, PSObject):
775
+ # setattr(token, "pdf_offset", pos)
776
+ # elif isinstance(token, int):
777
+ # token = PSInt(token, pdf_offset=pos)
778
+ # elif isinstance(token, bytes):
779
+ # token = PSBytes(token, pdf_offset=pos)
780
+ # elif isinstance(token, float):
781
+ # token = PSFloat(token, pdf_offset=pos)
782
+ # elif isinstance(token, bool):
783
+ # token - PSBool(token, pdf_offset=pos)
784
+ # else:
785
+ # raise NotImplementedError(f"Add support for tokens of type {type(token)}")
786
+ # return pos, token
787
+
788
+ # def do_keyword(self, pos: int, token: PSKeyword) -> None:
789
+
790
+
791
+ class RawPDFStream:
792
+ def __init__(self, file_stream):
793
+ self._file_stream = file_stream
794
+
795
+ def read(self, *args, **kwargs):
796
+ offset_before = self._file_stream.tell()
797
+ ret = self._file_stream.read(*args, **kwargs)
798
+ if isinstance(ret, bytes):
799
+ ret = PSBytes(ret, pdf_offset=offset_before)
800
+ return ret
801
+
802
+ def __getattr__(self, item):
803
+ return getattr(self._file_stream, item)
804
+
805
+
806
+ def parse_object(obj, matcher: Matcher, parent: Optional[Match] = None, pdf_header_offset: int = 0):
807
+ if isinstance(obj, PDFStreamFilter):
808
+ filter_obj = Submatch(
809
+ f"{obj.name!s}",
810
+ bytes(obj.original_bytes),
811
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
812
+ length=obj.pdf_bytes,
813
+ parent=parent
814
+ )
815
+ yield filter_obj
816
+ if obj.error is None:
817
+ stream = Submatch(
818
+ "DecodedStream",
819
+ bytes(obj),
820
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
821
+ length=obj.pdf_bytes,
822
+ parent=filter_obj,
823
+ decoded=bytes(obj)
824
+ )
825
+ else:
826
+ stream = Submatch(
827
+ "DecodingError",
828
+ obj.error.message,
829
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
830
+ length=obj.pdf_bytes,
831
+ parent=filter_obj
832
+ )
833
+ yield stream
834
+ yield from parse_object(obj.original_bytes, matcher=matcher, parent=stream,
835
+ pdf_header_offset=pdf_header_offset)
836
+ elif isinstance(obj, PDFList):
837
+ list_obj = Submatch(
838
+ "PDFList",
839
+ '',
840
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
841
+ length=obj.pdf_bytes,
842
+ parent=parent
843
+ )
844
+ yield list_obj
845
+ for item in obj:
846
+ yield from parse_object(item, matcher=matcher, parent=list_obj, pdf_header_offset=pdf_header_offset)
847
+ elif isinstance(obj, PDFDict):
848
+ dict_obj = Submatch(
849
+ "PDFDictionary",
850
+ '',
851
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
852
+ length=obj.pdf_bytes - 1,
853
+ parent=parent
854
+ )
855
+ yield dict_obj
856
+ for key, value in obj.items():
857
+ if not hasattr(value, "pdf_offset") or not hasattr(value, "pdf_bytes"):
858
+ if isinstance(value, list):
859
+ value = PDFList.load(value)
860
+ else:
861
+ raise ValueError(f"Unexpected PDF dictionary value {value!r}")
862
+ pair = Submatch(
863
+ "KeyValuePair",
864
+ '',
865
+ relative_offset=key.pdf_offset - (dict_obj.offset - pdf_header_offset) - 1,
866
+ length=value.pdf_offset + value.pdf_bytes - key.pdf_offset,
867
+ parent=dict_obj
868
+ )
869
+ yield pair
870
+ yield Submatch(
871
+ "Key",
872
+ key,
873
+ relative_offset=0,
874
+ length=key.pdf_bytes + 1,
875
+ parent=pair
876
+ )
877
+ value_match = Submatch(
878
+ "Value",
879
+ value,
880
+ relative_offset=value.pdf_offset - key.pdf_offset,
881
+ length=value.pdf_bytes,
882
+ parent=pair
883
+ )
884
+ yield value_match
885
+ yield from parse_object(value, matcher=matcher, parent=value_match, pdf_header_offset=pdf_header_offset)
886
+ elif isinstance(obj, PDFDeciphered):
887
+ deciphered = Submatch(
888
+ "PDFDeciphered",
889
+ obj.original_bytes,
890
+ decoded=obj,
891
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
892
+ length=obj.pdf_bytes,
893
+ parent=parent
894
+ )
895
+ yield deciphered
896
+ with Tempfile(obj) as f:
897
+ yield from matcher.match(f, parent=deciphered)
898
+ elif isinstance(obj, PSBytes):
899
+ if isinstance(obj, PNGPredictor):
900
+ match = Submatch(
901
+ "PNGPredictor",
902
+ bytes(obj.original_bytes),
903
+ decoded=obj,
904
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
905
+ length=obj.pdf_bytes,
906
+ parent=parent
907
+ )
908
+ yield from parse_object(obj.params, matcher=matcher, parent=match, pdf_header_offset=pdf_header_offset)
909
+ yield from parse_object(obj.original_bytes, matcher=matcher, parent=match,
910
+ pdf_header_offset=pdf_header_offset)
911
+ else:
912
+ match = Submatch(
913
+ obj.__class__.__name__,
914
+ bytes(obj),
915
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
916
+ length=obj.pdf_bytes,
917
+ parent=parent
918
+ )
919
+ if hasattr(obj, "original_bytes"):
920
+ yield from parse_object(obj.original_bytes, matcher=matcher, parent=match,
921
+ pdf_header_offset=pdf_header_offset)
922
+ # recursively match against the deflated contents
923
+ with Tempfile(obj) as f:
924
+ yield from matcher.match(f, parent=match)
925
+ elif hasattr(obj, "pdf_offset") and hasattr(obj, "pdf_bytes"):
926
+ yield Submatch(
927
+ obj.__class__.__name__,
928
+ obj,
929
+ relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
930
+ length=obj.pdf_bytes,
931
+ parent=parent
932
+ )
933
+
934
+
935
+ class InstrumentedPDFDocument(PDFDocument):
936
+ def __init__(self, *args, **kwargs):
937
+ self._xrefs = []
938
+ self._decipher: Optional[DecipherCallable] = None
939
+ try:
940
+ super().__init__(*args, **kwargs)
941
+ except PDFSyntaxError as pse:
942
+ if "No /Root object" not in str(pse):
943
+ raise pse
944
+ # this is a malformed PDF without a trailer root object
945
+ old_get_trailer = PDFXRef.get_trailer
946
+
947
+ def get_trailer(_):
948
+ return {"Root": {}}
949
+
950
+ try:
951
+ PDFXRef.get_trailer = get_trailer
952
+ # try it again with our patched trailer loading:
953
+ super().__init__(*args, **kwargs)
954
+ finally:
955
+ PDFXRef.get_trailer = old_get_trailer
956
+
957
+ # @property
958
+ # def xrefs(self):
959
+ # if not self._xrefs:
960
+ # pass
961
+ # return self._xrefs
962
+ #
963
+ # @xrefs.setter
964
+ # def xrefs(self, new_value):
965
+ # self._xrefs = new_value
966
+
967
+ @property
968
+ def decipher(self) -> DecipherCallable:
969
+ if self._decipher is None:
970
+ return None
971
+ else:
972
+ return self.do_decipher
973
+
974
+ @decipher.setter
975
+ def decipher(self, new_value: DecipherCallable):
976
+ self._decipher = new_value
977
+
978
+ def do_decipher(self, *args, **kwargs) -> PSBytes:
979
+ deciphered = self._decipher(*args, **kwargs)
980
+ if isinstance(deciphered, bytes) and not isinstance(deciphered, PSBytes):
981
+ for arg in args:
982
+ if isinstance(arg, PSBytes):
983
+ deciphered = PDFDeciphered(
984
+ deciphered,
985
+ pdf_offset=arg.pdf_offset,
986
+ pdf_bytes=arg.pdf_bytes,
987
+ original_bytes=arg
988
+ )
989
+ break
990
+ return deciphered
991
+
992
+
993
+ # The default libmagic test for detecting PDFs is too restrictive:
994
+ class RelaxedPDFMatcher(MagicTest):
995
+ def __init__(self):
996
+ super().__init__(
997
+ offset=AbsoluteOffset(0),
998
+ mime="application/pdf",
999
+ extensions=("pdf",),
1000
+ message="Malformed PDF"
1001
+ )
1002
+
1003
+ def subtest_type(self) -> TestType:
1004
+ return TestType.BINARY
1005
+
1006
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
1007
+ if b"%PDF-" in data:
1008
+ return MatchedTest(self, value=data, offset=0, length=len(data))
1009
+ return FailedTest(self, offset=0, message="data did not contain \"%PDF-\"")
1010
+
1011
+
1012
+ MagicMatcher.DEFAULT_INSTANCE.add(RelaxedPDFMatcher())
1013
+
1014
+
1015
+ def reverse_skip_whitespace(file_stream) -> bool:
1016
+ found_whitespace = False
1017
+ while True:
1018
+ try:
1019
+ file_stream.seek(-1, from_what=1)
1020
+ except IndexError:
1021
+ break
1022
+ b = file_stream.read(1)
1023
+ if b not in (b' ', b'\t', b'\n'):
1024
+ break
1025
+ found_whitespace = True
1026
+ file_stream.seek(-1, from_what=1)
1027
+ return found_whitespace
1028
+
1029
+
1030
+ def skip_whitespace(file_stream) -> bool:
1031
+ found_whitespace = False
1032
+ while True:
1033
+ b = file_stream.read(1)
1034
+ if b not in (b' ', b'\t', b'\n'):
1035
+ try:
1036
+ file_stream.seek(-1, from_what=1)
1037
+ except IndexError:
1038
+ pass
1039
+ break
1040
+ found_whitespace = True
1041
+ return found_whitespace
1042
+
1043
+
1044
+ def reverse_expect(file_stream, expected: Union[bytes, Callable[[int, bytes], bool]]) -> bytes:
1045
+ skipped_bytes = 0
1046
+ start_pos = file_stream.tell()
1047
+ with file_stream.save_pos():
1048
+ if isinstance(expected, bytes):
1049
+ try:
1050
+ file_stream.seek(-len(expected), from_what=1)
1051
+ except IndexError:
1052
+ return b""
1053
+ if file_stream.read(len(expected)) != expected:
1054
+ return b""
1055
+ skipped_bytes = len(expected)
1056
+ else:
1057
+ while True:
1058
+ try:
1059
+ file_stream.seek(start_pos - skipped_bytes - 1)
1060
+ except IndexError:
1061
+ return b""
1062
+ b = file_stream.read(1)
1063
+ if not expected(skipped_bytes, b):
1064
+ break
1065
+ skipped_bytes += 1
1066
+ file_stream.seek(start_pos - skipped_bytes)
1067
+ try:
1068
+ return file_stream.read(skipped_bytes)
1069
+ finally:
1070
+ file_stream.seek(start_pos - skipped_bytes)
1071
+
1072
+
1073
+ def pdf_obj_parser(file_stream, obj, objid: int, parent: Match, pdf_header_offset: int = 0) -> Iterator[Submatch]:
1074
+ data: Optional[bytes] = None
1075
+ if isinstance(obj, PDFObjectStream):
1076
+ log.status(f"Parsing PDF obj {obj.objid!s} {obj.genno!s}")
1077
+ try:
1078
+ data = obj.get_data()
1079
+ except PDFNotImplementedError as e:
1080
+ log.error(f"Unsupported PDF stream filter in object {obj.objid!s} {obj.genno!s}: {e!s}")
1081
+ relative_offset = obj.attrs.pdf_offset
1082
+ obj_length = obj.data_value.pdf_offset - obj.attrs.pdf_offset + obj.data_value.pdf_bytes - 1
1083
+ else:
1084
+ log.status(f"Parsing PDF obj {objid!s}")
1085
+ relative_offset = obj.pdf_offset
1086
+ obj_length = obj.pdf_bytes - 1
1087
+ with file_stream.save_pos():
1088
+ file_stream.seek(parent.offset + relative_offset - pdf_header_offset)
1089
+ reverse_skip_whitespace(file_stream)
1090
+ if reverse_expect(file_stream, b"obj") and reverse_skip_whitespace(file_stream):
1091
+ version = reverse_expect(file_stream, lambda _, b: ord('0') <= b[0] <= ord('9'))
1092
+ if version and reverse_skip_whitespace(file_stream):
1093
+ obj_id = reverse_expect(file_stream, lambda _, b: ord('0') <= b[0] <= ord('9'))
1094
+ if obj_id:
1095
+ obj_offset = parent.offset + relative_offset - pdf_header_offset - file_stream.tell()
1096
+ relative_offset -= obj_offset
1097
+ obj_length += obj_offset
1098
+ file_stream.seek(parent.offset + relative_offset - pdf_header_offset + obj_length)
1099
+ skip_whitespace(file_stream)
1100
+ if file_stream.read(6) == b"endobj":
1101
+ skip_whitespace(file_stream)
1102
+ obj_length = file_stream.tell() - (parent.offset + relative_offset - pdf_header_offset)
1103
+ if isinstance(obj, PDFObjectStream):
1104
+ match = Submatch(
1105
+ name="PDFObject",
1106
+ display_name=f"PDFObject{obj.objid!s}.{obj.genno!s}",
1107
+ match_obj=(obj.objid, obj.genno),
1108
+ relative_offset=relative_offset,
1109
+ length=obj_length,
1110
+ parent=parent
1111
+ )
1112
+ yield match
1113
+ yield from parse_object(obj.attrs, matcher=parent.matcher, parent=match, pdf_header_offset=pdf_header_offset)
1114
+ if data is not None:
1115
+ yield from parse_object(data, matcher=parent.matcher, parent=match, pdf_header_offset=pdf_header_offset)
1116
+ else:
1117
+ match = Submatch(
1118
+ name="PDFObject",
1119
+ display_name=f"PDFObject{objid}",
1120
+ match_obj=objid,
1121
+ relative_offset=relative_offset,
1122
+ length=obj_length,
1123
+ parent=parent
1124
+ )
1125
+ yield match
1126
+ yield from parse_object(obj, parent.matcher, match, pdf_header_offset=pdf_header_offset)
1127
+ log.clear_status()
1128
+
1129
+
1130
+ @register_parser("application/pdf")
1131
+ def pdf_parser(file_stream, parent: Match):
1132
+ # pdfminer expects %PDF to be at byte offset zero in the file
1133
+ pdf_header_offset = file_stream.first_index_of(b"%PDF")
1134
+ if pdf_header_offset > 0:
1135
+ # the PDF header does not start at byte offset zero!
1136
+ yield Submatch(
1137
+ "IgnoredPDFPreamble",
1138
+ b"",
1139
+ relative_offset=0,
1140
+ length=pdf_header_offset,
1141
+ parent=parent
1142
+ )
1143
+ pdf_content = Submatch(
1144
+ "OffsetPDFContent",
1145
+ b"",
1146
+ relative_offset=pdf_header_offset,
1147
+ parent=parent
1148
+ )
1149
+ yield pdf_content
1150
+ with FileStream(file_stream, start=pdf_header_offset) as f:
1151
+ yield from pdf_parser(f, pdf_content)
1152
+ return
1153
+ pdf_header_offset = file_stream.start
1154
+ parser = PDFParser(RawPDFStream(file_stream))
1155
+ doc = InstrumentedPDFDocument(parser)
1156
+ yielded = set()
1157
+ for xref in doc.xrefs:
1158
+ for objid in xref.get_objids():
1159
+ try:
1160
+ obj = doc.getobj(objid)
1161
+ except PDFObjectNotFound:
1162
+ continue
1163
+ if isinstance(obj, PDFObjectStream):
1164
+ if (obj.objid, obj.genno) in yielded:
1165
+ continue
1166
+ yielded.add((obj.objid, obj.genno))
1167
+ else:
1168
+ if objid in yielded or not hasattr(obj, "pdf_offset") or not hasattr(obj, "pdf_bytes"):
1169
+ continue
1170
+ yielded.add(objid)
1171
+ yield from pdf_obj_parser(file_stream, obj, objid, parent, pdf_header_offset=pdf_header_offset)
1172
+
1173
+ trailer = xref.get_trailer()
1174
+ if trailer is not None:
1175
+ trailer_start = min(k.pdf_offset for k in trailer.keys())
1176
+ trailer_end = max(v.pdf_offset + v.pdf_bytes for v in trailer.values())
1177
+ t = Submatch(
1178
+ "Trailer",
1179
+ b"",
1180
+ relative_offset=trailer_start,
1181
+ length=trailer_end - trailer_start,
1182
+ parent=parent
1183
+ )
1184
+ yield t
1185
+ for k, v in trailer.items():
1186
+ kvp = Submatch(
1187
+ "KeyValuePair",
1188
+ b"",
1189
+ relative_offset=k.pdf_offset - trailer_start,
1190
+ length=v.pdf_offset + v.pdf_bytes - k.pdf_offset,
1191
+ parent=t
1192
+ )
1193
+ yield kvp
1194
+ yield Submatch(
1195
+ "Key",
1196
+ k,
1197
+ relative_offset=k.pdf_offset - k.pdf_offset,
1198
+ length=k.pdf_bytes,
1199
+ parent=kvp
1200
+ )
1201
+ value_match = Submatch(
1202
+ "Value",
1203
+ b"",
1204
+ relative_offset=v.pdf_offset - k.pdf_offset,
1205
+ length=v.pdf_bytes,
1206
+ parent=kvp
1207
+ )
1208
+ yield value_match
1209
+ yield from parse_object(v, matcher=parent.matcher, parent=value_match,
1210
+ pdf_header_offset=pdf_header_offset)
1211
+
1212
+ if not isinstance(xref, PDFXRef):
1213
+ continue
1214
+
1215
+ xref_start = min(min(c.pdf_offset for c in row if c is not None) for row in xref.offsets.values())
1216
+ xref_end = max(max(c.pdf_offset + c.pdf_bytes for c in row if c is not None) for row in xref.offsets.values())
1217
+ x = Submatch(
1218
+ "XRefTable",
1219
+ b"",
1220
+ relative_offset=xref_start,
1221
+ length=xref_end - xref_start,
1222
+ parent=parent
1223
+ )
1224
+ yield x
1225
+ for row in xref.offsets.values():
1226
+ row_start = min(c.pdf_offset for c in row if c is not None)
1227
+ row_end = max(c.pdf_offset + c.pdf_bytes for c in row if c is not None)
1228
+ row_match = Submatch(
1229
+ "XRefRow",
1230
+ b"",
1231
+ relative_offset=row_start - xref_start,
1232
+ length=row_end - row_start,
1233
+ parent=x
1234
+ )
1235
+ yield row_match
1236
+ obj_id, pos, gen_no = row
1237
+ if obj_id is not None:
1238
+ ret = Submatch(
1239
+ "ObjectID",
1240
+ b"",
1241
+ relative_offset=obj_id.pdf_offset - row_start,
1242
+ length=obj_id.pdf_bytes,
1243
+ parent=row_match
1244
+ )
1245
+ yield ret
1246
+ yield from parse_object(obj_id, matcher=parent.matcher, parent=ret, pdf_header_offset=pdf_header_offset)
1247
+ ret = Submatch(
1248
+ "Position",
1249
+ b"",
1250
+ relative_offset=pos.pdf_offset - row_start,
1251
+ length=pos.pdf_bytes,
1252
+ parent=row_match
1253
+ )
1254
+ yield ret
1255
+ yield from parse_object(ret, matcher=parent.matcher, parent=ret, pdf_header_offset=pdf_header_offset)
1256
+ ret = Submatch(
1257
+ "Generation",
1258
+ b"",
1259
+ relative_offset=gen_no.pdf_offset - row_start,
1260
+ length=gen_no.pdf_bytes,
1261
+ parent=row_match
1262
+ )
1263
+ yield ret
1264
+ yield from parse_object(ret, matcher=parent.matcher, parent=ret, pdf_header_offset=pdf_header_offset)