polyfile-weave 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polyfile-weave might be problematic. Click here for more details.
- polyfile/__init__.py +15 -0
- polyfile/__main__.py +394 -0
- polyfile/arithmetic.py +27 -0
- polyfile/ast.py +114 -0
- polyfile/debugger.py +1039 -0
- polyfile/expressions.py +346 -0
- polyfile/fileutils.py +343 -0
- polyfile/html.py +135 -0
- polyfile/http/__init__.py +1 -0
- polyfile/http/defacto.py +37 -0
- polyfile/http/deprecated.py +51 -0
- polyfile/http/experimental.py +67 -0
- polyfile/http/http_11.py +548 -0
- polyfile/http/matcher.py +37 -0
- polyfile/http/structured_headers.py +48 -0
- polyfile/iterators.py +72 -0
- polyfile/jpeg.py +24 -0
- polyfile/kaitai/__init__.py +0 -0
- polyfile/kaitai/compiler.py +156 -0
- polyfile/kaitai/parser.py +312 -0
- polyfile/kaitai/parsers/__init__.py +0 -0
- polyfile/kaitai/parsers/aix_utmp.py +116 -0
- polyfile/kaitai/parsers/allegro_dat.py +367 -0
- polyfile/kaitai/parsers/andes_firmware.py +64 -0
- polyfile/kaitai/parsers/android_bootldr_asus.py +105 -0
- polyfile/kaitai/parsers/android_bootldr_huawei.py +181 -0
- polyfile/kaitai/parsers/android_bootldr_qcom.py +217 -0
- polyfile/kaitai/parsers/android_dto.py +138 -0
- polyfile/kaitai/parsers/android_img.py +319 -0
- polyfile/kaitai/parsers/android_nanoapp_header.py +83 -0
- polyfile/kaitai/parsers/android_opengl_shaders_cache.py +151 -0
- polyfile/kaitai/parsers/android_sparse.py +237 -0
- polyfile/kaitai/parsers/android_super.py +401 -0
- polyfile/kaitai/parsers/apm_partition_table.py +196 -0
- polyfile/kaitai/parsers/apple_single_double.py +180 -0
- polyfile/kaitai/parsers/asn1_der.py +235 -0
- polyfile/kaitai/parsers/au.py +138 -0
- polyfile/kaitai/parsers/avantes_roh60.py +112 -0
- polyfile/kaitai/parsers/avi.py +296 -0
- polyfile/kaitai/parsers/bcd.py +111 -0
- polyfile/kaitai/parsers/bitcoin_transaction.py +210 -0
- polyfile/kaitai/parsers/blender_blend.py +334 -0
- polyfile/kaitai/parsers/bmp.py +780 -0
- polyfile/kaitai/parsers/bson.py +411 -0
- polyfile/kaitai/parsers/btrfs_stream.py +318 -0
- polyfile/kaitai/parsers/bytes_with_io.py +27 -0
- polyfile/kaitai/parsers/chrome_pak.py +194 -0
- polyfile/kaitai/parsers/code_6502.py +456 -0
- polyfile/kaitai/parsers/compressed_resource.py +217 -0
- polyfile/kaitai/parsers/cpio_old_le.py +154 -0
- polyfile/kaitai/parsers/cramfs.py +344 -0
- polyfile/kaitai/parsers/creative_voice_file.py +342 -0
- polyfile/kaitai/parsers/dbf.py +274 -0
- polyfile/kaitai/parsers/dcmp_0.py +664 -0
- polyfile/kaitai/parsers/dcmp_1.py +422 -0
- polyfile/kaitai/parsers/dcmp_2.py +312 -0
- polyfile/kaitai/parsers/dcmp_variable_length_integer.py +66 -0
- polyfile/kaitai/parsers/dex.py +1086 -0
- polyfile/kaitai/parsers/dicom.py +4370 -0
- polyfile/kaitai/parsers/dime_message.py +201 -0
- polyfile/kaitai/parsers/dns_packet.py +569 -0
- polyfile/kaitai/parsers/doom_wad.py +654 -0
- polyfile/kaitai/parsers/dos_datetime.py +191 -0
- polyfile/kaitai/parsers/dos_mz.py +172 -0
- polyfile/kaitai/parsers/ds_store.py +513 -0
- polyfile/kaitai/parsers/dtb.py +310 -0
- polyfile/kaitai/parsers/dune_2_pak.py +126 -0
- polyfile/kaitai/parsers/edid.py +472 -0
- polyfile/kaitai/parsers/efivar_signature_list.py +331 -0
- polyfile/kaitai/parsers/elf.py +2482 -0
- polyfile/kaitai/parsers/ethernet_frame.py +114 -0
- polyfile/kaitai/parsers/exif.py +723 -0
- polyfile/kaitai/parsers/ext2.py +537 -0
- polyfile/kaitai/parsers/fallout2_dat.py +187 -0
- polyfile/kaitai/parsers/fallout_dat.py +156 -0
- polyfile/kaitai/parsers/fasttracker_xm_module.py +558 -0
- polyfile/kaitai/parsers/ftl_dat.py +90 -0
- polyfile/kaitai/parsers/genmidi_op2.py +161 -0
- polyfile/kaitai/parsers/gettext_mo.py +541 -0
- polyfile/kaitai/parsers/gif.py +492 -0
- polyfile/kaitai/parsers/gimp_brush.py +244 -0
- polyfile/kaitai/parsers/glibc_utmp.py +114 -0
- polyfile/kaitai/parsers/gltf_binary.py +132 -0
- polyfile/kaitai/parsers/google_protobuf.py +151 -0
- polyfile/kaitai/parsers/gpt_partition_table.py +175 -0
- polyfile/kaitai/parsers/gran_turismo_vol.py +140 -0
- polyfile/kaitai/parsers/grub2_font.py +337 -0
- polyfile/kaitai/parsers/gzip.py +232 -0
- polyfile/kaitai/parsers/hashcat_restore.py +60 -0
- polyfile/kaitai/parsers/hccap.py +111 -0
- polyfile/kaitai/parsers/hccapx.py +103 -0
- polyfile/kaitai/parsers/heaps_pak.py +177 -0
- polyfile/kaitai/parsers/heroes_of_might_and_magic_agg.py +116 -0
- polyfile/kaitai/parsers/heroes_of_might_and_magic_bmp.py +34 -0
- polyfile/kaitai/parsers/icmp_packet.py +136 -0
- polyfile/kaitai/parsers/ico.py +129 -0
- polyfile/kaitai/parsers/id3v1_1.py +220 -0
- polyfile/kaitai/parsers/id3v2_3.py +324 -0
- polyfile/kaitai/parsers/id3v2_4.py +423 -0
- polyfile/kaitai/parsers/ines.py +282 -0
- polyfile/kaitai/parsers/ipv4_packet.py +158 -0
- polyfile/kaitai/parsers/ipv6_packet.py +55 -0
- polyfile/kaitai/parsers/iso9660.py +544 -0
- polyfile/kaitai/parsers/java_class.py +1113 -0
- polyfile/kaitai/parsers/jpeg.py +361 -0
- polyfile/kaitai/parsers/luks.py +149 -0
- polyfile/kaitai/parsers/lzh.py +165 -0
- polyfile/kaitai/parsers/mac_os_resource_snd.py +493 -0
- polyfile/kaitai/parsers/mach_o.py +3033 -0
- polyfile/kaitai/parsers/mach_o_fat.py +92 -0
- polyfile/kaitai/parsers/magicavoxel_vox.py +391 -0
- polyfile/kaitai/parsers/manifest.json +1 -0
- polyfile/kaitai/parsers/mbr_partition_table.py +119 -0
- polyfile/kaitai/parsers/mcap.py +1015 -0
- polyfile/kaitai/parsers/microsoft_cfb.py +293 -0
- polyfile/kaitai/parsers/microsoft_network_monitor_v2.py +309 -0
- polyfile/kaitai/parsers/microsoft_pe.py +765 -0
- polyfile/kaitai/parsers/mifare_classic.py +706 -0
- polyfile/kaitai/parsers/minecraft_nbt.py +449 -0
- polyfile/kaitai/parsers/monomakh_sapr_chg.py +69 -0
- polyfile/kaitai/parsers/mozilla_mar.py +239 -0
- polyfile/kaitai/parsers/mp4.py +333 -0
- polyfile/kaitai/parsers/msgpack.py +467 -0
- polyfile/kaitai/parsers/nitf.py +1189 -0
- polyfile/kaitai/parsers/nt_mdt_pal.py +155 -0
- polyfile/kaitai/parsers/ogg.py +118 -0
- polyfile/kaitai/parsers/openpgp_message.py +993 -0
- polyfile/kaitai/parsers/packet_ppi.py +515 -0
- polyfile/kaitai/parsers/pcap.py +344 -0
- polyfile/kaitai/parsers/pcf_font.py +506 -0
- polyfile/kaitai/parsers/pcx.py +195 -0
- polyfile/kaitai/parsers/pcx_dcx.py +79 -0
- polyfile/kaitai/parsers/phar_without_stub.py +399 -0
- polyfile/kaitai/parsers/php_serialized_value.py +505 -0
- polyfile/kaitai/parsers/png.py +721 -0
- polyfile/kaitai/parsers/protocol_body.py +260 -0
- polyfile/kaitai/parsers/psx_tim.py +104 -0
- polyfile/kaitai/parsers/python_pickle.py +718 -0
- polyfile/kaitai/parsers/python_pyc_27.py +510 -0
- polyfile/kaitai/parsers/quake_mdl.py +441 -0
- polyfile/kaitai/parsers/quake_pak.py +112 -0
- polyfile/kaitai/parsers/quicktime_mov.py +634 -0
- polyfile/kaitai/parsers/rar.py +265 -0
- polyfile/kaitai/parsers/regf.py +569 -0
- polyfile/kaitai/parsers/renderware_binary_stream.py +877 -0
- polyfile/kaitai/parsers/resource_fork.py +611 -0
- polyfile/kaitai/parsers/respack.py +57 -0
- polyfile/kaitai/parsers/riff.py +409 -0
- polyfile/kaitai/parsers/rpm.py +964 -0
- polyfile/kaitai/parsers/rtcp_payload.py +579 -0
- polyfile/kaitai/parsers/rtp_packet.py +150 -0
- polyfile/kaitai/parsers/rtpdump.py +115 -0
- polyfile/kaitai/parsers/ruby_marshal.py +423 -0
- polyfile/kaitai/parsers/s3m.py +493 -0
- polyfile/kaitai/parsers/saints_row_2_vpp_pc.py +254 -0
- polyfile/kaitai/parsers/shapefile_index.py +174 -0
- polyfile/kaitai/parsers/shapefile_main.py +893 -0
- polyfile/kaitai/parsers/some_ip.py +209 -0
- polyfile/kaitai/parsers/some_ip_container.py +37 -0
- polyfile/kaitai/parsers/some_ip_sd.py +86 -0
- polyfile/kaitai/parsers/some_ip_sd_entries.py +160 -0
- polyfile/kaitai/parsers/some_ip_sd_options.py +374 -0
- polyfile/kaitai/parsers/specpr.py +404 -0
- polyfile/kaitai/parsers/sqlite3.py +472 -0
- polyfile/kaitai/parsers/ssh_public_key.py +252 -0
- polyfile/kaitai/parsers/standard_midi_file.py +390 -0
- polyfile/kaitai/parsers/stl.py +111 -0
- polyfile/kaitai/parsers/sudoers_ts.py +201 -0
- polyfile/kaitai/parsers/swf.py +406 -0
- polyfile/kaitai/parsers/systemd_journal.py +361 -0
- polyfile/kaitai/parsers/tcp_segment.py +57 -0
- polyfile/kaitai/parsers/tga.py +213 -0
- polyfile/kaitai/parsers/tls_client_hello.py +293 -0
- polyfile/kaitai/parsers/tr_dos_image.py +322 -0
- polyfile/kaitai/parsers/tsm.py +198 -0
- polyfile/kaitai/parsers/ttf.py +1847 -0
- polyfile/kaitai/parsers/udp_datagram.py +42 -0
- polyfile/kaitai/parsers/uefi_te.py +236 -0
- polyfile/kaitai/parsers/uimage.py +198 -0
- polyfile/kaitai/parsers/utf8_string.py +137 -0
- polyfile/kaitai/parsers/vfat.py +410 -0
- polyfile/kaitai/parsers/vlq_base128_be.py +104 -0
- polyfile/kaitai/parsers/vlq_base128_le.py +129 -0
- polyfile/kaitai/parsers/vmware_vmdk.py +167 -0
- polyfile/kaitai/parsers/vp8_ivf.py +112 -0
- polyfile/kaitai/parsers/warcraft_2_pud.py +423 -0
- polyfile/kaitai/parsers/wav.py +1014 -0
- polyfile/kaitai/parsers/websocket.py +167 -0
- polyfile/kaitai/parsers/windows_evt_log.py +304 -0
- polyfile/kaitai/parsers/windows_lnk_file.py +467 -0
- polyfile/kaitai/parsers/windows_minidump.py +575 -0
- polyfile/kaitai/parsers/windows_resource_file.py +243 -0
- polyfile/kaitai/parsers/windows_shell_items.py +190 -0
- polyfile/kaitai/parsers/windows_systemtime.py +52 -0
- polyfile/kaitai/parsers/wmf.py +502 -0
- polyfile/kaitai/parsers/xar.py +181 -0
- polyfile/kaitai/parsers/xwd.py +189 -0
- polyfile/kaitai/parsers/zip.py +685 -0
- polyfile/kaitai/parsers/zisofs.py +158 -0
- polyfile/kaitai/parsers/zx_spectrum_tap.py +184 -0
- polyfile/kaitaimatcher.py +113 -0
- polyfile/languagematcher.py +217 -0
- polyfile/logger.py +135 -0
- polyfile/magic.py +2983 -0
- polyfile/magic_defs/COPYING +29 -0
- polyfile/magic_defs/__init__.py +0 -0
- polyfile/magic_defs/acorn +102 -0
- polyfile/magic_defs/adi +13 -0
- polyfile/magic_defs/adventure +122 -0
- polyfile/magic_defs/aes +29 -0
- polyfile/magic_defs/algol68 +35 -0
- polyfile/magic_defs/allegro +9 -0
- polyfile/magic_defs/alliant +18 -0
- polyfile/magic_defs/alpha +32 -0
- polyfile/magic_defs/amanda +12 -0
- polyfile/magic_defs/amigaos +218 -0
- polyfile/magic_defs/android +259 -0
- polyfile/magic_defs/animation +1197 -0
- polyfile/magic_defs/aout +46 -0
- polyfile/magic_defs/apache +28 -0
- polyfile/magic_defs/apl +7 -0
- polyfile/magic_defs/apple +773 -0
- polyfile/magic_defs/application +7 -0
- polyfile/magic_defs/applix +13 -0
- polyfile/magic_defs/apt +52 -0
- polyfile/magic_defs/archive +2586 -0
- polyfile/magic_defs/aria +38 -0
- polyfile/magic_defs/arm +50 -0
- polyfile/magic_defs/asf +132 -0
- polyfile/magic_defs/assembler +18 -0
- polyfile/magic_defs/asterix +18 -0
- polyfile/magic_defs/att3b +41 -0
- polyfile/magic_defs/audio +1291 -0
- polyfile/magic_defs/avm +33 -0
- polyfile/magic_defs/basis +18 -0
- polyfile/magic_defs/beetle +7 -0
- polyfile/magic_defs/ber +65 -0
- polyfile/magic_defs/bflt +14 -0
- polyfile/magic_defs/bhl +10 -0
- polyfile/magic_defs/bioinformatics +178 -0
- polyfile/magic_defs/biosig +154 -0
- polyfile/magic_defs/blackberry +8 -0
- polyfile/magic_defs/blcr +25 -0
- polyfile/magic_defs/blender +50 -0
- polyfile/magic_defs/blit +24 -0
- polyfile/magic_defs/bm +10 -0
- polyfile/magic_defs/bout +11 -0
- polyfile/magic_defs/bsdi +33 -0
- polyfile/magic_defs/bsi +10 -0
- polyfile/magic_defs/btsnoop +13 -0
- polyfile/magic_defs/burp +7 -0
- polyfile/magic_defs/bytecode +41 -0
- polyfile/magic_defs/c-lang +110 -0
- polyfile/magic_defs/c64 +531 -0
- polyfile/magic_defs/cad +437 -0
- polyfile/magic_defs/cafebabe +107 -0
- polyfile/magic_defs/cbor +21 -0
- polyfile/magic_defs/ccf +14 -0
- polyfile/magic_defs/cddb +12 -0
- polyfile/magic_defs/chord +15 -0
- polyfile/magic_defs/cisco +12 -0
- polyfile/magic_defs/citrus +12 -0
- polyfile/magic_defs/clarion +27 -0
- polyfile/magic_defs/claris +48 -0
- polyfile/magic_defs/clipper +65 -0
- polyfile/magic_defs/clojure +30 -0
- polyfile/magic_defs/coff +98 -0
- polyfile/magic_defs/commands +201 -0
- polyfile/magic_defs/communications +22 -0
- polyfile/magic_defs/compress +461 -0
- polyfile/magic_defs/console +1213 -0
- polyfile/magic_defs/convex +69 -0
- polyfile/magic_defs/coverage +91 -0
- polyfile/magic_defs/cracklib +14 -0
- polyfile/magic_defs/crypto +31 -0
- polyfile/magic_defs/csv +8 -0
- polyfile/magic_defs/ctags +6 -0
- polyfile/magic_defs/ctf +23 -0
- polyfile/magic_defs/cubemap +8 -0
- polyfile/magic_defs/cups +56 -0
- polyfile/magic_defs/dact +11 -0
- polyfile/magic_defs/database +886 -0
- polyfile/magic_defs/dataone +47 -0
- polyfile/magic_defs/dbpf +15 -0
- polyfile/magic_defs/der +146 -0
- polyfile/magic_defs/diamond +12 -0
- polyfile/magic_defs/dif +33 -0
- polyfile/magic_defs/diff +41 -0
- polyfile/magic_defs/digital +59 -0
- polyfile/magic_defs/dolby +69 -0
- polyfile/magic_defs/dsf +25 -0
- polyfile/magic_defs/dump +96 -0
- polyfile/magic_defs/dwarfs +45 -0
- polyfile/magic_defs/dyadic +61 -0
- polyfile/magic_defs/ebml +8 -0
- polyfile/magic_defs/edid +11 -0
- polyfile/magic_defs/editors +43 -0
- polyfile/magic_defs/efi +15 -0
- polyfile/magic_defs/elf +379 -0
- polyfile/magic_defs/encore +22 -0
- polyfile/magic_defs/epoc +62 -0
- polyfile/magic_defs/erlang +21 -0
- polyfile/magic_defs/espressif +57 -0
- polyfile/magic_defs/esri +28 -0
- polyfile/magic_defs/etf +33 -0
- polyfile/magic_defs/fcs +9 -0
- polyfile/magic_defs/filesystems +2694 -0
- polyfile/magic_defs/finger +16 -0
- polyfile/magic_defs/firmware +133 -0
- polyfile/magic_defs/flash +62 -0
- polyfile/magic_defs/flif +36 -0
- polyfile/magic_defs/fonts +449 -0
- polyfile/magic_defs/forth +82 -0
- polyfile/magic_defs/fortran +9 -0
- polyfile/magic_defs/frame +62 -0
- polyfile/magic_defs/freebsd +164 -0
- polyfile/magic_defs/fsav +128 -0
- polyfile/magic_defs/fusecompress +12 -0
- polyfile/magic_defs/games +696 -0
- polyfile/magic_defs/gcc +17 -0
- polyfile/magic_defs/gconv +10 -0
- polyfile/magic_defs/gentoo +85 -0
- polyfile/magic_defs/geo +166 -0
- polyfile/magic_defs/geos +20 -0
- polyfile/magic_defs/gimp +77 -0
- polyfile/magic_defs/git +13 -0
- polyfile/magic_defs/glibc +21 -0
- polyfile/magic_defs/gnome +59 -0
- polyfile/magic_defs/gnu +173 -0
- polyfile/magic_defs/gnumeric +8 -0
- polyfile/magic_defs/gpt +240 -0
- polyfile/magic_defs/gpu +28 -0
- polyfile/magic_defs/grace +21 -0
- polyfile/magic_defs/graphviz +12 -0
- polyfile/magic_defs/gringotts +48 -0
- polyfile/magic_defs/guile +13 -0
- polyfile/magic_defs/hardware +12 -0
- polyfile/magic_defs/hitachi-sh +30 -0
- polyfile/magic_defs/hp +433 -0
- polyfile/magic_defs/human68k +26 -0
- polyfile/magic_defs/ibm370 +52 -0
- polyfile/magic_defs/ibm6000 +35 -0
- polyfile/magic_defs/icc +214 -0
- polyfile/magic_defs/iff +80 -0
- polyfile/magic_defs/images +4210 -0
- polyfile/magic_defs/inform +9 -0
- polyfile/magic_defs/intel +310 -0
- polyfile/magic_defs/interleaf +9 -0
- polyfile/magic_defs/island +10 -0
- polyfile/magic_defs/ispell +63 -0
- polyfile/magic_defs/isz +15 -0
- polyfile/magic_defs/java +52 -0
- polyfile/magic_defs/javascript +171 -0
- polyfile/magic_defs/jpeg +252 -0
- polyfile/magic_defs/json +8 -0
- polyfile/magic_defs/karma +9 -0
- polyfile/magic_defs/kde +11 -0
- polyfile/magic_defs/keepass +20 -0
- polyfile/magic_defs/kerberos +45 -0
- polyfile/magic_defs/kicad +85 -0
- polyfile/magic_defs/kml +34 -0
- polyfile/magic_defs/lammps +64 -0
- polyfile/magic_defs/lecter +6 -0
- polyfile/magic_defs/lex +12 -0
- polyfile/magic_defs/lif +50 -0
- polyfile/magic_defs/linux +557 -0
- polyfile/magic_defs/lisp +78 -0
- polyfile/magic_defs/llvm +22 -0
- polyfile/magic_defs/locoscript +12 -0
- polyfile/magic_defs/lua +31 -0
- polyfile/magic_defs/luks +126 -0
- polyfile/magic_defs/m4 +11 -0
- polyfile/magic_defs/mach +303 -0
- polyfile/magic_defs/macintosh +505 -0
- polyfile/magic_defs/macos +7 -0
- polyfile/magic_defs/magic +10 -0
- polyfile/magic_defs/magic.mgc +0 -0
- polyfile/magic_defs/mail.news +132 -0
- polyfile/magic_defs/make +21 -0
- polyfile/magic_defs/map +413 -0
- polyfile/magic_defs/maple +109 -0
- polyfile/magic_defs/marc21 +30 -0
- polyfile/magic_defs/mathcad +8 -0
- polyfile/magic_defs/mathematica +188 -0
- polyfile/magic_defs/matroska +17 -0
- polyfile/magic_defs/mcrypt +52 -0
- polyfile/magic_defs/measure +44 -0
- polyfile/magic_defs/mercurial +13 -0
- polyfile/magic_defs/metastore +8 -0
- polyfile/magic_defs/meteorological +53 -0
- polyfile/magic_defs/microfocus +21 -0
- polyfile/magic_defs/mime +9 -0
- polyfile/magic_defs/mips +120 -0
- polyfile/magic_defs/mirage +8 -0
- polyfile/magic_defs/misctools +140 -0
- polyfile/magic_defs/mkid +11 -0
- polyfile/magic_defs/mlssa +8 -0
- polyfile/magic_defs/mmdf +6 -0
- polyfile/magic_defs/modem +92 -0
- polyfile/magic_defs/modulefile +9 -0
- polyfile/magic_defs/motorola +71 -0
- polyfile/magic_defs/mozilla +37 -0
- polyfile/magic_defs/msdos +2304 -0
- polyfile/magic_defs/msooxml +68 -0
- polyfile/magic_defs/msvc +222 -0
- polyfile/magic_defs/msx +309 -0
- polyfile/magic_defs/mup +24 -0
- polyfile/magic_defs/music +17 -0
- polyfile/magic_defs/nasa +7 -0
- polyfile/magic_defs/natinst +24 -0
- polyfile/magic_defs/ncr +49 -0
- polyfile/magic_defs/neko +12 -0
- polyfile/magic_defs/netbsd +251 -0
- polyfile/magic_defs/netscape +26 -0
- polyfile/magic_defs/netware +11 -0
- polyfile/magic_defs/news +13 -0
- polyfile/magic_defs/nifty +202 -0
- polyfile/magic_defs/nim-lang +29 -0
- polyfile/magic_defs/nitpicker +14 -0
- polyfile/magic_defs/numpy +9 -0
- polyfile/magic_defs/oasis +12 -0
- polyfile/magic_defs/ocaml +14 -0
- polyfile/magic_defs/octave +6 -0
- polyfile/magic_defs/ole2compounddocs +760 -0
- polyfile/magic_defs/olf +98 -0
- polyfile/magic_defs/openfst +17 -0
- polyfile/magic_defs/opentimestamps +16 -0
- polyfile/magic_defs/oric +16 -0
- polyfile/magic_defs/os2 +186 -0
- polyfile/magic_defs/os400 +39 -0
- polyfile/magic_defs/os9 +80 -0
- polyfile/magic_defs/osf1 +10 -0
- polyfile/magic_defs/palm +156 -0
- polyfile/magic_defs/parix +13 -0
- polyfile/magic_defs/parrot +22 -0
- polyfile/magic_defs/pascal +39 -0
- polyfile/magic_defs/pbf +11 -0
- polyfile/magic_defs/pbm +8 -0
- polyfile/magic_defs/pc88 +24 -0
- polyfile/magic_defs/pc98 +77 -0
- polyfile/magic_defs/pci_ids +116 -0
- polyfile/magic_defs/pcjr +8 -0
- polyfile/magic_defs/pdf +51 -0
- polyfile/magic_defs/pdp +42 -0
- polyfile/magic_defs/perl +100 -0
- polyfile/magic_defs/pgf +52 -0
- polyfile/magic_defs/pgp +581 -0
- polyfile/magic_defs/pgp-binary-keys +388 -0
- polyfile/magic_defs/pkgadd +7 -0
- polyfile/magic_defs/plan9 +25 -0
- polyfile/magic_defs/playdate +57 -0
- polyfile/magic_defs/plus5 +18 -0
- polyfile/magic_defs/pmem +46 -0
- polyfile/magic_defs/polyfile_zip +5 -0
- polyfile/magic_defs/polyml +23 -0
- polyfile/magic_defs/printer +269 -0
- polyfile/magic_defs/project +10 -0
- polyfile/magic_defs/psdbms +14 -0
- polyfile/magic_defs/psl +14 -0
- polyfile/magic_defs/pulsar +13 -0
- polyfile/magic_defs/puzzle +17 -0
- polyfile/magic_defs/pwsafe +14 -0
- polyfile/magic_defs/pyramid +12 -0
- polyfile/magic_defs/python +305 -0
- polyfile/magic_defs/qt +30 -0
- polyfile/magic_defs/revision +66 -0
- polyfile/magic_defs/riff +840 -0
- polyfile/magic_defs/rinex +44 -0
- polyfile/magic_defs/ringdove +45 -0
- polyfile/magic_defs/rpi +52 -0
- polyfile/magic_defs/rpm +45 -0
- polyfile/magic_defs/rpmsg +7 -0
- polyfile/magic_defs/rst +11 -0
- polyfile/magic_defs/rtf +94 -0
- polyfile/magic_defs/ruby +55 -0
- polyfile/magic_defs/rust +21 -0
- polyfile/magic_defs/sc +7 -0
- polyfile/magic_defs/sccs +24 -0
- polyfile/magic_defs/scientific +144 -0
- polyfile/magic_defs/securitycerts +6 -0
- polyfile/magic_defs/selinux +24 -0
- polyfile/magic_defs/sendmail +37 -0
- polyfile/magic_defs/sequent +42 -0
- polyfile/magic_defs/sereal +35 -0
- polyfile/magic_defs/sgi +144 -0
- polyfile/magic_defs/sgml +161 -0
- polyfile/magic_defs/sharc +23 -0
- polyfile/magic_defs/sinclair +40 -0
- polyfile/magic_defs/sisu +18 -0
- polyfile/magic_defs/sketch +6 -0
- polyfile/magic_defs/smalltalk +25 -0
- polyfile/magic_defs/smile +34 -0
- polyfile/magic_defs/sniffer +482 -0
- polyfile/magic_defs/softquad +40 -0
- polyfile/magic_defs/sosi +40 -0
- polyfile/magic_defs/spec +21 -0
- polyfile/magic_defs/spectrum +184 -0
- polyfile/magic_defs/sql +288 -0
- polyfile/magic_defs/ssh +39 -0
- polyfile/magic_defs/ssl +20 -0
- polyfile/magic_defs/statistics +45 -0
- polyfile/magic_defs/subtitle +38 -0
- polyfile/magic_defs/sun +141 -0
- polyfile/magic_defs/svf +5 -0
- polyfile/magic_defs/sylk +36 -0
- polyfile/magic_defs/symbos +42 -0
- polyfile/magic_defs/sysex +429 -0
- polyfile/magic_defs/tcl +29 -0
- polyfile/magic_defs/teapot +6 -0
- polyfile/magic_defs/terminfo +63 -0
- polyfile/magic_defs/tex +141 -0
- polyfile/magic_defs/tgif +7 -0
- polyfile/magic_defs/ti-8x +239 -0
- polyfile/magic_defs/timezone +42 -0
- polyfile/magic_defs/tplink +95 -0
- polyfile/magic_defs/troff +38 -0
- polyfile/magic_defs/tuxedo +8 -0
- polyfile/magic_defs/typeset +8 -0
- polyfile/magic_defs/uf2 +72 -0
- polyfile/magic_defs/unicode +15 -0
- polyfile/magic_defs/unisig +12 -0
- polyfile/magic_defs/unknown +34 -0
- polyfile/magic_defs/usd +21 -0
- polyfile/magic_defs/uterus +16 -0
- polyfile/magic_defs/uuencode +28 -0
- polyfile/magic_defs/vacuum-cleaner +54 -0
- polyfile/magic_defs/varied.out +46 -0
- polyfile/magic_defs/varied.script +21 -0
- polyfile/magic_defs/vax +32 -0
- polyfile/magic_defs/vicar +17 -0
- polyfile/magic_defs/virtual +307 -0
- polyfile/magic_defs/virtutech +12 -0
- polyfile/magic_defs/visx +32 -0
- polyfile/magic_defs/vms +30 -0
- polyfile/magic_defs/vmware +6 -0
- polyfile/magic_defs/vorbis +155 -0
- polyfile/magic_defs/vxl +14 -0
- polyfile/magic_defs/warc +16 -0
- polyfile/magic_defs/weak +16 -0
- polyfile/magic_defs/web +18 -0
- polyfile/magic_defs/webassembly +17 -0
- polyfile/magic_defs/windows +1811 -0
- polyfile/magic_defs/wireless +7 -0
- polyfile/magic_defs/wordprocessors +630 -0
- polyfile/magic_defs/wsdl +23 -0
- polyfile/magic_defs/x68000 +25 -0
- polyfile/magic_defs/xdelta +13 -0
- polyfile/magic_defs/xenix +106 -0
- polyfile/magic_defs/xilinx +58 -0
- polyfile/magic_defs/xo65 +37 -0
- polyfile/magic_defs/xwindows +43 -0
- polyfile/magic_defs/yara +17 -0
- polyfile/magic_defs/zfs +96 -0
- polyfile/magic_defs/zilog +12 -0
- polyfile/magic_defs/zip +126 -0
- polyfile/magic_defs/zyxel +17 -0
- polyfile/nes.py +144 -0
- polyfile/nitf.py +15 -0
- polyfile/pdf.py +1264 -0
- polyfile/pickles.py +45 -0
- polyfile/polyfile.py +409 -0
- polyfile/profiling.py +115 -0
- polyfile/repl.py +624 -0
- polyfile/search.py +310 -0
- polyfile/serialization.py +323 -0
- polyfile/structmatcher.py +46 -0
- polyfile/structs.py +281 -0
- polyfile/templates/download.js +162 -0
- polyfile/templates/hexdump.css +268 -0
- polyfile/templates/hexdump.js +756 -0
- polyfile/templates/jquery-3.4.1.min.js +2 -0
- polyfile/templates/template.html +119 -0
- polyfile/wildcards.py +62 -0
- polyfile/zipmatcher.py +183 -0
- polyfile_weave-0.5.5.dist-info/METADATA +173 -0
- polyfile_weave-0.5.5.dist-info/RECORD +585 -0
- polyfile_weave-0.5.5.dist-info/WHEEL +5 -0
- polyfile_weave-0.5.5.dist-info/entry_points.txt +2 -0
- polyfile_weave-0.5.5.dist-info/licenses/LICENSE +202 -0
- polyfile_weave-0.5.5.dist-info/top_level.txt +2 -0
- polymerge/__init__.py +1 -0
- polymerge/__main__.py +296 -0
- polymerge/cfg.py +127 -0
- polymerge/polymerge.py +227 -0
- polymerge/polytracker.py +190 -0
polyfile/pdf.py
ADDED
|
@@ -0,0 +1,1264 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Callable, Dict, Iterator, List, Optional, Type, TypeVar, Union
|
|
3
|
+
import zlib
|
|
4
|
+
|
|
5
|
+
from pdfminer.ascii85 import ascii85decode, asciihexdecode
|
|
6
|
+
from pdfminer.ccitt import ccittfaxdecode
|
|
7
|
+
from pdfminer.lzw import lzwdecode
|
|
8
|
+
from pdfminer.pdfparser import PDFSyntaxError
|
|
9
|
+
from pdfminer.pdftypes import PDFNotImplementedError
|
|
10
|
+
from pdfminer.runlength import rldecode
|
|
11
|
+
from pdfminer.pdfparser import PDFParser as PDFMinerParser, PDFStream, PDFObjRef
|
|
12
|
+
from pdfminer.psparser import ExtraT, PSBaseParserToken, PSKeyword, PSObject, PSLiteral, PSStackEntry, PSSyntaxError
|
|
13
|
+
from pdfminer.pdfdocument import (
|
|
14
|
+
PDFDocument, PDFXRef, KWD, PDFNoValidXRef, PSEOF, dict_value, LITERAL_XREF, LITERAL_OBJSTM, LITERAL_CATALOG,
|
|
15
|
+
DecipherCallable, PDFObjectNotFound
|
|
16
|
+
)
|
|
17
|
+
from pdfminer.pdftypes import (
|
|
18
|
+
LIT, LITERALS_FLATE_DECODE, LITERALS_ASCIIHEX_DECODE, LITERALS_CCITTFAX_DECODE, LITERALS_RUNLENGTH_DECODE,
|
|
19
|
+
LITERAL_CRYPT, LITERALS_LZW_DECODE, LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_ASCII85_DECODE,
|
|
20
|
+
int_value, apply_png_predictor
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from .fileutils import FileStream
|
|
24
|
+
from .fileutils import Tempfile
|
|
25
|
+
from .logger import getStatusLogger
|
|
26
|
+
from .magic import AbsoluteOffset, FailedTest, MagicMatcher, MagicTest, MatchedTest, TestResult, TestType
|
|
27
|
+
from .polyfile import Match, Matcher, Submatch, register_parser
|
|
28
|
+
|
|
29
|
+
log = getStatusLogger("PDF")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_trailer(self, parser: "PDFParser") -> None:
|
|
33
|
+
try:
|
|
34
|
+
(_, kwd) = parser.nexttoken()
|
|
35
|
+
assert kwd == KWD(b'trailer'), f"{kwd!s} != {KWD(b'trailer')!s}"
|
|
36
|
+
flush_before = parser.auto_flush
|
|
37
|
+
try:
|
|
38
|
+
# This might be a bug in pdfminer, or it's just that we are using it wrong, but we need to
|
|
39
|
+
# flush our entire token stack to the results list in order to parse the trailer dict:
|
|
40
|
+
parser.auto_flush = True
|
|
41
|
+
(_, dic) = parser.nextobject()
|
|
42
|
+
finally:
|
|
43
|
+
parser.auto_flush = flush_before
|
|
44
|
+
except PSEOF:
|
|
45
|
+
x = parser.pop(1)
|
|
46
|
+
if not x:
|
|
47
|
+
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
|
48
|
+
(_, dic) = x[0]
|
|
49
|
+
self.trailer.update(dict_value(dic))
|
|
50
|
+
log.debug('trailer=%r', self.trailer)
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_xref(self: PDFXRef, parser: "PDFParser"):
|
|
55
|
+
while True:
|
|
56
|
+
try:
|
|
57
|
+
(pos, line) = parser.nextline()
|
|
58
|
+
line = line.strip()
|
|
59
|
+
if not line:
|
|
60
|
+
continue
|
|
61
|
+
except PSEOF:
|
|
62
|
+
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
|
|
63
|
+
if line.startswith(b"trailer"):
|
|
64
|
+
parser.seek(pos)
|
|
65
|
+
break
|
|
66
|
+
f = line.split(b" ")
|
|
67
|
+
if len(f) != 2:
|
|
68
|
+
error_msg = "Trailer not found: {!r}: line={!r}".format(parser, line)
|
|
69
|
+
raise PDFNoValidXRef(error_msg)
|
|
70
|
+
try:
|
|
71
|
+
(start, nobjs) = map(int, f)
|
|
72
|
+
except ValueError:
|
|
73
|
+
error_msg = "Invalid line: {!r}: line={!r}".format(parser, line)
|
|
74
|
+
raise PDFNoValidXRef(error_msg)
|
|
75
|
+
for objid in range(start, start + nobjs):
|
|
76
|
+
try:
|
|
77
|
+
(_, line) = parser.nextline()
|
|
78
|
+
line = line.strip()
|
|
79
|
+
except PSEOF:
|
|
80
|
+
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
|
|
81
|
+
f = line.split(b" ")
|
|
82
|
+
if len(f) != 3:
|
|
83
|
+
error_msg = "Invalid XRef format: {!r}, line={!r}".format(
|
|
84
|
+
parser, line
|
|
85
|
+
)
|
|
86
|
+
raise PDFNoValidXRef(error_msg)
|
|
87
|
+
(pos_b, genno_b, use_b) = f
|
|
88
|
+
if use_b != b"n":
|
|
89
|
+
continue
|
|
90
|
+
self.offsets[objid] = (None, pos_b.__int__(), genno_b.__int__())
|
|
91
|
+
log.debug("xref objects: %r", self.offsets)
|
|
92
|
+
self.load_trailer(parser)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
PDFXRef.load_trailer = load_trailer
|
|
96
|
+
PDFXRef.load = load_xref
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PSToken:
|
|
100
|
+
pdf_offset: int
|
|
101
|
+
pdf_bytes: int
|
|
102
|
+
|
|
103
|
+
def __new__(cls, *args, **kwargs):
|
|
104
|
+
ret = super().__new__(cls, *args)
|
|
105
|
+
ret.pdf_offset = kwargs["pdf_offset"]
|
|
106
|
+
ret.pdf_bytes = kwargs["pdf_bytes"]
|
|
107
|
+
return ret
|
|
108
|
+
|
|
109
|
+
def __int__(self):
|
|
110
|
+
if isinstance(self, PSInt):
|
|
111
|
+
return self
|
|
112
|
+
return PSInt(int(self, base=10), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
113
|
+
|
|
114
|
+
def __float__(self):
|
|
115
|
+
if isinstance(self, float):
|
|
116
|
+
return self
|
|
117
|
+
elif isinstance(self, int):
|
|
118
|
+
return PSFloat(int(self, base=10), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
119
|
+
elif isinstance(self, bytes):
|
|
120
|
+
return PSFloat(self.decode(), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
121
|
+
elif isinstance(self, PSStr):
|
|
122
|
+
return PSFloat(str(self), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
123
|
+
else:
|
|
124
|
+
raise NotImplementedError()
|
|
125
|
+
|
|
126
|
+
def __bytes__(self):
|
|
127
|
+
if isinstance(self, PSBytes):
|
|
128
|
+
return self
|
|
129
|
+
else:
|
|
130
|
+
return PSBytes(self, pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
131
|
+
|
|
132
|
+
def __hex__(self):
|
|
133
|
+
return PSStr(super().__hex__(), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
134
|
+
|
|
135
|
+
def __str__(self):
|
|
136
|
+
raise NotImplementedError()
|
|
137
|
+
# return PSStr(super().__str__(), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
138
|
+
|
|
139
|
+
def __repr__(self):
|
|
140
|
+
return f"{self.__class__.__name__}({super().__repr__()}, pdf_offset={self.pdf_offset!r}, "\
|
|
141
|
+
f"pdf_bytes={self.pdf_bytes!r})"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class PSInt(PSToken, int):
|
|
145
|
+
def __index__(self):
|
|
146
|
+
return self
|
|
147
|
+
|
|
148
|
+
def __str__(self):
|
|
149
|
+
return str(int(self))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
C = TypeVar("C")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class PSSequence(PSToken):
|
|
156
|
+
def split(self: Type[C], sep: Optional[C] = None, maxsplit: int = -1) -> List[C]:
|
|
157
|
+
remainder = self
|
|
158
|
+
current: Optional[C] = None
|
|
159
|
+
result: List[C] = []
|
|
160
|
+
if sep is None:
|
|
161
|
+
remainder = remainder.strip()
|
|
162
|
+
while remainder and (maxsplit < 0 or len(result) <= maxsplit):
|
|
163
|
+
c = remainder[0:1]
|
|
164
|
+
remainder = remainder[1:]
|
|
165
|
+
if sep is None:
|
|
166
|
+
if not c.strip():
|
|
167
|
+
if current is not None:
|
|
168
|
+
result.append(current)
|
|
169
|
+
current = None
|
|
170
|
+
else:
|
|
171
|
+
if current is None:
|
|
172
|
+
current = c
|
|
173
|
+
else:
|
|
174
|
+
current += c
|
|
175
|
+
else:
|
|
176
|
+
if current is None:
|
|
177
|
+
current = c
|
|
178
|
+
else:
|
|
179
|
+
current += c
|
|
180
|
+
if current[-len(sep):] == sep:
|
|
181
|
+
result.append(current[:-len(sep)])
|
|
182
|
+
current = None
|
|
183
|
+
if current is not None:
|
|
184
|
+
if not result or maxsplit < 0 or len(result) <= maxsplit:
|
|
185
|
+
result.append(current)
|
|
186
|
+
else:
|
|
187
|
+
result[-1] += current
|
|
188
|
+
return result
|
|
189
|
+
|
|
190
|
+
def __add__(self: Type[C], other) -> C:
|
|
191
|
+
if isinstance(other, self.__class__) and other.pdf_offset == self.pdf_offset + self.pdf_bytes:
|
|
192
|
+
return self.__class__(super().__add__(other), pdf_offset=self.pdf_offset)
|
|
193
|
+
return self.__class__(super().__add__(other), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
194
|
+
|
|
195
|
+
def __radd__(self: Type[C], other) -> C:
|
|
196
|
+
return self.__class__(other, pdf_offset=self.pdf_offset - len(other)) + self
|
|
197
|
+
|
|
198
|
+
def lstrip(self: Type[C], chars: bytes = b" \t\n\r") -> C:
|
|
199
|
+
ret = self
|
|
200
|
+
while ret and ret[0] in chars:
|
|
201
|
+
ret = ret[1:]
|
|
202
|
+
return ret
|
|
203
|
+
|
|
204
|
+
def rstrip(self: Type[C], chars: bytes = b" \t\n\r") -> C:
|
|
205
|
+
ret = self
|
|
206
|
+
while ret and ret[-1] in chars:
|
|
207
|
+
ret = ret[:-1]
|
|
208
|
+
return ret
|
|
209
|
+
|
|
210
|
+
def strip(self: Type[C], chars: bytes = b" \t\n\r") -> C:
|
|
211
|
+
return self.lstrip(chars).rstrip(chars)
|
|
212
|
+
|
|
213
|
+
def __getitem__(self, item):
|
|
214
|
+
if isinstance(item, int):
|
|
215
|
+
value = super().__getitem__(item)
|
|
216
|
+
return make_ps_object(value, pdf_offset=self.pdf_offset+item, pdf_bytes=self.pdf_bytes-item)
|
|
217
|
+
elif isinstance(item, slice):
|
|
218
|
+
if item.start is None:
|
|
219
|
+
start = 0
|
|
220
|
+
else:
|
|
221
|
+
start = item.start
|
|
222
|
+
if item.stop is None:
|
|
223
|
+
stop = self.pdf_bytes
|
|
224
|
+
else:
|
|
225
|
+
stop = item.stop
|
|
226
|
+
try:
|
|
227
|
+
return self.__class__(
|
|
228
|
+
super().__getitem__(item),
|
|
229
|
+
pdf_offset=self.pdf_offset+start,
|
|
230
|
+
pdf_bytes=self.pdf_bytes-(stop - start)
|
|
231
|
+
)
|
|
232
|
+
except ValueError:
|
|
233
|
+
if isinstance(self, PSBytes):
|
|
234
|
+
return PSBytes(
|
|
235
|
+
super().__getitem__(item),
|
|
236
|
+
pdf_offset=self.pdf_offset+start,
|
|
237
|
+
pdf_bytes=self.pdf_bytes-(stop - start)
|
|
238
|
+
)
|
|
239
|
+
else:
|
|
240
|
+
raise
|
|
241
|
+
else:
|
|
242
|
+
return super().__getitem__(item)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class PSStr(PSSequence, str):
|
|
246
|
+
def encode(self, encoding: str = ..., errors: str = ...) -> bytes:
|
|
247
|
+
return PSBytes(super().encode(encoding, errors), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
248
|
+
|
|
249
|
+
def __str__(self):
|
|
250
|
+
return str.__str__(self)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class PSBytes(PSSequence, bytes):
|
|
254
|
+
def __new__(cls, *args, **kwargs):
|
|
255
|
+
kwargs = dict(kwargs)
|
|
256
|
+
if "pdf_bytes" not in kwargs:
|
|
257
|
+
kwargs["pdf_bytes"] = len(args[0])
|
|
258
|
+
return super().__new__(cls, *args, **kwargs)
|
|
259
|
+
|
|
260
|
+
def __getitem__(self, item):
|
|
261
|
+
if isinstance(item, slice):
|
|
262
|
+
if item.start is None:
|
|
263
|
+
start = 0
|
|
264
|
+
else:
|
|
265
|
+
start = item.start
|
|
266
|
+
return PSBytes(super().__getitem__(item), pdf_offset=self.pdf_offset + start)
|
|
267
|
+
else:
|
|
268
|
+
ret = super().__getitem__(item)
|
|
269
|
+
if isinstance(ret, PSInt):
|
|
270
|
+
return ret
|
|
271
|
+
else:
|
|
272
|
+
return PSInt(ret, pdf_offset=self.pdf_offset + item)
|
|
273
|
+
|
|
274
|
+
def decode(self, encoding: str = "utf-8", errors: str = "strict") -> PSStr:
|
|
275
|
+
return PSStr(super().decode(encoding, errors), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def __str__(self):
|
|
279
|
+
return bytes.__str__(self)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class PDFDeciphered(PSBytes):
|
|
283
|
+
original_bytes: bytes
|
|
284
|
+
|
|
285
|
+
def __new__(cls, *args, **kwargs):
|
|
286
|
+
kwargs = dict(kwargs)
|
|
287
|
+
if "pdf_bytes" not in kwargs:
|
|
288
|
+
kwargs["pdf_bytes"] = len(args[0])
|
|
289
|
+
if "original_bytes" in kwargs:
|
|
290
|
+
original_bytes = kwargs["original_bytes"]
|
|
291
|
+
del kwargs["original_bytes"]
|
|
292
|
+
else:
|
|
293
|
+
raise ValueError(f"{cls.__name__}.__init__ requires the `original_bytes` argument")
|
|
294
|
+
ret = super().__new__(cls, *args, **kwargs)
|
|
295
|
+
setattr(ret, "original_bytes", original_bytes)
|
|
296
|
+
return ret
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class PSFloat(PSToken, float):
|
|
300
|
+
def __str__(self):
|
|
301
|
+
return float.__str__(self)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class PSBool:
|
|
305
|
+
def __init__(self, value: bool, pdf_offset: int, pdf_bytes: int):
|
|
306
|
+
self.value: bool = value
|
|
307
|
+
self.pdf_offset: int = pdf_offset
|
|
308
|
+
self.pdf_bytes: int = pdf_bytes
|
|
309
|
+
|
|
310
|
+
def __bool__(self):
|
|
311
|
+
return self.value
|
|
312
|
+
|
|
313
|
+
def __int__(self):
|
|
314
|
+
return PSInt(int(self.value), pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
315
|
+
|
|
316
|
+
def __eq__(self, other):
|
|
317
|
+
return self.value == bool(other)
|
|
318
|
+
|
|
319
|
+
def __ne__(self, other):
|
|
320
|
+
return self.value != bool(other)
|
|
321
|
+
|
|
322
|
+
def __hash__(self):
|
|
323
|
+
return hash(self.value)
|
|
324
|
+
|
|
325
|
+
def __str__(self):
|
|
326
|
+
return PSStr(self.value, pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
327
|
+
|
|
328
|
+
def __repr__(self):
|
|
329
|
+
return f"{self.__class__.__name__}(value={self.value!r}, pdf_offset={self.pdf_offset!r}, "\
|
|
330
|
+
f"pdf_bytes={self.pdf_bytes!r})"
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class PDFLiteral(PSLiteral):
|
|
334
|
+
def __init__(self, name: PSLiteral.NameType, pdf_offset: int, pdf_bytes: int):
|
|
335
|
+
if isinstance(name, str) and not isinstance(name, PSStr):
|
|
336
|
+
super().__init__(PSStr(name, pdf_offset=pdf_offset + 1, pdf_bytes=pdf_bytes))
|
|
337
|
+
elif isinstance(name, bytes) and not isinstance(name, PSBytes):
|
|
338
|
+
super().__init__(PSBytes(name, pdf_offset=pdf_offset + 1, pdf_bytes=pdf_bytes))
|
|
339
|
+
else:
|
|
340
|
+
super().__init__(name)
|
|
341
|
+
|
|
342
|
+
@property
|
|
343
|
+
def pdf_bytes(self) -> int:
|
|
344
|
+
return self.name.pdf_bytes + 1 # add one to account for the leading "/"
|
|
345
|
+
|
|
346
|
+
@property
|
|
347
|
+
def pdf_offset(self) -> int:
|
|
348
|
+
return self.name.pdf_offset - 1
|
|
349
|
+
|
|
350
|
+
def __eq__(self, other):
|
|
351
|
+
return isinstance(other, PSLiteral) and self.name == other.name
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class PDFKeyword(PSKeyword):
|
|
355
|
+
def __init__(self, name: bytes, pdf_offset: int, pdf_bytes: int):
|
|
356
|
+
pdf_bytes = len(name) # sometimes we actually lose the length of the token, so rely on the keyword name
|
|
357
|
+
if not isinstance(name, PSBytes):
|
|
358
|
+
super().__init__(PSBytes(name, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes))
|
|
359
|
+
else:
|
|
360
|
+
super().__init__(name)
|
|
361
|
+
self.pdf_offset: int = pdf_offset
|
|
362
|
+
self.pdf_bytes: int = pdf_bytes
|
|
363
|
+
|
|
364
|
+
def __eq__(self, other):
|
|
365
|
+
return isinstance(other, PSKeyword) and self.name == other.name
|
|
366
|
+
|
|
367
|
+
def __repr__(self):
|
|
368
|
+
return f"{self.__class__.__name__}({self.name!r}, pdf_offset={self.pdf_offset}, pdf_bytes={self.pdf_bytes})"
|
|
369
|
+
|
|
370
|
+
def __str__(self):
|
|
371
|
+
return f"/{self.name!s}"
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
PDFBaseParserToken = Union[PSFloat, PSBool, PDFLiteral, PSKeyword, PSBytes, PSInt]
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
"""
|
|
378
|
+
pdfminer.pdfdocument unfortunately tests for equality with these literals using `is` rather than `==`, so we must
|
|
379
|
+
return their singletons from a dict rather than our instrumented PDFLiteral objects:
|
|
380
|
+
"""
|
|
381
|
+
PROTECTED_LITERALS: Dict[str, PSLiteral] = {
|
|
382
|
+
LITERAL_OBJSTM.name: LITERAL_OBJSTM,
|
|
383
|
+
LITERAL_XREF.name: LITERAL_XREF,
|
|
384
|
+
LITERAL_CATALOG.name: LITERAL_CATALOG
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
if sys.version_info < (3, 7):
|
|
389
|
+
# Before Python 3.7, we'll get an MRO error if we extend from both dict and Dict
|
|
390
|
+
PDFDict_Type = object
|
|
391
|
+
else:
|
|
392
|
+
PDFDict_Type = Dict[PSStr, Union[PDFBaseParserToken, PSStr, "PDFDict", "PDFList"]]
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class PDFDict(dict, PDFDict_Type):
|
|
396
|
+
pdf_offset: int
|
|
397
|
+
pdf_bytes: int
|
|
398
|
+
|
|
399
|
+
def __init__(self, *args, **kwargs):
|
|
400
|
+
kwargs = dict(kwargs)
|
|
401
|
+
if "pdf_offset" in kwargs:
|
|
402
|
+
del kwargs["pdf_offset"]
|
|
403
|
+
if "pdf_bytes" in kwargs:
|
|
404
|
+
del kwargs["pdf_bytes"]
|
|
405
|
+
super().__init__(*args, **kwargs)
|
|
406
|
+
|
|
407
|
+
def get(self, key, default = None):
|
|
408
|
+
result = super().get(key, default)
|
|
409
|
+
if isinstance(result, PDFLiteral) and result.name in PROTECTED_LITERALS:
|
|
410
|
+
# we must return the protected literals as their singleton version:
|
|
411
|
+
return PROTECTED_LITERALS[result.name]
|
|
412
|
+
return result
|
|
413
|
+
|
|
414
|
+
def __new__(cls, *args, pdf_offset: int, pdf_bytes: int, **kwargs):
|
|
415
|
+
ret = super().__new__(cls, *args, **kwargs)
|
|
416
|
+
ret.pdf_offset = pdf_offset
|
|
417
|
+
ret.pdf_bytes = pdf_bytes
|
|
418
|
+
return ret
|
|
419
|
+
|
|
420
|
+
def __str__(self):
|
|
421
|
+
return dict.__str__(self)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
class PDFList(PSSequence, list):
|
|
426
|
+
@staticmethod
|
|
427
|
+
def load(iterable) -> "PDFList":
|
|
428
|
+
start_offset: Optional[int] = None
|
|
429
|
+
end_offset: Optional[int] = None
|
|
430
|
+
items = []
|
|
431
|
+
for item in iterable:
|
|
432
|
+
if hasattr(item, "pdf_offset") and hasattr(item, "pdf_bytes"):
|
|
433
|
+
if start_offset is None or start_offset > item.pdf_offset:
|
|
434
|
+
start_offset = item.pdf_offset
|
|
435
|
+
if end_offset is None or end_offset < item.pdf_offset + item.pdf_bytes:
|
|
436
|
+
end_offset = item.pdf_offset + item.pdf_bytes
|
|
437
|
+
items.append(item)
|
|
438
|
+
if start_offset is None or end_offset is None:
|
|
439
|
+
raise ValueError(f"Cannot determine PDF bounds for list {items!r}")
|
|
440
|
+
return PDFList(items, pdf_offset=start_offset, pdf_bytes=end_offset - start_offset)
|
|
441
|
+
|
|
442
|
+
def __str__(self):
|
|
443
|
+
return list.__str__(self)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def make_ps_object(value, pdf_offset: int, pdf_bytes: int) -> Union[PDFBaseParserToken, PSStr, PDFDict]:
|
|
448
|
+
if isinstance(value, PSLiteral):
|
|
449
|
+
return PDFLiteral(value.name, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
|
|
450
|
+
# Unfortunately, we can't convert PSKeywords to PDFKeywords here because pdfminer requires them to be singletons
|
|
451
|
+
# elif isinstance(value, PSKeyword):
|
|
452
|
+
# return PDFKeyword(value.name, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
|
|
453
|
+
elif isinstance(value, PDFDict):
|
|
454
|
+
value.pdf_offset = pdf_offset
|
|
455
|
+
value.pdf_bytes = pdf_bytes
|
|
456
|
+
return value
|
|
457
|
+
elif isinstance(value, dict):
|
|
458
|
+
return PDFDict(value, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
|
|
459
|
+
elif isinstance(value, PSObject):
|
|
460
|
+
setattr(value, "pdf_offset", pdf_offset)
|
|
461
|
+
if isinstance(value, PSKeyword):
|
|
462
|
+
# sometimes the byte count gets off, so set it to the name size
|
|
463
|
+
pdf_bytes = len(value.name)
|
|
464
|
+
setattr(value, "pdf_bytes", pdf_bytes)
|
|
465
|
+
return value
|
|
466
|
+
elif isinstance(value, int):
|
|
467
|
+
supertype = PSInt
|
|
468
|
+
elif isinstance(value, float):
|
|
469
|
+
supertype = PSFloat
|
|
470
|
+
elif isinstance(value, bool):
|
|
471
|
+
supertype = PSBool
|
|
472
|
+
elif isinstance(value, bytes):
|
|
473
|
+
supertype = PSBytes
|
|
474
|
+
elif isinstance(value, str):
|
|
475
|
+
supertype = PSStr
|
|
476
|
+
else:
|
|
477
|
+
raise NotImplementedError(f"Add suppport for PSSequences containing elements of type {type(value)}")
|
|
478
|
+
return supertype(value, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
class DecodingError(bytes):
|
|
482
|
+
message: Optional[str]
|
|
483
|
+
|
|
484
|
+
def __new__(cls, *args, **kwargs):
|
|
485
|
+
kwargs = dict(kwargs)
|
|
486
|
+
if "message" in kwargs:
|
|
487
|
+
message = kwargs["message"]
|
|
488
|
+
del kwargs["message"]
|
|
489
|
+
else:
|
|
490
|
+
message = None
|
|
491
|
+
ret = super().__new__(cls, b'')
|
|
492
|
+
setattr(ret, "message", message)
|
|
493
|
+
return ret
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class PDFStreamFilter(PSBytes):
|
|
497
|
+
name: str
|
|
498
|
+
original_bytes: bytes
|
|
499
|
+
error: Optional[DecodingError]
|
|
500
|
+
|
|
501
|
+
def __new__(cls, *args, **kwargs):
|
|
502
|
+
kwargs = dict(kwargs)
|
|
503
|
+
if "pdf_bytes" not in kwargs:
|
|
504
|
+
kwargs["pdf_bytes"] = len(args[0])
|
|
505
|
+
if "original_bytes" in kwargs:
|
|
506
|
+
original_bytes = kwargs["original_bytes"]
|
|
507
|
+
del kwargs["original_bytes"]
|
|
508
|
+
else:
|
|
509
|
+
raise ValueError(f"{cls.__name__}.__init__ requires the `original_bytes` argument")
|
|
510
|
+
if "name" in kwargs:
|
|
511
|
+
name = kwargs["name"]
|
|
512
|
+
del kwargs["name"]
|
|
513
|
+
else:
|
|
514
|
+
raise ValueError(f"{cls.__name__}.__init__ requires the `name` argument")
|
|
515
|
+
if isinstance(args[0], DecodingError):
|
|
516
|
+
error = args[0]
|
|
517
|
+
else:
|
|
518
|
+
error = None
|
|
519
|
+
ret = super().__new__(cls, *args, **kwargs)
|
|
520
|
+
setattr(ret, "original_bytes", original_bytes)
|
|
521
|
+
setattr(ret, "name", name)
|
|
522
|
+
setattr(ret, "error", error)
|
|
523
|
+
return ret
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
class PNGPredictor(PSBytes):
|
|
527
|
+
params: PDFDict
|
|
528
|
+
original_bytes: bytes
|
|
529
|
+
|
|
530
|
+
def __new__(cls, *args, **kwargs):
|
|
531
|
+
kwargs = dict(kwargs)
|
|
532
|
+
if "pdf_bytes" not in kwargs:
|
|
533
|
+
kwargs["pdf_bytes"] = len(args[0])
|
|
534
|
+
if "original_bytes" in kwargs:
|
|
535
|
+
original_bytes = kwargs["original_bytes"]
|
|
536
|
+
del kwargs["original_bytes"]
|
|
537
|
+
else:
|
|
538
|
+
raise ValueError(f"{cls.__name__}.__init__ requires the `original_bytes` argument")
|
|
539
|
+
if "params" in kwargs:
|
|
540
|
+
params = kwargs["params"]
|
|
541
|
+
del kwargs["params"]
|
|
542
|
+
else:
|
|
543
|
+
raise ValueError(f"{cls.__name__}.__init__ requires the `params` argument")
|
|
544
|
+
ret = super().__new__(cls, *args, **kwargs)
|
|
545
|
+
setattr(ret, "original_bytes", original_bytes)
|
|
546
|
+
setattr(ret, "params", params)
|
|
547
|
+
return ret
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
class PDFObjectStream(PDFStream):
|
|
551
|
+
def __init__(self, parent: PDFStream, pdf_offset: int, pdf_bytes: int):
|
|
552
|
+
super().__init__(
|
|
553
|
+
attrs=parent.attrs,
|
|
554
|
+
rawdata=PSBytes(parent.rawdata, pdf_offset=pdf_offset, pdf_bytes=pdf_bytes),
|
|
555
|
+
decipher=parent.decipher
|
|
556
|
+
)
|
|
557
|
+
self.parent: PDFStream = parent
|
|
558
|
+
self.pdf_offset: int = pdf_offset
|
|
559
|
+
self.pdf_bytes: int = pdf_bytes
|
|
560
|
+
self.data = parent.data
|
|
561
|
+
self.objid = parent.objid
|
|
562
|
+
self.genno = parent.genno
|
|
563
|
+
|
|
564
|
+
@property
|
|
565
|
+
def data(self) -> Optional[PSBytes]:
|
|
566
|
+
return self._data
|
|
567
|
+
|
|
568
|
+
@data.setter
|
|
569
|
+
def data(self, new_value: Optional[bytes]):
|
|
570
|
+
if new_value is not None and not isinstance(new_value, PSBytes):
|
|
571
|
+
self._data = PSBytes(new_value, pdf_offset=self.pdf_offset, pdf_bytes=self.pdf_bytes)
|
|
572
|
+
else:
|
|
573
|
+
self._data = new_value
|
|
574
|
+
|
|
575
|
+
@property
|
|
576
|
+
def data_value(self) -> PSBytes:
|
|
577
|
+
if self.data is not None:
|
|
578
|
+
return self.data
|
|
579
|
+
elif self.rawdata is not None:
|
|
580
|
+
return self.rawdata
|
|
581
|
+
else:
|
|
582
|
+
raise ValueError(f"PDFObjectStream {self!r} does not have any data")
|
|
583
|
+
|
|
584
|
+
def decode(self):
|
|
585
|
+
assert self.data is None \
|
|
586
|
+
and self.rawdata is not None, str((self.data, self.rawdata))
|
|
587
|
+
data = self.rawdata
|
|
588
|
+
if self.decipher:
|
|
589
|
+
# Handle encryption
|
|
590
|
+
assert self.objid is not None
|
|
591
|
+
assert self.genno is not None
|
|
592
|
+
data = self.decipher(self.objid, self.genno, data, self.attrs)
|
|
593
|
+
filters = self.get_filters()
|
|
594
|
+
if not filters:
|
|
595
|
+
self.data = data
|
|
596
|
+
self.rawdata = None
|
|
597
|
+
return
|
|
598
|
+
for (f, params) in filters:
|
|
599
|
+
decoded: Optional[bytes] = None
|
|
600
|
+
if f in LITERALS_FLATE_DECODE:
|
|
601
|
+
# will get errors if the document is encrypted.
|
|
602
|
+
try:
|
|
603
|
+
decoded = zlib.decompress(data)
|
|
604
|
+
except zlib.error as e:
|
|
605
|
+
decoded = DecodingError(str(e))
|
|
606
|
+
elif f in LITERALS_LZW_DECODE:
|
|
607
|
+
decoded = lzwdecode(data)
|
|
608
|
+
elif f in LITERALS_ASCII85_DECODE:
|
|
609
|
+
decoded = ascii85decode(data)
|
|
610
|
+
elif f in LITERALS_ASCIIHEX_DECODE:
|
|
611
|
+
decoded = asciihexdecode(data)
|
|
612
|
+
elif f in LITERALS_RUNLENGTH_DECODE:
|
|
613
|
+
decoded = rldecode(data)
|
|
614
|
+
elif f in LITERALS_CCITTFAX_DECODE:
|
|
615
|
+
decoded = ccittfaxdecode(data, params)
|
|
616
|
+
elif f in LITERALS_DCT_DECODE or f == LIT("JPXDecode"):
|
|
617
|
+
# This is probably a JPG stream
|
|
618
|
+
# it does not need to be decoded twice.
|
|
619
|
+
# Just return the stream to the user.
|
|
620
|
+
pass
|
|
621
|
+
elif f in LITERALS_JBIG2_DECODE:
|
|
622
|
+
pass
|
|
623
|
+
elif f == LITERAL_CRYPT:
|
|
624
|
+
# not yet..
|
|
625
|
+
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
|
626
|
+
else:
|
|
627
|
+
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
|
628
|
+
if decoded is not None:
|
|
629
|
+
if isinstance(f, PDFLiteral):
|
|
630
|
+
name = f.name
|
|
631
|
+
else:
|
|
632
|
+
name = f
|
|
633
|
+
data = PDFStreamFilter(
|
|
634
|
+
decoded,
|
|
635
|
+
pdf_offset=data.pdf_offset,
|
|
636
|
+
pdf_bytes=data.pdf_bytes,
|
|
637
|
+
original_bytes=data,
|
|
638
|
+
name=name
|
|
639
|
+
)
|
|
640
|
+
# apply predictors
|
|
641
|
+
if params and 'Predictor' in params:
|
|
642
|
+
pred = int_value(params['Predictor'])
|
|
643
|
+
if pred == 1:
|
|
644
|
+
# no predictor
|
|
645
|
+
pass
|
|
646
|
+
elif 10 <= pred:
|
|
647
|
+
# PNG predictor
|
|
648
|
+
colors = int_value(params.get('Colors', 1))
|
|
649
|
+
columns = int_value(params.get('Columns', 1))
|
|
650
|
+
raw_bits_per_component = params.get('BitsPerComponent', 8)
|
|
651
|
+
bitspercomponent = int_value(raw_bits_per_component)
|
|
652
|
+
predicted = apply_png_predictor(pred, colors, columns,
|
|
653
|
+
bitspercomponent, data)
|
|
654
|
+
data = PNGPredictor(
|
|
655
|
+
predicted,
|
|
656
|
+
pdf_offset=data.pdf_offset,
|
|
657
|
+
pdf_bytes=data.pdf_bytes,
|
|
658
|
+
original_bytes=data,
|
|
659
|
+
params=params
|
|
660
|
+
)
|
|
661
|
+
else:
|
|
662
|
+
error_msg = 'Unsupported predictor: %r' % pred
|
|
663
|
+
raise PDFNotImplementedError(error_msg)
|
|
664
|
+
self.data = data
|
|
665
|
+
self.rawdata = None
|
|
666
|
+
return
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
class PDFParser(PDFMinerParser):
|
|
670
|
+
auto_flush: bool = False
|
|
671
|
+
|
|
672
|
+
@staticmethod
|
|
673
|
+
def string_escape(data: Union[bytes, int]) -> str:
|
|
674
|
+
if not isinstance(data, int):
|
|
675
|
+
return "".join(PDFParser.string_escape(d) for d in data)
|
|
676
|
+
elif data == ord('\n'):
|
|
677
|
+
return "\\n"
|
|
678
|
+
elif data == ord('\t'):
|
|
679
|
+
return "\\t"
|
|
680
|
+
elif data == ord('\r'):
|
|
681
|
+
return "\\r"
|
|
682
|
+
elif data == 0:
|
|
683
|
+
return "\\0"
|
|
684
|
+
elif data == ord('\\'):
|
|
685
|
+
return "\\\\"
|
|
686
|
+
elif 32 <= data <= 126:
|
|
687
|
+
return chr(data)
|
|
688
|
+
else:
|
|
689
|
+
return f"\\x{data:02X}"
|
|
690
|
+
|
|
691
|
+
def token_context(self, token: Union[PDFBaseParserToken, PSStr], padding_bytes: int = 10) -> str:
|
|
692
|
+
pos_before = self.fp.tell()
|
|
693
|
+
try:
|
|
694
|
+
bytes_before = min(token.pdf_offset, padding_bytes)
|
|
695
|
+
self.fp.seek(token.pdf_offset - bytes_before)
|
|
696
|
+
if bytes_before > 0:
|
|
697
|
+
context_before = PDFParser.string_escape(self.fp.read(bytes_before))
|
|
698
|
+
else:
|
|
699
|
+
context_before = ""
|
|
700
|
+
content = PDFParser.string_escape(self.fp.read(token.pdf_bytes))
|
|
701
|
+
context_after = PDFParser.string_escape(self.fp.read(padding_bytes))
|
|
702
|
+
return f"{context_before}{content}{context_after}\n" \
|
|
703
|
+
f"{' ' * len(context_before)}" \
|
|
704
|
+
f"{'^' * len(content)}" \
|
|
705
|
+
f"{' ' * len(context_after)}"
|
|
706
|
+
finally:
|
|
707
|
+
self.fp.seek(pos_before)
|
|
708
|
+
|
|
709
|
+
def push(self, *objs: PSStackEntry[ExtraT]):
|
|
710
|
+
transformed = []
|
|
711
|
+
for obj in objs:
|
|
712
|
+
if len(obj) == 2 and isinstance(obj[1], dict):
|
|
713
|
+
length = self._curtokenpos + 1 - obj[0]
|
|
714
|
+
assert length > 0
|
|
715
|
+
transformed.append((obj[0], PDFDict(obj[1], pdf_offset=obj[0], pdf_bytes=length + 2)))
|
|
716
|
+
elif len(obj) == 2 and isinstance(obj[1], list):
|
|
717
|
+
length = self._curtokenpos + 1 - obj[0]
|
|
718
|
+
assert length > 0
|
|
719
|
+
transformed.append((obj[0], PDFList(obj[1], pdf_offset=obj[0], pdf_bytes=length)))
|
|
720
|
+
elif len(obj) == 2 and isinstance(obj[1], PDFStream):
|
|
721
|
+
stream: PDFStream = obj[1]
|
|
722
|
+
pos = obj[0]
|
|
723
|
+
transformed.append((pos, PDFObjectStream(stream, pdf_offset=pos, pdf_bytes=len(stream.rawdata))))
|
|
724
|
+
elif len(obj) == 2 and isinstance(obj[1], PSObject) and not isinstance(obj[1], PDFLiteral):
|
|
725
|
+
pos = obj[0]
|
|
726
|
+
psobj = obj[1]
|
|
727
|
+
length = self._curtokenpos + 1 - obj[0]
|
|
728
|
+
if isinstance(psobj, PDFObjRef):
|
|
729
|
+
orig_pos = pos
|
|
730
|
+
pos = min(pos, psobj.objid.pdf_offset)
|
|
731
|
+
length += orig_pos - pos
|
|
732
|
+
setattr(psobj, "pdf_offset", pos)
|
|
733
|
+
setattr(psobj, "pdf_bytes", length)
|
|
734
|
+
transformed.append((pos, psobj))
|
|
735
|
+
else:
|
|
736
|
+
transformed.append(obj)
|
|
737
|
+
return super().push(*transformed)
|
|
738
|
+
|
|
739
|
+
def _add_token(self, obj: PSBaseParserToken):
|
|
740
|
+
if hasattr(obj, "pdf_offset"):
|
|
741
|
+
pos = obj.pdf_offset
|
|
742
|
+
else:
|
|
743
|
+
pos = self._curtokenpos
|
|
744
|
+
if hasattr(obj, "pdf_bytes"):
|
|
745
|
+
length = obj.pdf_bytes
|
|
746
|
+
elif isinstance(obj, PSLiteral):
|
|
747
|
+
length = len(self._curtoken)
|
|
748
|
+
else:
|
|
749
|
+
length = len(self._curtoken)
|
|
750
|
+
obj = make_ps_object(obj, pdf_offset=pos, pdf_bytes=length)
|
|
751
|
+
# log.info(f"\n{self.token_context(obj)}")
|
|
752
|
+
return super()._add_token(obj)
|
|
753
|
+
|
|
754
|
+
def flush(self):
|
|
755
|
+
if self.auto_flush:
|
|
756
|
+
self.add_results(*self.popall())
|
|
757
|
+
else:
|
|
758
|
+
super().flush()
|
|
759
|
+
|
|
760
|
+
def do_keyword(self, pos: int, token: PSKeyword):
|
|
761
|
+
if token is self.KEYWORD_R:
|
|
762
|
+
# reference to indirect object
|
|
763
|
+
try:
|
|
764
|
+
((_, objid), (_, genno)) = self.pop(2)
|
|
765
|
+
obj = PDFObjRef(self.doc, objid, genno)
|
|
766
|
+
self.push((pos, obj))
|
|
767
|
+
except PSSyntaxError:
|
|
768
|
+
pass
|
|
769
|
+
else:
|
|
770
|
+
super().do_keyword(pos, token)
|
|
771
|
+
|
|
772
|
+
# def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
|
|
773
|
+
# pos, token = super().nexttoken()
|
|
774
|
+
# if isinstance(token, PSObject):
|
|
775
|
+
# setattr(token, "pdf_offset", pos)
|
|
776
|
+
# elif isinstance(token, int):
|
|
777
|
+
# token = PSInt(token, pdf_offset=pos)
|
|
778
|
+
# elif isinstance(token, bytes):
|
|
779
|
+
# token = PSBytes(token, pdf_offset=pos)
|
|
780
|
+
# elif isinstance(token, float):
|
|
781
|
+
# token = PSFloat(token, pdf_offset=pos)
|
|
782
|
+
# elif isinstance(token, bool):
|
|
783
|
+
# token - PSBool(token, pdf_offset=pos)
|
|
784
|
+
# else:
|
|
785
|
+
# raise NotImplementedError(f"Add support for tokens of type {type(token)}")
|
|
786
|
+
# return pos, token
|
|
787
|
+
|
|
788
|
+
# def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
class RawPDFStream:
|
|
792
|
+
def __init__(self, file_stream):
|
|
793
|
+
self._file_stream = file_stream
|
|
794
|
+
|
|
795
|
+
def read(self, *args, **kwargs):
|
|
796
|
+
offset_before = self._file_stream.tell()
|
|
797
|
+
ret = self._file_stream.read(*args, **kwargs)
|
|
798
|
+
if isinstance(ret, bytes):
|
|
799
|
+
ret = PSBytes(ret, pdf_offset=offset_before)
|
|
800
|
+
return ret
|
|
801
|
+
|
|
802
|
+
def __getattr__(self, item):
|
|
803
|
+
return getattr(self._file_stream, item)
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def parse_object(obj, matcher: Matcher, parent: Optional[Match] = None, pdf_header_offset: int = 0):
|
|
807
|
+
if isinstance(obj, PDFStreamFilter):
|
|
808
|
+
filter_obj = Submatch(
|
|
809
|
+
f"{obj.name!s}",
|
|
810
|
+
bytes(obj.original_bytes),
|
|
811
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
812
|
+
length=obj.pdf_bytes,
|
|
813
|
+
parent=parent
|
|
814
|
+
)
|
|
815
|
+
yield filter_obj
|
|
816
|
+
if obj.error is None:
|
|
817
|
+
stream = Submatch(
|
|
818
|
+
"DecodedStream",
|
|
819
|
+
bytes(obj),
|
|
820
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
821
|
+
length=obj.pdf_bytes,
|
|
822
|
+
parent=filter_obj,
|
|
823
|
+
decoded=bytes(obj)
|
|
824
|
+
)
|
|
825
|
+
else:
|
|
826
|
+
stream = Submatch(
|
|
827
|
+
"DecodingError",
|
|
828
|
+
obj.error.message,
|
|
829
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
830
|
+
length=obj.pdf_bytes,
|
|
831
|
+
parent=filter_obj
|
|
832
|
+
)
|
|
833
|
+
yield stream
|
|
834
|
+
yield from parse_object(obj.original_bytes, matcher=matcher, parent=stream,
|
|
835
|
+
pdf_header_offset=pdf_header_offset)
|
|
836
|
+
elif isinstance(obj, PDFList):
|
|
837
|
+
list_obj = Submatch(
|
|
838
|
+
"PDFList",
|
|
839
|
+
'',
|
|
840
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
841
|
+
length=obj.pdf_bytes,
|
|
842
|
+
parent=parent
|
|
843
|
+
)
|
|
844
|
+
yield list_obj
|
|
845
|
+
for item in obj:
|
|
846
|
+
yield from parse_object(item, matcher=matcher, parent=list_obj, pdf_header_offset=pdf_header_offset)
|
|
847
|
+
elif isinstance(obj, PDFDict):
|
|
848
|
+
dict_obj = Submatch(
|
|
849
|
+
"PDFDictionary",
|
|
850
|
+
'',
|
|
851
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
852
|
+
length=obj.pdf_bytes - 1,
|
|
853
|
+
parent=parent
|
|
854
|
+
)
|
|
855
|
+
yield dict_obj
|
|
856
|
+
for key, value in obj.items():
|
|
857
|
+
if not hasattr(value, "pdf_offset") or not hasattr(value, "pdf_bytes"):
|
|
858
|
+
if isinstance(value, list):
|
|
859
|
+
value = PDFList.load(value)
|
|
860
|
+
else:
|
|
861
|
+
raise ValueError(f"Unexpected PDF dictionary value {value!r}")
|
|
862
|
+
pair = Submatch(
|
|
863
|
+
"KeyValuePair",
|
|
864
|
+
'',
|
|
865
|
+
relative_offset=key.pdf_offset - (dict_obj.offset - pdf_header_offset) - 1,
|
|
866
|
+
length=value.pdf_offset + value.pdf_bytes - key.pdf_offset,
|
|
867
|
+
parent=dict_obj
|
|
868
|
+
)
|
|
869
|
+
yield pair
|
|
870
|
+
yield Submatch(
|
|
871
|
+
"Key",
|
|
872
|
+
key,
|
|
873
|
+
relative_offset=0,
|
|
874
|
+
length=key.pdf_bytes + 1,
|
|
875
|
+
parent=pair
|
|
876
|
+
)
|
|
877
|
+
value_match = Submatch(
|
|
878
|
+
"Value",
|
|
879
|
+
value,
|
|
880
|
+
relative_offset=value.pdf_offset - key.pdf_offset,
|
|
881
|
+
length=value.pdf_bytes,
|
|
882
|
+
parent=pair
|
|
883
|
+
)
|
|
884
|
+
yield value_match
|
|
885
|
+
yield from parse_object(value, matcher=matcher, parent=value_match, pdf_header_offset=pdf_header_offset)
|
|
886
|
+
elif isinstance(obj, PDFDeciphered):
|
|
887
|
+
deciphered = Submatch(
|
|
888
|
+
"PDFDeciphered",
|
|
889
|
+
obj.original_bytes,
|
|
890
|
+
decoded=obj,
|
|
891
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
892
|
+
length=obj.pdf_bytes,
|
|
893
|
+
parent=parent
|
|
894
|
+
)
|
|
895
|
+
yield deciphered
|
|
896
|
+
with Tempfile(obj) as f:
|
|
897
|
+
yield from matcher.match(f, parent=deciphered)
|
|
898
|
+
elif isinstance(obj, PSBytes):
|
|
899
|
+
if isinstance(obj, PNGPredictor):
|
|
900
|
+
match = Submatch(
|
|
901
|
+
"PNGPredictor",
|
|
902
|
+
bytes(obj.original_bytes),
|
|
903
|
+
decoded=obj,
|
|
904
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
905
|
+
length=obj.pdf_bytes,
|
|
906
|
+
parent=parent
|
|
907
|
+
)
|
|
908
|
+
yield from parse_object(obj.params, matcher=matcher, parent=match, pdf_header_offset=pdf_header_offset)
|
|
909
|
+
yield from parse_object(obj.original_bytes, matcher=matcher, parent=match,
|
|
910
|
+
pdf_header_offset=pdf_header_offset)
|
|
911
|
+
else:
|
|
912
|
+
match = Submatch(
|
|
913
|
+
obj.__class__.__name__,
|
|
914
|
+
bytes(obj),
|
|
915
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
916
|
+
length=obj.pdf_bytes,
|
|
917
|
+
parent=parent
|
|
918
|
+
)
|
|
919
|
+
if hasattr(obj, "original_bytes"):
|
|
920
|
+
yield from parse_object(obj.original_bytes, matcher=matcher, parent=match,
|
|
921
|
+
pdf_header_offset=pdf_header_offset)
|
|
922
|
+
# recursively match against the deflated contents
|
|
923
|
+
with Tempfile(obj) as f:
|
|
924
|
+
yield from matcher.match(f, parent=match)
|
|
925
|
+
elif hasattr(obj, "pdf_offset") and hasattr(obj, "pdf_bytes"):
|
|
926
|
+
yield Submatch(
|
|
927
|
+
obj.__class__.__name__,
|
|
928
|
+
obj,
|
|
929
|
+
relative_offset=obj.pdf_offset - (parent.offset - pdf_header_offset),
|
|
930
|
+
length=obj.pdf_bytes,
|
|
931
|
+
parent=parent
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
class InstrumentedPDFDocument(PDFDocument):
|
|
936
|
+
def __init__(self, *args, **kwargs):
|
|
937
|
+
self._xrefs = []
|
|
938
|
+
self._decipher: Optional[DecipherCallable] = None
|
|
939
|
+
try:
|
|
940
|
+
super().__init__(*args, **kwargs)
|
|
941
|
+
except PDFSyntaxError as pse:
|
|
942
|
+
if "No /Root object" not in str(pse):
|
|
943
|
+
raise pse
|
|
944
|
+
# this is a malformed PDF without a trailer root object
|
|
945
|
+
old_get_trailer = PDFXRef.get_trailer
|
|
946
|
+
|
|
947
|
+
def get_trailer(_):
|
|
948
|
+
return {"Root": {}}
|
|
949
|
+
|
|
950
|
+
try:
|
|
951
|
+
PDFXRef.get_trailer = get_trailer
|
|
952
|
+
# try it again with our patched trailer loading:
|
|
953
|
+
super().__init__(*args, **kwargs)
|
|
954
|
+
finally:
|
|
955
|
+
PDFXRef.get_trailer = old_get_trailer
|
|
956
|
+
|
|
957
|
+
# @property
|
|
958
|
+
# def xrefs(self):
|
|
959
|
+
# if not self._xrefs:
|
|
960
|
+
# pass
|
|
961
|
+
# return self._xrefs
|
|
962
|
+
#
|
|
963
|
+
# @xrefs.setter
|
|
964
|
+
# def xrefs(self, new_value):
|
|
965
|
+
# self._xrefs = new_value
|
|
966
|
+
|
|
967
|
+
@property
|
|
968
|
+
def decipher(self) -> DecipherCallable:
|
|
969
|
+
if self._decipher is None:
|
|
970
|
+
return None
|
|
971
|
+
else:
|
|
972
|
+
return self.do_decipher
|
|
973
|
+
|
|
974
|
+
@decipher.setter
|
|
975
|
+
def decipher(self, new_value: DecipherCallable):
|
|
976
|
+
self._decipher = new_value
|
|
977
|
+
|
|
978
|
+
def do_decipher(self, *args, **kwargs) -> PSBytes:
|
|
979
|
+
deciphered = self._decipher(*args, **kwargs)
|
|
980
|
+
if isinstance(deciphered, bytes) and not isinstance(deciphered, PSBytes):
|
|
981
|
+
for arg in args:
|
|
982
|
+
if isinstance(arg, PSBytes):
|
|
983
|
+
deciphered = PDFDeciphered(
|
|
984
|
+
deciphered,
|
|
985
|
+
pdf_offset=arg.pdf_offset,
|
|
986
|
+
pdf_bytes=arg.pdf_bytes,
|
|
987
|
+
original_bytes=arg
|
|
988
|
+
)
|
|
989
|
+
break
|
|
990
|
+
return deciphered
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
# The default libmagic test for detecting PDFs is too restrictive:
|
|
994
|
+
class RelaxedPDFMatcher(MagicTest):
|
|
995
|
+
def __init__(self):
|
|
996
|
+
super().__init__(
|
|
997
|
+
offset=AbsoluteOffset(0),
|
|
998
|
+
mime="application/pdf",
|
|
999
|
+
extensions=("pdf",),
|
|
1000
|
+
message="Malformed PDF"
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
def subtest_type(self) -> TestType:
|
|
1004
|
+
return TestType.BINARY
|
|
1005
|
+
|
|
1006
|
+
def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
|
|
1007
|
+
if b"%PDF-" in data:
|
|
1008
|
+
return MatchedTest(self, value=data, offset=0, length=len(data))
|
|
1009
|
+
return FailedTest(self, offset=0, message="data did not contain \"%PDF-\"")
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
MagicMatcher.DEFAULT_INSTANCE.add(RelaxedPDFMatcher())
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
def reverse_skip_whitespace(file_stream) -> bool:
|
|
1016
|
+
found_whitespace = False
|
|
1017
|
+
while True:
|
|
1018
|
+
try:
|
|
1019
|
+
file_stream.seek(-1, from_what=1)
|
|
1020
|
+
except IndexError:
|
|
1021
|
+
break
|
|
1022
|
+
b = file_stream.read(1)
|
|
1023
|
+
if b not in (b' ', b'\t', b'\n'):
|
|
1024
|
+
break
|
|
1025
|
+
found_whitespace = True
|
|
1026
|
+
file_stream.seek(-1, from_what=1)
|
|
1027
|
+
return found_whitespace
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def skip_whitespace(file_stream) -> bool:
|
|
1031
|
+
found_whitespace = False
|
|
1032
|
+
while True:
|
|
1033
|
+
b = file_stream.read(1)
|
|
1034
|
+
if b not in (b' ', b'\t', b'\n'):
|
|
1035
|
+
try:
|
|
1036
|
+
file_stream.seek(-1, from_what=1)
|
|
1037
|
+
except IndexError:
|
|
1038
|
+
pass
|
|
1039
|
+
break
|
|
1040
|
+
found_whitespace = True
|
|
1041
|
+
return found_whitespace
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
def reverse_expect(file_stream, expected: Union[bytes, Callable[[int, bytes], bool]]) -> bytes:
|
|
1045
|
+
skipped_bytes = 0
|
|
1046
|
+
start_pos = file_stream.tell()
|
|
1047
|
+
with file_stream.save_pos():
|
|
1048
|
+
if isinstance(expected, bytes):
|
|
1049
|
+
try:
|
|
1050
|
+
file_stream.seek(-len(expected), from_what=1)
|
|
1051
|
+
except IndexError:
|
|
1052
|
+
return b""
|
|
1053
|
+
if file_stream.read(len(expected)) != expected:
|
|
1054
|
+
return b""
|
|
1055
|
+
skipped_bytes = len(expected)
|
|
1056
|
+
else:
|
|
1057
|
+
while True:
|
|
1058
|
+
try:
|
|
1059
|
+
file_stream.seek(start_pos - skipped_bytes - 1)
|
|
1060
|
+
except IndexError:
|
|
1061
|
+
return b""
|
|
1062
|
+
b = file_stream.read(1)
|
|
1063
|
+
if not expected(skipped_bytes, b):
|
|
1064
|
+
break
|
|
1065
|
+
skipped_bytes += 1
|
|
1066
|
+
file_stream.seek(start_pos - skipped_bytes)
|
|
1067
|
+
try:
|
|
1068
|
+
return file_stream.read(skipped_bytes)
|
|
1069
|
+
finally:
|
|
1070
|
+
file_stream.seek(start_pos - skipped_bytes)
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
def pdf_obj_parser(file_stream, obj, objid: int, parent: Match, pdf_header_offset: int = 0) -> Iterator[Submatch]:
|
|
1074
|
+
data: Optional[bytes] = None
|
|
1075
|
+
if isinstance(obj, PDFObjectStream):
|
|
1076
|
+
log.status(f"Parsing PDF obj {obj.objid!s} {obj.genno!s}")
|
|
1077
|
+
try:
|
|
1078
|
+
data = obj.get_data()
|
|
1079
|
+
except PDFNotImplementedError as e:
|
|
1080
|
+
log.error(f"Unsupported PDF stream filter in object {obj.objid!s} {obj.genno!s}: {e!s}")
|
|
1081
|
+
relative_offset = obj.attrs.pdf_offset
|
|
1082
|
+
obj_length = obj.data_value.pdf_offset - obj.attrs.pdf_offset + obj.data_value.pdf_bytes - 1
|
|
1083
|
+
else:
|
|
1084
|
+
log.status(f"Parsing PDF obj {objid!s}")
|
|
1085
|
+
relative_offset = obj.pdf_offset
|
|
1086
|
+
obj_length = obj.pdf_bytes - 1
|
|
1087
|
+
with file_stream.save_pos():
|
|
1088
|
+
file_stream.seek(parent.offset + relative_offset - pdf_header_offset)
|
|
1089
|
+
reverse_skip_whitespace(file_stream)
|
|
1090
|
+
if reverse_expect(file_stream, b"obj") and reverse_skip_whitespace(file_stream):
|
|
1091
|
+
version = reverse_expect(file_stream, lambda _, b: ord('0') <= b[0] <= ord('9'))
|
|
1092
|
+
if version and reverse_skip_whitespace(file_stream):
|
|
1093
|
+
obj_id = reverse_expect(file_stream, lambda _, b: ord('0') <= b[0] <= ord('9'))
|
|
1094
|
+
if obj_id:
|
|
1095
|
+
obj_offset = parent.offset + relative_offset - pdf_header_offset - file_stream.tell()
|
|
1096
|
+
relative_offset -= obj_offset
|
|
1097
|
+
obj_length += obj_offset
|
|
1098
|
+
file_stream.seek(parent.offset + relative_offset - pdf_header_offset + obj_length)
|
|
1099
|
+
skip_whitespace(file_stream)
|
|
1100
|
+
if file_stream.read(6) == b"endobj":
|
|
1101
|
+
skip_whitespace(file_stream)
|
|
1102
|
+
obj_length = file_stream.tell() - (parent.offset + relative_offset - pdf_header_offset)
|
|
1103
|
+
if isinstance(obj, PDFObjectStream):
|
|
1104
|
+
match = Submatch(
|
|
1105
|
+
name="PDFObject",
|
|
1106
|
+
display_name=f"PDFObject{obj.objid!s}.{obj.genno!s}",
|
|
1107
|
+
match_obj=(obj.objid, obj.genno),
|
|
1108
|
+
relative_offset=relative_offset,
|
|
1109
|
+
length=obj_length,
|
|
1110
|
+
parent=parent
|
|
1111
|
+
)
|
|
1112
|
+
yield match
|
|
1113
|
+
yield from parse_object(obj.attrs, matcher=parent.matcher, parent=match, pdf_header_offset=pdf_header_offset)
|
|
1114
|
+
if data is not None:
|
|
1115
|
+
yield from parse_object(data, matcher=parent.matcher, parent=match, pdf_header_offset=pdf_header_offset)
|
|
1116
|
+
else:
|
|
1117
|
+
match = Submatch(
|
|
1118
|
+
name="PDFObject",
|
|
1119
|
+
display_name=f"PDFObject{objid}",
|
|
1120
|
+
match_obj=objid,
|
|
1121
|
+
relative_offset=relative_offset,
|
|
1122
|
+
length=obj_length,
|
|
1123
|
+
parent=parent
|
|
1124
|
+
)
|
|
1125
|
+
yield match
|
|
1126
|
+
yield from parse_object(obj, parent.matcher, match, pdf_header_offset=pdf_header_offset)
|
|
1127
|
+
log.clear_status()
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
@register_parser("application/pdf")
|
|
1131
|
+
def pdf_parser(file_stream, parent: Match):
|
|
1132
|
+
# pdfminer expects %PDF to be at byte offset zero in the file
|
|
1133
|
+
pdf_header_offset = file_stream.first_index_of(b"%PDF")
|
|
1134
|
+
if pdf_header_offset > 0:
|
|
1135
|
+
# the PDF header does not start at byte offset zero!
|
|
1136
|
+
yield Submatch(
|
|
1137
|
+
"IgnoredPDFPreamble",
|
|
1138
|
+
b"",
|
|
1139
|
+
relative_offset=0,
|
|
1140
|
+
length=pdf_header_offset,
|
|
1141
|
+
parent=parent
|
|
1142
|
+
)
|
|
1143
|
+
pdf_content = Submatch(
|
|
1144
|
+
"OffsetPDFContent",
|
|
1145
|
+
b"",
|
|
1146
|
+
relative_offset=pdf_header_offset,
|
|
1147
|
+
parent=parent
|
|
1148
|
+
)
|
|
1149
|
+
yield pdf_content
|
|
1150
|
+
with FileStream(file_stream, start=pdf_header_offset) as f:
|
|
1151
|
+
yield from pdf_parser(f, pdf_content)
|
|
1152
|
+
return
|
|
1153
|
+
pdf_header_offset = file_stream.start
|
|
1154
|
+
parser = PDFParser(RawPDFStream(file_stream))
|
|
1155
|
+
doc = InstrumentedPDFDocument(parser)
|
|
1156
|
+
yielded = set()
|
|
1157
|
+
for xref in doc.xrefs:
|
|
1158
|
+
for objid in xref.get_objids():
|
|
1159
|
+
try:
|
|
1160
|
+
obj = doc.getobj(objid)
|
|
1161
|
+
except PDFObjectNotFound:
|
|
1162
|
+
continue
|
|
1163
|
+
if isinstance(obj, PDFObjectStream):
|
|
1164
|
+
if (obj.objid, obj.genno) in yielded:
|
|
1165
|
+
continue
|
|
1166
|
+
yielded.add((obj.objid, obj.genno))
|
|
1167
|
+
else:
|
|
1168
|
+
if objid in yielded or not hasattr(obj, "pdf_offset") or not hasattr(obj, "pdf_bytes"):
|
|
1169
|
+
continue
|
|
1170
|
+
yielded.add(objid)
|
|
1171
|
+
yield from pdf_obj_parser(file_stream, obj, objid, parent, pdf_header_offset=pdf_header_offset)
|
|
1172
|
+
|
|
1173
|
+
trailer = xref.get_trailer()
|
|
1174
|
+
if trailer is not None:
|
|
1175
|
+
trailer_start = min(k.pdf_offset for k in trailer.keys())
|
|
1176
|
+
trailer_end = max(v.pdf_offset + v.pdf_bytes for v in trailer.values())
|
|
1177
|
+
t = Submatch(
|
|
1178
|
+
"Trailer",
|
|
1179
|
+
b"",
|
|
1180
|
+
relative_offset=trailer_start,
|
|
1181
|
+
length=trailer_end - trailer_start,
|
|
1182
|
+
parent=parent
|
|
1183
|
+
)
|
|
1184
|
+
yield t
|
|
1185
|
+
for k, v in trailer.items():
|
|
1186
|
+
kvp = Submatch(
|
|
1187
|
+
"KeyValuePair",
|
|
1188
|
+
b"",
|
|
1189
|
+
relative_offset=k.pdf_offset - trailer_start,
|
|
1190
|
+
length=v.pdf_offset + v.pdf_bytes - k.pdf_offset,
|
|
1191
|
+
parent=t
|
|
1192
|
+
)
|
|
1193
|
+
yield kvp
|
|
1194
|
+
yield Submatch(
|
|
1195
|
+
"Key",
|
|
1196
|
+
k,
|
|
1197
|
+
relative_offset=k.pdf_offset - k.pdf_offset,
|
|
1198
|
+
length=k.pdf_bytes,
|
|
1199
|
+
parent=kvp
|
|
1200
|
+
)
|
|
1201
|
+
value_match = Submatch(
|
|
1202
|
+
"Value",
|
|
1203
|
+
b"",
|
|
1204
|
+
relative_offset=v.pdf_offset - k.pdf_offset,
|
|
1205
|
+
length=v.pdf_bytes,
|
|
1206
|
+
parent=kvp
|
|
1207
|
+
)
|
|
1208
|
+
yield value_match
|
|
1209
|
+
yield from parse_object(v, matcher=parent.matcher, parent=value_match,
|
|
1210
|
+
pdf_header_offset=pdf_header_offset)
|
|
1211
|
+
|
|
1212
|
+
if not isinstance(xref, PDFXRef):
|
|
1213
|
+
continue
|
|
1214
|
+
|
|
1215
|
+
xref_start = min(min(c.pdf_offset for c in row if c is not None) for row in xref.offsets.values())
|
|
1216
|
+
xref_end = max(max(c.pdf_offset + c.pdf_bytes for c in row if c is not None) for row in xref.offsets.values())
|
|
1217
|
+
x = Submatch(
|
|
1218
|
+
"XRefTable",
|
|
1219
|
+
b"",
|
|
1220
|
+
relative_offset=xref_start,
|
|
1221
|
+
length=xref_end - xref_start,
|
|
1222
|
+
parent=parent
|
|
1223
|
+
)
|
|
1224
|
+
yield x
|
|
1225
|
+
for row in xref.offsets.values():
|
|
1226
|
+
row_start = min(c.pdf_offset for c in row if c is not None)
|
|
1227
|
+
row_end = max(c.pdf_offset + c.pdf_bytes for c in row if c is not None)
|
|
1228
|
+
row_match = Submatch(
|
|
1229
|
+
"XRefRow",
|
|
1230
|
+
b"",
|
|
1231
|
+
relative_offset=row_start - xref_start,
|
|
1232
|
+
length=row_end - row_start,
|
|
1233
|
+
parent=x
|
|
1234
|
+
)
|
|
1235
|
+
yield row_match
|
|
1236
|
+
obj_id, pos, gen_no = row
|
|
1237
|
+
if obj_id is not None:
|
|
1238
|
+
ret = Submatch(
|
|
1239
|
+
"ObjectID",
|
|
1240
|
+
b"",
|
|
1241
|
+
relative_offset=obj_id.pdf_offset - row_start,
|
|
1242
|
+
length=obj_id.pdf_bytes,
|
|
1243
|
+
parent=row_match
|
|
1244
|
+
)
|
|
1245
|
+
yield ret
|
|
1246
|
+
yield from parse_object(obj_id, matcher=parent.matcher, parent=ret, pdf_header_offset=pdf_header_offset)
|
|
1247
|
+
ret = Submatch(
|
|
1248
|
+
"Position",
|
|
1249
|
+
b"",
|
|
1250
|
+
relative_offset=pos.pdf_offset - row_start,
|
|
1251
|
+
length=pos.pdf_bytes,
|
|
1252
|
+
parent=row_match
|
|
1253
|
+
)
|
|
1254
|
+
yield ret
|
|
1255
|
+
yield from parse_object(ret, matcher=parent.matcher, parent=ret, pdf_header_offset=pdf_header_offset)
|
|
1256
|
+
ret = Submatch(
|
|
1257
|
+
"Generation",
|
|
1258
|
+
b"",
|
|
1259
|
+
relative_offset=gen_no.pdf_offset - row_start,
|
|
1260
|
+
length=gen_no.pdf_bytes,
|
|
1261
|
+
parent=row_match
|
|
1262
|
+
)
|
|
1263
|
+
yield ret
|
|
1264
|
+
yield from parse_object(ret, matcher=parent.matcher, parent=ret, pdf_header_offset=pdf_header_offset)
|