polyfile-weave 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polyfile-weave might be problematic. Click here for more details.

Files changed (585) hide show
  1. polyfile/__init__.py +15 -0
  2. polyfile/__main__.py +394 -0
  3. polyfile/arithmetic.py +27 -0
  4. polyfile/ast.py +114 -0
  5. polyfile/debugger.py +1039 -0
  6. polyfile/expressions.py +346 -0
  7. polyfile/fileutils.py +343 -0
  8. polyfile/html.py +135 -0
  9. polyfile/http/__init__.py +1 -0
  10. polyfile/http/defacto.py +37 -0
  11. polyfile/http/deprecated.py +51 -0
  12. polyfile/http/experimental.py +67 -0
  13. polyfile/http/http_11.py +548 -0
  14. polyfile/http/matcher.py +37 -0
  15. polyfile/http/structured_headers.py +48 -0
  16. polyfile/iterators.py +72 -0
  17. polyfile/jpeg.py +24 -0
  18. polyfile/kaitai/__init__.py +0 -0
  19. polyfile/kaitai/compiler.py +156 -0
  20. polyfile/kaitai/parser.py +312 -0
  21. polyfile/kaitai/parsers/__init__.py +0 -0
  22. polyfile/kaitai/parsers/aix_utmp.py +116 -0
  23. polyfile/kaitai/parsers/allegro_dat.py +367 -0
  24. polyfile/kaitai/parsers/andes_firmware.py +64 -0
  25. polyfile/kaitai/parsers/android_bootldr_asus.py +105 -0
  26. polyfile/kaitai/parsers/android_bootldr_huawei.py +181 -0
  27. polyfile/kaitai/parsers/android_bootldr_qcom.py +217 -0
  28. polyfile/kaitai/parsers/android_dto.py +138 -0
  29. polyfile/kaitai/parsers/android_img.py +319 -0
  30. polyfile/kaitai/parsers/android_nanoapp_header.py +83 -0
  31. polyfile/kaitai/parsers/android_opengl_shaders_cache.py +151 -0
  32. polyfile/kaitai/parsers/android_sparse.py +237 -0
  33. polyfile/kaitai/parsers/android_super.py +401 -0
  34. polyfile/kaitai/parsers/apm_partition_table.py +196 -0
  35. polyfile/kaitai/parsers/apple_single_double.py +180 -0
  36. polyfile/kaitai/parsers/asn1_der.py +235 -0
  37. polyfile/kaitai/parsers/au.py +138 -0
  38. polyfile/kaitai/parsers/avantes_roh60.py +112 -0
  39. polyfile/kaitai/parsers/avi.py +296 -0
  40. polyfile/kaitai/parsers/bcd.py +111 -0
  41. polyfile/kaitai/parsers/bitcoin_transaction.py +210 -0
  42. polyfile/kaitai/parsers/blender_blend.py +334 -0
  43. polyfile/kaitai/parsers/bmp.py +780 -0
  44. polyfile/kaitai/parsers/bson.py +411 -0
  45. polyfile/kaitai/parsers/btrfs_stream.py +318 -0
  46. polyfile/kaitai/parsers/bytes_with_io.py +27 -0
  47. polyfile/kaitai/parsers/chrome_pak.py +194 -0
  48. polyfile/kaitai/parsers/code_6502.py +456 -0
  49. polyfile/kaitai/parsers/compressed_resource.py +217 -0
  50. polyfile/kaitai/parsers/cpio_old_le.py +154 -0
  51. polyfile/kaitai/parsers/cramfs.py +344 -0
  52. polyfile/kaitai/parsers/creative_voice_file.py +342 -0
  53. polyfile/kaitai/parsers/dbf.py +274 -0
  54. polyfile/kaitai/parsers/dcmp_0.py +664 -0
  55. polyfile/kaitai/parsers/dcmp_1.py +422 -0
  56. polyfile/kaitai/parsers/dcmp_2.py +312 -0
  57. polyfile/kaitai/parsers/dcmp_variable_length_integer.py +66 -0
  58. polyfile/kaitai/parsers/dex.py +1086 -0
  59. polyfile/kaitai/parsers/dicom.py +4370 -0
  60. polyfile/kaitai/parsers/dime_message.py +201 -0
  61. polyfile/kaitai/parsers/dns_packet.py +569 -0
  62. polyfile/kaitai/parsers/doom_wad.py +654 -0
  63. polyfile/kaitai/parsers/dos_datetime.py +191 -0
  64. polyfile/kaitai/parsers/dos_mz.py +172 -0
  65. polyfile/kaitai/parsers/ds_store.py +513 -0
  66. polyfile/kaitai/parsers/dtb.py +310 -0
  67. polyfile/kaitai/parsers/dune_2_pak.py +126 -0
  68. polyfile/kaitai/parsers/edid.py +472 -0
  69. polyfile/kaitai/parsers/efivar_signature_list.py +331 -0
  70. polyfile/kaitai/parsers/elf.py +2482 -0
  71. polyfile/kaitai/parsers/ethernet_frame.py +114 -0
  72. polyfile/kaitai/parsers/exif.py +723 -0
  73. polyfile/kaitai/parsers/ext2.py +537 -0
  74. polyfile/kaitai/parsers/fallout2_dat.py +187 -0
  75. polyfile/kaitai/parsers/fallout_dat.py +156 -0
  76. polyfile/kaitai/parsers/fasttracker_xm_module.py +558 -0
  77. polyfile/kaitai/parsers/ftl_dat.py +90 -0
  78. polyfile/kaitai/parsers/genmidi_op2.py +161 -0
  79. polyfile/kaitai/parsers/gettext_mo.py +541 -0
  80. polyfile/kaitai/parsers/gif.py +492 -0
  81. polyfile/kaitai/parsers/gimp_brush.py +244 -0
  82. polyfile/kaitai/parsers/glibc_utmp.py +114 -0
  83. polyfile/kaitai/parsers/gltf_binary.py +132 -0
  84. polyfile/kaitai/parsers/google_protobuf.py +151 -0
  85. polyfile/kaitai/parsers/gpt_partition_table.py +175 -0
  86. polyfile/kaitai/parsers/gran_turismo_vol.py +140 -0
  87. polyfile/kaitai/parsers/grub2_font.py +337 -0
  88. polyfile/kaitai/parsers/gzip.py +232 -0
  89. polyfile/kaitai/parsers/hashcat_restore.py +60 -0
  90. polyfile/kaitai/parsers/hccap.py +111 -0
  91. polyfile/kaitai/parsers/hccapx.py +103 -0
  92. polyfile/kaitai/parsers/heaps_pak.py +177 -0
  93. polyfile/kaitai/parsers/heroes_of_might_and_magic_agg.py +116 -0
  94. polyfile/kaitai/parsers/heroes_of_might_and_magic_bmp.py +34 -0
  95. polyfile/kaitai/parsers/icmp_packet.py +136 -0
  96. polyfile/kaitai/parsers/ico.py +129 -0
  97. polyfile/kaitai/parsers/id3v1_1.py +220 -0
  98. polyfile/kaitai/parsers/id3v2_3.py +324 -0
  99. polyfile/kaitai/parsers/id3v2_4.py +423 -0
  100. polyfile/kaitai/parsers/ines.py +282 -0
  101. polyfile/kaitai/parsers/ipv4_packet.py +158 -0
  102. polyfile/kaitai/parsers/ipv6_packet.py +55 -0
  103. polyfile/kaitai/parsers/iso9660.py +544 -0
  104. polyfile/kaitai/parsers/java_class.py +1113 -0
  105. polyfile/kaitai/parsers/jpeg.py +361 -0
  106. polyfile/kaitai/parsers/luks.py +149 -0
  107. polyfile/kaitai/parsers/lzh.py +165 -0
  108. polyfile/kaitai/parsers/mac_os_resource_snd.py +493 -0
  109. polyfile/kaitai/parsers/mach_o.py +3033 -0
  110. polyfile/kaitai/parsers/mach_o_fat.py +92 -0
  111. polyfile/kaitai/parsers/magicavoxel_vox.py +391 -0
  112. polyfile/kaitai/parsers/manifest.json +1 -0
  113. polyfile/kaitai/parsers/mbr_partition_table.py +119 -0
  114. polyfile/kaitai/parsers/mcap.py +1015 -0
  115. polyfile/kaitai/parsers/microsoft_cfb.py +293 -0
  116. polyfile/kaitai/parsers/microsoft_network_monitor_v2.py +309 -0
  117. polyfile/kaitai/parsers/microsoft_pe.py +765 -0
  118. polyfile/kaitai/parsers/mifare_classic.py +706 -0
  119. polyfile/kaitai/parsers/minecraft_nbt.py +449 -0
  120. polyfile/kaitai/parsers/monomakh_sapr_chg.py +69 -0
  121. polyfile/kaitai/parsers/mozilla_mar.py +239 -0
  122. polyfile/kaitai/parsers/mp4.py +333 -0
  123. polyfile/kaitai/parsers/msgpack.py +467 -0
  124. polyfile/kaitai/parsers/nitf.py +1189 -0
  125. polyfile/kaitai/parsers/nt_mdt_pal.py +155 -0
  126. polyfile/kaitai/parsers/ogg.py +118 -0
  127. polyfile/kaitai/parsers/openpgp_message.py +993 -0
  128. polyfile/kaitai/parsers/packet_ppi.py +515 -0
  129. polyfile/kaitai/parsers/pcap.py +344 -0
  130. polyfile/kaitai/parsers/pcf_font.py +506 -0
  131. polyfile/kaitai/parsers/pcx.py +195 -0
  132. polyfile/kaitai/parsers/pcx_dcx.py +79 -0
  133. polyfile/kaitai/parsers/phar_without_stub.py +399 -0
  134. polyfile/kaitai/parsers/php_serialized_value.py +505 -0
  135. polyfile/kaitai/parsers/png.py +721 -0
  136. polyfile/kaitai/parsers/protocol_body.py +260 -0
  137. polyfile/kaitai/parsers/psx_tim.py +104 -0
  138. polyfile/kaitai/parsers/python_pickle.py +718 -0
  139. polyfile/kaitai/parsers/python_pyc_27.py +510 -0
  140. polyfile/kaitai/parsers/quake_mdl.py +441 -0
  141. polyfile/kaitai/parsers/quake_pak.py +112 -0
  142. polyfile/kaitai/parsers/quicktime_mov.py +634 -0
  143. polyfile/kaitai/parsers/rar.py +265 -0
  144. polyfile/kaitai/parsers/regf.py +569 -0
  145. polyfile/kaitai/parsers/renderware_binary_stream.py +877 -0
  146. polyfile/kaitai/parsers/resource_fork.py +611 -0
  147. polyfile/kaitai/parsers/respack.py +57 -0
  148. polyfile/kaitai/parsers/riff.py +409 -0
  149. polyfile/kaitai/parsers/rpm.py +964 -0
  150. polyfile/kaitai/parsers/rtcp_payload.py +579 -0
  151. polyfile/kaitai/parsers/rtp_packet.py +150 -0
  152. polyfile/kaitai/parsers/rtpdump.py +115 -0
  153. polyfile/kaitai/parsers/ruby_marshal.py +423 -0
  154. polyfile/kaitai/parsers/s3m.py +493 -0
  155. polyfile/kaitai/parsers/saints_row_2_vpp_pc.py +254 -0
  156. polyfile/kaitai/parsers/shapefile_index.py +174 -0
  157. polyfile/kaitai/parsers/shapefile_main.py +893 -0
  158. polyfile/kaitai/parsers/some_ip.py +209 -0
  159. polyfile/kaitai/parsers/some_ip_container.py +37 -0
  160. polyfile/kaitai/parsers/some_ip_sd.py +86 -0
  161. polyfile/kaitai/parsers/some_ip_sd_entries.py +160 -0
  162. polyfile/kaitai/parsers/some_ip_sd_options.py +374 -0
  163. polyfile/kaitai/parsers/specpr.py +404 -0
  164. polyfile/kaitai/parsers/sqlite3.py +472 -0
  165. polyfile/kaitai/parsers/ssh_public_key.py +252 -0
  166. polyfile/kaitai/parsers/standard_midi_file.py +390 -0
  167. polyfile/kaitai/parsers/stl.py +111 -0
  168. polyfile/kaitai/parsers/sudoers_ts.py +201 -0
  169. polyfile/kaitai/parsers/swf.py +406 -0
  170. polyfile/kaitai/parsers/systemd_journal.py +361 -0
  171. polyfile/kaitai/parsers/tcp_segment.py +57 -0
  172. polyfile/kaitai/parsers/tga.py +213 -0
  173. polyfile/kaitai/parsers/tls_client_hello.py +293 -0
  174. polyfile/kaitai/parsers/tr_dos_image.py +322 -0
  175. polyfile/kaitai/parsers/tsm.py +198 -0
  176. polyfile/kaitai/parsers/ttf.py +1847 -0
  177. polyfile/kaitai/parsers/udp_datagram.py +42 -0
  178. polyfile/kaitai/parsers/uefi_te.py +236 -0
  179. polyfile/kaitai/parsers/uimage.py +198 -0
  180. polyfile/kaitai/parsers/utf8_string.py +137 -0
  181. polyfile/kaitai/parsers/vfat.py +410 -0
  182. polyfile/kaitai/parsers/vlq_base128_be.py +104 -0
  183. polyfile/kaitai/parsers/vlq_base128_le.py +129 -0
  184. polyfile/kaitai/parsers/vmware_vmdk.py +167 -0
  185. polyfile/kaitai/parsers/vp8_ivf.py +112 -0
  186. polyfile/kaitai/parsers/warcraft_2_pud.py +423 -0
  187. polyfile/kaitai/parsers/wav.py +1014 -0
  188. polyfile/kaitai/parsers/websocket.py +167 -0
  189. polyfile/kaitai/parsers/windows_evt_log.py +304 -0
  190. polyfile/kaitai/parsers/windows_lnk_file.py +467 -0
  191. polyfile/kaitai/parsers/windows_minidump.py +575 -0
  192. polyfile/kaitai/parsers/windows_resource_file.py +243 -0
  193. polyfile/kaitai/parsers/windows_shell_items.py +190 -0
  194. polyfile/kaitai/parsers/windows_systemtime.py +52 -0
  195. polyfile/kaitai/parsers/wmf.py +502 -0
  196. polyfile/kaitai/parsers/xar.py +181 -0
  197. polyfile/kaitai/parsers/xwd.py +189 -0
  198. polyfile/kaitai/parsers/zip.py +685 -0
  199. polyfile/kaitai/parsers/zisofs.py +158 -0
  200. polyfile/kaitai/parsers/zx_spectrum_tap.py +184 -0
  201. polyfile/kaitaimatcher.py +113 -0
  202. polyfile/languagematcher.py +217 -0
  203. polyfile/logger.py +135 -0
  204. polyfile/magic.py +2983 -0
  205. polyfile/magic_defs/COPYING +29 -0
  206. polyfile/magic_defs/__init__.py +0 -0
  207. polyfile/magic_defs/acorn +102 -0
  208. polyfile/magic_defs/adi +13 -0
  209. polyfile/magic_defs/adventure +122 -0
  210. polyfile/magic_defs/aes +29 -0
  211. polyfile/magic_defs/algol68 +35 -0
  212. polyfile/magic_defs/allegro +9 -0
  213. polyfile/magic_defs/alliant +18 -0
  214. polyfile/magic_defs/alpha +32 -0
  215. polyfile/magic_defs/amanda +12 -0
  216. polyfile/magic_defs/amigaos +218 -0
  217. polyfile/magic_defs/android +259 -0
  218. polyfile/magic_defs/animation +1197 -0
  219. polyfile/magic_defs/aout +46 -0
  220. polyfile/magic_defs/apache +28 -0
  221. polyfile/magic_defs/apl +7 -0
  222. polyfile/magic_defs/apple +773 -0
  223. polyfile/magic_defs/application +7 -0
  224. polyfile/magic_defs/applix +13 -0
  225. polyfile/magic_defs/apt +52 -0
  226. polyfile/magic_defs/archive +2586 -0
  227. polyfile/magic_defs/aria +38 -0
  228. polyfile/magic_defs/arm +50 -0
  229. polyfile/magic_defs/asf +132 -0
  230. polyfile/magic_defs/assembler +18 -0
  231. polyfile/magic_defs/asterix +18 -0
  232. polyfile/magic_defs/att3b +41 -0
  233. polyfile/magic_defs/audio +1291 -0
  234. polyfile/magic_defs/avm +33 -0
  235. polyfile/magic_defs/basis +18 -0
  236. polyfile/magic_defs/beetle +7 -0
  237. polyfile/magic_defs/ber +65 -0
  238. polyfile/magic_defs/bflt +14 -0
  239. polyfile/magic_defs/bhl +10 -0
  240. polyfile/magic_defs/bioinformatics +178 -0
  241. polyfile/magic_defs/biosig +154 -0
  242. polyfile/magic_defs/blackberry +8 -0
  243. polyfile/magic_defs/blcr +25 -0
  244. polyfile/magic_defs/blender +50 -0
  245. polyfile/magic_defs/blit +24 -0
  246. polyfile/magic_defs/bm +10 -0
  247. polyfile/magic_defs/bout +11 -0
  248. polyfile/magic_defs/bsdi +33 -0
  249. polyfile/magic_defs/bsi +10 -0
  250. polyfile/magic_defs/btsnoop +13 -0
  251. polyfile/magic_defs/burp +7 -0
  252. polyfile/magic_defs/bytecode +41 -0
  253. polyfile/magic_defs/c-lang +110 -0
  254. polyfile/magic_defs/c64 +531 -0
  255. polyfile/magic_defs/cad +437 -0
  256. polyfile/magic_defs/cafebabe +107 -0
  257. polyfile/magic_defs/cbor +21 -0
  258. polyfile/magic_defs/ccf +14 -0
  259. polyfile/magic_defs/cddb +12 -0
  260. polyfile/magic_defs/chord +15 -0
  261. polyfile/magic_defs/cisco +12 -0
  262. polyfile/magic_defs/citrus +12 -0
  263. polyfile/magic_defs/clarion +27 -0
  264. polyfile/magic_defs/claris +48 -0
  265. polyfile/magic_defs/clipper +65 -0
  266. polyfile/magic_defs/clojure +30 -0
  267. polyfile/magic_defs/coff +98 -0
  268. polyfile/magic_defs/commands +201 -0
  269. polyfile/magic_defs/communications +22 -0
  270. polyfile/magic_defs/compress +461 -0
  271. polyfile/magic_defs/console +1213 -0
  272. polyfile/magic_defs/convex +69 -0
  273. polyfile/magic_defs/coverage +91 -0
  274. polyfile/magic_defs/cracklib +14 -0
  275. polyfile/magic_defs/crypto +31 -0
  276. polyfile/magic_defs/csv +8 -0
  277. polyfile/magic_defs/ctags +6 -0
  278. polyfile/magic_defs/ctf +23 -0
  279. polyfile/magic_defs/cubemap +8 -0
  280. polyfile/magic_defs/cups +56 -0
  281. polyfile/magic_defs/dact +11 -0
  282. polyfile/magic_defs/database +886 -0
  283. polyfile/magic_defs/dataone +47 -0
  284. polyfile/magic_defs/dbpf +15 -0
  285. polyfile/magic_defs/der +146 -0
  286. polyfile/magic_defs/diamond +12 -0
  287. polyfile/magic_defs/dif +33 -0
  288. polyfile/magic_defs/diff +41 -0
  289. polyfile/magic_defs/digital +59 -0
  290. polyfile/magic_defs/dolby +69 -0
  291. polyfile/magic_defs/dsf +25 -0
  292. polyfile/magic_defs/dump +96 -0
  293. polyfile/magic_defs/dwarfs +45 -0
  294. polyfile/magic_defs/dyadic +61 -0
  295. polyfile/magic_defs/ebml +8 -0
  296. polyfile/magic_defs/edid +11 -0
  297. polyfile/magic_defs/editors +43 -0
  298. polyfile/magic_defs/efi +15 -0
  299. polyfile/magic_defs/elf +379 -0
  300. polyfile/magic_defs/encore +22 -0
  301. polyfile/magic_defs/epoc +62 -0
  302. polyfile/magic_defs/erlang +21 -0
  303. polyfile/magic_defs/espressif +57 -0
  304. polyfile/magic_defs/esri +28 -0
  305. polyfile/magic_defs/etf +33 -0
  306. polyfile/magic_defs/fcs +9 -0
  307. polyfile/magic_defs/filesystems +2694 -0
  308. polyfile/magic_defs/finger +16 -0
  309. polyfile/magic_defs/firmware +133 -0
  310. polyfile/magic_defs/flash +62 -0
  311. polyfile/magic_defs/flif +36 -0
  312. polyfile/magic_defs/fonts +449 -0
  313. polyfile/magic_defs/forth +82 -0
  314. polyfile/magic_defs/fortran +9 -0
  315. polyfile/magic_defs/frame +62 -0
  316. polyfile/magic_defs/freebsd +164 -0
  317. polyfile/magic_defs/fsav +128 -0
  318. polyfile/magic_defs/fusecompress +12 -0
  319. polyfile/magic_defs/games +696 -0
  320. polyfile/magic_defs/gcc +17 -0
  321. polyfile/magic_defs/gconv +10 -0
  322. polyfile/magic_defs/gentoo +85 -0
  323. polyfile/magic_defs/geo +166 -0
  324. polyfile/magic_defs/geos +20 -0
  325. polyfile/magic_defs/gimp +77 -0
  326. polyfile/magic_defs/git +13 -0
  327. polyfile/magic_defs/glibc +21 -0
  328. polyfile/magic_defs/gnome +59 -0
  329. polyfile/magic_defs/gnu +173 -0
  330. polyfile/magic_defs/gnumeric +8 -0
  331. polyfile/magic_defs/gpt +240 -0
  332. polyfile/magic_defs/gpu +28 -0
  333. polyfile/magic_defs/grace +21 -0
  334. polyfile/magic_defs/graphviz +12 -0
  335. polyfile/magic_defs/gringotts +48 -0
  336. polyfile/magic_defs/guile +13 -0
  337. polyfile/magic_defs/hardware +12 -0
  338. polyfile/magic_defs/hitachi-sh +30 -0
  339. polyfile/magic_defs/hp +433 -0
  340. polyfile/magic_defs/human68k +26 -0
  341. polyfile/magic_defs/ibm370 +52 -0
  342. polyfile/magic_defs/ibm6000 +35 -0
  343. polyfile/magic_defs/icc +214 -0
  344. polyfile/magic_defs/iff +80 -0
  345. polyfile/magic_defs/images +4210 -0
  346. polyfile/magic_defs/inform +9 -0
  347. polyfile/magic_defs/intel +310 -0
  348. polyfile/magic_defs/interleaf +9 -0
  349. polyfile/magic_defs/island +10 -0
  350. polyfile/magic_defs/ispell +63 -0
  351. polyfile/magic_defs/isz +15 -0
  352. polyfile/magic_defs/java +52 -0
  353. polyfile/magic_defs/javascript +171 -0
  354. polyfile/magic_defs/jpeg +252 -0
  355. polyfile/magic_defs/json +8 -0
  356. polyfile/magic_defs/karma +9 -0
  357. polyfile/magic_defs/kde +11 -0
  358. polyfile/magic_defs/keepass +20 -0
  359. polyfile/magic_defs/kerberos +45 -0
  360. polyfile/magic_defs/kicad +85 -0
  361. polyfile/magic_defs/kml +34 -0
  362. polyfile/magic_defs/lammps +64 -0
  363. polyfile/magic_defs/lecter +6 -0
  364. polyfile/magic_defs/lex +12 -0
  365. polyfile/magic_defs/lif +50 -0
  366. polyfile/magic_defs/linux +557 -0
  367. polyfile/magic_defs/lisp +78 -0
  368. polyfile/magic_defs/llvm +22 -0
  369. polyfile/magic_defs/locoscript +12 -0
  370. polyfile/magic_defs/lua +31 -0
  371. polyfile/magic_defs/luks +126 -0
  372. polyfile/magic_defs/m4 +11 -0
  373. polyfile/magic_defs/mach +303 -0
  374. polyfile/magic_defs/macintosh +505 -0
  375. polyfile/magic_defs/macos +7 -0
  376. polyfile/magic_defs/magic +10 -0
  377. polyfile/magic_defs/magic.mgc +0 -0
  378. polyfile/magic_defs/mail.news +132 -0
  379. polyfile/magic_defs/make +21 -0
  380. polyfile/magic_defs/map +413 -0
  381. polyfile/magic_defs/maple +109 -0
  382. polyfile/magic_defs/marc21 +30 -0
  383. polyfile/magic_defs/mathcad +8 -0
  384. polyfile/magic_defs/mathematica +188 -0
  385. polyfile/magic_defs/matroska +17 -0
  386. polyfile/magic_defs/mcrypt +52 -0
  387. polyfile/magic_defs/measure +44 -0
  388. polyfile/magic_defs/mercurial +13 -0
  389. polyfile/magic_defs/metastore +8 -0
  390. polyfile/magic_defs/meteorological +53 -0
  391. polyfile/magic_defs/microfocus +21 -0
  392. polyfile/magic_defs/mime +9 -0
  393. polyfile/magic_defs/mips +120 -0
  394. polyfile/magic_defs/mirage +8 -0
  395. polyfile/magic_defs/misctools +140 -0
  396. polyfile/magic_defs/mkid +11 -0
  397. polyfile/magic_defs/mlssa +8 -0
  398. polyfile/magic_defs/mmdf +6 -0
  399. polyfile/magic_defs/modem +92 -0
  400. polyfile/magic_defs/modulefile +9 -0
  401. polyfile/magic_defs/motorola +71 -0
  402. polyfile/magic_defs/mozilla +37 -0
  403. polyfile/magic_defs/msdos +2304 -0
  404. polyfile/magic_defs/msooxml +68 -0
  405. polyfile/magic_defs/msvc +222 -0
  406. polyfile/magic_defs/msx +309 -0
  407. polyfile/magic_defs/mup +24 -0
  408. polyfile/magic_defs/music +17 -0
  409. polyfile/magic_defs/nasa +7 -0
  410. polyfile/magic_defs/natinst +24 -0
  411. polyfile/magic_defs/ncr +49 -0
  412. polyfile/magic_defs/neko +12 -0
  413. polyfile/magic_defs/netbsd +251 -0
  414. polyfile/magic_defs/netscape +26 -0
  415. polyfile/magic_defs/netware +11 -0
  416. polyfile/magic_defs/news +13 -0
  417. polyfile/magic_defs/nifty +202 -0
  418. polyfile/magic_defs/nim-lang +29 -0
  419. polyfile/magic_defs/nitpicker +14 -0
  420. polyfile/magic_defs/numpy +9 -0
  421. polyfile/magic_defs/oasis +12 -0
  422. polyfile/magic_defs/ocaml +14 -0
  423. polyfile/magic_defs/octave +6 -0
  424. polyfile/magic_defs/ole2compounddocs +760 -0
  425. polyfile/magic_defs/olf +98 -0
  426. polyfile/magic_defs/openfst +17 -0
  427. polyfile/magic_defs/opentimestamps +16 -0
  428. polyfile/magic_defs/oric +16 -0
  429. polyfile/magic_defs/os2 +186 -0
  430. polyfile/magic_defs/os400 +39 -0
  431. polyfile/magic_defs/os9 +80 -0
  432. polyfile/magic_defs/osf1 +10 -0
  433. polyfile/magic_defs/palm +156 -0
  434. polyfile/magic_defs/parix +13 -0
  435. polyfile/magic_defs/parrot +22 -0
  436. polyfile/magic_defs/pascal +39 -0
  437. polyfile/magic_defs/pbf +11 -0
  438. polyfile/magic_defs/pbm +8 -0
  439. polyfile/magic_defs/pc88 +24 -0
  440. polyfile/magic_defs/pc98 +77 -0
  441. polyfile/magic_defs/pci_ids +116 -0
  442. polyfile/magic_defs/pcjr +8 -0
  443. polyfile/magic_defs/pdf +51 -0
  444. polyfile/magic_defs/pdp +42 -0
  445. polyfile/magic_defs/perl +100 -0
  446. polyfile/magic_defs/pgf +52 -0
  447. polyfile/magic_defs/pgp +581 -0
  448. polyfile/magic_defs/pgp-binary-keys +388 -0
  449. polyfile/magic_defs/pkgadd +7 -0
  450. polyfile/magic_defs/plan9 +25 -0
  451. polyfile/magic_defs/playdate +57 -0
  452. polyfile/magic_defs/plus5 +18 -0
  453. polyfile/magic_defs/pmem +46 -0
  454. polyfile/magic_defs/polyfile_zip +5 -0
  455. polyfile/magic_defs/polyml +23 -0
  456. polyfile/magic_defs/printer +269 -0
  457. polyfile/magic_defs/project +10 -0
  458. polyfile/magic_defs/psdbms +14 -0
  459. polyfile/magic_defs/psl +14 -0
  460. polyfile/magic_defs/pulsar +13 -0
  461. polyfile/magic_defs/puzzle +17 -0
  462. polyfile/magic_defs/pwsafe +14 -0
  463. polyfile/magic_defs/pyramid +12 -0
  464. polyfile/magic_defs/python +305 -0
  465. polyfile/magic_defs/qt +30 -0
  466. polyfile/magic_defs/revision +66 -0
  467. polyfile/magic_defs/riff +840 -0
  468. polyfile/magic_defs/rinex +44 -0
  469. polyfile/magic_defs/ringdove +45 -0
  470. polyfile/magic_defs/rpi +52 -0
  471. polyfile/magic_defs/rpm +45 -0
  472. polyfile/magic_defs/rpmsg +7 -0
  473. polyfile/magic_defs/rst +11 -0
  474. polyfile/magic_defs/rtf +94 -0
  475. polyfile/magic_defs/ruby +55 -0
  476. polyfile/magic_defs/rust +21 -0
  477. polyfile/magic_defs/sc +7 -0
  478. polyfile/magic_defs/sccs +24 -0
  479. polyfile/magic_defs/scientific +144 -0
  480. polyfile/magic_defs/securitycerts +6 -0
  481. polyfile/magic_defs/selinux +24 -0
  482. polyfile/magic_defs/sendmail +37 -0
  483. polyfile/magic_defs/sequent +42 -0
  484. polyfile/magic_defs/sereal +35 -0
  485. polyfile/magic_defs/sgi +144 -0
  486. polyfile/magic_defs/sgml +161 -0
  487. polyfile/magic_defs/sharc +23 -0
  488. polyfile/magic_defs/sinclair +40 -0
  489. polyfile/magic_defs/sisu +18 -0
  490. polyfile/magic_defs/sketch +6 -0
  491. polyfile/magic_defs/smalltalk +25 -0
  492. polyfile/magic_defs/smile +34 -0
  493. polyfile/magic_defs/sniffer +482 -0
  494. polyfile/magic_defs/softquad +40 -0
  495. polyfile/magic_defs/sosi +40 -0
  496. polyfile/magic_defs/spec +21 -0
  497. polyfile/magic_defs/spectrum +184 -0
  498. polyfile/magic_defs/sql +288 -0
  499. polyfile/magic_defs/ssh +39 -0
  500. polyfile/magic_defs/ssl +20 -0
  501. polyfile/magic_defs/statistics +45 -0
  502. polyfile/magic_defs/subtitle +38 -0
  503. polyfile/magic_defs/sun +141 -0
  504. polyfile/magic_defs/svf +5 -0
  505. polyfile/magic_defs/sylk +36 -0
  506. polyfile/magic_defs/symbos +42 -0
  507. polyfile/magic_defs/sysex +429 -0
  508. polyfile/magic_defs/tcl +29 -0
  509. polyfile/magic_defs/teapot +6 -0
  510. polyfile/magic_defs/terminfo +63 -0
  511. polyfile/magic_defs/tex +141 -0
  512. polyfile/magic_defs/tgif +7 -0
  513. polyfile/magic_defs/ti-8x +239 -0
  514. polyfile/magic_defs/timezone +42 -0
  515. polyfile/magic_defs/tplink +95 -0
  516. polyfile/magic_defs/troff +38 -0
  517. polyfile/magic_defs/tuxedo +8 -0
  518. polyfile/magic_defs/typeset +8 -0
  519. polyfile/magic_defs/uf2 +72 -0
  520. polyfile/magic_defs/unicode +15 -0
  521. polyfile/magic_defs/unisig +12 -0
  522. polyfile/magic_defs/unknown +34 -0
  523. polyfile/magic_defs/usd +21 -0
  524. polyfile/magic_defs/uterus +16 -0
  525. polyfile/magic_defs/uuencode +28 -0
  526. polyfile/magic_defs/vacuum-cleaner +54 -0
  527. polyfile/magic_defs/varied.out +46 -0
  528. polyfile/magic_defs/varied.script +21 -0
  529. polyfile/magic_defs/vax +32 -0
  530. polyfile/magic_defs/vicar +17 -0
  531. polyfile/magic_defs/virtual +307 -0
  532. polyfile/magic_defs/virtutech +12 -0
  533. polyfile/magic_defs/visx +32 -0
  534. polyfile/magic_defs/vms +30 -0
  535. polyfile/magic_defs/vmware +6 -0
  536. polyfile/magic_defs/vorbis +155 -0
  537. polyfile/magic_defs/vxl +14 -0
  538. polyfile/magic_defs/warc +16 -0
  539. polyfile/magic_defs/weak +16 -0
  540. polyfile/magic_defs/web +18 -0
  541. polyfile/magic_defs/webassembly +17 -0
  542. polyfile/magic_defs/windows +1811 -0
  543. polyfile/magic_defs/wireless +7 -0
  544. polyfile/magic_defs/wordprocessors +630 -0
  545. polyfile/magic_defs/wsdl +23 -0
  546. polyfile/magic_defs/x68000 +25 -0
  547. polyfile/magic_defs/xdelta +13 -0
  548. polyfile/magic_defs/xenix +106 -0
  549. polyfile/magic_defs/xilinx +58 -0
  550. polyfile/magic_defs/xo65 +37 -0
  551. polyfile/magic_defs/xwindows +43 -0
  552. polyfile/magic_defs/yara +17 -0
  553. polyfile/magic_defs/zfs +96 -0
  554. polyfile/magic_defs/zilog +12 -0
  555. polyfile/magic_defs/zip +126 -0
  556. polyfile/magic_defs/zyxel +17 -0
  557. polyfile/nes.py +144 -0
  558. polyfile/nitf.py +15 -0
  559. polyfile/pdf.py +1264 -0
  560. polyfile/pickles.py +45 -0
  561. polyfile/polyfile.py +409 -0
  562. polyfile/profiling.py +115 -0
  563. polyfile/repl.py +624 -0
  564. polyfile/search.py +310 -0
  565. polyfile/serialization.py +323 -0
  566. polyfile/structmatcher.py +46 -0
  567. polyfile/structs.py +281 -0
  568. polyfile/templates/download.js +162 -0
  569. polyfile/templates/hexdump.css +268 -0
  570. polyfile/templates/hexdump.js +756 -0
  571. polyfile/templates/jquery-3.4.1.min.js +2 -0
  572. polyfile/templates/template.html +119 -0
  573. polyfile/wildcards.py +62 -0
  574. polyfile/zipmatcher.py +183 -0
  575. polyfile_weave-0.5.5.dist-info/METADATA +173 -0
  576. polyfile_weave-0.5.5.dist-info/RECORD +585 -0
  577. polyfile_weave-0.5.5.dist-info/WHEEL +5 -0
  578. polyfile_weave-0.5.5.dist-info/entry_points.txt +2 -0
  579. polyfile_weave-0.5.5.dist-info/licenses/LICENSE +202 -0
  580. polyfile_weave-0.5.5.dist-info/top_level.txt +2 -0
  581. polymerge/__init__.py +1 -0
  582. polymerge/__main__.py +296 -0
  583. polymerge/cfg.py +127 -0
  584. polymerge/polymerge.py +227 -0
  585. polymerge/polytracker.py +190 -0
polyfile/magic.py ADDED
@@ -0,0 +1,2983 @@
1
+ """
2
+ A pure Python implementation of libmagic.
3
+
4
+ This is to avoid having libmagic be a dependency, as well as to add the ability for searching for matches at arbitrary
5
+ byte offsets.
6
+
7
+ This implementation is also optimized to only test for the file's MIME types; it skips all of the tests for printing
8
+ details about the file.
9
+
10
+ """
11
+ from abc import ABC, abstractmethod
12
+ from collections import defaultdict
13
+ import csv
14
+ from datetime import datetime
15
+ from enum import Enum, IntFlag
16
+ from importlib import resources
17
+ from io import StringIO
18
+ import json
19
+ import logging
20
+ from pathlib import Path
21
+ import re
22
+ import struct
23
+ import sys
24
+ from time import gmtime, localtime, strftime
25
+ from typing import (
26
+ Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator, List, Optional, Set, Tuple, Type, TypeVar, Union
27
+ )
28
+ from uuid import UUID
29
+
30
+ from chardet.universaldetector import UniversalDetector
31
+
32
+ from .arithmetic import CStyleInt, make_c_style_int
33
+ from .fileutils import Streamable
34
+ from .iterators import LazyIterableSet
35
+ from .logger import getStatusLogger, TRACE
36
+ from .repl import ANSIColor, ANSIWriter
37
+
38
+ from . import magic_defs
39
+
40
+
41
+ if sys.version_info < (3, 9):
42
+ from typing import Pattern
43
+ else:
44
+ from re import Pattern
45
+
46
+
47
+ log = getStatusLogger("libmagic")
48
+
49
+
50
+ if sys.version_info < (3, 11):
51
+ def get_resource_path(name: str) -> Path:
52
+ with resources.path(magic_defs, name) as path:
53
+ return path
54
+
55
+ def get_resource_contents(package):
56
+ return resources.contents(package)
57
+ else:
58
+ def get_resource_path(name: str) -> Path:
59
+ with resources.as_file(resources.files(magic_defs).joinpath(name)) as f:
60
+ return f
61
+
62
+ def get_resource_contents(package):
63
+ return (resource.name for resource in resources.files(package).iterdir() if resource.is_file())
64
+
65
+
66
+ MAGIC_DEFS: List[Path] = [
67
+ get_resource_path(resource_name)
68
+ for resource_name in get_resource_contents(magic_defs)
69
+ if resource_name not in ("COPYING", "magic.mgc", "__pycache__") and not resource_name.startswith(".")
70
+ ]
71
+
72
+
73
+ WHITESPACE: bytes = b" \r\t\n\v\f"
74
+ ESCAPES = {
75
+ "n": ord("\n"),
76
+ "r": ord("\r"),
77
+ "b": ord("\b"),
78
+ "v": ord("\v"),
79
+ "t": ord("\t"),
80
+ "f": ord("\f")
81
+ }
82
+
83
+
84
+ def unescape(to_unescape: Union[str, bytes]) -> bytes:
85
+ """Processes unicode escape sequences. Also handles libmagic's support for single digit `\\x#` hex escapes."""
86
+ # first, process single digit hex escapes:
87
+ b = bytearray()
88
+ escaped: Optional[str] = None
89
+ if isinstance(to_unescape, str):
90
+ to_unescape = to_unescape.encode("utf-8")
91
+ for c in to_unescape:
92
+ if escaped is not None:
93
+ char = chr(c)
94
+ if escaped.isnumeric():
95
+ if not char.isnumeric() or len(escaped) == 3 or not int(char) < 8:
96
+ # this is an octal escape sequence like "\1", "\12", or "\123"
97
+ b.append(int(escaped, 8))
98
+ escaped = None
99
+ else:
100
+ escaped = f"{escaped}{char}"
101
+ continue
102
+ elif escaped.startswith("x"):
103
+ # we are processing a hex escape
104
+ if not char.isnumeric() and not ord("a") <= c <= ord("f") and not ord("A") <= c <= ord("F"):
105
+ if len(escaped) == 1:
106
+ raise ValueError(f"Invalid \\x hex escape in {to_unescape!r}")
107
+ b.append(int(escaped[1:], 16))
108
+ escaped = None
109
+ elif len(escaped) == 2:
110
+ b.append(int(f"{escaped[1:]}{char}", 16))
111
+ escaped = None
112
+ continue
113
+ else:
114
+ escaped = f"{escaped}{char}"
115
+ continue
116
+ elif not escaped:
117
+ # the last character was a '\' and this is the first character of the escape
118
+ if char == "x" or char.isnumeric():
119
+ # The escape is either a hex or octal escape
120
+ escaped = char
121
+ elif char in ESCAPES:
122
+ b.append(ESCAPES[char])
123
+ escaped = None
124
+ else:
125
+ b.append(c)
126
+ escaped = None
127
+ continue
128
+ assert escaped is None
129
+ if c == ord("\\"):
130
+ escaped = ""
131
+ else:
132
+ b.append(c)
133
+ if escaped is not None:
134
+ if escaped.startswith("x"):
135
+ if len(escaped) == 1:
136
+ raise ValueError(f"Invalid \\x hex escape in {to_unescape!r}")
137
+ else:
138
+ b.append(int(escaped[1:], 16))
139
+ elif escaped.isnumeric():
140
+ b.append(int(escaped, 8))
141
+ else:
142
+ raise ValueError(f"Unterminated escape in {to_unescape!r}")
143
+ return bytes(b)
144
+
145
+
146
+ class TestResult(ABC):
147
+ def __init__(self, test: "MagicTest", offset: int, parent: Optional["TestResult"] = None):
148
+ self.test: MagicTest = test
149
+ self.offset: int = offset
150
+ self.parent: Optional["TestResult"] = parent
151
+ if parent is not None and bool(self):
152
+ assert self.test.named_test is self.test or parent.test.level == self.test.level - 1
153
+ if not isinstance(self.test, UseTest):
154
+ parent.child_matched = True
155
+ self._child_matched: bool = False
156
+
157
+ @abstractmethod
158
+ def explain(self, writer: ANSIWriter, file: Streamable):
159
+ raise NotImplementedError()
160
+
161
+ @property
162
+ def child_matched(self) -> bool:
163
+ return self._child_matched
164
+
165
+ @child_matched.setter
166
+ def child_matched(self, did_match: bool):
167
+ if did_match and isinstance(self.test, NamedTest):
168
+ assert isinstance(self.parent.test, UseTest)
169
+ self.parent.child_matched = True
170
+ if self.parent.parent is not None:
171
+ self.parent.parent.child_matched = True
172
+ self._child_matched = did_match
173
+
174
+ def __hash__(self):
175
+ return hash((self.test, self.offset))
176
+
177
+ def __eq__(self, other):
178
+ return isinstance(other, TestResult) and other.test == self.test and other.offset == self.offset
179
+
180
+ @abstractmethod
181
+ def __bool__(self):
182
+ raise NotImplementedError()
183
+
184
+ def __repr__(self):
185
+ return f"{self.__class__.__name__}(test={self.test!r}, offset={self.offset}, parent={self.parent!r})"
186
+
187
+ def __str__(self):
188
+ if self.test.message is not None:
189
+ # TODO: Fix pasting our value in
190
+ return str(self.test.message)
191
+ #if self.value is not None and "%" in self.test.message:
192
+ # return self.test.message % (self.value,)
193
+ #else:
194
+ # return self.test.message
195
+ else:
196
+ return f"Match[{self.offset}]"
197
+
198
+
199
+ class MatchedTest(TestResult):
200
+ def __init__(
201
+ self, test: "MagicTest",
202
+ value: Any,
203
+ offset: int,
204
+ length: int,
205
+ parent: Optional["TestResult"] = None
206
+ ):
207
+ super().__init__(test=test, offset=offset, parent=parent)
208
+ self.value: Any = value
209
+ self.length: int = length
210
+
211
+ def explain(self, writer: ANSIWriter, file: Streamable):
212
+ if self.parent is not None:
213
+ self.parent.explain(writer, file=file)
214
+ indent = self.test.write(writer)
215
+ if not isinstance(self.test, (NamedTest, UseTest)):
216
+ writer.write(f"{indent}Matched ", bold=True, color=ANSIColor.GREEN)
217
+ writer.write(str(self.length), bold=True)
218
+ writer.write(f" byte{['','s'][self.length != 1]} at offset ", bold=True, color=ANSIColor.GREEN)
219
+ writer.write(f"{self.offset}\n", bold=True)
220
+ writer.write_context(file, offset=self.offset, context_bytes=max(0, (80 - len(indent) - self.length) // 2),
221
+ num_bytes=self.length, indent=indent)
222
+
223
+ def __hash__(self):
224
+ return hash((self.test, self.offset, self.length))
225
+
226
+ def __eq__(self, other):
227
+ return isinstance(other, MatchedTest) and other.test == self.test and other.offset == self.offset \
228
+ and other.length == self.length
229
+
230
+ def __bool__(self):
231
+ return True
232
+
233
+ def __repr__(self):
234
+ return f"{self.__class__.__name__}(test={self.test!r}, offset={self.offset}, length={self.length}, " \
235
+ f"parent={self.parent!r})"
236
+
237
+ def __str__(self):
238
+ if self.test.message is not None:
239
+ # TODO: Fix pasting our value in
240
+ return str(self.test.message)
241
+ #if self.value is not None and "%" in self.test.message:
242
+ # return self.test.message % (self.value,)
243
+ #else:
244
+ # return self.test.message
245
+ else:
246
+ return f"Match[{self.offset}:{self.offset + self.length}]"
247
+
248
+
249
+ class FailedTest(TestResult):
250
+ def __init__(self, test: "MagicTest", offset: int, message: str, parent: Optional["TestResult"] = None):
251
+ super().__init__(test=test, offset=offset, parent=parent)
252
+ self.message: str = message
253
+
254
+ def __bool__(self):
255
+ return False
256
+
257
+ def explain(self, writer: ANSIWriter, file: Streamable):
258
+ writer.write(f"{self.test} did not match at offset {self.offset} because {self.message}\n", dim=True)
259
+
260
+
261
+ class Endianness(Enum):
262
+ NATIVE = "="
263
+ LITTLE = "<"
264
+ BIG = ">"
265
+ PDP = "me"
266
+
267
+
268
+ def parse_numeric(text: Union[str, bytes]) -> int:
269
+ if isinstance(text, bytes):
270
+ text = text.decode("utf-8")
271
+ text = text.strip()
272
+ if text.startswith("-"):
273
+ factor = -1
274
+ text = text[1:]
275
+ else:
276
+ factor = 1
277
+ if text.startswith("+"):
278
+ text = text[1:]
279
+ if text.endswith("L"):
280
+ text = text[:-1]
281
+ if text.startswith("0x") or text.startswith("0X"):
282
+ if text.lower().endswith("h"):
283
+ # Some hex constants now end with "h" 🤷
284
+ # (see https://github.com/file/file/blob/7a4e60a8f56ed45f76f28d2812a88d82efdc4bb8/magic/Magdir/sniffer#L369)
285
+ text = text[:-1]
286
+ return int(text, 16) * factor
287
+ elif text.startswith("0") and len(text) > 1:
288
+ return int(text, 8) * factor
289
+ else:
290
+ return int(text) * factor
291
+
292
+
293
+ class Offset(ABC):
294
+ @abstractmethod
295
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
296
+ raise NotImplementedError()
297
+
298
+ @staticmethod
299
+ def parse(offset: str) -> "Offset":
300
+ if offset.startswith("&"):
301
+ return RelativeOffset(Offset.parse(offset[1:]))
302
+ elif offset.startswith("("):
303
+ return IndirectOffset.parse(offset)
304
+ elif offset.startswith("-"):
305
+ return NegativeOffset(parse_numeric(offset[1:]))
306
+ else:
307
+ return AbsoluteOffset(parse_numeric(offset))
308
+
309
+
310
+ class InvalidOffsetError(IndexError):
311
+ def __init__(self, message: Optional[str] = None, offset: Optional[Offset] = None):
312
+ if message is None:
313
+ if offset is not None:
314
+ message = f"Invalid Offset: {offset!r}"
315
+ else:
316
+ message = "Invalid Offset"
317
+ super().__init__(message)
318
+ self.offset: Optional[Offset] = offset
319
+
320
+
321
+ class AbsoluteOffset(Offset):
322
+ def __init__(self, offset: int):
323
+ self.offset: int = offset
324
+
325
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
326
+ if not allow_invalid and self.offset >= len(data):
327
+ raise InvalidOffsetError(offset=self)
328
+ return self.offset
329
+
330
+ def __repr__(self):
331
+ return f"{self.__class__.__name__}(offset={self.offset})"
332
+
333
+ def __str__(self):
334
+ return str(self.offset)
335
+
336
+
337
+ class NamedAbsoluteOffset(AbsoluteOffset):
338
+ def __init__(self, test: "NamedTest", offset: int):
339
+ super().__init__(offset)
340
+ self.test: NamedTest = test
341
+
342
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
343
+ while last_match is not None and not last_match.test is self.test:
344
+ last_match = last_match.parent
345
+
346
+ if last_match is not None:
347
+ # At this point, last_match should be equal to the match generated from the NamedTest,
348
+ # and its parent should be the match associated with the UseTest
349
+ last_match = last_match.parent
350
+
351
+ if last_match is None:
352
+ raise ValueError(f"Could not resolve the match associated with {self!r}")
353
+
354
+ assert isinstance(last_match.test, UseTest)
355
+
356
+ if not allow_invalid and last_match.offset + self.offset >= len(data):
357
+ raise InvalidOffsetError(offset=self)
358
+ return last_match.offset + self.offset
359
+
360
+ def __repr__(self):
361
+ return f"{self.__class__.__name__}(test={self.test!r}, offset={self.offset})"
362
+
363
+
364
+ class NegativeOffset(Offset):
365
+ def __init__(self, magnitude: int):
366
+ self.magnitude: int = magnitude
367
+
368
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
369
+ if not allow_invalid and self.magnitude > len(data):
370
+ raise InvalidOffsetError(offset=self)
371
+ return len(data) - self.magnitude
372
+
373
+ def __repr__(self):
374
+ return f"{self.__class__.__name__}(magnitude={self.magnitude})"
375
+
376
+ def __str__(self):
377
+ return f"{self.magnitude}"
378
+
379
+
380
+ class RelativeOffset(Offset):
381
+ def __init__(self, relative_to: Offset):
382
+ self.relative_to: Offset = relative_to
383
+
384
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
385
+ if isinstance(self.relative_to, NegativeOffset):
386
+ difference = -self.relative_to.magnitude
387
+ else:
388
+ difference = self.relative_to.to_absolute(data, last_match)
389
+ if not isinstance(last_match, MatchedTest):
390
+ raise InvalidOffsetError(f"The last test was expected to be a match, but instead got {last_match!s}",
391
+ offset=self)
392
+ offset = last_match.offset + last_match.length + difference
393
+ if not allow_invalid and len(data) < offset < 0:
394
+ raise InvalidOffsetError(offset=self)
395
+ return offset
396
+
397
+ def __repr__(self):
398
+ return f"{self.__class__.__name__}(relative_to={self.relative_to})"
399
+
400
+ def __str__(self):
401
+ return f"&{self.relative_to}"
402
+
403
+
404
+ class IndirectOffset(Offset):
405
+ OctalIndirectOffset = -1
406
+
407
+ def __init__(self, offset: Offset, num_bytes: int, endianness: Endianness, signed: bool,
408
+ post_process: Callable[[int], int] = lambda n: n):
409
+ self.offset: Offset = offset
410
+ self.num_bytes: int = num_bytes
411
+ self.endianness: Endianness = endianness
412
+ self.signed: bool = signed
413
+ self.post_process: Callable[[int], int] = post_process
414
+ if self.endianness != Endianness.LITTLE and self.endianness != endianness.BIG:
415
+ raise ValueError(f"Invalid endianness: {endianness!r}")
416
+ elif num_bytes not in (1, 2, 4, 8, IndirectOffset.OctalIndirectOffset):
417
+ raise ValueError(f"Invalid number of bytes: {num_bytes}")
418
+
419
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
420
+ if self.num_bytes == IndirectOffset.OctalIndirectOffset:
421
+ # Special case: This is for the new octal type used here:
422
+ # https://github.com/file/file/blob/7a4e60a8f56ed45f76f28d2812a88d82efdc4bb8/magic/Magdir/gentoo#L81
423
+ offset = self.offset.to_absolute(data, last_match)
424
+ octal_string_end = offset
425
+ while octal_string_end < len(data) and ord('0') <= data[octal_string_end] <= ord('7'):
426
+ octal_string_end += 1
427
+ value: Optional[int] = None
428
+ if octal_string_end > offset:
429
+ try:
430
+ value = int(data[:octal_string_end], 8)
431
+ except ValueError:
432
+ pass
433
+ if value is None:
434
+ if allow_invalid:
435
+ value = 0
436
+ else:
437
+ return len(data)
438
+ # raise ValueError(f"Invalid octal string expected for {self} at file offset {offset}")
439
+ return self.post_process(value)
440
+ elif self.num_bytes == 1:
441
+ fmt = "B"
442
+ elif self.num_bytes == 2:
443
+ fmt = "H"
444
+ elif self.num_bytes == 8:
445
+ fmt = "Q"
446
+ else:
447
+ fmt = "I"
448
+ if self.signed:
449
+ fmt = fmt.lower()
450
+ if self.endianness == Endianness.LITTLE:
451
+ fmt = f"<{fmt}"
452
+ else:
453
+ fmt = f">{fmt}"
454
+ offset = self.offset.to_absolute(data, last_match)
455
+ to_unpack = data[offset:offset + self.num_bytes]
456
+ if len(to_unpack) < self.num_bytes:
457
+ if allow_invalid:
458
+ return len(data)
459
+ else:
460
+ raise InvalidOffsetError(offset=self)
461
+ return self.post_process(struct.unpack(fmt, to_unpack)[0])
462
+
463
+ NUMBER_PATTERN: str = r"(0[xX][\dA-Fa-f]+|\d+)L?"
464
+ INDIRECT_OFFSET_PATTERN: Pattern[str] = re.compile(
465
+ r"^\("
466
+ rf"(?P<offset>&?-?{NUMBER_PATTERN})"
467
+ r"((?P<signedness>[.,])(?P<type>[bBcCeEfFgGhHiILlmsSqQo]))?"
468
+ rf"(?P<post_process>[*&/]?[+-]?({NUMBER_PATTERN}|\(-?{NUMBER_PATTERN}\)))?"
469
+ r"\)$"
470
+ )
471
+
472
+ @classmethod
473
+ def parse(cls, offset: str) -> "IndirectOffset":
474
+ m = cls.INDIRECT_OFFSET_PATTERN.match(offset)
475
+ if not m:
476
+ raise ValueError(f"Invalid indirect offset: {offset!r}")
477
+ t = m.group("type")
478
+ if t is None:
479
+ t = "I"
480
+ if t == "m":
481
+ raise NotImplementedError("TODO: Add support for middle endianness")
482
+ elif t.islower():
483
+ endianness = Endianness.LITTLE
484
+ else:
485
+ endianness = Endianness.BIG
486
+ t = t.lower()
487
+ if t in ("b", "c"):
488
+ num_bytes = 1
489
+ elif t in ("e", "f", "g", "q"):
490
+ num_bytes = 8
491
+ elif t in ("h", "s"):
492
+ num_bytes = 2
493
+ elif t in ("i", "l"):
494
+ # TODO: Confirm that "l" should really be here
495
+ num_bytes = 4
496
+ elif t in ("o",):
497
+ num_bytes = IndirectOffset.OctalIndirectOffset
498
+ else:
499
+ raise ValueError(f"Unsupported indirect specifier type: {m.group('type')!r}")
500
+ pp = m.group("post_process")
501
+ if pp is None:
502
+ post_process = lambda n: n
503
+ else:
504
+ multiply = pp.startswith("*")
505
+ bitwise_and = pp.startswith("&")
506
+ divide = pp.startswith("/")
507
+ if multiply or bitwise_and or divide:
508
+ pp = pp[1:]
509
+ if pp.startswith("+"):
510
+ pp = pp[1:]
511
+ if pp.startswith("(") and pp.endswith(")"):
512
+ # some definition files like `msdos` have indirect offsets of the form: >>>(&0x0f.l+(-4))
513
+ # Handle those nested parenthesis around the `(-4)` here. This is an undocumented part of the DSL,
514
+ # so, TODO: confirm we are handling it properly and it's not something more complex like a nested
515
+ # indirect offset
516
+ pp = pp[1:-1]
517
+ operand = parse_numeric(pp)
518
+ if multiply:
519
+ post_process = lambda n: n * operand
520
+ elif bitwise_and:
521
+ post_process = lambda n: n & operand
522
+ elif divide:
523
+ post_process = lambda n: n // operand
524
+ else:
525
+ post_process = lambda n: n + operand
526
+ return IndirectOffset(
527
+ offset=Offset.parse(m.group("offset")),
528
+ num_bytes=num_bytes,
529
+ endianness=endianness,
530
+ signed=m.group("signedness") == ",",
531
+ post_process=post_process
532
+ )
533
+
534
+ def __repr__(self):
535
+ return f"{self.__class__.__name__}(offset={self.offset!r}, num_bytes={self.num_bytes}, "\
536
+ f"endianness={self.endianness!r}, signed={self.signed}, post_process={self.post_process!r})"
537
+
538
+ def __str__(self):
539
+ if self.num_bytes == IndirectOffset.OctalIndirectOffset:
540
+ num_bytes = "o"
541
+ else:
542
+ num_bytes = str(self.num_bytes)
543
+ return f"({self.offset!s}{['.', ','][self.signed]}{num_bytes}{self.endianness.value})"
544
+
545
+
546
+ class SourceInfo:
547
+ def __init__(self, path: Path, line: int, original_line: Optional[str] = None):
548
+ self.path: Path = path
549
+ self.line: int = line
550
+ self.original_line: Optional[str] = original_line
551
+
552
+ def __repr__(self):
553
+ return f"{self.__class__.__name__}(path={self.path!r}, line={self.line}, original_line={self.original_line!r})"
554
+
555
+ def __str__(self):
556
+ return f"{self.path!s}:{self.line}"
557
+
558
+
559
+ class MatchContext:
560
+ def __init__(self, data: bytes, path: Optional[Path] = None, only_match_mime: bool = False):
561
+ self.data: bytes = data
562
+ self.path: Optional[Path] = path
563
+ self.only_match_mime: bool = only_match_mime
564
+
565
+ def __getitem__(self, s: slice) -> "MatchContext":
566
+ if not isinstance(s, slice):
567
+ raise ValueError("Match contexts can only be sliced")
568
+ return MatchContext(data=self.data[s], path=self.path, only_match_mime=self.only_match_mime)
569
+
570
+ @property
571
+ def is_executable(self) -> bool:
572
+ if self.path is None:
573
+ log.warning("Unable to determine if the input data is executable; assuming it is not.")
574
+ return False
575
+ try:
576
+ return bool(self.path.stat().st_mode & 0o111)
577
+ except FileNotFoundError:
578
+ log.warning(f"Unable to determine if the data from {self.path} is executable; assuming it is not.")
579
+ return False
580
+
581
+ @staticmethod
582
+ def load(stream_or_path: Union[str, Path, BinaryIO], only_match_mime: bool = False) -> "MatchContext":
583
+ if isinstance(stream_or_path, str) or isinstance(stream_or_path, Path):
584
+ with open(stream_or_path, "rb") as f:
585
+ return MatchContext.load(f, only_match_mime)
586
+ if hasattr(stream_or_path, "name") and stream_or_path.name is not None:
587
+ path: Optional[Path] = Path(stream_or_path.name)
588
+ else:
589
+ path = None
590
+ return MatchContext(stream_or_path.read(), path, only_match_mime)
591
+
592
+
593
+ class Message(ABC):
594
+ @abstractmethod
595
+ def resolve(self, context: MatchContext) -> str:
596
+ raise NotImplementedError()
597
+
598
+ @abstractmethod
599
+ def possibilities(self) -> Iterator[str]:
600
+ raise NotImplementedError()
601
+
602
+ @staticmethod
603
+ def parse(message: str) -> "Message":
604
+ try:
605
+ return TernaryExecutableMessage.parse(message)
606
+ except ValueError:
607
+ return ConstantMessage(message)
608
+
609
+
610
+ class ConstantMessage(Message):
611
+ def __init__(self, message: str):
612
+ self.message: str = message
613
+
614
+ def possibilities(self) -> Iterator[str]:
615
+ yield self.message
616
+
617
+ def resolve(self, context: MatchContext) -> str:
618
+ return self.message
619
+
620
+ def __eq__(self, other):
621
+ return isinstance(other, ConstantMessage) and other.message == self.message
622
+
623
+ def __str__(self):
624
+ return self.message
625
+
626
+
627
+ class TernaryMessage(Message, ABC):
628
+ def __init__(self, true_value: str, false_value: str):
629
+ self.true_value: str = true_value
630
+ self.false_value: str = false_value
631
+
632
+ def possibilities(self) -> Iterator[str]:
633
+ yield self.true_value
634
+ yield self.false_value
635
+
636
+ def __eq__(self, other):
637
+ return isinstance(other, TernaryMessage) and other.false_value == self.false_value and \
638
+ other.true_value == self.true_value and other.__class__ == self.__class__
639
+
640
+
641
+ class TernaryExecutableMessage(TernaryMessage):
642
+ def resolve(self, context: MatchContext) -> str:
643
+ if context.is_executable:
644
+ return self.true_value
645
+ else:
646
+ return self.false_value
647
+
648
+ TERNARY_EXECUTABLE_PATTERN: Pattern[str] = re.compile(
649
+ r"^(?P<before>.*?)\${x\?(?P<true>[^:]+):(?P<false>[^}]+)}(?P<after>.*)$"
650
+ )
651
+
652
+ @staticmethod
653
+ def parse(message: str) -> "TernaryExecutableMessage":
654
+ m = TernaryExecutableMessage.TERNARY_EXECUTABLE_PATTERN.match(message)
655
+ if not m:
656
+ raise ValueError(f"Invalid ternary message: {message!r}")
657
+ before = m.group("before")
658
+ after = m.group("after")
659
+ true_msg = f"{before}{m.group('true')}{after}"
660
+ false_msg = f"{before}{m.group('false')}{after}"
661
+ return TernaryExecutableMessage(true_value=true_msg, false_value=false_msg)
662
+
663
+ def __str__(self):
664
+ return f"${{x?{self.true_value}:{self.false_value}}}"
665
+
666
+
667
+ TEST_TYPES: Set[Type["MagicTest"]] = set()
668
+
669
+
670
+ class Comment:
671
+ def __init__(self, message: str, source_info: Optional[SourceInfo] = None):
672
+ self.message: str = message
673
+ self.source_info: Optional[SourceInfo] = source_info
674
+
675
+ def __str__(self):
676
+ return self.message
677
+
678
+
679
+ class TestType(IntFlag):
680
+ UNKNOWN = 0
681
+ BINARY = 1
682
+ TEXT = 2
683
+
684
+
685
+ class MagicTest(ABC):
686
+ AUTO_REGISTER_TEST: bool = True
687
+
688
+ def __init__(
689
+ self,
690
+ offset: Offset,
691
+ mime: Optional[Union[str, TernaryExecutableMessage]] = None,
692
+ extensions: Iterable[str] = (),
693
+ message: Union[str, Message] = "",
694
+ parent: Optional["MagicTest"] = None,
695
+ comments: Iterable[Comment] = ()
696
+ ):
697
+ self.offset: Offset = offset
698
+ self._mime: Optional[Message] = None
699
+ self.extensions: Set[str] = set(extensions)
700
+ if isinstance(message, Message):
701
+ self._message: Message = message
702
+ else:
703
+ self._message = Message.parse(message)
704
+ self._parent: Optional[MagicTest] = parent
705
+ self.children: List[MagicTest] = []
706
+ if parent is not None:
707
+ self.level: int = self.parent.level + 1
708
+ parent.children.append(self)
709
+ self.named_test: Optional[NamedTest] = parent.named_test
710
+ if self.named_test is not None and isinstance(offset, AbsoluteOffset):
711
+ self.offset = NamedAbsoluteOffset(self.named_test, offset.offset)
712
+ if mime is not None:
713
+ parent.can_match_mime = True
714
+ else:
715
+ self.level = 0
716
+ self.named_test: Optional[NamedTest] = None
717
+ self.can_match_mime: bool = mime is not None
718
+ """
719
+ Whether or not this test or any of its descendants can match a MIME type.
720
+ This is currently set after parsing all of the definition files.
721
+ Any custom implementation should set it manually after this object is created.
722
+
723
+ """
724
+ self.can_be_indirect: bool = False
725
+ """
726
+ Whether or not this test or any of its descendants can be an indirect test.
727
+ This is currently set after parsing all of the definition files.
728
+ Any custom implementation should set it manually after this object is created.
729
+
730
+ """
731
+ self.mime = mime
732
+ self.source_info: Optional[SourceInfo] = None
733
+ self.comments: Tuple[Comment, ...] = tuple(comments)
734
+ self._type: TestType = TestType.UNKNOWN
735
+
736
+ def __init_subclass__(cls, **kwargs):
737
+ if cls.AUTO_REGISTER_TEST:
738
+ TEST_TYPES.add(cls)
739
+ return super().__init_subclass__(**kwargs)
740
+
741
+ @property
742
+ def message(self) -> Message:
743
+ return self._message
744
+
745
+ @message.setter
746
+ def message(self, new_value: Message):
747
+ self._message = new_value
748
+
749
+ @property
750
+ def test_type(self) -> TestType:
751
+ if self._type == TestType.UNKNOWN:
752
+ if hasattr(self, "__calculating_test_type") and getattr(self, "__calculating_test_type"):
753
+ return TestType.UNKNOWN
754
+ setattr(self, "__calculating_test_type", True)
755
+ if self.can_be_indirect:
756
+ # indirect tests can execute any other (binary) test, so classify ourselves as binary
757
+ self._type = TestType.BINARY
758
+ else:
759
+ if any(bool(child.test_type & TestType.BINARY) for child in self.children):
760
+ self._type = TestType.BINARY
761
+ else:
762
+ self._type = self.subtest_type()
763
+ if (self._type == TestType.UNKNOWN and self.children) or bool(self._type & TestType.TEXT):
764
+ # A pattern is considered to be a text test when all its patterns are text patterns;
765
+ # otherwise, it is considered to be a binary pattern.
766
+ if all(bool(child.test_type & TestType.TEXT) for child in self.children):
767
+ self._type = TestType.TEXT
768
+ else:
769
+ self._type = TestType.UNKNOWN
770
+ delattr(self, "__calculating_test_type")
771
+ return self._type
772
+
773
+ @test_type.setter
774
+ def test_type(self, value: TestType):
775
+ if self._type != TestType.UNKNOWN:
776
+ if value != self._type:
777
+ raise ValueError(f"Cannot assign type {value} to test {self} because it already has value {value}")
778
+ else:
779
+ self._type = value
780
+
781
+ @abstractmethod
782
+ def subtest_type(self) -> TestType:
783
+ raise NotImplementedError()
784
+
785
+ @property
786
+ def parent(self) -> Optional["MagicTest"]:
787
+ return self._parent
788
+
789
+ def ancestors(self) -> Iterator["MagicTest"]:
790
+ """Yields all ancestors of this test. NamedTest will also include all UseTest ancestors that call it."""
791
+ stack: List[MagicTest] = [self]
792
+ history: Set[MagicTest] = set(stack)
793
+ while stack:
794
+ test = stack.pop()
795
+ if test is not self:
796
+ yield test
797
+ if isinstance(test, NamedTest):
798
+ new_tests = test.used_by - history
799
+ stack.extend(new_tests)
800
+ history |= new_tests
801
+ if test.parent is not None and test.parent not in history:
802
+ stack.append(test.parent)
803
+ history.add(test.parent)
804
+
805
+ def descendants(self) -> Iterator["MagicTest"]:
806
+ """
807
+ Yields all descendants of this test.
808
+ UseTests will also include all referenced NamedTests and their descendants.
809
+
810
+ """
811
+ stack: List[MagicTest] = [self]
812
+ history: Set[MagicTest] = set(stack)
813
+ while stack:
814
+ test = stack.pop()
815
+ if test is not self:
816
+ yield test
817
+ new_tests = [child for child in test.children if child not in history]
818
+ stack.extend(reversed(new_tests))
819
+ history |= set(new_tests)
820
+ if isinstance(test, UseTest):
821
+ stack.append(test.referenced_test)
822
+ history.add(test.referenced_test)
823
+
824
+ def referenced_tests(self) -> Set["NamedTest"]:
825
+ result: Set[NamedTest] = set()
826
+ for child in self.children:
827
+ result |= child.referenced_tests()
828
+ return result
829
+
830
+ @property
831
+ def mime(self) -> Optional[Message]:
832
+ return self._mime
833
+
834
+ @mime.setter
835
+ def mime(self, new_mime: Optional[Union[str, Message]]):
836
+ if isinstance(new_mime, str):
837
+ new_mime = Message.parse(new_mime)
838
+ if self._mime is not None:
839
+ if self._mime == new_mime:
840
+ return
841
+ raise ValueError("The mime type of a test may not be changed once it is set")
842
+ elif new_mime is None:
843
+ # the mime is already None, and we are setting it to None, so just ignore
844
+ return
845
+ self._mime = new_mime
846
+ self.can_match_mime = True
847
+
848
+ def _mimetypes(self) -> Iterator[str]:
849
+ """Yields all possible MIME types that this test or any of its descendants could match against"""
850
+ if not self.can_match_mime:
851
+ return
852
+ yielded: Set[str] = set()
853
+ if self.mime is not None:
854
+ yielded |= set(self.mime.possibilities())
855
+ yield from yielded
856
+ for d in self.descendants():
857
+ if d.mime is not None:
858
+ possibilities = set(d.mime.possibilities())
859
+ new_mimes = possibilities - yielded
860
+ yield from new_mimes
861
+ yielded |= new_mimes
862
+
863
+ def mimetypes(self) -> LazyIterableSet[str]:
864
+ """Returns the set of all possible MIME types that this test or any of its descendants could match against"""
865
+ return LazyIterableSet(self._mimetypes())
866
+
867
+ def _all_extensions(self) -> Iterator[str]:
868
+ """Yields all possible extensions that this test or any of its descendants could match against"""
869
+ yield from self.extensions
870
+ yielded = set(self.extensions)
871
+ for d in self.descendants():
872
+ new_extensions = d.extensions - yielded
873
+ yield from new_extensions
874
+ yielded |= new_extensions
875
+
876
+ def all_extensions(self) -> LazyIterableSet[str]:
877
+ """Returns the set of all possible extensions that this test or any of its descendants could match against"""
878
+ return LazyIterableSet(self._all_extensions())
879
+
880
+ @abstractmethod
881
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
882
+ raise NotImplementedError()
883
+
884
+ def write(self, writer: ANSIWriter, is_current_test: bool = False, pre_mime_text: str = "") -> str:
885
+ for comment in self.comments:
886
+ if comment.source_info is not None and comment.source_info.original_line is not None:
887
+ writer.write(f" {comment.source_info.path.name}", dim=True, color=ANSIColor.CYAN)
888
+ writer.write(":", dim=True)
889
+ writer.write(f"{comment.source_info.line}\t", dim=True, color=ANSIColor.CYAN)
890
+ writer.write(comment.source_info.original_line.strip(), dim=True)
891
+ writer.write("\n")
892
+ else:
893
+ writer.write(f" # {comment!s}\n", dim=True)
894
+ if is_current_test:
895
+ writer.write("→ ", bold=True)
896
+ else:
897
+ writer.write(" ")
898
+ if self.source_info is not None and self.source_info.original_line is not None:
899
+ source_prefix = f"{self.source_info.path.name}:{self.source_info.line}"
900
+ indent = f"{' ' * len(source_prefix)}\t"
901
+ writer.write(self.source_info.path.name, dim=True, color=ANSIColor.CYAN)
902
+ writer.write(":", dim=True)
903
+ writer.write(self.source_info.line, dim=True, color=ANSIColor.CYAN)
904
+ writer.write("\t")
905
+ writer.write(self.source_info.original_line.strip(), color=ANSIColor.BLUE, bold=True)
906
+ else:
907
+ indent = ""
908
+ writer.write(f"{'>' * self.level}{self.offset!s}\t")
909
+ writer.write(self.message, color=ANSIColor.BLUE, bold=True)
910
+ if self.level == 0:
911
+ if self.test_type & TestType.BINARY:
912
+ writer.write(f" \uF5BB BINARY TEST", color=ANSIColor.BLUE)
913
+ elif self.test_type & TestType.TEXT:
914
+ writer.write(f" \uF5B9 ASCII TEST", color=ANSIColor.BLUE)
915
+ writer.write(pre_mime_text)
916
+ if self.mime is not None:
917
+ writer.write(f"\n {indent}!:mime ", dim=True)
918
+ writer.write(self.mime, color=ANSIColor.BLUE)
919
+ for e in self.extensions:
920
+ writer.write(f"\n {indent}!:ext ", dim=True)
921
+ writer.write(str(e), color=ANSIColor.BLUE)
922
+ writer.write("\n")
923
+ return f" {indent}"
924
+
925
+ def calculate_absolute_offset(self, data: bytes, parent_match: Optional[TestResult] = None) -> int:
926
+ return self.offset.to_absolute(data, parent_match)
927
+
928
+ def _match(self, context: MatchContext, parent_match: Optional[TestResult] = None) -> Iterator[MatchedTest]:
929
+ if context.only_match_mime and not self.can_match_mime:
930
+ return
931
+ try:
932
+ absolute_offset = self.calculate_absolute_offset(context.data, parent_match)
933
+ except InvalidOffsetError:
934
+ return
935
+ m = self.test(context.data, absolute_offset, parent_match)
936
+ if logging.root.level <= TRACE and (bool(m) or self.level > 0):
937
+ log.trace(
938
+ f"{self.source_info!s}\t{bool(m)}\t{absolute_offset}\t"
939
+ f"{context.data[absolute_offset:absolute_offset + 20]!r}"
940
+ )
941
+ if bool(m):
942
+ if not context.only_match_mime or self.mime is not None:
943
+ yield m
944
+ for child in self.children:
945
+ if not context.only_match_mime or child.can_match_mime:
946
+ yield from child._match(context=context, parent_match=m)
947
+
948
+ def match(self, to_match: Union[bytes, BinaryIO, str, Path, MatchContext]) -> Iterator[TestResult]:
949
+ """Yields all matches for the given data"""
950
+ if isinstance(to_match, bytes):
951
+ to_match = MatchContext(data=to_match)
952
+ elif not isinstance(to_match, MatchContext):
953
+ to_match = MatchContext.load(to_match)
954
+ return self._match(to_match)
955
+
956
+ def __str__(self):
957
+ if self.source_info is not None and self.source_info.original_line is not None:
958
+ s = f"{self.source_info.path.name}:{self.source_info.line} {self.source_info.original_line.strip()}"
959
+ else:
960
+ s = f"{'>' * self.level}{self.offset!s}\t{self.message}"
961
+ if self.mime is not None:
962
+ s = f"{s}\n!:mime\t{self.mime}"
963
+ for e in self.extensions:
964
+ s = f"{s}\n!:ext\t{e}"
965
+ return s
966
+
967
+
968
+ class DynamicMagicTest(MagicTest, ABC):
969
+ """A test that can be bound with a dynamically generated message"""
970
+
971
+ def __init__(
972
+ self,
973
+ offset: Offset,
974
+ mime: Optional[Union[str, TernaryExecutableMessage]] = None,
975
+ extensions: Iterable[str] = (),
976
+ default_message: Union[str, Message] = "",
977
+ parent: Optional["MagicTest"] = None,
978
+ comments: Iterable[Comment] = ()
979
+ ):
980
+ super().__init__(offset=offset, mime=mime, extensions=extensions, parent=parent, comments=comments,
981
+ message=default_message)
982
+ self._bound_message: Optional[Message] = None
983
+
984
+ @property
985
+ def default_message(self) -> Message:
986
+ return super().message
987
+
988
+ @property
989
+ def message(self) -> Message:
990
+ if self._bound_message is None:
991
+ return self.default_message
992
+ else:
993
+ return self._bound_message
994
+
995
+ def bind(self, message: Union[str, Message]) -> MagicTest:
996
+ if self._bound_message is not None:
997
+ raise ValueError(f"{self!r} already has a bound message: {self.message!s}")
998
+ elif not isinstance(message, Message):
999
+ message = Message.parse(message)
1000
+ result: DynamicMagicTest = type(f"Bound{self.__class__.__name__}", (self.__class__,), dict(self.__dict__))()
1001
+ result._bound_message = message
1002
+ return result
1003
+
1004
+
1005
+ TYPES_BY_NAME: Dict[str, "DataType"] = {}
1006
+
1007
+
1008
+ T = TypeVar("T")
1009
+
1010
+
1011
+ class DataTypeMatch:
1012
+ INVALID: "DataTypeMatch"
1013
+
1014
+ def __init__(self, raw_match: Optional[bytes] = None, value: Optional[Any] = None, initial_offset: int = 0):
1015
+ self.raw_match: Optional[bytes] = raw_match
1016
+ if value is None and raw_match is not None:
1017
+ self.value: Optional[bytes] = raw_match
1018
+ else:
1019
+ self.value = value
1020
+ self.initial_offset: int = initial_offset
1021
+
1022
+ def __bool__(self):
1023
+ return self.raw_match is not None
1024
+
1025
+ def __repr__(self):
1026
+ if self.initial_offset != 0:
1027
+ io = f", initial_offset={self.initial_offset}"
1028
+ else:
1029
+ io = ""
1030
+ return f"{self.__class__.__name__}(raw_match={self.raw_match!r}, value={self.value!r}{io})"
1031
+
1032
+ def __str__(self):
1033
+ if self.value is not None:
1034
+ return str(self.value)
1035
+ elif self.raw_match is None:
1036
+ return "DataTypeNoMatch"
1037
+ else:
1038
+ return repr(self.raw_match)
1039
+
1040
+
1041
+ DataTypeMatch.INVALID = DataTypeMatch()
1042
+
1043
+
1044
+ class DataType(ABC, Generic[T]):
1045
+ def __init__(self, name: str):
1046
+ self.name: str = name
1047
+
1048
+ def allows_invalid_offsets(self, expected: T) -> bool:
1049
+ return False
1050
+
1051
+ @abstractmethod
1052
+ def is_text(self, value: T) -> bool:
1053
+ raise NotImplementedError()
1054
+
1055
+ @abstractmethod
1056
+ def parse_expected(self, specification: str) -> T:
1057
+ raise NotImplementedError()
1058
+
1059
+ @abstractmethod
1060
+ def match(self, data: bytes, expected: T) -> DataTypeMatch:
1061
+ raise NotImplementedError()
1062
+
1063
+ @staticmethod
1064
+ def parse(fmt: str) -> "DataType":
1065
+ if fmt in TYPES_BY_NAME:
1066
+ return TYPES_BY_NAME[fmt]
1067
+ elif fmt.startswith("string") or fmt.startswith("ustring"):
1068
+ dt = StringType.parse(fmt)
1069
+ elif fmt == "lestring16":
1070
+ dt = UTF16Type(endianness=Endianness.LITTLE)
1071
+ elif fmt == "bestring16":
1072
+ dt = UTF16Type(endianness=Endianness.BIG)
1073
+ elif fmt.startswith("pstring"):
1074
+ dt = PascalStringType.parse(fmt)
1075
+ elif fmt.startswith("search"):
1076
+ dt = SearchType.parse(fmt)
1077
+ elif fmt.startswith("regex"):
1078
+ dt = RegexType.parse(fmt)
1079
+ elif fmt == "guid":
1080
+ dt = GUIDType()
1081
+ else:
1082
+ dt = NumericDataType.parse(fmt)
1083
+ if dt.name in TYPES_BY_NAME:
1084
+ # Sometimes a data type will change its name based on modifiers.
1085
+ # For example, string and pstring will always include their modifiers after their name
1086
+ dt = TYPES_BY_NAME[dt.name]
1087
+ else:
1088
+ TYPES_BY_NAME[dt.name] = dt
1089
+ TYPES_BY_NAME[fmt] = dt
1090
+ return dt
1091
+
1092
+ def __str__(self):
1093
+ return self.name
1094
+
1095
+ def __repr__(self):
1096
+ return f"{self.__class__.__name__}({self.name})"
1097
+
1098
+
1099
+ class UUIDWildcard:
1100
+ pass
1101
+
1102
+
1103
+ class GUIDType(DataType[Union[UUID, UUIDWildcard]]):
1104
+ def __init__(self):
1105
+ super().__init__("guid")
1106
+
1107
+ def is_text(self, value: Union[UUID, UUIDWildcard]) -> bool:
1108
+ return False
1109
+
1110
+ def parse_expected(self, specification: str) -> Union[UUID, UUIDWildcard]:
1111
+ if specification.strip() == "x":
1112
+ return UUIDWildcard()
1113
+ # there is a bug in the `asf` definition where a guid is missing its last two characters:
1114
+ if specification.strip().upper() == "B61BE100-5B4E-11CF-A8FD-00805F5C44":
1115
+ specification = "B61BE100-5B4E-11CF-A8FD-00805F5C442B"
1116
+ return UUID(str(specification.strip()))
1117
+
1118
+ def match(self, data: bytes, expected: Union[UUID, UUIDWildcard]) -> DataTypeMatch:
1119
+ if len(data) < 16:
1120
+ return DataTypeMatch.INVALID
1121
+ try:
1122
+ uuid = UUID(bytes_le=data[:16])
1123
+ except ValueError:
1124
+ return DataTypeMatch.INVALID
1125
+ if isinstance(expected, UUIDWildcard) or uuid == expected:
1126
+ return DataTypeMatch(data[:16], uuid)
1127
+ else:
1128
+ return DataTypeMatch.INVALID
1129
+
1130
+
1131
+ class UTF16Type(DataType[bytes]):
1132
+ def __init__(self, endianness: Endianness):
1133
+ if endianness == Endianness.LITTLE:
1134
+ super().__init__("lestring16")
1135
+ elif endianness == Endianness.BIG:
1136
+ super().__init__("bestring16")
1137
+ else:
1138
+ raise ValueError(f"UTF16 strings only support big and little endianness, not {endianness!r}")
1139
+ self.endianness: Endianness = endianness
1140
+
1141
+ def is_text(self, value: bytes) -> bool:
1142
+ return True
1143
+
1144
+ def parse_expected(self, specification: str) -> bytes:
1145
+ specification = unescape(specification).decode("utf-8")
1146
+ if self.endianness == Endianness.LITTLE:
1147
+ return specification.encode("utf-16-le")
1148
+ else:
1149
+ return specification.encode("utf-16-be")
1150
+
1151
+ def match(self, data: bytes, expected: bytes) -> DataTypeMatch:
1152
+ if data.startswith(expected):
1153
+ if self.endianness == Endianness.LITTLE:
1154
+ return DataTypeMatch(expected, expected.decode("utf-16-le"))
1155
+ else:
1156
+ return DataTypeMatch(expected, expected.decode("utf-16-be"))
1157
+ else:
1158
+ return DataTypeMatch.INVALID
1159
+
1160
+
1161
+ class StringTest(ABC):
1162
+ def __init__(self, trim: bool = False, compact_whitespace: bool = False, num_bytes: Optional[int] = None):
1163
+ self.trim: bool = trim
1164
+ self.compact_whitespace: bool = compact_whitespace
1165
+ self.num_bytes: Optional[int] = num_bytes
1166
+
1167
+ def post_process(self, data: bytes, initial_offset: int = 0) -> DataTypeMatch:
1168
+ value = data
1169
+ # if self.compact_whitespace:
1170
+ # value = b"".join(c for prev, c in zip(b"\0" + data, data) if c not in WHITESPACE or prev not in WHITESPACE)
1171
+ if self.trim:
1172
+ value = value.strip()
1173
+ try:
1174
+ value = value.decode("utf-8")
1175
+ except UnicodeDecodeError:
1176
+ pass
1177
+ return DataTypeMatch(data, value, initial_offset=initial_offset)
1178
+
1179
+ @abstractmethod
1180
+ def matches(self, data: bytes) -> DataTypeMatch:
1181
+ raise NotImplementedError()
1182
+
1183
+ @abstractmethod
1184
+ def is_always_text(self) -> bool:
1185
+ raise NotImplementedError()
1186
+
1187
+ @abstractmethod
1188
+ def search(self, data: bytes) -> DataTypeMatch:
1189
+ raise NotImplementedError()
1190
+
1191
+ @staticmethod
1192
+ def parse(specification: str,
1193
+ trim: bool = False,
1194
+ compact_whitespace: bool = False,
1195
+ case_insensitive_lower: bool = False,
1196
+ case_insensitive_upper: bool = False,
1197
+ optional_blanks: bool = False,
1198
+ full_word_match: bool = False,
1199
+ num_bytes: Optional[int] = None) -> "StringTest":
1200
+ original_spec = specification
1201
+ if specification.strip() == "x":
1202
+ return StringWildcard(trim=trim, compact_whitespace=compact_whitespace, num_bytes=num_bytes)
1203
+ if specification.startswith("!"):
1204
+ negate = True
1205
+ specification = specification[1:]
1206
+ else:
1207
+ negate = False
1208
+ if specification.startswith(">") or specification.startswith("<"):
1209
+ test = StringLengthTest(
1210
+ to_match=specification[1:],
1211
+ test_smaller=specification.startswith("<"),
1212
+ trim=trim,
1213
+ compact_whitespace=compact_whitespace,
1214
+ num_bytes=num_bytes,
1215
+ )
1216
+ else:
1217
+ if num_bytes is not None:
1218
+ raise ValueError(f"Invalid string match specification: {original_spec!r}: a string length limiter "
1219
+ f"cannot be combined with an explicit string match")
1220
+ if specification.startswith("="):
1221
+ specification = specification[1:]
1222
+ test = StringMatch(
1223
+ to_match=specification,
1224
+ trim=trim,
1225
+ compact_whitespace=compact_whitespace,
1226
+ case_insensitive_lower=case_insensitive_lower,
1227
+ case_insensitive_upper=case_insensitive_upper,
1228
+ optional_blanks=optional_blanks,
1229
+ full_word_match=full_word_match
1230
+ )
1231
+ if negate:
1232
+ return NegatedStringTest(test)
1233
+ else:
1234
+ return test
1235
+
1236
+
1237
+ class StringWildcard(StringTest):
1238
+ def matches(self, data: bytes) -> DataTypeMatch:
1239
+ if self.num_bytes is None:
1240
+ first_null = data.find(b"\0")
1241
+ else:
1242
+ first_null = data.find(b"\0", 0, self.num_bytes)
1243
+ if first_null < 0:
1244
+ return self.post_process(data[:self.num_bytes])
1245
+ if first_null >= 0:
1246
+ return self.post_process(data[:first_null])
1247
+ else:
1248
+ return self.post_process(data)
1249
+
1250
+ def is_always_text(self) -> bool:
1251
+ return False
1252
+
1253
+ def search(self, data: bytes) -> DataTypeMatch:
1254
+ return self.matches(data)
1255
+
1256
+ def __str__(self):
1257
+ return "null-terminated string"
1258
+
1259
+
1260
+ class NegatedStringTest(StringWildcard):
1261
+ def __init__(self, parent_test: StringTest):
1262
+ super().__init__(trim=parent_test.trim, compact_whitespace=parent_test.compact_whitespace)
1263
+ self.parent: StringTest = parent_test
1264
+
1265
+ def is_always_text(self) -> bool:
1266
+ return self.parent.is_always_text()
1267
+
1268
+ def matches(self, data: bytes) -> DataTypeMatch:
1269
+ result = self.parent.matches(data)
1270
+ if result == DataTypeMatch.INVALID:
1271
+ return super().matches(data)
1272
+ else:
1273
+ return DataTypeMatch.INVALID
1274
+
1275
+ def search(self, data: bytes) -> DataTypeMatch:
1276
+ result = self.parent.search(data)
1277
+ if result == DataTypeMatch.INVALID:
1278
+ return super().search(data)
1279
+ else:
1280
+ return DataTypeMatch.INVALID
1281
+
1282
+ def __str__(self):
1283
+ return f"something other than {self.parent!s}"
1284
+
1285
+
1286
+ class StringLengthTest(StringWildcard):
1287
+ def __init__(self, to_match: str, test_smaller: bool, trim: bool = False, compact_whitespace: bool = False,
1288
+ num_bytes: Optional[int] = None):
1289
+ super().__init__(trim=trim, compact_whitespace=compact_whitespace, num_bytes=num_bytes)
1290
+ self.raw_pattern: str = to_match
1291
+ self.to_match: bytes = unescape(to_match)
1292
+ null_termination_index = self.to_match.find(0)
1293
+ if null_termination_index >= 0:
1294
+ self.to_match = self.to_match[:null_termination_index]
1295
+ self.desired_length: int = len(self.to_match)
1296
+ self.test_smaller: bool = test_smaller
1297
+
1298
+ def matches(self, data: bytes) -> DataTypeMatch:
1299
+ match = super().matches(data)
1300
+ if self.desired_length == 0:
1301
+ return match
1302
+ elif self.test_smaller and match.raw_match[:self.desired_length] < self.to_match:
1303
+ return match
1304
+ elif not self.test_smaller and match.raw_match[:self.desired_length] > self.to_match:
1305
+ return match
1306
+ else:
1307
+ return DataTypeMatch.INVALID
1308
+
1309
+ def is_always_text(self) -> bool:
1310
+ return False
1311
+
1312
+ def search(self, data: bytes) -> DataTypeMatch:
1313
+ match = super().search(data)
1314
+ if self.test_smaller and match.raw_match < self.to_match:
1315
+ return match
1316
+ elif not self.test_smaller and match.raw_match > self.to_match:
1317
+ return match
1318
+ else:
1319
+ return DataTypeMatch.INVALID
1320
+
1321
+ def __repr__(self):
1322
+ return f"{self.__class__.__name__}(to_match={self.raw_pattern!r}, test_smaller={self.test_smaller!r}, " \
1323
+ f"trim={self.trim!r}, compact_whitespace={self.compact_whitespace!r}, num_bytes={self.num_bytes!r})"
1324
+
1325
+ def __str__(self):
1326
+ return f"{['>', '<'][self.test_smaller]}{repr(self.to_match)}"
1327
+
1328
+
1329
+ class StringMatch(StringTest):
1330
+ def __init__(self,
1331
+ to_match: str,
1332
+ trim: bool = False,
1333
+ compact_whitespace: bool = False,
1334
+ case_insensitive_lower: bool = False,
1335
+ case_insensitive_upper: bool = False,
1336
+ optional_blanks: bool = False,
1337
+ full_word_match: bool = False
1338
+ ):
1339
+ super().__init__(trim=trim, compact_whitespace=compact_whitespace)
1340
+ self.raw_pattern: str = to_match
1341
+ self.string: bytes = unescape(to_match)
1342
+ self.case_insensitive_lower: bool = case_insensitive_lower
1343
+ self.case_insensitive_upper: bool = case_insensitive_upper
1344
+ self.optional_blanks: bool = optional_blanks
1345
+ self.full_word_match: bool = full_word_match
1346
+ if optional_blanks and compact_whitespace:
1347
+ raise ValueError("Optional blanks `w` and compacting whitespace `W` cannot be selected at the same time")
1348
+ self._is_always_text: Optional[bool] = None
1349
+ self._pattern: Optional[re.Pattern] = None
1350
+ _ = self.pattern
1351
+
1352
+ def pattern_string(self) -> bytes:
1353
+ pattern = re.escape(self.string)
1354
+ if self.case_insensitive_lower and not self.case_insensitive_upper:
1355
+ # treat lower case letters as either lower or upper case
1356
+ delta = ord('A') - ord('a')
1357
+ for ordinal in range(ord('a'), ord('z') + 1):
1358
+ pattern = pattern.replace(bytes([ordinal]), f"[{chr(ordinal)}{chr(ordinal+delta)}]".encode("utf-8"))
1359
+ elif not self.case_insensitive_lower and self.case_insensitive_upper:
1360
+ # treat upper case letters as either lower or upper case
1361
+ delta = ord('a') - ord('A')
1362
+ for ordinal in range(ord('A'), ord('Z') + 1):
1363
+ pattern = pattern.replace(bytes([ordinal]), f"[{chr(ordinal)}{chr(ordinal+delta)}]".encode("utf-8"))
1364
+ if self.compact_whitespace:
1365
+ new_pattern_bytes: List[Tuple[bytes, int]] = []
1366
+ escaped = False
1367
+ for c in (bytes([b]) for b in pattern):
1368
+ if escaped:
1369
+ c = b"\\" + c
1370
+ escaped = False
1371
+ elif c == b"\\":
1372
+ escaped = True
1373
+ continue
1374
+ if new_pattern_bytes and new_pattern_bytes[-1][0] == c:
1375
+ new_pattern_bytes[-1] = (c, new_pattern_bytes[-1][1] + 1)
1376
+ else:
1377
+ new_pattern_bytes.append((c, 1))
1378
+ if escaped:
1379
+ raise ValueError(f"Error parsing search pattern {self.string!r}")
1380
+ pattern_bytes = bytearray()
1381
+ for c, count in new_pattern_bytes:
1382
+ pattern_bytes.extend(c)
1383
+ if c in (b'\\ ', b'\\s', b'\\t', b'\\r', b'\\v', b'\\f'):
1384
+ # this is whitespace
1385
+ if count == 1:
1386
+ pattern_bytes.extend(b"+")
1387
+ else:
1388
+ pattern_bytes.extend(f"{{{count},}}".encode("utf-8"))
1389
+ elif count > 1:
1390
+ pattern_bytes.extend(f"{{{count}}}".encode("utf-8"))
1391
+ pattern = bytes(pattern_bytes)
1392
+ elif self.optional_blanks:
1393
+ pattern = pattern.replace(rb"\ ", rb"\ ?")
1394
+ if self.full_word_match:
1395
+ pattern = rb"\b" + pattern + rb"\b"
1396
+ return pattern
1397
+
1398
+ def pattern_flags(self) -> int:
1399
+ flags: int = 0
1400
+ if self.case_insensitive_upper and self.case_insensitive_lower:
1401
+ flags |= re.IGNORECASE
1402
+ return flags
1403
+
1404
+ @property
1405
+ def pattern(self) -> re.Pattern:
1406
+ if self._pattern is None:
1407
+ self._pattern = re.compile(self.pattern_string(), flags=self.pattern_flags())
1408
+ return self._pattern
1409
+
1410
+ def is_always_text(self) -> bool:
1411
+ if self._is_always_text is None:
1412
+ if "\\x" in self.raw_pattern or "\\0" in self.raw_pattern:
1413
+ # the string has hex escapes, so do not treat it as text
1414
+ self._is_always_text = False
1415
+ else:
1416
+ try:
1417
+ _ = self.pattern.pattern.decode("ascii")
1418
+ self._is_always_text = True
1419
+ except UnicodeDecodeError:
1420
+ self._is_always_text = False
1421
+ return self._is_always_text
1422
+
1423
+ def matches(self, data: bytes) -> DataTypeMatch:
1424
+ m = self.pattern.match(data)
1425
+ if m:
1426
+ return self.post_process(bytes(m.group(0)))
1427
+ return DataTypeMatch.INVALID
1428
+
1429
+ def search(self, data: bytes) -> DataTypeMatch:
1430
+ m = self.pattern.search(data)
1431
+ if m:
1432
+ return self.post_process(bytes(m.group(0)), initial_offset=m.start())
1433
+ return DataTypeMatch.INVALID
1434
+
1435
+ def __str__(self):
1436
+ return repr(self.string)
1437
+
1438
+
1439
+ class StringType(DataType[StringTest]):
1440
+ def __init__(
1441
+ self,
1442
+ case_insensitive_lower: bool = False,
1443
+ case_insensitive_upper: bool = False,
1444
+ compact_whitespace: bool = False,
1445
+ optional_blanks: bool = False,
1446
+ full_word_match: bool = False,
1447
+ trim: bool = False,
1448
+ force_text: bool = False,
1449
+ num_bytes: Optional[int] = None
1450
+ ):
1451
+ if not any((num_bytes is not None, case_insensitive_lower, case_insensitive_upper, compact_whitespace,
1452
+ optional_blanks, trim, force_text)):
1453
+ name = "string"
1454
+ else:
1455
+ if num_bytes is not None:
1456
+ name = f"{num_bytes}/"
1457
+ else:
1458
+ name = ""
1459
+ name = f"string/{name}{['', 'W'][compact_whitespace]}{['', 'w'][optional_blanks]}"\
1460
+ f"{['', 'C'][case_insensitive_upper]}{['', 'c'][case_insensitive_lower]}"\
1461
+ f"{['', 'T'][trim]}{['', 'f'][full_word_match]}{['', 't'][force_text]}"
1462
+ super().__init__(name)
1463
+ self.case_insensitive_lower: bool = case_insensitive_lower
1464
+ self.case_insensitive_upper: bool = case_insensitive_upper
1465
+ self.compact_whitespace: bool = compact_whitespace
1466
+ self.optional_blanks: bool = optional_blanks
1467
+ self.full_word_match: bool = full_word_match
1468
+ self.trim: bool = trim
1469
+ self.force_text: bool = force_text
1470
+ self.num_bytes: Optional[int] = num_bytes
1471
+
1472
+ def is_text(self, value: StringTest) -> bool:
1473
+ return self.force_text
1474
+
1475
+ def allows_invalid_offsets(self, expected: StringTest) -> bool:
1476
+ return isinstance(expected, NegatedStringTest)
1477
+
1478
+ def parse_expected(self, specification: str) -> StringTest:
1479
+ return StringTest.parse(
1480
+ specification,
1481
+ trim=self.trim,
1482
+ case_insensitive_lower=self.case_insensitive_lower,
1483
+ case_insensitive_upper=self.case_insensitive_upper,
1484
+ compact_whitespace=self.compact_whitespace,
1485
+ full_word_match=self.full_word_match,
1486
+ num_bytes=self.num_bytes
1487
+ )
1488
+
1489
+ def match(self, data: bytes, expected: StringTest) -> DataTypeMatch:
1490
+ return expected.matches(data)
1491
+
1492
+ STRING_TYPE_FORMAT: Pattern[str] = re.compile(r"^u?string(/(?P<numbytes>\d+))?(?P<opts>/[BbCctTWwf]*)?$")
1493
+
1494
+ @classmethod
1495
+ def parse(cls, format_str: str) -> "StringType":
1496
+ m = cls.STRING_TYPE_FORMAT.match(format_str)
1497
+ if not m:
1498
+ raise ValueError(f"Invalid string type declaration: {format_str!r}")
1499
+ if m.group("numbytes") is None:
1500
+ num_bytes: Optional[int] = None
1501
+ else:
1502
+ num_bytes = int(m.group("numbytes"))
1503
+ if m.group("opts") is None:
1504
+ options: Iterable[str] = ()
1505
+ else:
1506
+ options = m.group("opts")
1507
+ unsupported_options = {opt for opt in options if opt not in "/WwcCtbTf"}
1508
+ if unsupported_options:
1509
+ log.warning(f"{format_str!r} has invalid option(s) that will be ignored: {', '.join(unsupported_options)}")
1510
+ return StringType(
1511
+ case_insensitive_lower="c" in options,
1512
+ case_insensitive_upper="C" in options,
1513
+ compact_whitespace="W" in options,
1514
+ optional_blanks="w" in options,
1515
+ full_word_match="f" in options,
1516
+ trim="T" in options,
1517
+ force_text="t" in options,
1518
+ num_bytes=num_bytes
1519
+ )
1520
+
1521
+
1522
+ class SearchType(StringType):
1523
+ def __init__(
1524
+ self,
1525
+ repetitions: Optional[int] = None,
1526
+ case_insensitive_lower: bool = False,
1527
+ case_insensitive_upper: bool = False,
1528
+ compact_whitespace: bool = False,
1529
+ optional_blanks: bool = False,
1530
+ match_to_start: bool = False,
1531
+ full_word_match: bool = False,
1532
+ trim: bool = False
1533
+ ):
1534
+ if repetitions is not None and repetitions <= 0:
1535
+ raise ValueError("repetitions must be either None or a positive integer")
1536
+ super().__init__(
1537
+ case_insensitive_lower=case_insensitive_lower,
1538
+ case_insensitive_upper=case_insensitive_upper,
1539
+ compact_whitespace=compact_whitespace,
1540
+ optional_blanks=optional_blanks,
1541
+ full_word_match=full_word_match,
1542
+ trim=trim
1543
+ )
1544
+ self.repetitions: Optional[int] = repetitions
1545
+ if repetitions is None:
1546
+ rep_str = ""
1547
+ else:
1548
+ rep_str = f"/{repetitions}"
1549
+ assert self.name.startswith("string")
1550
+ self.name = f"search{rep_str}{self.name[6:]}"
1551
+ self.match_to_start: bool = match_to_start
1552
+ if match_to_start:
1553
+ if self.name == f"search{rep_str}":
1554
+ self.name = f"search{rep_str}/s"
1555
+ else:
1556
+ self.name = f"{self.name}s"
1557
+
1558
+ def is_text(self, value: StringTest) -> bool:
1559
+ return value.is_always_text()
1560
+
1561
+ def match(self, data: bytes, expected: StringTest) -> DataTypeMatch:
1562
+ return expected.search(data)
1563
+
1564
+ SEARCH_TYPE_FORMAT: Pattern[str] = re.compile(
1565
+ r"^search"
1566
+ r"((/(?P<repetitions1>(0[xX][\dA-Fa-f]+|\d+)))(/(?P<flags1>[BbCctTWwsf]*)?)?|"
1567
+ r"/((?P<flags2>[BbCctTWwsf]*)/?)?(?P<repetitions2>(0[xX][\dA-Fa-f]+|\d+)))$"
1568
+ )
1569
+ # NOTE: some specification files like `ber` use `search/b64`, which is undocumented. We treat that equivalent to
1570
+ # the compliant `search/b/64`.
1571
+ # TODO: Figure out if this is correct.
1572
+
1573
+ @classmethod
1574
+ def parse(cls, format_str: str) -> "SearchType":
1575
+ if format_str == "search":
1576
+ # it's undocumented, but you can apparently use the search test without an explicit repetition number
1577
+ return SearchType()
1578
+ m = cls.SEARCH_TYPE_FORMAT.match(format_str)
1579
+ if not m:
1580
+ raise ValueError(f"Invalid search type declaration: {format_str!r}")
1581
+ if m.group("repetitions1") is not None:
1582
+ repetitions = parse_numeric(m.group("repetitions1"))
1583
+ flags = m.group("flags1")
1584
+ elif m.group("repetitions2") is not None:
1585
+ repetitions = parse_numeric(m.group("repetitions2"))
1586
+ flags = m.group("flags2")
1587
+ else:
1588
+ raise ValueError(f"Invalid search type declaration: {format_str!r}")
1589
+ if flags is None:
1590
+ options: Iterable[str] = ()
1591
+ else:
1592
+ options = flags
1593
+ return SearchType(
1594
+ repetitions=repetitions,
1595
+ case_insensitive_lower="c" in options,
1596
+ case_insensitive_upper="C" in options,
1597
+ compact_whitespace="B" in options or "W" in options,
1598
+ optional_blanks="b" in options or "w" in options,
1599
+ full_word_match="f" in options,
1600
+ trim="T" in options,
1601
+ match_to_start="s" in options
1602
+ )
1603
+
1604
+
1605
+ class PascalStringType(DataType[StringTest]):
1606
+ def __init__(
1607
+ self,
1608
+ byte_length: int = 1,
1609
+ endianness: Endianness = Endianness.BIG,
1610
+ count_includes_length: bool = False
1611
+ ):
1612
+ if endianness != Endianness.BIG and endianness != Endianness.LITTLE:
1613
+ raise ValueError("Endianness must be either BIG or LITTLE")
1614
+ elif byte_length == 1:
1615
+ modifier = "B"
1616
+ elif byte_length == 2:
1617
+ if endianness == Endianness.BIG:
1618
+ modifier = "H"
1619
+ else:
1620
+ modifier = "h"
1621
+ elif byte_length == 4:
1622
+ if endianness == Endianness.BIG:
1623
+ modifier = "L"
1624
+ else:
1625
+ modifier = "l"
1626
+ else:
1627
+ raise ValueError("byte_length must be either 1, 2, or 4")
1628
+ if count_includes_length:
1629
+ modifier = f"{modifier}J"
1630
+ super().__init__(f"pstring/{modifier}")
1631
+ self.byte_length: int = byte_length
1632
+ self.endianness: Endianness = endianness
1633
+ self.count_includes_length: int = count_includes_length
1634
+
1635
+ def is_text(self, value: StringTest) -> bool:
1636
+ # TODO: See if Pascal strings should sometimes be forced to be text
1637
+ return False
1638
+
1639
+ def parse_expected(self, specification: str) -> StringTest:
1640
+ return StringTest.parse(specification)
1641
+
1642
+ def match(self, data: bytes, expected: StringTest) -> DataTypeMatch:
1643
+ if len(data) < self.byte_length:
1644
+ return DataTypeMatch.INVALID
1645
+ elif self.byte_length == 1:
1646
+ length = data[0]
1647
+ elif self.byte_length == 2:
1648
+ if self.endianness == Endianness.BIG:
1649
+ length = struct.unpack(">H", data[:2])[0]
1650
+ else:
1651
+ length = struct.unpack("<H", data[:2])[0]
1652
+ elif self.endianness == Endianness.BIG:
1653
+ length = struct.unpack(">I", data[:4])[0]
1654
+ else:
1655
+ length = struct.unpack("<I", data[:4])[0]
1656
+ if self.count_includes_length:
1657
+ length -= self.byte_length
1658
+ if len(data) < self.byte_length + length:
1659
+ return DataTypeMatch.INVALID
1660
+ m = expected.matches(data[self.byte_length:self.byte_length + length])
1661
+ if m:
1662
+ m.raw_match = data[:self.byte_length + length]
1663
+ return m
1664
+
1665
+ PSTRING_TYPE_FORMAT: Pattern[str] = re.compile(r"^pstring(/J?[BHhLl]?J?)?$")
1666
+
1667
+ @classmethod
1668
+ def parse(cls, format_str: str) -> "PascalStringType":
1669
+ m = cls.PSTRING_TYPE_FORMAT.match(format_str)
1670
+ if not m:
1671
+ raise ValueError(f"Invalid pstring type declaration: {format_str!r}")
1672
+ if m.group(1) is None:
1673
+ options: Iterable[str] = ()
1674
+ else:
1675
+ options = m.group(1)
1676
+ if "H" in options:
1677
+ byte_length = 2
1678
+ endianness = Endianness.BIG
1679
+ elif "h" in options:
1680
+ byte_length = 2
1681
+ endianness = Endianness.LITTLE
1682
+ elif "L" in options:
1683
+ byte_length = 4
1684
+ endianness = Endianness.BIG
1685
+ elif "l" in options:
1686
+ byte_length = 4
1687
+ endianness = Endianness.LITTLE
1688
+ else:
1689
+ byte_length = 1
1690
+ endianness = Endianness.BIG
1691
+ return PascalStringType(
1692
+ byte_length=byte_length,
1693
+ endianness=endianness,
1694
+ count_includes_length="J" in options
1695
+ )
1696
+
1697
+
1698
+ def posix_to_python_re(match: bytes) -> bytes:
1699
+ for match_from, replace_with in (
1700
+ ("upper", "A-Z"),
1701
+ ("lower", "a-z"),
1702
+ ("alpha", "A-Za-z"),
1703
+ ("digit", "0-9"),
1704
+ ("xdigit", "0-9A-Fa-f"),
1705
+ ("alnum", "A-Za-z0-9"),
1706
+ ("punct", ",./<>?`;':\"\\[\\]{}\\|~!@#$%\\^&*()_+-=\\\\"),
1707
+ ("blank", " \t"),
1708
+ ("space", " \t\n\r\f\v"),
1709
+ ("cntrl", "\0-\x1f\x7f"),
1710
+ ("graph", "^\0-\x1f\x7f "),
1711
+ ("print", "^\0-\x1f\x7f"),
1712
+ ("word", "\\w")
1713
+ ):
1714
+ match = match.replace(f"[:{match_from}:]".encode("utf-8"), f"{replace_with}".encode("utf-8"))
1715
+ return match
1716
+
1717
+
1718
+ class RegexType(DataType[Pattern[bytes]]):
1719
+ def __init__(
1720
+ self,
1721
+ length: Optional[int] = None,
1722
+ case_insensitive: bool = False,
1723
+ match_to_start: bool = False,
1724
+ limit_lines: bool = False,
1725
+ trim: bool = False
1726
+ ):
1727
+ if length is None:
1728
+ if limit_lines:
1729
+ length = 8 * 1024 // 80 # libmagic assumes 80 bytes per line
1730
+ else:
1731
+ length = 8 * 1024 # libmagic limits to 8KiB by default
1732
+ self.limit_lines: bool = limit_lines
1733
+ self.length: int = length
1734
+ self.case_insensitive: bool = case_insensitive
1735
+ self.match_to_start: bool = match_to_start
1736
+ self.trim: bool = trim
1737
+ super().__init__(f"regex/{self.length}{['', 'c'][case_insensitive]}{['', 's'][match_to_start]}"
1738
+ f"{['', 'l'][self.limit_lines]}{['', 'T'][self.trim]}")
1739
+
1740
+ DOLLAR_PATTERN = re.compile(rb"(^|[^\\])\$", re.MULTILINE)
1741
+
1742
+ def is_text(self, value: Pattern[bytes]) -> bool:
1743
+ try:
1744
+ _ = value.pattern.decode("ascii")
1745
+ return True
1746
+ except UnicodeDecodeError:
1747
+ return False
1748
+
1749
+ def parse_expected(self, specification: str) -> Pattern[bytes]:
1750
+ # handle POSIX-style character classes:
1751
+ unescaped_spec = posix_to_python_re(unescape(specification))
1752
+ # convert '$' to '[\r$]'
1753
+ # unescaped_spec = self.__class__.DOLLAR_PATTERN.sub(rb"[\r$]", unescaped_spec)
1754
+ try:
1755
+ if self.case_insensitive:
1756
+ return re.compile(unescaped_spec, re.IGNORECASE | re.MULTILINE)
1757
+ else:
1758
+ return re.compile(unescaped_spec, re.MULTILINE)
1759
+ except re.error as e:
1760
+ raise ValueError(str(e))
1761
+
1762
+ def match(self, data: bytes, expected: Pattern[bytes]) -> DataTypeMatch:
1763
+ if self.limit_lines:
1764
+ limit = self.length
1765
+ offset = 0
1766
+ byte_limit = 80 * self.length # libmagic uses an implicit byte limit assuming 80 characters per line
1767
+ while limit > 0:
1768
+ limit -= 1
1769
+ line_offset = data.find(b"\n", offset, byte_limit)
1770
+ if line_offset < 0:
1771
+ return DataTypeMatch.INVALID
1772
+ line = data[offset:line_offset]
1773
+ m = expected.match(line)
1774
+ if m:
1775
+ match = data[:offset + m.end()]
1776
+ try:
1777
+ value = match.decode("utf-8")
1778
+ except UnicodeDecodeError:
1779
+ value = match
1780
+ if self.trim:
1781
+ value = value.strip()
1782
+ return DataTypeMatch(match, value)
1783
+ offset = line_offset + 1
1784
+ else:
1785
+ m = expected.search(data[:self.length])
1786
+ if m:
1787
+ match = data[:m.end()]
1788
+ try:
1789
+ value = match.decode("utf-8")
1790
+ except UnicodeDecodeError:
1791
+ value = match
1792
+ if self.trim:
1793
+ value = value.strip()
1794
+ return DataTypeMatch(match, value)
1795
+ else:
1796
+ return DataTypeMatch.INVALID
1797
+
1798
+ REGEX_TYPE_FORMAT: Pattern[str] = re.compile(r"^regex(/(?P<length>\d+)?(?P<flags>[cslT]*)(b\d*)?)?$")
1799
+ # NOTE: some specification files like `cad` use `regex/b`, which is undocumented, and it's unclear from the libmagic
1800
+ # source code whether it is simply ignored or if it has a purpuse. We ignore it here.
1801
+
1802
+ @classmethod
1803
+ def parse(cls, format_str: str) -> "RegexType":
1804
+ m = cls.REGEX_TYPE_FORMAT.match(format_str)
1805
+ if not m:
1806
+ raise ValueError(f"Invalid regex type declaration: {format_str!r}")
1807
+ if m.group("flags") is None:
1808
+ options: Iterable[str] = ()
1809
+ else:
1810
+ options = m.group("flags")
1811
+ if m.group("length") is None:
1812
+ length: Optional[int] = None
1813
+ else:
1814
+ length = int(m.group("length"))
1815
+ return RegexType(
1816
+ length=length,
1817
+ case_insensitive="c" in options,
1818
+ match_to_start="s" in options,
1819
+ limit_lines="l" in options,
1820
+ trim="T" in options
1821
+ )
1822
+
1823
+
1824
+ BASE_NUMERIC_TYPES_BY_NAME: Dict[str, "BaseNumericDataType"] = {}
1825
+
1826
+
1827
+ DATETIME_FORMAT: str = "%a %b %e %H:%M:%S %Y"
1828
+ DATE_FORMAT: str = "%a %b %e %Y"
1829
+ TIME_FORMAT: str = "%H:%M:%S"
1830
+
1831
+
1832
+ def local_date(ms_since_epoch: int) -> str:
1833
+ return strftime(DATETIME_FORMAT, localtime(ms_since_epoch / 1000.0))
1834
+
1835
+
1836
+ def utc_date(ms_since_epoch: int) -> str:
1837
+ return strftime(DATETIME_FORMAT, gmtime(ms_since_epoch / 1000.0))
1838
+
1839
+
1840
+ def msdos_date(value: int) -> str:
1841
+ day = (value & 0b11111) + 1
1842
+ value >>= 5
1843
+ month = (value & 0b1111) + 1
1844
+ value >>= 4
1845
+ year = 1980 + (value & 0b1111111)
1846
+ return strftime(DATE_FORMAT, datetime(year, month, day).timetuple())
1847
+
1848
+
1849
+ def msdos_time(value: int) -> str:
1850
+ seconds = (value & 0b11111) * 2
1851
+ value >>= 5
1852
+ minutes = value & 0b111111
1853
+ value >>= 6
1854
+ hour = value & 0b11111
1855
+ return strftime(TIME_FORMAT, datetime(1, 1, 1, hour, minutes, seconds).timetuple())
1856
+
1857
+
1858
+ class BaseNumericDataType(Enum):
1859
+ BYTE = ("byte", "b", 1)
1860
+ BYTE1 = ("1", "b", 1)
1861
+ SHORT = ("short", "h", 2)
1862
+ SHORT2 = ("2", "h", 2)
1863
+ LONG = ("long", "l", 4)
1864
+ LONG4 = ("4", "l", 4)
1865
+ QUAD = ("quad", "q", 8)
1866
+ QUAD8 = ("8", "q", 8)
1867
+ FLOAT = ("float", "f", 4)
1868
+ DOUBLE = ("double", "d", 8)
1869
+ DATE = ("date", "L", 4, lambda n: utc_date(n * 1000))
1870
+ QDATE = ("qdate", "Q", 8, lambda n: utc_date(n * 1000))
1871
+ LDATE = ("ldate", "L", 4, lambda n: local_date(n * 1000))
1872
+ QLDATE = ("qldate", "Q", 8, lambda n: local_date(n * 1000))
1873
+ QWDATE = ("qwdate", "Q", 8)
1874
+ MSDOSDATE = ("msdosdate", "h", 2, msdos_date)
1875
+ MSDOSTIME = ("msdostime", "h", 2, msdos_time)
1876
+
1877
+ def __init__(
1878
+ self, name: str,
1879
+ struct_fmt: str,
1880
+ num_bytes: int,
1881
+ to_value: Callable[[int], Any] = lambda n: n
1882
+ ):
1883
+ self.struct_fmt: str = struct_fmt
1884
+ self.num_bytes: int = num_bytes
1885
+ self.to_value: Callable[[int], Any] = to_value
1886
+ BASE_NUMERIC_TYPES_BY_NAME[name] = self
1887
+
1888
+
1889
+ NUMERIC_OPERATORS_BY_SYMBOL: Dict[str, "NumericOperator"] = {}
1890
+
1891
+
1892
+ class NumericOperator(Enum):
1893
+ EQUALS = ("=", lambda a, b: a == b)
1894
+ LESS_THAN = ("<", lambda a, b: a < b)
1895
+ GREATER_THAN = (">", lambda a, b: a > b)
1896
+ ALL_BITS_SET = ("&", lambda a, b: (a & b) == b) # value from the file (a) must have set all bits set in b
1897
+ ALL_BITS_CLEAR = ("^", lambda a, b: not (a & b)) # value from the file (a) must have clear all bits set in b
1898
+ NOT = ("!", lambda a, b: not (a == b))
1899
+
1900
+ def __init__(self, symbol: str, test: Union[
1901
+ Callable[[int, int], bool],
1902
+ Callable[[float, float], bool],
1903
+ Callable[[CStyleInt, CStyleInt], bool]
1904
+ ]):
1905
+ self.symbol: str = symbol
1906
+ self.test: Union[
1907
+ Callable[[int, int], bool], Callable[[float, float], bool], Callable[[CStyleInt, CStyleInt], bool]
1908
+ ] = test
1909
+ NUMERIC_OPERATORS_BY_SYMBOL[symbol] = self
1910
+
1911
+ @staticmethod
1912
+ def get(symbol: str) -> "NumericOperator":
1913
+ return NUMERIC_OPERATORS_BY_SYMBOL[symbol]
1914
+
1915
+ def __str__(self):
1916
+ return self.symbol
1917
+
1918
+
1919
+ class NumericValue(Generic[T]):
1920
+ def __init__(self, value: T, operator: NumericOperator = NumericOperator.EQUALS):
1921
+ self.value: T = value
1922
+ self.operator: NumericOperator = operator
1923
+
1924
+ def test(self, to_match: T, unsigned: bool, num_bytes: int, preprocess: Callable[[T], T] = lambda x: x) -> bool:
1925
+ return self.operator.test(preprocess(to_match), self.value)
1926
+
1927
+ @staticmethod
1928
+ def parse(value: str, num_bytes: int) -> "NumericValue":
1929
+ value = value.strip()
1930
+ try:
1931
+ return IntegerValue.parse(value, num_bytes)
1932
+ except ValueError:
1933
+ pass
1934
+ try:
1935
+ return FloatValue.parse(value, num_bytes)
1936
+ except ValueError:
1937
+ pass
1938
+ raise ValueError(f"Could not parse numeric type {value!r}")
1939
+
1940
+ def __str__(self):
1941
+ return f"{self.operator}{self.value!s}"
1942
+
1943
+
1944
+ class NumericWildcard(NumericValue):
1945
+ def __init__(self):
1946
+ super().__init__(None)
1947
+
1948
+ def test(self, to_match, unsigned, num_bytes, preprocess: Callable[[int], int] = lambda x: x) -> bool:
1949
+ return True
1950
+
1951
+
1952
+ class IntegerValue(NumericValue[int]):
1953
+ def test(
1954
+ self,
1955
+ to_match: int,
1956
+ unsigned: bool,
1957
+ num_bytes: int,
1958
+ preprocess: Callable[[CStyleInt], CStyleInt] = lambda x: x
1959
+ ) -> bool:
1960
+ to_test = make_c_style_int(value=self.value, num_bytes=num_bytes, signed=not unsigned)
1961
+ to_match = make_c_style_int(value=to_match, num_bytes=num_bytes, signed=not unsigned)
1962
+ return self.operator.test(preprocess(to_match), to_test)
1963
+
1964
+ @staticmethod
1965
+ def parse(value: Union[str, bytes], num_bytes: int) -> "IntegerValue":
1966
+ if isinstance(value, bytes):
1967
+ value = value.decode("utf-8")
1968
+ try:
1969
+ operator = NumericOperator.get(value[0])
1970
+ value = value[1:]
1971
+ except KeyError:
1972
+ operator = NumericOperator.EQUALS
1973
+ if value[0] == "~":
1974
+ int_value = parse_numeric(value[1:])
1975
+ int_value = (1 << (num_bytes * 8)) - 1 - int_value
1976
+ else:
1977
+ int_value = parse_numeric(value)
1978
+ return IntegerValue(value=int_value, operator=operator)
1979
+
1980
+
1981
+ class FloatValue(NumericValue[float]):
1982
+ @staticmethod
1983
+ def parse(value: str, num_bytes: int) -> "FloatValue":
1984
+ try:
1985
+ operator = NumericOperator.get(value[0])
1986
+ value = value[1:]
1987
+ except KeyError:
1988
+ operator = NumericOperator.EQUALS
1989
+ if operator in (NumericOperator.ALL_BITS_SET, NumericOperator.ALL_BITS_CLEAR):
1990
+ raise ValueError(f"A floating point value cannot have the {operator.symbol} operator")
1991
+ return FloatValue(value=float(value), operator=operator)
1992
+
1993
+
1994
+ class NumericDataType(DataType[NumericValue]):
1995
+ def __init__(
1996
+ self,
1997
+ name: str,
1998
+ base_type: BaseNumericDataType,
1999
+ unsigned: bool = False,
2000
+ endianness: Endianness = Endianness.NATIVE,
2001
+ preprocess: Callable[[int], int] = lambda x: x
2002
+ ):
2003
+ super().__init__(name)
2004
+ self.base_type: BaseNumericDataType = base_type
2005
+ self.unsigned: bool = unsigned
2006
+ self.endianness: Endianness = endianness
2007
+ self.preprocess: Callable[[int], int] = preprocess
2008
+ if self.endianness == Endianness.PDP and self.base_type.num_bytes != 4:
2009
+ raise ValueError(f"PDP endianness can only be used with four byte base types, not {self.base_type}")
2010
+
2011
+ def is_text(self, value: NumericValue) -> bool:
2012
+ return False
2013
+
2014
+ def parse_expected(self, specification: str) -> NumericValue:
2015
+ if specification.strip() == "x":
2016
+ return NumericWildcard()
2017
+ else:
2018
+ return NumericValue.parse(specification, self.base_type.num_bytes)
2019
+
2020
+ def match(self, data: bytes, expected: NumericValue) -> DataTypeMatch:
2021
+ if len(data) < self.base_type.num_bytes:
2022
+ return DataTypeMatch.INVALID
2023
+ elif self.endianness == Endianness.PDP:
2024
+ assert self.base_type.num_bytes == 4
2025
+ if self.unsigned:
2026
+ value = (struct.unpack("<H", data[:2])[0] << 16) | struct.unpack("<H", data[2:4])[0]
2027
+ else:
2028
+ be_data = bytes([data[1], data[0], data[3], data[2]])
2029
+ value = struct.unpack(">i", be_data)[0]
2030
+ else:
2031
+ if self.unsigned and self.base_type not in (BaseNumericDataType.DOUBLE, BaseNumericDataType.FLOAT):
2032
+ struct_fmt = self.base_type.struct_fmt.upper()
2033
+ else:
2034
+ struct_fmt = self.base_type.struct_fmt
2035
+ struct_fmt = f"{self.endianness.value}{struct_fmt}"
2036
+ try:
2037
+ value = struct.unpack(struct_fmt, data[:self.base_type.num_bytes])[0]
2038
+ except struct.error:
2039
+ return DataTypeMatch.INVALID
2040
+ if expected.test(value, self.unsigned, self.base_type.num_bytes, self.preprocess):
2041
+ value = self.preprocess(value)
2042
+ return DataTypeMatch(data[:self.base_type.num_bytes], self.base_type.to_value(value))
2043
+ else:
2044
+ return DataTypeMatch.INVALID
2045
+
2046
+ @staticmethod
2047
+ def parse(fmt: str) -> "NumericDataType":
2048
+ name = fmt
2049
+ if fmt.startswith("u"):
2050
+ fmt = fmt[1:]
2051
+ if fmt.startswith("double") or fmt.startswith("float"):
2052
+ raise ValueError(f"{name[1:]} cannot be unsigned")
2053
+ unsigned = True
2054
+ else:
2055
+ unsigned = False
2056
+ if fmt.startswith("le"):
2057
+ endianness = Endianness.LITTLE
2058
+ fmt = fmt[2:]
2059
+ elif fmt.startswith("be"):
2060
+ endianness = Endianness.BIG
2061
+ fmt = fmt[2:]
2062
+ elif fmt.startswith("me"):
2063
+ endianness = Endianness.PDP
2064
+ fmt = fmt[2:]
2065
+ else:
2066
+ endianness = Endianness.NATIVE
2067
+ for symbol, operator in (
2068
+ ("&", lambda a, b: a & b),
2069
+ ("%", lambda a, b: a % b),
2070
+ ("+", lambda a, b: a + b),
2071
+ ("-", lambda a, b: a - b),
2072
+ ("^", lambda a, b: a ^ b),
2073
+ ("/", lambda a, b: [a // b, a / b][isinstance(a, float)]),
2074
+ ("*", lambda a, b: a * b),
2075
+ ("|", lambda a, b: a | b)
2076
+ ):
2077
+ pos = fmt.find(symbol)
2078
+ if pos > 0:
2079
+ operand = parse_numeric(fmt[pos+1:])
2080
+ preprocess = lambda n: operator(n, operand)
2081
+ fmt = fmt[:pos]
2082
+ break
2083
+ else:
2084
+ preprocess = lambda n: n
2085
+ if fmt not in BASE_NUMERIC_TYPES_BY_NAME:
2086
+ raise ValueError(f"Invalid numeric data type: {name!r}")
2087
+ return NumericDataType(
2088
+ name=name,
2089
+ base_type=BASE_NUMERIC_TYPES_BY_NAME[fmt],
2090
+ unsigned=unsigned,
2091
+ endianness=endianness,
2092
+ preprocess=preprocess
2093
+ )
2094
+
2095
+
2096
+ class ConstantMatchTest(MagicTest, Generic[T]):
2097
+ def __init__(
2098
+ self,
2099
+ offset: Offset,
2100
+ data_type: DataType[T],
2101
+ constant: T,
2102
+ mime: Optional[str] = None,
2103
+ extensions: Iterable[str] = (),
2104
+ message: str = "",
2105
+ parent: Optional["MagicTest"] = None
2106
+ ):
2107
+ super().__init__(offset=offset, mime=mime, extensions=extensions, message=message, parent=parent)
2108
+ self.data_type: DataType[T] = data_type
2109
+ self.constant: T = constant
2110
+
2111
+ def subtest_type(self) -> TestType:
2112
+ if self.data_type.is_text(self.constant):
2113
+ return TestType.TEXT
2114
+ else:
2115
+ return TestType.BINARY
2116
+
2117
+ def calculate_absolute_offset(self, data: bytes, parent_match: Optional[TestResult] = None) -> int:
2118
+ return self.offset.to_absolute(data, parent_match, self.data_type.allows_invalid_offsets(self.constant))
2119
+
2120
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2121
+ match = self.data_type.match(data[absolute_offset:], self.constant)
2122
+ if match:
2123
+ return MatchedTest(self, offset=absolute_offset + match.initial_offset, length=len(match.raw_match),
2124
+ value=match.value, parent=parent_match)
2125
+ else:
2126
+ return FailedTest(
2127
+ self,
2128
+ offset=absolute_offset,
2129
+ parent=parent_match,
2130
+ message=f"expected {self.constant!s}"
2131
+ )
2132
+
2133
+
2134
+ class OffsetMatchTest(MagicTest):
2135
+ def __init__(
2136
+ self,
2137
+ offset: Offset,
2138
+ value: IntegerValue,
2139
+ mime: Optional[str] = None,
2140
+ extensions: Iterable[str] = (),
2141
+ message: str = "",
2142
+ parent: Optional["MagicTest"] = None
2143
+ ):
2144
+ super().__init__(offset=offset, mime=mime, extensions=extensions, message=message, parent=parent)
2145
+ self.value: IntegerValue = value
2146
+
2147
+ def subtest_type(self) -> TestType:
2148
+ return TestType.UNKNOWN
2149
+
2150
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2151
+ if self.value.test(absolute_offset, unsigned=True, num_bytes=8):
2152
+ return MatchedTest(self, offset=0, length=absolute_offset, value=absolute_offset, parent=parent_match)
2153
+ else:
2154
+ return FailedTest(
2155
+ test=self,
2156
+ offset=absolute_offset,
2157
+ parent=parent_match,
2158
+ message=f"expected {self.value!r}"
2159
+ )
2160
+
2161
+
2162
+ class IndirectResult(MatchedTest):
2163
+ def __init__(self, test: "IndirectTest", offset: int, parent: Optional[TestResult] = None):
2164
+ super().__init__(test, value=None, offset=offset, length=0, parent=parent)
2165
+
2166
+ def explain(self, writer: ANSIWriter, file: Streamable):
2167
+ writer.write(f"Indirect test {self.test} matched at offset {self.offset}\n", dim=True)
2168
+
2169
+
2170
+ class IndirectTest(MagicTest):
2171
+ def __init__(
2172
+ self,
2173
+ matcher: "MagicMatcher",
2174
+ offset: Offset,
2175
+ relative: bool = False,
2176
+ mime: Optional[str] = None,
2177
+ extensions: Iterable[str] = (),
2178
+ message: str = "",
2179
+ parent: Optional[MagicTest] = None
2180
+ ):
2181
+ super().__init__(offset=offset, mime=mime, extensions=extensions, message=message, parent=parent)
2182
+ self.matcher: MagicMatcher = matcher
2183
+ self.relative: bool = relative
2184
+ self.can_match_mime = True
2185
+ self.can_be_indirect = True
2186
+ self._type = TestType.BINARY
2187
+ p = parent
2188
+ while p is not None:
2189
+ p.can_be_indirect = True
2190
+ p.can_match_mime = True
2191
+ p._type = TestType.BINARY
2192
+ p = p.parent
2193
+
2194
+ def subtest_type(self) -> TestType:
2195
+ return TestType.BINARY
2196
+
2197
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2198
+ if self.relative:
2199
+ if parent_match is None:
2200
+ return FailedTest(
2201
+ test=self,
2202
+ offset=absolute_offset,
2203
+ parent=parent_match,
2204
+ message="the test is relative but it does not have a parent test (this is likely a bug in the magic"
2205
+ " definition file)"
2206
+ )
2207
+ absolute_offset += parent_match.offset
2208
+ return IndirectResult(self, absolute_offset, parent_match)
2209
+
2210
+
2211
+ class NamedTest(MagicTest):
2212
+ def __init__(
2213
+ self,
2214
+ name: str,
2215
+ offset: Offset,
2216
+ mime: Optional[str] = None,
2217
+ extensions: Iterable[str] = (),
2218
+ message: str = ""
2219
+ ):
2220
+ if not message:
2221
+ # by default, named tests should not add a space if they don't contain an explicit message
2222
+ message = "\b"
2223
+ assert isinstance(offset, AbsoluteOffset) and offset.offset == 0
2224
+
2225
+ class NamedTestOffset(Offset):
2226
+ def to_absolute(self, data: bytes, last_match: Optional[TestResult], allow_invalid: bool = False) -> int:
2227
+ assert last_match is not None
2228
+ return last_match.offset
2229
+ offset = NamedTestOffset()
2230
+ super().__init__(offset=offset, mime=mime, extensions=extensions, message=message, parent=None)
2231
+ self.name: str = name
2232
+ self.named_test = self
2233
+ self.used_by: Set[UseTest] = set()
2234
+
2235
+ def subtest_type(self) -> TestType:
2236
+ return TestType.UNKNOWN
2237
+
2238
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> MatchedTest:
2239
+ if parent_match is not None:
2240
+ return MatchedTest(self, offset=parent_match.offset + parent_match.length, length=0, value=self.name,
2241
+ parent=parent_match)
2242
+ else:
2243
+ raise ValueError("A named test must always be called from a `use` test.")
2244
+
2245
+ def __str__(self):
2246
+ return self.name
2247
+
2248
+
2249
+ class UseTest(MagicTest):
2250
+ def __init__(
2251
+ self,
2252
+ referenced_test: NamedTest,
2253
+ offset: Offset,
2254
+ mime: Optional[str] = None,
2255
+ extensions: Iterable[str] = (),
2256
+ message: str = "",
2257
+ parent: Optional["MagicTest"] = None,
2258
+ flip_endianness: bool = False,
2259
+ late_binding: bool = False
2260
+ ):
2261
+ super().__init__(offset=offset, mime=mime, extensions=extensions, message=message, parent=parent)
2262
+ self.referenced_test: NamedTest = referenced_test
2263
+ self.flip_endianness: bool = flip_endianness
2264
+ self.late_binding: bool = late_binding
2265
+ referenced_test.used_by.add(self)
2266
+
2267
+ def subtest_type(self) -> TestType:
2268
+ return self.referenced_test.test_type
2269
+
2270
+ def referenced_tests(self) -> Set[NamedTest]:
2271
+ result = super().referenced_tests() | {self.referenced_test}
2272
+ if self.named_test is None or self.named_test.name != self.referenced_test.name:
2273
+ result |= self.referenced_test.referenced_tests()
2274
+ return result
2275
+
2276
+ def _match(self, context: MatchContext, parent_match: Optional[TestResult] = None) -> Iterator[TestResult]:
2277
+ if self.flip_endianness:
2278
+ raise NotImplementedError("TODO: Add support for use tests with flipped endianness")
2279
+ try:
2280
+ absolute_offset = self.offset.to_absolute(context.data, last_match=parent_match)
2281
+ except InvalidOffsetError:
2282
+ return None
2283
+ log.trace(
2284
+ f"{self.source_info!s}\tTrue\t{absolute_offset}\t{context.data[absolute_offset:absolute_offset + 20]!r}"
2285
+ )
2286
+ use_match = MatchedTest(self, None, absolute_offset, 0, parent=parent_match)
2287
+ yielded = False
2288
+ for named_result in self.referenced_test._match(context, use_match):
2289
+ if not yielded:
2290
+ yielded = True
2291
+ yield use_match
2292
+ yield named_result
2293
+ if not yielded:
2294
+ # the named test did not match anything, so don't try any of our children
2295
+ return
2296
+ elif context.only_match_mime and not self.can_match_mime:
2297
+ # none of our children can produce a mime type
2298
+ return
2299
+ for child in self.children:
2300
+ if not context.only_match_mime or child.can_match_mime:
2301
+ yield from child._match(context=context, parent_match=use_match)
2302
+
2303
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2304
+ raise NotImplementedError("This function should never be called")
2305
+
2306
+
2307
+ class JSONTest(MagicTest):
2308
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> Optional[TestResult]:
2309
+ try:
2310
+ parsed = json.loads(data[absolute_offset:])
2311
+ return MatchedTest(self, offset=absolute_offset, length=len(data) - absolute_offset, value=parsed,
2312
+ parent=parent_match)
2313
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
2314
+ return FailedTest(
2315
+ test=self,
2316
+ offset=absolute_offset,
2317
+ parent=parent_match,
2318
+ message=str(e)
2319
+ )
2320
+
2321
+ def subtest_type(self) -> TestType:
2322
+ return TestType.TEXT
2323
+
2324
+
2325
+ class CSVTest(MagicTest):
2326
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2327
+ try:
2328
+ text = data[absolute_offset:].decode("utf-8")
2329
+ except UnicodeDecodeError as e:
2330
+ return FailedTest(test=self, offset=absolute_offset, parent=parent_match, message=str(e))
2331
+ for dialect in csv.list_dialects():
2332
+ string_data = StringIO(text, newline="")
2333
+ reader = csv.reader(string_data, dialect=dialect)
2334
+ valid = False
2335
+ try:
2336
+ for i, row in enumerate(reader):
2337
+ if i == 0:
2338
+ num_cols = len(row)
2339
+ if num_cols < 2:
2340
+ # CSVs should have at least two rows:
2341
+ break
2342
+ valid = True
2343
+ elif len(row) != num_cols:
2344
+ # every row of the CSV should have the same number of columns
2345
+ valid = False
2346
+ break
2347
+ except csv.Error:
2348
+ continue
2349
+ if valid:
2350
+ # every row was valid, and we had at least one row
2351
+ return MatchedTest(self, offset=absolute_offset, length=len(data) - absolute_offset, value=dialect,
2352
+ parent=parent_match)
2353
+ return FailedTest(
2354
+ test=self,
2355
+ offset=absolute_offset,
2356
+ parent=parent_match,
2357
+ message=f"the input did not match a known CSV dialect ({', '.join(csv.list_dialects())})"
2358
+ )
2359
+
2360
+ def subtest_type(self) -> TestType:
2361
+ return TestType.TEXT
2362
+
2363
+
2364
+ class DefaultTest(MagicTest):
2365
+ def subtest_type(self) -> TestType:
2366
+ return TestType.UNKNOWN
2367
+
2368
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2369
+ if parent_match is None or not parent_match.child_matched:
2370
+ return MatchedTest(self, offset=absolute_offset, length=0, value=True, parent=parent_match)
2371
+ else:
2372
+ return FailedTest(self, offset=absolute_offset, parent=parent_match, message="the parent test already "
2373
+ "has a child that matched")
2374
+
2375
+
2376
+ class ClearTest(MagicTest):
2377
+ def subtest_type(self) -> TestType:
2378
+ return TestType.UNKNOWN
2379
+
2380
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> MatchedTest:
2381
+ if parent_match is None:
2382
+ return MatchedTest(self, offset=absolute_offset, length=0, value=None)
2383
+ else:
2384
+ parent_match.child_matched = False
2385
+ return MatchedTest(self, offset=absolute_offset, length=0, parent=parent_match, value=None)
2386
+
2387
+
2388
+ class DERTest(MagicTest):
2389
+ def subtest_type(self) -> TestType:
2390
+ return TestType.BINARY
2391
+
2392
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2393
+ raise NotImplementedError(
2394
+ "TODO: Implement support for the DER test (e.g., using the Kaitai asn1_der.py parser)"
2395
+ )
2396
+
2397
+
2398
+ class PlainTextTest(MagicTest):
2399
+ AUTO_REGISTER_TEST = False
2400
+
2401
+ def __init__(
2402
+ self,
2403
+ offset: Offset = AbsoluteOffset(0),
2404
+ mime: Union[str, TernaryExecutableMessage] = "text/plain",
2405
+ extensions: Iterable[str] = ("txt",),
2406
+ parent: Optional["MagicTest"] = None,
2407
+ comments: Iterable[Comment] = (),
2408
+ minimum_encoding_confidence: float = 0.5
2409
+ ):
2410
+ super().__init__(offset, mime, extensions, "", parent, comments)
2411
+ self.minimum_encoding_confidence: float = minimum_encoding_confidence
2412
+
2413
+ def subtest_type(self) -> TestType:
2414
+ return TestType.TEXT
2415
+
2416
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2417
+ if not isinstance(self.message, ConstantMessage) or self.message.message:
2418
+ raise ValueError(f"A new PlainTextTest must be constructed for each call to .test")
2419
+ detector = UniversalDetector()
2420
+ offset = absolute_offset
2421
+ while not detector.done and offset < min(len(data), 5000000):
2422
+ # feed 1kB at a time until we have high confidence in the classification
2423
+ # up to a maximum of 5MiB
2424
+ detector.feed(data[offset:offset+1024])
2425
+ offset += 1024
2426
+ detector.close()
2427
+ if detector.result["confidence"] >= self.minimum_encoding_confidence:
2428
+ encoding = detector.result["encoding"]
2429
+ try:
2430
+ value = data[absolute_offset:].decode(encoding)
2431
+ except UnicodeDecodeError:
2432
+ value = data[absolute_offset:]
2433
+ self.message = ConstantMessage(f"{encoding} text")
2434
+ return MatchedTest(self, offset=absolute_offset, length=len(data) - absolute_offset, parent=parent_match,
2435
+ value=value)
2436
+ else:
2437
+ return FailedTest(self, offset=absolute_offset, parent=parent_match, message="the data do not appear to "
2438
+ "be encoded in a text format")
2439
+
2440
+
2441
+ class OctetStreamTest(MagicTest):
2442
+ AUTO_REGISTER_TEST = False
2443
+
2444
+ def __init__(
2445
+ self,
2446
+ offset: Offset = AbsoluteOffset(0),
2447
+ mime: Union[str, TernaryExecutableMessage] = "application/octet-stream",
2448
+ extensions: Iterable[str] = (),
2449
+ message: Union[str, Message] = "data",
2450
+ parent: Optional["MagicTest"] = None,
2451
+ comments: Iterable[Comment] = ()
2452
+ ):
2453
+ super().__init__(offset, mime, extensions, message, parent, comments)
2454
+
2455
+ def subtest_type(self) -> TestType:
2456
+ return TestType.BINARY
2457
+
2458
+ def test(self, data: bytes, absolute_offset: int, parent_match: Optional[TestResult]) -> TestResult:
2459
+ # Everything is an octet stream!
2460
+ return MatchedTest(self, offset=absolute_offset, length=len(data) - absolute_offset, parent=parent_match,
2461
+ value=data)
2462
+
2463
+
2464
+ TEST_PATTERN: Pattern[str] = re.compile(
2465
+ r"^(?P<level>[>]*)(?P<offset>[^\s!][^\s]*)\s+(?P<data_type>[^\s]+)\s+(?P<remainder>.+)$"
2466
+ )
2467
+ MIME_PATTERN: Pattern[str] = re.compile(r"^!:mime\s+([^#]+?)\s*(#.*)?$")
2468
+ EXTENSION_PATTERN: Pattern[str] = re.compile(r"^!:ext\s+([^\s]+)\s*(#.*)?$")
2469
+
2470
+
2471
+ def _split_with_escapes(text: str) -> Tuple[str, str]:
2472
+ first_length = 0
2473
+ escaped = False
2474
+ delimiter_length = 1
2475
+ for c in text:
2476
+ if escaped:
2477
+ escaped = False
2478
+ elif c == "\\":
2479
+ escaped = True
2480
+ elif c == "\n":
2481
+ if first_length > 0 and text[first_length - 1] == "\r":
2482
+ # strip the \r from trailing \r\n
2483
+ first_length -= 1
2484
+ delimiter_length = 2
2485
+ break
2486
+ elif c == " " or c == "\t":
2487
+ break
2488
+ first_length += 1
2489
+ return text[:first_length], text[first_length + delimiter_length:]
2490
+
2491
+
2492
+ class Match:
2493
+ def __init__(
2494
+ self, matcher: "MagicMatcher", context: MatchContext, results: Iterable[TestResult]
2495
+ ):
2496
+ self.matcher: MagicMatcher = matcher
2497
+ self.context: MatchContext = context
2498
+ self._result_iter: Optional[Iterator[TestResult]] = iter(results)
2499
+ self._results: List[TestResult] = []
2500
+
2501
+ @property
2502
+ def data(self) -> bytes:
2503
+ return self.context.data
2504
+
2505
+ @property
2506
+ def only_match_mime(self) -> bool:
2507
+ return self.context.only_match_mime
2508
+
2509
+ @property
2510
+ def mimetypes(self) -> LazyIterableSet[str]:
2511
+ return LazyIterableSet((
2512
+ result.test.mime.resolve(self.context) for result in self if result.test.mime is not None)
2513
+ )
2514
+
2515
+ @property
2516
+ def extensions(self) -> LazyIterableSet[str]:
2517
+ def _extensions():
2518
+ for result in self:
2519
+ yield from result.test.extensions
2520
+ return LazyIterableSet(_extensions())
2521
+
2522
+ def explain(self, file: Streamable, ansi_color: Optional[bool] = None) -> str:
2523
+ if ansi_color is None:
2524
+ ansi_color = sys.stdout.isatty()
2525
+ writer = ANSIWriter(use_ansi=ansi_color)
2526
+ for result in self:
2527
+ result.explain(writer, file=file)
2528
+ return str(writer)
2529
+
2530
+ def __bool__(self):
2531
+ return any(m for m in self.mimetypes) or any(e for e in self.extensions) or bool(self.message())
2532
+
2533
+ def __len__(self):
2534
+ if self._result_iter is not None:
2535
+ # we have not yet finished collecting the results
2536
+ for _ in self:
2537
+ pass
2538
+ assert self._result_iter is None
2539
+ return len(self._results)
2540
+
2541
+ def __getitem__(self, index: int) -> TestResult:
2542
+ while self._result_iter is not None and index <= len(self._results):
2543
+ # we have not yet finished collecting the results
2544
+ try:
2545
+ result = next(self._result_iter)
2546
+ self._results.append(result)
2547
+ if isinstance(result, IndirectResult):
2548
+ for match in self.matcher.match(self.context[result.offset:]):
2549
+ self._results.extend(match)
2550
+ except StopIteration:
2551
+ self._result_iter = None
2552
+ return self._results[index]
2553
+
2554
+ def __iter__(self) -> Iterator[TestResult]:
2555
+ if self._result_iter is None:
2556
+ yield from self._results
2557
+ return
2558
+ i = 0
2559
+ while True:
2560
+ try:
2561
+ yield self[i]
2562
+ except IndexError:
2563
+ break
2564
+ i += 1
2565
+
2566
+ def message(self) -> str:
2567
+ msg = ""
2568
+ for result in self:
2569
+ m = result.test.message.resolve(self.context).lstrip()
2570
+ if not m:
2571
+ continue
2572
+ elif m.startswith("\b"):
2573
+ result_str = m[1:]
2574
+ else:
2575
+ result_str = m
2576
+ if msg and not msg[-1] in " \t\r\n\v\f":
2577
+ msg = f"{msg} "
2578
+ if "%u" in result_str and result.value < 0:
2579
+ # sometimes we parsed a negative value and want to print it as an unsigned int:
2580
+ result_str = result_str % (result.value + 2**(8 * result.length),)
2581
+ elif "%" in result_str.replace("%%", ""):
2582
+ result_str = result_str.replace("%ll", "%")
2583
+ result_str = result_str.replace("%#ll", "0x%")
2584
+ try:
2585
+ result_str = result_str % (result.value,)
2586
+ except ValueError as e:
2587
+ log.error(f"Error formatting message {result_str!r} with value {result.value!r}: {e!s}")
2588
+ result_str = result_str.replace("%%", "%")
2589
+ msg = f"{msg}{result_str}"
2590
+ return msg
2591
+
2592
+ __str__ = message
2593
+
2594
+
2595
+ class DefaultMagicMatcher:
2596
+ _DEFAULT_INSTANCE: Optional["MagicMatcher"] = None
2597
+
2598
+ def __get__(self, instance, owner) -> "MagicMatcher":
2599
+ if DefaultMagicMatcher._DEFAULT_INSTANCE is None:
2600
+ # DefaultMagicMatcher._DEFAULT_INSTANCE = MagicMatcher.parse(*MAGIC_DEFS)
2601
+ # FIXME: skip the DER definition for now because we don't yet support it
2602
+ DefaultMagicMatcher._DEFAULT_INSTANCE = MagicMatcher.parse(*(d for d in MAGIC_DEFS if d.name != "der"))
2603
+ return DefaultMagicMatcher._DEFAULT_INSTANCE
2604
+
2605
+ def __set__(self, instance, value: Optional["MagicMatcher"]):
2606
+ DefaultMagicMatcher._DEFAULT_INSTANCE = value
2607
+
2608
+ def __delete__(self, instance):
2609
+ DefaultMagicMatcher._DEFAULT_INSTANCE = None
2610
+
2611
+
2612
+ class MagicMatcher:
2613
+ DEFAULT_INSTANCE: "MagicMatcher" = DefaultMagicMatcher() # type: ignore
2614
+
2615
+ def __init__(self, tests: Iterable[MagicTest] = ()):
2616
+ self._tests: List[MagicTest] = []
2617
+ self.named_tests: Dict[str, NamedTest] = {}
2618
+ self._tests_by_mime: Dict[str, Set[MagicTest]] = defaultdict(set)
2619
+ self._tests_by_ext: Dict[str, Set[MagicTest]] = defaultdict(set)
2620
+ self._tests_that_can_be_indirect: Set[MagicTest] = set()
2621
+ self._non_text_tests: Set[MagicTest] = set()
2622
+ self._text_tests: Set[MagicTest] = set()
2623
+ self._dirty: bool = True
2624
+ for test in tests:
2625
+ self.add(test)
2626
+
2627
+ @property
2628
+ def tests_by_mime(self) -> Dict[str, Set[MagicTest]]:
2629
+ self._reassign_test_types()
2630
+ return self._tests_by_mime
2631
+
2632
+ @property
2633
+ def tests_by_ext(self) -> Dict[str, Set[MagicTest]]:
2634
+ self._reassign_test_types()
2635
+ return self._tests_by_ext
2636
+
2637
+ @property
2638
+ def tests_that_can_be_indirect(self) -> Set[MagicTest]:
2639
+ self._reassign_test_types()
2640
+ return self._tests_that_can_be_indirect
2641
+
2642
+ @property
2643
+ def non_text_tests(self) -> Set[MagicTest]:
2644
+ self._reassign_test_types()
2645
+ return self._non_text_tests
2646
+
2647
+ @property
2648
+ def text_tests(self) -> Set[MagicTest]:
2649
+ self._reassign_test_types()
2650
+ return self._text_tests
2651
+
2652
+ def add(self, test: Union[MagicTest, Path], test_type: TestType = TestType.UNKNOWN) -> List[MagicTest]:
2653
+ if not isinstance(test, MagicTest):
2654
+ level_zero_tests, _, tests_with_mime, indirect_tests = self._parse_file(test, self)
2655
+ for test in tests_with_mime:
2656
+ assert test.can_match_mime
2657
+ for ancestor in test.ancestors():
2658
+ ancestor.can_match_mime = True
2659
+ for test in indirect_tests:
2660
+ assert test.can_be_indirect
2661
+ assert test.can_match_mime
2662
+ for ancestor in test.ancestors():
2663
+ ancestor.can_be_indirect = True
2664
+ for test in level_zero_tests:
2665
+ self.add(test, test_type=test_type)
2666
+ return list(level_zero_tests)
2667
+
2668
+ if test_type != TestType.UNKNOWN:
2669
+ test.test_type = test_type
2670
+
2671
+ self._dirty = True
2672
+
2673
+ if isinstance(test, NamedTest):
2674
+ if test.name in self.named_tests:
2675
+ raise ValueError(f"A test named {test.name} already exists in this matcher!")
2676
+ self.named_tests[test.name] = test
2677
+ else:
2678
+ self._tests.append(test)
2679
+
2680
+ return [test]
2681
+
2682
+ def _reassign_test_types(self):
2683
+ if not self._dirty:
2684
+ return
2685
+ self._dirty = False
2686
+ self._text_tests = set()
2687
+ self._non_text_tests = set()
2688
+ self._tests_that_can_be_indirect = set()
2689
+ self._tests_by_ext = defaultdict(set)
2690
+ self._tests_by_mime = defaultdict(set)
2691
+ for test in self._tests:
2692
+ if test.test_type == TestType.TEXT:
2693
+ self._text_tests.add(test)
2694
+ else:
2695
+ self._non_text_tests.add(test)
2696
+ if test.can_be_indirect:
2697
+ self._tests_that_can_be_indirect.add(test)
2698
+ for mime in test.mimetypes():
2699
+ self._tests_by_mime[mime].add(test)
2700
+ for ext in test.all_extensions():
2701
+ self._tests_by_ext[ext].add(test)
2702
+
2703
+ def only_match(
2704
+ self,
2705
+ mimetypes: Optional[Iterable[str]] = None,
2706
+ extensions: Optional[Iterable[str]] = None
2707
+ ) -> "MagicMatcher":
2708
+ """
2709
+ Returns the simplest possible matcher that is capable of matching against all the given mimetypes or extensions.
2710
+
2711
+ If either argument is None, the resulting matcher will match against all such values. Therefore, if both
2712
+ arguments are None, the resulting matcher will be equivalent to this matcher.
2713
+
2714
+ """
2715
+ if mimetypes is None and extensions is None:
2716
+ return self
2717
+ tests: Set[MagicTest] = {
2718
+ indirect_test for indirect_test in self.tests_that_can_be_indirect
2719
+ if not any(True for _ in indirect_test.mimetypes())
2720
+ }
2721
+ if mimetypes is not None:
2722
+ for mime in mimetypes:
2723
+ tests |= self.tests_by_mime[mime]
2724
+ if extensions is not None:
2725
+ for ext in extensions:
2726
+ tests |= self.tests_by_ext[ext]
2727
+ # add in all necessary named tests:
2728
+ required_named_tests = set()
2729
+ for test in tests:
2730
+ required_named_tests |= test.referenced_tests()
2731
+ return MagicMatcher(tests | required_named_tests)
2732
+
2733
+ def __iter__(self) -> Iterator[MagicTest]:
2734
+ return iter(self._tests)
2735
+
2736
+ @property
2737
+ def mimetypes(self) -> Iterable[str]:
2738
+ """Returns the set of MIME types this matcher is capable of matching"""
2739
+ return self.tests_by_mime.keys()
2740
+
2741
+ @property
2742
+ def extensions(self) -> Iterable[str]:
2743
+ """Returns the set of extensions this matcher is capable of matching"""
2744
+ return self.tests_by_ext.keys()
2745
+
2746
+ def match(self, to_match: Union[bytes, BinaryIO, str, Path, MatchContext]) -> Iterator[Match]:
2747
+ if isinstance(to_match, bytes):
2748
+ to_match = MatchContext(to_match)
2749
+ elif not isinstance(to_match, MatchContext):
2750
+ to_match = MatchContext.load(to_match)
2751
+ yielded = False
2752
+ for test in log.range(self.non_text_tests, desc="binary matching", unit=" tests", delay=1.0):
2753
+ m = Match(matcher=self, context=to_match, results=test.match(to_match))
2754
+ if m and (not to_match.only_match_mime or any(t is not None for t in m.mimetypes)):
2755
+ yield m
2756
+ yielded = True
2757
+ # is this a plain text file?
2758
+ text_matcher = Match(matcher=self, context=to_match, results=PlainTextTest().match(to_match))
2759
+ is_text = text_matcher and (not to_match.only_match_mime or any(t is not None for t in text_matcher.mimetypes))
2760
+ if is_text:
2761
+ # this is a text file, so try all of the textual tests:
2762
+ for test in log.range(self.text_tests, desc="text matching", unit=" tests", delay=1.0):
2763
+ m = Match(matcher=self, context=to_match, results=test.match(to_match))
2764
+ if m and (not to_match.only_match_mime or any(t is not None for t in m.mimetypes)):
2765
+ yield m
2766
+ yielded = True
2767
+ if not yielded:
2768
+ if is_text:
2769
+ yield text_matcher
2770
+ else:
2771
+ yield Match(matcher=self, context=to_match, results=OctetStreamTest().match(to_match))
2772
+
2773
+ @staticmethod
2774
+ def parse_test(
2775
+ line: str,
2776
+ def_file: Path,
2777
+ line_number: int,
2778
+ parent: Optional[MagicTest] = None,
2779
+ matcher: Optional["MagicMatcher"] = None
2780
+ ) -> Optional[MagicTest]:
2781
+ m = TEST_PATTERN.match(line)
2782
+ if not m:
2783
+ return None
2784
+ level = len(m.group("level"))
2785
+ while parent is not None and parent.level >= level:
2786
+ parent = parent.parent
2787
+ if parent is None and level != 0:
2788
+ raise ValueError(f"{def_file!s} line {line_number}: Invalid level for test {line!r}")
2789
+ test_str, message = _split_with_escapes(m.group("remainder"))
2790
+ message = unescape(message).decode("utf-8")
2791
+ try:
2792
+ offset = Offset.parse(m.group("offset"))
2793
+ except ValueError as e:
2794
+ raise ValueError(f"{def_file!s} line {line_number}: {e!s}")
2795
+ data_type = m.group("data_type")
2796
+ if data_type == "name":
2797
+ if parent is not None:
2798
+ raise ValueError(f"{def_file!s} line {line_number}: A named test must be at level 0")
2799
+ elif test_str in matcher.named_tests:
2800
+ raise ValueError(f"{def_file!s} line {line_number}: Duplicate test named {test_str!r}")
2801
+ test = NamedTest(name=test_str, offset=offset, message=message)
2802
+ matcher.named_tests[test_str] = test
2803
+ test.source_info = SourceInfo(def_file, line_number, line)
2804
+ else:
2805
+ if data_type == "default":
2806
+ if parent is None:
2807
+ raise NotImplementedError("TODO: Add support for default tests at level 0")
2808
+ test = DefaultTest(offset=offset, message=message, parent=parent)
2809
+ elif data_type == "clear":
2810
+ if parent is None:
2811
+ raise NotImplementedError("TODO: Add support for clear tests at level 0")
2812
+ test = ClearTest(offset=offset, message=message, parent=parent)
2813
+ elif data_type == "offset":
2814
+ expected_value = IntegerValue.parse(test_str, num_bytes=8)
2815
+ test = OffsetMatchTest(offset=offset, value=expected_value, message=message,
2816
+ parent=parent)
2817
+ elif data_type == "json":
2818
+ test = JSONTest(offset=offset, message=message, parent=parent)
2819
+ elif data_type == "csv":
2820
+ test = CSVTest(offset=offset, message=message, parent=parent)
2821
+ elif data_type == "indirect" or data_type == "indirect/r":
2822
+ test = IndirectTest(matcher=matcher, offset=offset,
2823
+ relative=m.group("data_type").endswith("r"),
2824
+ message=message, parent=parent)
2825
+ elif data_type == "use":
2826
+ if test_str.startswith("^"):
2827
+ flip_endianness = True
2828
+ test_str = test_str[1:]
2829
+ elif test_str.startswith("\\^"):
2830
+ flip_endianness = True
2831
+ test_str = test_str[2:]
2832
+ else:
2833
+ flip_endianness = False
2834
+ if test_str not in matcher.named_tests:
2835
+ late_binding = True
2836
+
2837
+ class LateBindingNamedTest(NamedTest):
2838
+ def __init__(self):
2839
+ super().__init__(test_str, offset=AbsoluteOffset(0))
2840
+
2841
+ named_test: NamedTest = LateBindingNamedTest()
2842
+ else:
2843
+ late_binding = False
2844
+ named_test = matcher.named_tests[test_str]
2845
+ # named_test might be a string here (the test name) rather than an actual NamedTest object.
2846
+ # This will happen if the named test is defined after the use (late binding).
2847
+ # We will resolve this after the entire file is parsed.
2848
+ test = UseTest( # type: ignore
2849
+ named_test,
2850
+ offset=offset,
2851
+ message=message,
2852
+ parent=parent,
2853
+ flip_endianness=flip_endianness,
2854
+ late_binding=late_binding
2855
+ )
2856
+ elif data_type == "der":
2857
+ # TODO: Update this as necessary once we fully implement the DERTest
2858
+ test = DERTest(offset=offset, message=message, parent=parent)
2859
+ else:
2860
+ try:
2861
+ data_type = DataType.parse(data_type)
2862
+ # in some definitions a space is put after the "&" in a numeric datatype:
2863
+ if test_str in ("<", ">", "=", "!", "&", "^", "~"):
2864
+ # Some files will erroneously add whitespace between the operator and the
2865
+ # subsequent value:
2866
+ actual_operand, message = _split_with_escapes(message)
2867
+ test_str = f"{test_str}{actual_operand}"
2868
+ constant = data_type.parse_expected(test_str)
2869
+ except ValueError as e:
2870
+ raise ValueError(f"{def_file!s} line {line_number}: {e!s}")
2871
+ test = ConstantMatchTest(
2872
+ offset=offset,
2873
+ data_type=data_type,
2874
+ constant=constant,
2875
+ message=message,
2876
+ parent=parent
2877
+ )
2878
+ test.source_info = SourceInfo(def_file, line_number, line)
2879
+ return test
2880
+
2881
+ @staticmethod
2882
+ def _parse_file(
2883
+ def_file: Union[str, Path], matcher: "MagicMatcher"
2884
+ ) -> Tuple[Iterable[MagicTest], Iterable[UseTest], Set[MagicTest], Set[IndirectTest]]:
2885
+ current_test: Optional[MagicTest] = None
2886
+ late_bindings: List[UseTest] = []
2887
+ level_zero_tests: List[MagicTest] = []
2888
+ tests_with_mime: Set[MagicTest] = set()
2889
+ indirect_tests: Set[IndirectTest] = set()
2890
+ comments: List[Comment] = []
2891
+ with open(def_file, "rb") as f:
2892
+ for line_number, raw_line in enumerate(f.readlines()):
2893
+ line_number += 1
2894
+ raw_line = raw_line.lstrip()
2895
+ if not raw_line:
2896
+ # skip empty lines
2897
+ comments = []
2898
+ continue
2899
+ elif raw_line.startswith(b"#"):
2900
+ # this is a comment
2901
+ try:
2902
+ comments.append(Comment(
2903
+ message=raw_line[1:].strip().decode("utf-8"),
2904
+ source_info=SourceInfo(def_file, line_number, raw_line.decode("utf-8"))
2905
+ ))
2906
+ except UnicodeDecodeError:
2907
+ pass
2908
+ continue
2909
+ elif raw_line.startswith(b"!:apple") or raw_line.startswith(b"!:strength"):
2910
+ # ignore these directives for now
2911
+ continue
2912
+ try:
2913
+ line = raw_line.decode("utf-8")
2914
+ except UnicodeDecodeError:
2915
+ continue
2916
+ test = MagicMatcher.parse_test(line, def_file, line_number, current_test, matcher)
2917
+ if test is not None:
2918
+ if isinstance(test, NamedTest):
2919
+ matcher.named_tests[test.name] = test
2920
+ else:
2921
+ if isinstance(test, IndirectTest):
2922
+ indirect_tests.add(test)
2923
+ elif isinstance(test, UseTest) and test.late_binding:
2924
+ late_bindings.append(test)
2925
+ if test.level == 0:
2926
+ level_zero_tests.append(test)
2927
+ test.source_info = SourceInfo(def_file, line_number, line)
2928
+ test.comments = tuple(comments)
2929
+ comments = []
2930
+ current_test = test
2931
+ continue
2932
+ m = MIME_PATTERN.match(line)
2933
+ if m:
2934
+ if current_test is None:
2935
+ raise ValueError(f"{def_file!s} line {line_number}: Unexpected mime type {line!r}")
2936
+ elif current_test.mime is not None:
2937
+ raise ValueError(f"{def_file!s} line {line_number}: Duplicate mime types for test "
2938
+ f"{current_test!r}: {current_test.mime!r} and {m.group(1)}")
2939
+ current_test.mime = m.group(1)
2940
+ tests_with_mime.add(current_test)
2941
+ continue
2942
+ m = EXTENSION_PATTERN.match(line)
2943
+ if m:
2944
+ if current_test is None:
2945
+ raise ValueError(f"{def_file!s} line {line_number}: Unexpected ext: {line!r}")
2946
+ current_test.extensions |= {ext for ext in re.split(r"[/,]", m.group(1)) if ext}
2947
+ continue
2948
+ raise ValueError(f"{def_file!s} line {line_number}: Unexpected line\n{raw_line!r}")
2949
+ return level_zero_tests, late_bindings, tests_with_mime, indirect_tests
2950
+
2951
+ @staticmethod
2952
+ def parse(*def_files: Union[str, Path]) -> "MagicMatcher":
2953
+ late_bindings: Dict[str, List[UseTest]] = {}
2954
+ zero_level_tests: List[MagicTest] = []
2955
+ tests_with_mime: Set[MagicTest] = set()
2956
+ indirect_tests: Set[IndirectTest] = set()
2957
+ matcher = MagicMatcher([])
2958
+ for file in def_files:
2959
+ zl, lb, wm, it = MagicMatcher._parse_file(file, matcher=matcher)
2960
+ late_bindings[file] = list(lb)
2961
+ zero_level_tests.extend(zl)
2962
+ tests_with_mime |= wm
2963
+ indirect_tests |= it
2964
+ # resolve any "use" tests with late binding:
2965
+ for def_file, use_tests in late_bindings.items():
2966
+ for use_test in use_tests:
2967
+ if use_test.referenced_test.name not in matcher.named_tests:
2968
+ raise ValueError(f"{def_file!s}: Named test {use_test.referenced_test.name!r} is not defined")
2969
+ named_test = matcher.named_tests[use_test.referenced_test.name]
2970
+ use_test.referenced_test = named_test
2971
+ named_test.used_by.add(use_test)
2972
+ for test in tests_with_mime:
2973
+ assert test.can_match_mime
2974
+ for ancestor in test.ancestors():
2975
+ ancestor.can_match_mime = True
2976
+ for test in indirect_tests:
2977
+ assert test.can_be_indirect
2978
+ assert test.can_match_mime
2979
+ for ancestor in test.ancestors():
2980
+ ancestor.can_be_indirect = True
2981
+ for test in zero_level_tests:
2982
+ matcher.add(test)
2983
+ return matcher