metasm 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. data/BUGS +11 -0
  2. data/CREDITS +17 -0
  3. data/README +270 -0
  4. data/TODO +114 -0
  5. data/doc/code_organisation.txt +146 -0
  6. data/doc/const_missing.txt +16 -0
  7. data/doc/core_classes.txt +75 -0
  8. data/doc/feature_list.txt +53 -0
  9. data/doc/index.txt +59 -0
  10. data/doc/install_notes.txt +170 -0
  11. data/doc/style.css +3 -0
  12. data/doc/use_cases.txt +18 -0
  13. data/lib/metasm.rb +80 -0
  14. data/lib/metasm/arm.rb +12 -0
  15. data/lib/metasm/arm/debug.rb +39 -0
  16. data/lib/metasm/arm/decode.rb +167 -0
  17. data/lib/metasm/arm/encode.rb +77 -0
  18. data/lib/metasm/arm/main.rb +75 -0
  19. data/lib/metasm/arm/opcodes.rb +177 -0
  20. data/lib/metasm/arm/parse.rb +130 -0
  21. data/lib/metasm/arm/render.rb +55 -0
  22. data/lib/metasm/compile_c.rb +1457 -0
  23. data/lib/metasm/dalvik.rb +8 -0
  24. data/lib/metasm/dalvik/decode.rb +196 -0
  25. data/lib/metasm/dalvik/main.rb +60 -0
  26. data/lib/metasm/dalvik/opcodes.rb +366 -0
  27. data/lib/metasm/decode.rb +213 -0
  28. data/lib/metasm/decompile.rb +2659 -0
  29. data/lib/metasm/disassemble.rb +2068 -0
  30. data/lib/metasm/disassemble_api.rb +1280 -0
  31. data/lib/metasm/dynldr.rb +1329 -0
  32. data/lib/metasm/encode.rb +333 -0
  33. data/lib/metasm/exe_format/a_out.rb +194 -0
  34. data/lib/metasm/exe_format/autoexe.rb +82 -0
  35. data/lib/metasm/exe_format/bflt.rb +189 -0
  36. data/lib/metasm/exe_format/coff.rb +455 -0
  37. data/lib/metasm/exe_format/coff_decode.rb +901 -0
  38. data/lib/metasm/exe_format/coff_encode.rb +1078 -0
  39. data/lib/metasm/exe_format/dex.rb +457 -0
  40. data/lib/metasm/exe_format/dol.rb +145 -0
  41. data/lib/metasm/exe_format/elf.rb +923 -0
  42. data/lib/metasm/exe_format/elf_decode.rb +979 -0
  43. data/lib/metasm/exe_format/elf_encode.rb +1375 -0
  44. data/lib/metasm/exe_format/macho.rb +827 -0
  45. data/lib/metasm/exe_format/main.rb +228 -0
  46. data/lib/metasm/exe_format/mz.rb +164 -0
  47. data/lib/metasm/exe_format/nds.rb +172 -0
  48. data/lib/metasm/exe_format/pe.rb +437 -0
  49. data/lib/metasm/exe_format/serialstruct.rb +246 -0
  50. data/lib/metasm/exe_format/shellcode.rb +114 -0
  51. data/lib/metasm/exe_format/xcoff.rb +167 -0
  52. data/lib/metasm/gui.rb +23 -0
  53. data/lib/metasm/gui/cstruct.rb +373 -0
  54. data/lib/metasm/gui/dasm_coverage.rb +199 -0
  55. data/lib/metasm/gui/dasm_decomp.rb +369 -0
  56. data/lib/metasm/gui/dasm_funcgraph.rb +103 -0
  57. data/lib/metasm/gui/dasm_graph.rb +1354 -0
  58. data/lib/metasm/gui/dasm_hex.rb +543 -0
  59. data/lib/metasm/gui/dasm_listing.rb +599 -0
  60. data/lib/metasm/gui/dasm_main.rb +906 -0
  61. data/lib/metasm/gui/dasm_opcodes.rb +291 -0
  62. data/lib/metasm/gui/debug.rb +1228 -0
  63. data/lib/metasm/gui/gtk.rb +884 -0
  64. data/lib/metasm/gui/qt.rb +495 -0
  65. data/lib/metasm/gui/win32.rb +3004 -0
  66. data/lib/metasm/gui/x11.rb +621 -0
  67. data/lib/metasm/ia32.rb +14 -0
  68. data/lib/metasm/ia32/compile_c.rb +1523 -0
  69. data/lib/metasm/ia32/debug.rb +193 -0
  70. data/lib/metasm/ia32/decode.rb +1167 -0
  71. data/lib/metasm/ia32/decompile.rb +564 -0
  72. data/lib/metasm/ia32/encode.rb +314 -0
  73. data/lib/metasm/ia32/main.rb +233 -0
  74. data/lib/metasm/ia32/opcodes.rb +872 -0
  75. data/lib/metasm/ia32/parse.rb +327 -0
  76. data/lib/metasm/ia32/render.rb +91 -0
  77. data/lib/metasm/main.rb +1193 -0
  78. data/lib/metasm/mips.rb +11 -0
  79. data/lib/metasm/mips/compile_c.rb +7 -0
  80. data/lib/metasm/mips/decode.rb +253 -0
  81. data/lib/metasm/mips/encode.rb +51 -0
  82. data/lib/metasm/mips/main.rb +72 -0
  83. data/lib/metasm/mips/opcodes.rb +443 -0
  84. data/lib/metasm/mips/parse.rb +51 -0
  85. data/lib/metasm/mips/render.rb +43 -0
  86. data/lib/metasm/os/gnu_exports.rb +270 -0
  87. data/lib/metasm/os/linux.rb +1112 -0
  88. data/lib/metasm/os/main.rb +1686 -0
  89. data/lib/metasm/os/remote.rb +527 -0
  90. data/lib/metasm/os/windows.rb +2027 -0
  91. data/lib/metasm/os/windows_exports.rb +745 -0
  92. data/lib/metasm/parse.rb +876 -0
  93. data/lib/metasm/parse_c.rb +3938 -0
  94. data/lib/metasm/pic16c/decode.rb +42 -0
  95. data/lib/metasm/pic16c/main.rb +17 -0
  96. data/lib/metasm/pic16c/opcodes.rb +68 -0
  97. data/lib/metasm/ppc.rb +11 -0
  98. data/lib/metasm/ppc/decode.rb +264 -0
  99. data/lib/metasm/ppc/decompile.rb +251 -0
  100. data/lib/metasm/ppc/encode.rb +51 -0
  101. data/lib/metasm/ppc/main.rb +129 -0
  102. data/lib/metasm/ppc/opcodes.rb +410 -0
  103. data/lib/metasm/ppc/parse.rb +52 -0
  104. data/lib/metasm/preprocessor.rb +1277 -0
  105. data/lib/metasm/render.rb +130 -0
  106. data/lib/metasm/sh4.rb +8 -0
  107. data/lib/metasm/sh4/decode.rb +336 -0
  108. data/lib/metasm/sh4/main.rb +292 -0
  109. data/lib/metasm/sh4/opcodes.rb +381 -0
  110. data/lib/metasm/x86_64.rb +12 -0
  111. data/lib/metasm/x86_64/compile_c.rb +1025 -0
  112. data/lib/metasm/x86_64/debug.rb +59 -0
  113. data/lib/metasm/x86_64/decode.rb +268 -0
  114. data/lib/metasm/x86_64/encode.rb +264 -0
  115. data/lib/metasm/x86_64/main.rb +135 -0
  116. data/lib/metasm/x86_64/opcodes.rb +118 -0
  117. data/lib/metasm/x86_64/parse.rb +68 -0
  118. data/misc/bottleneck.rb +61 -0
  119. data/misc/cheader-findpppath.rb +58 -0
  120. data/misc/hexdiff.rb +74 -0
  121. data/misc/hexdump.rb +55 -0
  122. data/misc/metasm-all.rb +13 -0
  123. data/misc/objdiff.rb +47 -0
  124. data/misc/objscan.rb +40 -0
  125. data/misc/pdfparse.rb +661 -0
  126. data/misc/ppc_pdf2oplist.rb +192 -0
  127. data/misc/tcp_proxy_hex.rb +84 -0
  128. data/misc/txt2html.rb +440 -0
  129. data/samples/a.out.rb +31 -0
  130. data/samples/asmsyntax.rb +77 -0
  131. data/samples/bindiff.rb +555 -0
  132. data/samples/compilation-steps.rb +49 -0
  133. data/samples/cparser_makestackoffset.rb +55 -0
  134. data/samples/dasm-backtrack.rb +38 -0
  135. data/samples/dasmnavig.rb +318 -0
  136. data/samples/dbg-apihook.rb +228 -0
  137. data/samples/dbghelp.rb +143 -0
  138. data/samples/disassemble-gui.rb +102 -0
  139. data/samples/disassemble.rb +133 -0
  140. data/samples/dump_upx.rb +95 -0
  141. data/samples/dynamic_ruby.rb +1929 -0
  142. data/samples/elf_list_needed.rb +46 -0
  143. data/samples/elf_listexports.rb +33 -0
  144. data/samples/elfencode.rb +25 -0
  145. data/samples/exeencode.rb +128 -0
  146. data/samples/factorize-headers-elfimports.rb +77 -0
  147. data/samples/factorize-headers-peimports.rb +109 -0
  148. data/samples/factorize-headers.rb +43 -0
  149. data/samples/gdbclient.rb +583 -0
  150. data/samples/generate_libsigs.rb +102 -0
  151. data/samples/hotfix_gtk_dbg.rb +59 -0
  152. data/samples/install_win_env.rb +78 -0
  153. data/samples/lindebug.rb +924 -0
  154. data/samples/linux_injectsyscall.rb +95 -0
  155. data/samples/machoencode.rb +31 -0
  156. data/samples/metasm-shell.rb +91 -0
  157. data/samples/pe-hook.rb +69 -0
  158. data/samples/pe-ia32-cpuid.rb +203 -0
  159. data/samples/pe-mips.rb +35 -0
  160. data/samples/pe-shutdown.rb +78 -0
  161. data/samples/pe-testrelocs.rb +51 -0
  162. data/samples/pe-testrsrc.rb +24 -0
  163. data/samples/pe_listexports.rb +31 -0
  164. data/samples/peencode.rb +19 -0
  165. data/samples/peldr.rb +494 -0
  166. data/samples/preprocess-flatten.rb +19 -0
  167. data/samples/r0trace.rb +308 -0
  168. data/samples/rubstop.rb +399 -0
  169. data/samples/scan_pt_gnu_stack.rb +54 -0
  170. data/samples/scanpeexports.rb +62 -0
  171. data/samples/shellcode-c.rb +40 -0
  172. data/samples/shellcode-dynlink.rb +146 -0
  173. data/samples/source.asm +34 -0
  174. data/samples/struct_offset.rb +47 -0
  175. data/samples/testpe.rb +32 -0
  176. data/samples/testraw.rb +45 -0
  177. data/samples/win32genloader.rb +132 -0
  178. data/samples/win32hooker-advanced.rb +169 -0
  179. data/samples/win32hooker.rb +96 -0
  180. data/samples/win32livedasm.rb +33 -0
  181. data/samples/win32remotescan.rb +133 -0
  182. data/samples/wintrace.rb +92 -0
  183. data/tests/all.rb +8 -0
  184. data/tests/dasm.rb +39 -0
  185. data/tests/dynldr.rb +35 -0
  186. data/tests/encodeddata.rb +132 -0
  187. data/tests/ia32.rb +82 -0
  188. data/tests/mips.rb +116 -0
  189. data/tests/parse_c.rb +239 -0
  190. data/tests/preprocessor.rb +269 -0
  191. data/tests/x86_64.rb +62 -0
  192. metadata +255 -0
data/BUGS ADDED
@@ -0,0 +1,11 @@
1
+ List of known bugs/missing features, in no particular order:
2
+
3
+ PPC cpu cannot parse/encode code
4
+ Disassembler is sloooow
5
+ The GTK UI is quite sluggish too
6
+ Disassembler backtracker does weird things
7
+ Mach-O encoder does not work (binaries won't load on OSX)
8
+ ELF encoder may need tweaks to handle OpenBSD
9
+ Ia32 compile_c misses many features (divisions, bitfields), and needs a register allocator
10
+ Asm parser does not handle well ; comments (eg "foo ; */* blargimdead") (c-style comments are parsed before asm-style, so multiline /* after ; is bad)
11
+ The BUGS file is incomplete
data/CREDITS ADDED
@@ -0,0 +1,17 @@
1
+ N: Yoann GUILLOT
2
+ E: yoann at ofjj.net
3
+ D: Lead developper
4
+
5
+ N: Julien TINNES
6
+ E: julien at cr0.org
7
+ D: Senior Product Manager
8
+ D: Ideas, bug hunting, Yoann-slapping
9
+ D: Metasploit integration
10
+
11
+ N: Arnaud CORNET
12
+ E: arnaud.cornet at gmail.com
13
+ D: Initial ELF support
14
+
15
+ N: Raphael RIGO
16
+ E: raphael at cr0.org
17
+ D: Initial MIPS support and misc stuff
data/README ADDED
@@ -0,0 +1,270 @@
1
+ Metasm, the Ruby assembly manipulation suite
2
+ ============================================
3
+
4
+ * sample scripts in samples/ -- read comments at the beginning of the files
5
+ * all files are licensed under the terms of the LGPL
6
+
7
+ Author: Yoann Guillot <john at ofjj.net>
8
+
9
+
10
+ Basic overview:
11
+
12
+ Metasm allows you to interact with executables formats (ExeFormat):
13
+ PE, ELF, Mach-O, Shellcode, etc.
14
+ There are three approaches to an ExeFormat:
15
+ - compiling one up, from scratch
16
+ - decompiling an existing format
17
+ - manipulating the file structure
18
+
19
+
20
+ Ready-to-use scripts can be found in the samples/ subdirectory, check the
21
+ comments in the scripts headers. You can also try the --help argument if
22
+ you're feeling lucky.
23
+
24
+
25
+ Here is a short overview of the Metasm internals.
26
+
27
+
28
+ Assembly:
29
+
30
+ When compiling, you start from a source text (ruby String, consisting
31
+ mostly in a sequence of instructions/data/padding directive), which is parsed.
32
+
33
+ The string is handed to a Preprocessor instance (which handles #if, #ifdef,
34
+ #include, #define, /* */ etc, should be 100% compatible with gcc -E), which is
35
+ encapsulated in an AsmPreprocessor for assembler sources (to handles asm macro
36
+ definitions, 'equ' and asm ';' comments).
37
+ The interface to do that is ExeFormat#parse(text[, filename, lineno]) or
38
+ ExeFormat.assemble (which calls .new, #parse and #assemble).
39
+
40
+ The (Asm)Preprocessor returns tokens to the ExeFormat, which parses them as Data,
41
+ Padding, Labels or parser directives. Parser directives always start with a dot.
42
+ They can be generic (.pad, .offset...) or ExeFormat-specific (.section,
43
+ .import, .entrypoint...). They are handled by #parse_parser_instruction().
44
+ If the ExeFormat does not recognize a word, it is handed to its CPU instance,
45
+ which is responsible for parsing Instructions (or raise an exception).
46
+ All those tokens are stored in one or more arrays in the @source attribute of
47
+ the ExeFormat (Shellcode's @source is an Array, for PE/ELF it is a hash
48
+ [section name] => [Array of parsed data])
49
+ Every immediate value can be an arbitrary Expression (see later).
50
+
51
+ You can then assemble the source to binary sections using ExeFormat#assemble.
52
+
53
+ Once the section binaries are available, the whole binary executable can be
54
+ written to disk using ExeFormat#encode_file(filename[, format]).
55
+
56
+ PE and ELF include an autoimport feature that allows automatic creation of
57
+ import-related data for known OS-specific functions (e.g. unresolved calls to
58
+ 'strcpy' will generate data so that the binary is linked against the libc
59
+ library at runtime).
60
+
61
+ The samples/{exe,pe,elf}encode.rb can take an asm source file as argument
62
+ and compile it to a working executable.
63
+
64
+ The CPU classes are responsible for parsing and encoding individual
65
+ instructions. The current Ia32 parser uses the Intel syntax (e.g. mov eax, 42).
66
+ The generic parser recognizes labels as a string at the beginning of a line
67
+ followed by a colon (e.g. 'some_label:'). GCC-style local labels may be used
68
+ (e.g. '1:', refered to using '1b' (backward) or '1f' (forward) ; may be
69
+ redefined as many times as needed.)
70
+ Data are specified using 'db'-style notation (e.g. 'dd 42h', 'db "blabla", 0')
71
+ See samples/asmsyntax.rb
72
+
73
+
74
+ EncodedData:
75
+
76
+ In Metasm all binary data is stored as an EncodedData.
77
+ EncodedData has 3 main attributes:
78
+ - #data which holds the raw binary data (generally a ruby String, but see
79
+ VirtualString)
80
+ - #export which is a hash associating an export name (label name) to an offset
81
+ within #data
82
+ - #reloc which is a hash whose keys are offsets within #data, and whose values
83
+ are Relocation objects.
84
+ A Relocation object has an endianness (:little/:big), a type (:u32 for unsigned
85
+ 32bits) and a target (the intended value stored here).
86
+ The target is an arbitrary arithmetic/logic Expression.
87
+
88
+ EncodedData also has a #virtsize (for e.g. .bss sections), and a #ptr (internal
89
+ offset used when decoding things)
90
+
91
+ You can fixup an EncodedData, with a Hash variable name => value (value should
92
+ be an Expression or a numeric value). When you do that, each relocation's target
93
+ is bound using the binding, and if the result is calculable (no external variable
94
+ name used in the Expression), the result is encoded using the relocation's
95
+ size/sign/endianness information. If it overflows (try to store 128 in an 8bit
96
+ signed relocation), an EncodeError exception is raised. Use the :a32 type to
97
+ allow silent overflow truncating.
98
+ If the relocation's target is not numeric, the target is unchanged if you use
99
+ EncodedData#fixup, or it is replaced with the bound target with #fixup! .
100
+
101
+
102
+ Disassembly:
103
+
104
+ This code is found in the metasm/decode.rb source file, which defines the
105
+ Disassembler class.
106
+
107
+ The disassembler needs a decoded ExeFormat (to be able to say what data is at
108
+ which virtual address) and an entrypoint (a virtual address or export name).
109
+ It can then start to disassemble instructions. When it encounters an
110
+ Opcode marked as :setip, it asks the CPU for the jump destination (an
111
+ Expression that may involve register values, for e.g. jmp eax), and backtraces
112
+ instructions until it finds the numeric value.
113
+
114
+ On decoding, the Disassembler maintains a #decoded hash associating addresses
115
+ (expressions/integer #normalize()d) to DecodedInstructions.
116
+
117
+ The disassembly generates an InstructionBlock graph. Each block holds a list of
118
+ DecodedInstruction, and pointers to the next/previous block (by address).
119
+
120
+ The disassembler also traces data accesses by instructions, and stores Xrefs
121
+ for them.
122
+ The backtrace parameters can be tweaked, and the maximum depth to consider
123
+ can be specifically changed for :r/:w backtraces (instruction memory xrefs)
124
+ using #backtrace_maxblocks_data.
125
+ When an Expression is backtracked, each walked block is marked so that loops
126
+ are detected, and so that if a new code path is found to an existing block,
127
+ backtraces can be resumed using this new path.
128
+
129
+ The disassembler makes very few assumptions, and in particular does not
130
+ suppose that functions will return ; they will only if the backtrace of the
131
+ 'ret' instructions is conclusive. This is quite powerful, but also implies
132
+ that any error in the backtracking process can lead to a full stop ; and also
133
+ means that the disassembler is quite slow.
134
+
135
+ The special method #disassemble_fast can be used to work around this when the
136
+ code is known to be well-formed (ie it assumes that all calls returns)
137
+
138
+ When a subfunction is found, a special DecodedFunction is created, which holds
139
+ a summary of the function's effects (like a DecodedInstruction on steroids).
140
+ This allows the backtracker to 'step over' subfunctions, which greatly improves
141
+ speed. The DecodedFunctions may be callback-based, to allow a very dynamic
142
+ behaviour.
143
+ External function calls create dedicated DecodedFunctions, which holds some
144
+ API information (e.g. stack fixup information, basic parameter accesses...)
145
+ This information may be derived from a C header parsed beforehand.
146
+ If no C function prototype is available, a special 'default' entry is used,
147
+ which assumes that the function has a standard ABI.
148
+
149
+ Ia32 implements a specific :default entry, which handles automatic stack fixup
150
+ resolution, by assuming that the last 'call' instruction returns. This may lead
151
+ to unexpected results ; for maximum accuracy a C header holding information for
152
+ all external functions is recommanded (see samples/factorize-headers-peimports
153
+ for a script to generate such a header from a full Visual Studio installation
154
+ and the target binary).
155
+
156
+ Ia32 also implements a specific GetProcAddress/dlsym callback, that will
157
+ yield the correct return value if the parameters can be backtraced.
158
+
159
+ The scripts implementing a full disassembler are samples/disassemble{-gui}.rb
160
+ See the comments for the GUI key bindings.
161
+
162
+
163
+ ExeFormat manipulation:
164
+
165
+ You can encode/decode an ExeFormat (ie decode sections, imports, headers etc)
166
+
167
+ Constructor: ExeFormat.decode_file(str), ExeFormat.decode_file_header(str)
168
+ Methods: ExeFormat#encode_file(filename), ExeFormat#encode_string
169
+
170
+ PE and ELF files have a LoadedPE/LoadedELF counterpart, that is able to work
171
+ with memory-mmaped versions of those formats (e.g. to debugging running
172
+ processes)
173
+
174
+
175
+ VirtualString:
176
+
177
+ A VirtualString is a String-like object: you can read and may rewrite slices of
178
+ it. It can be used as EncodedData#data, and thus allows virtualization
179
+ of most Metasm algorithms.
180
+ You cannot change a VirtualString length.
181
+ Taking a slice of a VirtualString will return either a String (for small sizes)
182
+ or another VirtualString (a 'window' into the other). You can force getting a
183
+ small VirtualString using the #dup(offset, length) method.
184
+ Any unimplemented method called on it is forwarded to a frozen String which is
185
+ a full copy of the VirtualString (should be avoided if possible, the underlying
186
+ string may be very big & slow to access).
187
+
188
+ There are currently 3 VirtualStrings implemented:
189
+ - VirtualFile, whichs loads a file by page-sized chunks on demand,
190
+ - WindowsRemoteString, which maps another process' virtual memory (uses the
191
+ windows debug api through WinDbgAPI)
192
+ - LinuxRemoteString, which maps another process' virtual memory (need ptrace
193
+ rights, memory reading is done using /proc/pid/mem)
194
+
195
+ The Win/Lin version are quite powerful, and allow things like live process
196
+ disassembly/patching easily (using LoadedPE/LoadedELF as ExeFormat)
197
+
198
+
199
+ Debugging:
200
+
201
+ Metasm includes a few interfaces to allow live debugging.
202
+ The WinOS and LinOS classes offer access to the underlying OS processes (e.g.
203
+ OS.current.find_process('foobar') will retrieve a running process with foobar
204
+ in its filename ; then process.mem can be used to access its memory.)
205
+
206
+ The Windows and Linux debugging APIs (x86 only) have a basic ruby interface
207
+ (PTrace32, extended in samples/rubstop.rb ; and WinDBG, a simple mapping of the
208
+ windows debugging API) ; those will be more worked on/integrated in the future.
209
+
210
+ A linux console debugging interface is available in samples/lindebug.rb ; it
211
+ uses a SoftICE-like look and feel.
212
+ This interface can talk to a gdb-server through samples/gdbclient.rb ; use
213
+ [udp:]<host:port> as target.
214
+
215
+ The disassembler scripts allow live process interaction by using as target
216
+ 'live:<pid or part of filename>'.
217
+
218
+ A generic debugging interface is available, it is defined in metasm/os/main.rb
219
+ It may be accessed using the Metasm::OS.current.create_debugger('foo')
220
+
221
+ It can be viewed in action using the GUI and 'open live' target.
222
+
223
+
224
+ C Parser:
225
+
226
+ Metasm includes a hand-written C Parser.
227
+ It handles all the constructs i am aware of, except hex floats:
228
+ - static const L"bla"
229
+ - variable arguments
230
+ - incomplete types
231
+ - __attributes__(()), __declspec()
232
+ - #pragma once
233
+ - #pragma pack()
234
+ - C99 declarators - type bla = { [ 2 ... 14 ].toto = 28 };
235
+ - Nested functions
236
+ - __int8 etc native types
237
+ - Label addresses (&&label)
238
+ Also note that all those things are parsed, but most of them will fail to
239
+ compile on the Ia32 backend (the only one implemented so far.)
240
+
241
+ When you parse a C String using C::Parser.parse(text), you receive a Parser
242
+ object. It holds a #toplevel field, which is a C::Block, which holds #structs,
243
+ #symbols and #statements. The top-level functions are found in the #symbol hash
244
+ whose keys are the symbol names, associated to a C::Variable object holding
245
+ the functions. The function parameter/attributes are accessible through
246
+ func.type, and the code is in func.initializer, which is itself a C::Block.
247
+ Under it you'll find a tree-like structure of C::Statements (If, While, Asm,
248
+ CExpressions...)
249
+
250
+ A C::Parser may be #precompiled to transform it into a simplified version that
251
+ is easier to compile: typedefs are removed, control sequences are transformed
252
+ in if () goto ; etc.
253
+
254
+ To compile a C program, use PE/ELF.compile_c, that will create a C::Parser with
255
+ exe-specific macros defined (eg __PE__ or __ELF__).
256
+
257
+ The prefered way to create a C::Parser is to initialize it with a CPU and the
258
+ desired ExeFormat, so that it is
259
+ correctly initialized (eg type sizes: is long 4 or 8 bytes? etc) ; and
260
+ may define preprocessor macros needed to correctly parse standard headers.
261
+ Vendor-specific headers may need to use either #pragma prepare_visualstudio
262
+ (to parse the Microsoft Visual Studio headers) or prepare_gcc (for gcc), the
263
+ latter may be auto-detected (or may not).
264
+ Vendor headers tested are VS2003 (incl. DDK) and gcc4 ; ymmv.
265
+
266
+ Currently the CPU#compilation of a C code will generate an asm source (text),
267
+ which may then be parsed & assembled to binary code.
268
+
269
+ See ExeFormat#compile_c, and samples/exeencode.rb
270
+
data/TODO ADDED
@@ -0,0 +1,114 @@
1
+ List of TODO items, by section, in random order
2
+
3
+ Ia32
4
+ emu fpu
5
+ add all sse2 instrs
6
+ realmode
7
+
8
+ X86_64
9
+ decompiler
10
+
11
+ CPU
12
+ Sparc
13
+ Cell
14
+
15
+ Parser
16
+ Allow single-file multiplexer (C code + Asm + asm16bit + ...)
17
+ Fix the asm prepro comment issue: '; a /* b\n c ; */' should see 'c'
18
+
19
+ Assembler
20
+ Handle cpu pseudo-instrs (mips 'li' -> lui high + ori low)
21
+ SplitReloc? (for pseudo-instrs)
22
+ Ia32 GAS syntax
23
+ Make the autoimport depend on the target platform and not on the exeformat
24
+ Encode FPU constants
25
+
26
+ Disasm
27
+ DecodedData
28
+ Exe decoding generate decodeddata ?
29
+ Function-local namespace (esp+12 -> esp+var_42)
30
+ Fix thunk detection (thunk: mov ecx, 42 jmp [iat_thiscall] is not a thunk)
31
+ Test with ET_REL style exe
32
+ Store stuff out of mem (to handle big binaries)
33
+ Better :default usage
34
+ good on call eax, but not on <600k instrs> ret
35
+ use binary personality ? (uses call vs uses pushret..)
36
+ Improve backtrace -> patch di.instr.args exprs
37
+ path-specific backtracking ( foo: call a ; a: jmp retloc ; bar: call b ; b: jmp retloc ; retloc: ret ; call foo ; ret : last ret trackback should only reach a:)
38
+ Decode pseudo/macro-instrs (mips 'li')
39
+ Deoptimizer (instr reordering for readability)
40
+ Optimizer (deobfuscating)
41
+ Per-instr context (allows to mix cell/ppc, x86 32/16bits, arm/armthumb..)
42
+
43
+ Compiler
44
+ Optimizer
45
+ Register allocator
46
+ Instr reordering
47
+ Asm intrinsics
48
+ Asm inline
49
+ inline functions
50
+ Separate partial compilation + linking (src1.c -> obj1.o, src2.c -> obj2.o, obj1.o+obj2.o -> bin)
51
+ Make generic compiler from cpu.instr_binding ?
52
+ create a cpu.what_instr_has_binding(:a => (:a + :b)) => 'add a, b' ?
53
+ Shellcode compiler (exit() => mov eax, 1 int 80h inline)
54
+
55
+ Decompiler
56
+ Fix decompiling on loaded savefile
57
+ Rewrite cpu-specific to really dumb
58
+ Just translate di.binding to C
59
+ maybe w/ trivial var dependency check for unused regs, but beware :incomplete instrs deps
60
+ Check interdependency ('xadd')
61
+ Move frame pointer checks / stack var detection to C code
62
+ Update asm listing from info in C (stack vars, stack var names..)
63
+ Handle renaming/retyping register vars / aliases
64
+ Handle switch() / computed goto
65
+ Fix inline asm reg dependencies
66
+ Handle direct syscalls (mov eax, 1 int 80h => exit())
67
+ Autodecode structs
68
+ FPU
69
+ Handle/hide compiler-generated stuff (getip, stack cookie setup/check..)
70
+ Handle call 1f ; 1: pop eax
71
+ More user control (force/forbid register arg, return type, etc)
72
+
73
+ Debugger
74
+ OSX
75
+ Detour-style functionnality to patch binary code (also static to patch exe files?)
76
+ Move constants in a data/ folder (ptrace reg numbers, syscalls, etc)
77
+ Generic remote process manip
78
+ create blank state
79
+ linux virtualallocex
80
+ pax-compatible code patch through mmap
81
+ Remote debugging (small standalone C client)
82
+ Support dbghelp.dll (ms symbol server info)
83
+ Support debugee function call (gdb 'call')
84
+ Manipulate memory through C struct casts
85
+
86
+ ExeFormat
87
+ Handle minor editing without decode/reencode (eg patch ELF entrypoint)
88
+
89
+ ELF
90
+ test encoding openbsd binaries
91
+ handle symbol versions
92
+ LoadedELF.dump
93
+ Check relocation encoding (eg samples/dynamic_ruby with cpu.generate_PIC=false)
94
+
95
+ MachO
96
+
97
+ PE
98
+ resource editor ?
99
+ rc compiler ?
100
+ add simple accessor for resource stuff (manifest, icon, ...)
101
+
102
+ GUI
103
+ debugger
104
+ specialize widgets
105
+ show breakpoints
106
+ show jump direction from current flag values
107
+ have a console frontend
108
+ better graph positionning fallback
109
+ zoom font when zooming graph
110
+ copy/paste, selection
111
+ map (part of) the binary & debug it (map a PE on a linux host & run it)
112
+
113
+ Ruby
114
+ compile ruby AST to native optimized code
@@ -0,0 +1,146 @@
1
+ Metasm source code organisation
2
+ ===============================
3
+
4
+ The metasm source code takes advantage of the ruby language facilities,
5
+ which allows splitting the definition of a single class in multiple files.
6
+
7
+ Each file in the source tree holds code related to a particular feature of
8
+ the framework.
9
+
10
+ Directories
11
+ -----------
12
+
13
+ The top-level directories are :
14
+
15
+ * `doc/`: this documentation
16
+ * `metasm/`: the framework core
17
+ * `samples/`: a set of sample scripts showing various functionnalities of the framework
18
+ * `tests/`: a few unit tests (too few..)
19
+ * `misc/`: misc ruby scripts, not directly related to metasm
20
+
21
+ The core
22
+ --------
23
+
24
+ The `metasm/` directory holds most of the code of the framework, along with the
25
+ main `metasm.rb` file in the top directory.
26
+
27
+ The top-level `metasm.rb` has code to load parts of the framework source on demand
28
+ in the ruby interpreter, which is implemented with ruby's <const_missing.txt>
29
+
30
+
31
+ Executable formats
32
+ ##################
33
+
34
+ The `exe_format/` subdirectory contains the implementations of the various
35
+ binary file formats supported in the framework.
36
+
37
+ Three files have a special meaning here:
38
+
39
+ * `main.rb`: it defines the <core/ExeFormat.txt> class
40
+ * `serialstruct.rb`: here you'll find the definitions of <core/SerialStruct.txt>
41
+ * `autoexe.rb`: the implementation of <core/AutoExe.txt>, which allows the recognition of arbitrary files from their binary signature.
42
+
43
+ The `main.rb` file is included in all other formats, as all file classes
44
+ are subclasses of `ExeFormat`.
45
+
46
+ The `serialstruct.rb` implements a helper class to ease the description of
47
+ binary structures, and generate parsing/encoding functions for those.
48
+
49
+ All other files implement a specific file format handler. The bigger files
50
+ (`ELF` and `PE/COFF`) are split between the parsing/encoding functions and
51
+ decoding/disassembly.
52
+
53
+
54
+ CPUs
55
+ ####
56
+
57
+ All supported architectures have a dedicated subdirectory, and a helper file
58
+ that will simply include all the arch-specific files.
59
+
60
+ All those files will contribute to add functions to the same class implementing
61
+ the CPU interface. Not all CPUs implement all those features. They are:
62
+
63
+ * `main.rb`: inner classes definitions (for registers etc), generic functions
64
+ * `opcodes.rb`: initializes the opcode list for the architecture
65
+ * `encode.rb`: methods to encode instructions
66
+ * `decode.rb`: methods to decode/emulate instructions
67
+ * `parse.rb`: methods to parse asm instructions from a source file
68
+ * `render.rb`: methods to output an instruction to a string
69
+ * `compile_c.rb`: the C compiler implementation
70
+ * `decompile.rb`: the arch-specific part of the generic decompiler
71
+ * `debug.rb`: arch-specific information used when debugging target of this architecture
72
+
73
+ In some cases the files are small enough to be all merged into the `main.rb` file.
74
+
75
+
76
+ Operating systems
77
+ #################
78
+
79
+ The `os/` subdirectory holds the code used to abstract an operating systems.
80
+
81
+ The files here define an API allowing to enumerate running processes, and interact
82
+ with them in various ways. The <core/Debugger.txt> class and subclasses are
83
+ defined there.
84
+
85
+ Those files also holds the list of known functions and in which system libraries
86
+ they can be found (see <core/WindowsExports.txt> or <core/GNUExports.txt>), which
87
+ are used when linking executable files.
88
+
89
+
90
+ Graphical user-interface
91
+ ########################
92
+
93
+ The `gui/` subdirectory contains the code needed by the metasm graphical user-interfaces.
94
+
95
+ Currently those include the disassembler and the debugger (see the *samples* section).
96
+
97
+ Those GUI elements are implemented using a custom GUI abstraction, and reside in the
98
+ various `dasm_*.rb` and `debug.rb`.
99
+
100
+ The actual implementation of the GUI are found in:
101
+
102
+ * `win32.rb`: the native Win32 API backend
103
+ * `gtk.rb`: a Gtk2 backend, intended for unix platforms
104
+ * `qt.rb`: a Qt backend experiment
105
+
106
+ Please note that the Qt backend does not work *at all*.
107
+
108
+ The `gui.rb` file in the main directory is used to chose among the available GUI backend
109
+ the most appropriate for the current session.
110
+
111
+
112
+ Others
113
+ ######
114
+
115
+ The other files directly in the `metasm/` directory are either support files
116
+ (eg `encode.rb`, `parse.rb`) that hold generic functions to be used by
117
+ specific cpu/exeformat instances, or implement arch-agnostic features.
118
+ Those include:
119
+
120
+ * `preprocessor.rb`: the C/asm preprocessor/lexer
121
+ * `parse_c.rb`: this is the implementation of the C parser
122
+ * `compile_c.rb`: this is a C precompiler, it generates a very simplified C from a standard source
123
+ * `decompile.rb`: the generic decompiler code, it uses arch-specific functions defined in the arch folder
124
+ * `dynldr.rb`: this module is used when interacting directly with the host operating system through <core/DynLdr.txt>
125
+
126
+
127
+ The samples
128
+ -----------
129
+
130
+ The `samples/` directory contains a lot of small files that intend to be
131
+ exemples of how to use the framework. It also holds experiments and
132
+ work-in-progress for features that may later be integrated into the main
133
+ framework.
134
+
135
+ The comment at the beginning of the file should be clear about the purpose
136
+ of the script, and the scripts are expected to be copy/pasted and tweaked
137
+ for the specific task needed by the user (that's you).
138
+
139
+ Some of those files however are full-featured applications:
140
+
141
+ * `exeencode.rb`: a shellcode compiler, with its `peencode.rb`, `elfencode.rb`, `machoencode.rb` counterparts
142
+ * `disassemble.rb`: a disassembler
143
+ * `disassemble-gui.rb`: the graphical disassembler / debugger
144
+
145
+ The `samples/dasm-plugins/` subdirectory holds various plugins for the disassembler.
146
+