metasm 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (192) hide show
  1. data/BUGS +11 -0
  2. data/CREDITS +17 -0
  3. data/README +270 -0
  4. data/TODO +114 -0
  5. data/doc/code_organisation.txt +146 -0
  6. data/doc/const_missing.txt +16 -0
  7. data/doc/core_classes.txt +75 -0
  8. data/doc/feature_list.txt +53 -0
  9. data/doc/index.txt +59 -0
  10. data/doc/install_notes.txt +170 -0
  11. data/doc/style.css +3 -0
  12. data/doc/use_cases.txt +18 -0
  13. data/lib/metasm.rb +80 -0
  14. data/lib/metasm/arm.rb +12 -0
  15. data/lib/metasm/arm/debug.rb +39 -0
  16. data/lib/metasm/arm/decode.rb +167 -0
  17. data/lib/metasm/arm/encode.rb +77 -0
  18. data/lib/metasm/arm/main.rb +75 -0
  19. data/lib/metasm/arm/opcodes.rb +177 -0
  20. data/lib/metasm/arm/parse.rb +130 -0
  21. data/lib/metasm/arm/render.rb +55 -0
  22. data/lib/metasm/compile_c.rb +1457 -0
  23. data/lib/metasm/dalvik.rb +8 -0
  24. data/lib/metasm/dalvik/decode.rb +196 -0
  25. data/lib/metasm/dalvik/main.rb +60 -0
  26. data/lib/metasm/dalvik/opcodes.rb +366 -0
  27. data/lib/metasm/decode.rb +213 -0
  28. data/lib/metasm/decompile.rb +2659 -0
  29. data/lib/metasm/disassemble.rb +2068 -0
  30. data/lib/metasm/disassemble_api.rb +1280 -0
  31. data/lib/metasm/dynldr.rb +1329 -0
  32. data/lib/metasm/encode.rb +333 -0
  33. data/lib/metasm/exe_format/a_out.rb +194 -0
  34. data/lib/metasm/exe_format/autoexe.rb +82 -0
  35. data/lib/metasm/exe_format/bflt.rb +189 -0
  36. data/lib/metasm/exe_format/coff.rb +455 -0
  37. data/lib/metasm/exe_format/coff_decode.rb +901 -0
  38. data/lib/metasm/exe_format/coff_encode.rb +1078 -0
  39. data/lib/metasm/exe_format/dex.rb +457 -0
  40. data/lib/metasm/exe_format/dol.rb +145 -0
  41. data/lib/metasm/exe_format/elf.rb +923 -0
  42. data/lib/metasm/exe_format/elf_decode.rb +979 -0
  43. data/lib/metasm/exe_format/elf_encode.rb +1375 -0
  44. data/lib/metasm/exe_format/macho.rb +827 -0
  45. data/lib/metasm/exe_format/main.rb +228 -0
  46. data/lib/metasm/exe_format/mz.rb +164 -0
  47. data/lib/metasm/exe_format/nds.rb +172 -0
  48. data/lib/metasm/exe_format/pe.rb +437 -0
  49. data/lib/metasm/exe_format/serialstruct.rb +246 -0
  50. data/lib/metasm/exe_format/shellcode.rb +114 -0
  51. data/lib/metasm/exe_format/xcoff.rb +167 -0
  52. data/lib/metasm/gui.rb +23 -0
  53. data/lib/metasm/gui/cstruct.rb +373 -0
  54. data/lib/metasm/gui/dasm_coverage.rb +199 -0
  55. data/lib/metasm/gui/dasm_decomp.rb +369 -0
  56. data/lib/metasm/gui/dasm_funcgraph.rb +103 -0
  57. data/lib/metasm/gui/dasm_graph.rb +1354 -0
  58. data/lib/metasm/gui/dasm_hex.rb +543 -0
  59. data/lib/metasm/gui/dasm_listing.rb +599 -0
  60. data/lib/metasm/gui/dasm_main.rb +906 -0
  61. data/lib/metasm/gui/dasm_opcodes.rb +291 -0
  62. data/lib/metasm/gui/debug.rb +1228 -0
  63. data/lib/metasm/gui/gtk.rb +884 -0
  64. data/lib/metasm/gui/qt.rb +495 -0
  65. data/lib/metasm/gui/win32.rb +3004 -0
  66. data/lib/metasm/gui/x11.rb +621 -0
  67. data/lib/metasm/ia32.rb +14 -0
  68. data/lib/metasm/ia32/compile_c.rb +1523 -0
  69. data/lib/metasm/ia32/debug.rb +193 -0
  70. data/lib/metasm/ia32/decode.rb +1167 -0
  71. data/lib/metasm/ia32/decompile.rb +564 -0
  72. data/lib/metasm/ia32/encode.rb +314 -0
  73. data/lib/metasm/ia32/main.rb +233 -0
  74. data/lib/metasm/ia32/opcodes.rb +872 -0
  75. data/lib/metasm/ia32/parse.rb +327 -0
  76. data/lib/metasm/ia32/render.rb +91 -0
  77. data/lib/metasm/main.rb +1193 -0
  78. data/lib/metasm/mips.rb +11 -0
  79. data/lib/metasm/mips/compile_c.rb +7 -0
  80. data/lib/metasm/mips/decode.rb +253 -0
  81. data/lib/metasm/mips/encode.rb +51 -0
  82. data/lib/metasm/mips/main.rb +72 -0
  83. data/lib/metasm/mips/opcodes.rb +443 -0
  84. data/lib/metasm/mips/parse.rb +51 -0
  85. data/lib/metasm/mips/render.rb +43 -0
  86. data/lib/metasm/os/gnu_exports.rb +270 -0
  87. data/lib/metasm/os/linux.rb +1112 -0
  88. data/lib/metasm/os/main.rb +1686 -0
  89. data/lib/metasm/os/remote.rb +527 -0
  90. data/lib/metasm/os/windows.rb +2027 -0
  91. data/lib/metasm/os/windows_exports.rb +745 -0
  92. data/lib/metasm/parse.rb +876 -0
  93. data/lib/metasm/parse_c.rb +3938 -0
  94. data/lib/metasm/pic16c/decode.rb +42 -0
  95. data/lib/metasm/pic16c/main.rb +17 -0
  96. data/lib/metasm/pic16c/opcodes.rb +68 -0
  97. data/lib/metasm/ppc.rb +11 -0
  98. data/lib/metasm/ppc/decode.rb +264 -0
  99. data/lib/metasm/ppc/decompile.rb +251 -0
  100. data/lib/metasm/ppc/encode.rb +51 -0
  101. data/lib/metasm/ppc/main.rb +129 -0
  102. data/lib/metasm/ppc/opcodes.rb +410 -0
  103. data/lib/metasm/ppc/parse.rb +52 -0
  104. data/lib/metasm/preprocessor.rb +1277 -0
  105. data/lib/metasm/render.rb +130 -0
  106. data/lib/metasm/sh4.rb +8 -0
  107. data/lib/metasm/sh4/decode.rb +336 -0
  108. data/lib/metasm/sh4/main.rb +292 -0
  109. data/lib/metasm/sh4/opcodes.rb +381 -0
  110. data/lib/metasm/x86_64.rb +12 -0
  111. data/lib/metasm/x86_64/compile_c.rb +1025 -0
  112. data/lib/metasm/x86_64/debug.rb +59 -0
  113. data/lib/metasm/x86_64/decode.rb +268 -0
  114. data/lib/metasm/x86_64/encode.rb +264 -0
  115. data/lib/metasm/x86_64/main.rb +135 -0
  116. data/lib/metasm/x86_64/opcodes.rb +118 -0
  117. data/lib/metasm/x86_64/parse.rb +68 -0
  118. data/misc/bottleneck.rb +61 -0
  119. data/misc/cheader-findpppath.rb +58 -0
  120. data/misc/hexdiff.rb +74 -0
  121. data/misc/hexdump.rb +55 -0
  122. data/misc/metasm-all.rb +13 -0
  123. data/misc/objdiff.rb +47 -0
  124. data/misc/objscan.rb +40 -0
  125. data/misc/pdfparse.rb +661 -0
  126. data/misc/ppc_pdf2oplist.rb +192 -0
  127. data/misc/tcp_proxy_hex.rb +84 -0
  128. data/misc/txt2html.rb +440 -0
  129. data/samples/a.out.rb +31 -0
  130. data/samples/asmsyntax.rb +77 -0
  131. data/samples/bindiff.rb +555 -0
  132. data/samples/compilation-steps.rb +49 -0
  133. data/samples/cparser_makestackoffset.rb +55 -0
  134. data/samples/dasm-backtrack.rb +38 -0
  135. data/samples/dasmnavig.rb +318 -0
  136. data/samples/dbg-apihook.rb +228 -0
  137. data/samples/dbghelp.rb +143 -0
  138. data/samples/disassemble-gui.rb +102 -0
  139. data/samples/disassemble.rb +133 -0
  140. data/samples/dump_upx.rb +95 -0
  141. data/samples/dynamic_ruby.rb +1929 -0
  142. data/samples/elf_list_needed.rb +46 -0
  143. data/samples/elf_listexports.rb +33 -0
  144. data/samples/elfencode.rb +25 -0
  145. data/samples/exeencode.rb +128 -0
  146. data/samples/factorize-headers-elfimports.rb +77 -0
  147. data/samples/factorize-headers-peimports.rb +109 -0
  148. data/samples/factorize-headers.rb +43 -0
  149. data/samples/gdbclient.rb +583 -0
  150. data/samples/generate_libsigs.rb +102 -0
  151. data/samples/hotfix_gtk_dbg.rb +59 -0
  152. data/samples/install_win_env.rb +78 -0
  153. data/samples/lindebug.rb +924 -0
  154. data/samples/linux_injectsyscall.rb +95 -0
  155. data/samples/machoencode.rb +31 -0
  156. data/samples/metasm-shell.rb +91 -0
  157. data/samples/pe-hook.rb +69 -0
  158. data/samples/pe-ia32-cpuid.rb +203 -0
  159. data/samples/pe-mips.rb +35 -0
  160. data/samples/pe-shutdown.rb +78 -0
  161. data/samples/pe-testrelocs.rb +51 -0
  162. data/samples/pe-testrsrc.rb +24 -0
  163. data/samples/pe_listexports.rb +31 -0
  164. data/samples/peencode.rb +19 -0
  165. data/samples/peldr.rb +494 -0
  166. data/samples/preprocess-flatten.rb +19 -0
  167. data/samples/r0trace.rb +308 -0
  168. data/samples/rubstop.rb +399 -0
  169. data/samples/scan_pt_gnu_stack.rb +54 -0
  170. data/samples/scanpeexports.rb +62 -0
  171. data/samples/shellcode-c.rb +40 -0
  172. data/samples/shellcode-dynlink.rb +146 -0
  173. data/samples/source.asm +34 -0
  174. data/samples/struct_offset.rb +47 -0
  175. data/samples/testpe.rb +32 -0
  176. data/samples/testraw.rb +45 -0
  177. data/samples/win32genloader.rb +132 -0
  178. data/samples/win32hooker-advanced.rb +169 -0
  179. data/samples/win32hooker.rb +96 -0
  180. data/samples/win32livedasm.rb +33 -0
  181. data/samples/win32remotescan.rb +133 -0
  182. data/samples/wintrace.rb +92 -0
  183. data/tests/all.rb +8 -0
  184. data/tests/dasm.rb +39 -0
  185. data/tests/dynldr.rb +35 -0
  186. data/tests/encodeddata.rb +132 -0
  187. data/tests/ia32.rb +82 -0
  188. data/tests/mips.rb +116 -0
  189. data/tests/parse_c.rb +239 -0
  190. data/tests/preprocessor.rb +269 -0
  191. data/tests/x86_64.rb +62 -0
  192. metadata +255 -0
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+ # This file is part of Metasm, the Ruby assembly manipulation suite
3
+ # Copyright (C) 2006-2009 Yoann GUILLOT
4
+ #
5
+ # Licence is LGPL, see LICENCE in the top-level directory
6
+
7
+
8
+
9
+ class IO
10
+ def hexdump(ctx={})
11
+ ctx[:noend] = true
12
+ while buf = read(512) and not buf.empty?
13
+ buf.hexdump(ctx)
14
+ end
15
+ ctx.delete :noend
16
+ ''.hexdump(ctx)
17
+ end
18
+ end
19
+
20
+ class String
21
+ def hexdump(ctx={})
22
+ fmt = ctx[:fmt] ||= ['c', 'd', 'a']
23
+ ctx[:pos] ||= 0
24
+ ctx[:linelen] ||= 16
25
+ scan(/.{1,#{ctx[:linelen]}}/m) { |s|
26
+ if s != ctx[:lastline]
27
+ ctx[:lastdup] = false
28
+ print '%04x ' % ctx[:pos]
29
+ print s.unpack('C*').map { |b| '%02x' % b }.join(' ').ljust(3*16-1) + ' ' if fmt.include? 'c'
30
+ print s.unpack('v*').map { |b| '%04x' % b }.join(' ').ljust(5*8-1) + ' ' if fmt.include? 'w'
31
+ print s.unpack('L*').map { |b| '%08x' % b }.join(' ').ljust(9*4-1) + ' ' if fmt.include? 'd'
32
+ print s.tr("\0-\x1f\x7f-\xff", '.') if fmt.include? 'a'
33
+ puts
34
+ elsif not ctx[:lastdup]
35
+ ctx[:lastdup] = true
36
+ puts '*'
37
+ end
38
+ ctx[:lastline] = s
39
+ ctx[:pos] += s.length
40
+ }
41
+ puts '%04x' % ctx[:pos] if not ctx[:noend]
42
+ rescue Errno::EPIPE
43
+ exit
44
+ end
45
+ end
46
+
47
+ if $0 == __FILE__
48
+ fmt = []
49
+ fmt << 'c' if ARGV.delete '-C'
50
+ fmt << 'w' if ARGV.delete '-W'
51
+ fmt << 'd' if ARGV.delete '-D'
52
+ fmt << 'a' if ARGV.delete '-A'
53
+ fmt = ['c', 'd', 'a'] if ARGV.delete '-a'
54
+ File.open(ARGV.first, 'rb').hexdump(:fmt => fmt)
55
+ end
@@ -0,0 +1,13 @@
1
+ # This file is part of Metasm, the Ruby assembly manipulation suite
2
+ # Copyright (C) 2006-2009 Yoann GUILLOT
3
+ #
4
+ # Licence is LGPL, see LICENCE in the top-level directory
5
+
6
+ # this file loads all metasm files, to avoid using ruby autoload mechanism
7
+
8
+ require File.join(File.dirname(__FILE__), '..', 'metasm')
9
+
10
+ module Metasm
11
+ Const_autorequire.values.flatten.each { |f| require File.join('metasm', f) }
12
+ $:.pop if $:.last == Metasmdir
13
+ end
@@ -0,0 +1,47 @@
1
+ # This file is part of Metasm, the Ruby assembly manipulation suite
2
+ # Copyright (C) 2006-2009 Yoann GUILLOT
3
+ #
4
+ # Licence is LGPL, see LICENCE in the top-level directory
5
+
6
+
7
+ # computes the difference beetween two ruby objects
8
+ # walks accessors, arrays and hashes
9
+
10
+ def Object.diff(o1, o2)
11
+ if o1.class == o2.class
12
+ h = {}
13
+ case o1
14
+ when Array, Hash
15
+ if o1.kind_of? Array
16
+ keys = (0...[o1.length, o2.length].max).to_a
17
+ else
18
+ keys = o1.keys | o2.keys
19
+ end
20
+ keys.each { |k|
21
+ d = diff(o1[k], o2[k])
22
+ h["[#{k.inspect}]"] = d if not d.empty?
23
+ }
24
+ else
25
+ a = (@@diff_accessor_cache ||= {})[o1.class] ||= (im = o1.class.public_instance_methods.grep(/^[a-z]/) ; (im & im.map { |m| m + '=' }).map { |m| m.chop }.find_all { |m| o1.instance_variable_get('@'+m) })
26
+ if a.empty?
27
+ return o1 == o2 ? h : [o1, o2]
28
+ end
29
+ a.each { |k|
30
+ d = diff(o1.send(k), o2.send(k))
31
+ h['.' + k] = d if not d.empty?
32
+ }
33
+ end
34
+
35
+ # simplify tree
36
+ h.keys.each { |k|
37
+ if h[k].kind_of? Hash and h[k].length == 1
38
+ v = h.delete k
39
+ h[k + v.keys.first] = v.values.first
40
+ end
41
+ }
42
+
43
+ h
44
+ else
45
+ [o1, o2]
46
+ end
47
+ end
@@ -0,0 +1,40 @@
1
+ # This file is part of Metasm, the Ruby assembly manipulation suite
2
+ # Copyright (C) 2006-2009 Yoann GUILLOT
3
+ #
4
+ # Licence is LGPL, see LICENCE in the top-level directory
5
+
6
+ # searches an object in the attributes of another
7
+ # anyobj.scan_for([obj]) => "anyobj.someattr[42]['blabla']"
8
+
9
+ class Object
10
+ def scan_iter
11
+ case self
12
+ when ::Array
13
+ length.times { |i| yield self[i], "[#{i}]" }
14
+ when ::Hash
15
+ each { |k, v| yield v, "[#{k.inspect}]" ; yield k, "(key)" }
16
+ else
17
+ instance_variables.each { |i| yield instance_variable_get(i), ".#{i[1..-1]}" }
18
+ end
19
+ end
20
+
21
+ # dumps to stdout the path to find some targets ( array of objects to match with == )
22
+ def scan_for(targets, path='', done={})
23
+ done[object_id] = self if done.empty?
24
+ if t = targets.find { |t_| self == t_ }
25
+ puts "found #{t} at #{path}"
26
+ end
27
+ scan_iter { |v, p|
28
+ case v
29
+ when Fixnum, Symbol; next
30
+ end
31
+ p = path+p
32
+ if done[v.object_id]
33
+ puts "loop #{p} -> #{done[v.object_id]}" if $VERBOSE
34
+ else
35
+ done[v.object_id] = p
36
+ v.scan_for(targets, p, done)
37
+ end
38
+ }
39
+ end
40
+ end
@@ -0,0 +1,661 @@
1
+ # This file is part of Metasm, the Ruby assembly manipulation suite
2
+ # Copyright (C) 2006-2009 Yoann GUILLOT
3
+ #
4
+ # Licence is LGPL, see LICENCE in the top-level directory
5
+ #
6
+ # parses a PDF file
7
+ # used by ppc_pdf2oplist
8
+ #
9
+
10
+
11
+ require 'zlib'
12
+
13
+ # a Virtual string backed by a file, which is read on-demand
14
+ class VString
15
+ # creates a VString from a file
16
+ def self.read(fname)
17
+ raise 'need a PDF filename' if not fname
18
+ new File.open(fname, 'rb'), File.size(fname)
19
+ end
20
+
21
+ def initialize(fd, len)
22
+ @fd = fd
23
+ @len = len
24
+ end
25
+
26
+ def length; @len end
27
+
28
+ def [](start, len=nil)
29
+ return if not start
30
+ if start.kind_of? Range
31
+ len = start.end
32
+ len -= 1 if start.exclude_end?
33
+ len = @len+1+len if len < 0
34
+ start = start.begin
35
+ end
36
+ start = @len+1+start if start < 0
37
+ return nil if start < 0 or len < 0 or start > @len
38
+ @fd.pos = start
39
+ @fd.read len
40
+ end
41
+
42
+ # search on a small region (1k or 1M)
43
+ def index(sub, off=0)
44
+ off += 1 + @len if off < 0
45
+ return if not ret = self[off, 1024].index(sub) || self[off, 1024*1024].index(sub)
46
+ ret + off
47
+ end
48
+
49
+ # search on a small region (1k or 1M)
50
+ def rindex(sub, off=@len)
51
+ off += 1 + @len if off < 0
52
+ p1 = [off, 1024].min
53
+ p2 = [off, 1024*1024].min
54
+ if ret = self[off-p1, p1].rindex(sub)
55
+ ret + off-p1
56
+ elsif p1 != p2 and ret = self[off-p2, p2].rindex(sub)
57
+ ret + off-p2
58
+ end
59
+ end
60
+ end
61
+
62
+ # a PDF parser
63
+ class PDF
64
+ attr_accessor :str, :off, :trailer, :hdr, :xrefs, :xoff
65
+
66
+ # reads a filename as a PDF using VString
67
+ def self.read(filename)
68
+ new(VString.read(filename))
69
+ end
70
+
71
+ def initialize(str=nil)
72
+ read str if str
73
+ end
74
+
75
+ # reads a string as a PDF, interpret basic informations (header, trailer, xref table)
76
+ def read(str)
77
+ @str = str
78
+ @off = 0
79
+ readhdr
80
+ raise 'bad pdf: no trailer' unless @off = @str.rindex("trailer", @str.length)
81
+ readtrailer
82
+ self
83
+ end
84
+
85
+ def readhdr
86
+ @hdr = @str[@off, @off = @str.index("\n", @off)]
87
+ end
88
+
89
+ # reads the pdf trailer
90
+ # XXX the xref table referenced here may be the first of the file, so we suppose the last is just before the 'trailer' command..
91
+ def readtrailer
92
+ toff = @off
93
+ readcmd
94
+ @trailer = readhash
95
+ readcmd
96
+ @xroff = readint
97
+ @xoff = {} # [gen] => { id => off }
98
+ @xrefs = {} # [gen] => { id => obj }
99
+ @off = @xroff
100
+ readcmd
101
+ readxrtable
102
+ off2 = @off
103
+ if @off < toff and readcmd == 'trailer' and off = @str.rindex('xref', toff)
104
+ @off = off
105
+ readcmd
106
+ readxrtable
107
+ @off = off2
108
+ readcmd
109
+ @trailer.update readhash
110
+ end
111
+ end
112
+
113
+ def readxrtable
114
+ while @str[@off, 7] != 'trailer'
115
+ objnr = readint
116
+ objcnt = readint
117
+ @str[@off, 20*objcnt].scan(/(\d+) (\d+) (.)/) { |o, g, u|
118
+ (@xoff[g.to_i] ||= {})[objnr] = o.to_i if u == 'n'
119
+ objnr += 1
120
+ }
121
+ @off += 20*objcnt
122
+ skipspc
123
+ end
124
+ end
125
+
126
+ def readint
127
+ buf = ''
128
+ loop do
129
+ case c = @str[@off, 1]
130
+ when '+', '-'; break if not buf.empty?
131
+ when '.'; break if buf.include? '.'
132
+ when '0'..'9'
133
+ else break
134
+ end
135
+ buf << c
136
+ @off += 1
137
+ end
138
+ return if buf.empty?
139
+ skipspc
140
+ buf.include?('.') ? buf.to_f : buf.to_i
141
+ end
142
+
143
+ def readstr
144
+ buf = ''
145
+ case @str[@off, 1]
146
+ when '('
147
+ nest = 0
148
+ loop do
149
+ @off += 1
150
+ case c = @str[@off, 1]
151
+ when '('; nest += 1 ; buf << c
152
+ when ')'; nest -= 1 ; break if nest < 0 ; buf << c
153
+ when '\\'
154
+ @off += 1
155
+ case c = @str[@off, 1]
156
+ when 'n'; buf << ?\n
157
+ when 'r'; buf << ?\r
158
+ when 't'; buf << ?\t
159
+ when 'b'; buf << ?\b
160
+ when '0'..'7'
161
+ if ('0'..'7').include?(cc = @str[@off+1, 1])
162
+ @off += 1 ; c << cc
163
+ if ('0'..'7').include?(cc = @str[@off+1, 1])
164
+ @off += 1 ; c << cc
165
+ end
166
+ end
167
+ buf << c.to_i(8)
168
+ when nil; break
169
+ else buf << c
170
+ end
171
+ when nil; break
172
+ else buf << c
173
+ end
174
+ end
175
+ when '<'
176
+ loop do
177
+ @off += 1
178
+ case c = @str[@off, 1]
179
+ when '0'..'9', 'a'..'f', 'A'..'F'; buf << c
180
+ when ' ', "\n", "\r", "\t"
181
+ else break
182
+ end
183
+ end
184
+ buf << '0' if buf.length % 2 == 1
185
+ buf = [buf].pack('H*')
186
+ else return
187
+ end
188
+ @off += 1
189
+ skipspc
190
+ buf
191
+ end
192
+
193
+ def readname
194
+ return if @str[@off, 1] != '/'
195
+ buf = ''
196
+ loop do
197
+ @off += 1
198
+ case c = @str[@off, 1]
199
+ when '#'; buf << @str[@off+1, 2].to_i(16) ; @off += 2
200
+ when nil, /[\s\(\)\{\}<>\[\]\/]/; break
201
+ else buf << c
202
+ end
203
+ end
204
+ skipspc
205
+ buf
206
+ end
207
+
208
+ def readarray
209
+ return if @str[@off, 1] != '['
210
+ buf = []
211
+ @off += 1
212
+ skipspc
213
+ buf << readany until @str[@off, 1] == ']' or @off >= @str.length
214
+ @off += 1
215
+ skipspc
216
+ buf
217
+ end
218
+
219
+ def readhash
220
+ return if @str[@off, 2] != '<<'
221
+ buf = {}
222
+ @off += 2
223
+ skipspc
224
+ buf[readname] = readany until @str[@off, 2] == '>>' or @off >= @str.length
225
+ buf.delete_if { |k, v| v == :null }
226
+ @off += 2
227
+ skipspc
228
+ buf
229
+ end
230
+
231
+ def readcmd
232
+ buf = ''
233
+ loop do
234
+ case c = @str[@off, 1]
235
+ when nil, /[\s\(\)\{\}<>\[\]\/%]/; break
236
+ else buf << c
237
+ end
238
+ @off += 1
239
+ end
240
+ skipspc
241
+ buf
242
+ end
243
+
244
+ def newstream(hash, data)
245
+ f = [hash['Filter']].flatten.compact
246
+ if f.length == 1 and f.first == 'FlateDecode'
247
+ data = Zlib::Inflate.inflate(data)
248
+ elsif f.length == 0
249
+ else puts "stream filter #{f.inspect} unsupported"
250
+ end
251
+ hash[:data] = data
252
+ hash
253
+ end
254
+
255
+ class Ref
256
+ attr_accessor :gen, :id
257
+ def initialize(pdf, gen, id)
258
+ @pdf, @gen, @id = pdf, gen, id
259
+ end
260
+
261
+ def inspect
262
+ "#<Ref @pdf=#{@pdf.object_id.to_s(16)} @gen=#@gen @id=#@id>"
263
+ end
264
+
265
+ def deref(depth=1)
266
+ @pdf.deref(self, depth)
267
+ end
268
+
269
+ def method_missing(*a, &b)
270
+ deref.send(*a, &b)
271
+ end
272
+ end
273
+
274
+ # reads & returns any pdf object according to its 1st char (almost)
275
+ # updates @xrefs if the object is indirect
276
+ def readany
277
+ case @str[@off, 1]
278
+ when nil; return
279
+ when '/'; readname
280
+ when '+', '-'; readint
281
+ when '0'..'9'
282
+ i = readint
283
+ if ('0'..'9').include?(@str[@off, 1])
284
+ poff = @off
285
+ g = readint
286
+ case readcmd
287
+ when 'obj'
288
+ @xrefs[g] ||= {}
289
+ i = @xrefs[g][i] ||= readany
290
+ raise 'no endobj' if readcmd != 'endobj'
291
+ when 'R'
292
+ i = Ref.new(self, g, i)
293
+ else @off = poff
294
+ end
295
+ end
296
+ i
297
+ when '['; readarray
298
+ when '('; readstr
299
+ when '<'
300
+ if @str[@off+1, 1] == '<'
301
+ h = readhash
302
+ if @str[@off, 6] == 'stream' and i = @str.index("\n", @off) # readcmd may eat spaces that are part of the stream
303
+ l = h['Length'].to_i
304
+ h = newstream(h, @str[i+1, l])
305
+ @off = i+1+l
306
+ skipspc
307
+ raise 'no endstream' if readcmd != 'endstream'
308
+ end
309
+ h
310
+ else readstr
311
+ end
312
+ else
313
+ case c = readcmd
314
+ when 'true', 'false', 'null'; c.to_sym
315
+ when 'xref'; readxrtable ; (@trailer ||= {}).update readhash if readcmd == 'trailer' ; readint if readcmd == 'startxref' ; :xref
316
+ else raise "unknown cmd #{c.inspect}"
317
+ end
318
+ end
319
+ end
320
+
321
+ def skipspc
322
+ while @off < @str.length
323
+ case @str[@off, 1]
324
+ when '%'; @off += 1 until @str[@off, 1] == "\n" or @off >= @str.length
325
+ when ' ', "\n", "\r", "\t"
326
+ else break
327
+ end
328
+ @off += 1
329
+ end
330
+ end
331
+
332
+ # dereference references from the specified root, with the specified depth
333
+ def deref(obj, depth=1)
334
+ if obj.kind_of? Ref
335
+ @xrefs[obj.gen] ||= {}
336
+ if not nobj = @xrefs[obj.gen][obj.id]
337
+ pvoff = @off
338
+ raise 'unknown ref off' unless @off = @xoff[obj.gen][obj.id]
339
+ puts "deref #{obj.gen} #{obj.id} => #{@off.to_s(16)}" if $DEBUG
340
+ nobj = @xrefs[obj.gen][obj.id] = readany || :poil
341
+ @off = pvoff
342
+ end
343
+ obj = nobj
344
+ end
345
+ depth -= 1
346
+ case obj
347
+ when Hash; obj = obj.dup ; obj.each { |k, v| obj[k] = deref(v, depth) }
348
+ when Array; obj = obj.dup ; obj.each_with_index { |v, i| obj[i] = deref(v, depth) }
349
+ end if depth > 0
350
+ obj
351
+ end
352
+
353
+ # returns the :data field for a Hash or the concatenation of the :data fields of the children for an Array
354
+ def page_data(ct)
355
+ if deref(ct).kind_of? Array
356
+ ct.map { |c| c[:data] }.join
357
+ else
358
+ ct[:data]
359
+ end
360
+ end
361
+
362
+ # iterates over the PDF pages, yields each PSPage
363
+ def each_page(h=@trailer['Root']['Pages'])
364
+ if h['Kids']
365
+ h['Kids'].each { |k| each_page(k, &Proc.new) }
366
+ else
367
+ yield PSPage.new(page_data(h['Contents']))
368
+ end
369
+ end
370
+
371
+ # returns the nr-th page of the pdf as a PSPage
372
+ def page(nr, ar=@trailer['Root']['Pages']['Kids'])
373
+ ar.each { |kid|
374
+ if kid['Count']
375
+ break page(nr, kid['Kids']) if nr <= kid['Count']
376
+ nr -= kid['Count']
377
+ else
378
+ nr -= 1
379
+ break PSPage.new(page_data(kid['Contents'])) if nr <= 0
380
+ end
381
+ }
382
+ end
383
+ end
384
+
385
+ # a PostScript page (lines with position information)
386
+ class PSPage
387
+ class Line
388
+ CHARWIDTH=400
389
+ attr_accessor :str, :x, :y, :fontx, :fonty
390
+ # parses a postscript line, returns a line with individual characters at the right place (more or less)
391
+ def initialize(str, x, y, fontx, fonty, charspc, wordspc)
392
+ @raw, @charspc, @wordspc = str, charspc, wordspc
393
+ @x, @y, @fontx, @fonty = x, y, fontx, fonty
394
+ str = str[1...-1] if str[0] == ?[
395
+ @str = ''
396
+ bs = char = false
397
+ #lastchar = nil
398
+ spc = ''
399
+ str.each_byte { |b|
400
+ if not bs
401
+ # special chars (unescaped)
402
+ case b
403
+ when ?( # new word: honor word spacing
404
+ spc = (-spc.to_f/CHARWIDTH).round
405
+ if spc > 0 and not @str.empty?
406
+ @str << (' '*spc)
407
+ elsif spc < 0
408
+ @str.chop! while @str[-1] == ?\ and (spc += 1) <= 0# and (lastchar != ?\ or @str[-2] == lastchar)
409
+ end
410
+ char = true
411
+ next
412
+ when ?\\ # bs character
413
+ bs = true
414
+ next
415
+ when ?) # end of word
416
+ char = false
417
+ spc = ''
418
+ next
419
+ end
420
+ end
421
+
422
+ # octal escape sequence: leave as is (actual char depends on font)
423
+ if bs and (?0..?7).include? b; @str << ?\\ end
424
+
425
+ bs = false
426
+ if char
427
+ # update current rendered string, honoring charspc
428
+ @str << b
429
+ @str << (' ' * (charspc*1000/CHARWIDTH).round) if charspc > 0.1
430
+ @str << (' ' * (wordspc*1000/CHARWIDTH).round) if b == ?\ and wordspc > 0.1
431
+ #lastchar = b
432
+ else
433
+ # between strings: store word spacing integer
434
+ spc << b
435
+ end
436
+ }
437
+ puts "(#{x}, #{y} #{fontx}, #{fonty}) #@str" if $VERBOSE
438
+ end
439
+ def to_s ; @str end
440
+ end
441
+
442
+ attr_accessor :lines
443
+ def initialize(str=nil)
444
+ parse(str) if str
445
+ end
446
+
447
+ # remove lines not within ymin and ymax
448
+ def clip_lines(ymin, ymax)
449
+ ymin, ymax = ymax, ymin if ymin > ymax
450
+ @lines.each { |la| la.delete_if { |l| l.y < ymin or l.y > ymax } }
451
+ @lines.delete_if { |la| la.empty? }
452
+ self
453
+ end
454
+
455
+ # parse a postscript string to an array of paragraph (itself an array of lines)
456
+ # handles text strings and basic cursor position updates
457
+ def parse(str)
458
+ @lines = []
459
+ curx = cury = 0
460
+ fontx = fonty = 12
461
+ charspc = wordspc = 0
462
+ stack = []
463
+ linelead = -12
464
+ ps2tok(str) { |t|
465
+ case t
466
+ when Float, String; print "#{t} "
467
+ else puts t
468
+ end if $VERBOSE
469
+ case t
470
+ when Float, String; stack << t # be postfix !
471
+ when :BT; intext = true ; @lines << [] # begin text
472
+ when :ET; intext = false # end text
473
+ when :Tj, :TJ # print line
474
+ @lines.last << Line.new(stack.pop, curx, cury, fontx, fonty, charspc, wordspc)
475
+ when :Td, :TD # move cursor
476
+ linelead = stack.last*fonty if t == :TD
477
+ cury += stack.pop*fonty
478
+ curx += stack.pop*fontx
479
+ when :'T*' # new line
480
+ cury += linelead
481
+ when :Tc # character spacing
482
+ # RHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
483
+ #3.17731 Tc 9 0 0 9 343.41 653.84998 Tm
484
+ #[(3T)3202(O)729(R)3179(A)-3689(S)3178(I)]TJ
485
+ # => 3 TO RA SI
486
+ charspc = stack.pop
487
+ when :Tw
488
+ wordspc = stack.pop
489
+ when :Tm # set transform matrix (scale, rotate, translate)
490
+ params = Array.new(6) { stack.pop }.reverse
491
+ next if params[0] == 0.0 # rotated text
492
+ fontx, _, _, fonty, curx, cury = params
493
+ end
494
+ }
495
+ end
496
+
497
+ # yields PS tokens: floats, commands, and strings
498
+ def ps2tok(str)
499
+ loop do
500
+ case str
501
+ when ''; break
502
+ when /\A-?\d+(?:\.\d+)?/; tok = $&.to_f
503
+ when /\A\((?:\\.|[^\\)])*\)/; tok = $&
504
+ when /\A\[(?:[^\](]*\((?:\\.|[^\\)])*\))*[^\]]*\]/; tok = $&
505
+ when /\A[a-zA-Z0-9_*]+/; tok = $&.to_sym rescue nil
506
+ when /\A\S+/, /\A\s+/
507
+ end
508
+ str = str[$&.length..-1]
509
+ yield tok if tok
510
+ end
511
+ end
512
+
513
+ # renders the lines, according to the layout (almost ;) )
514
+ def to_s
515
+ mx = @lines.flatten.map { |l| l.x }.min
516
+ py = nil
517
+ strs = ['']
518
+ @lines.sort_by { |la| -la.map { |l| l.y }.max.to_i }.each { |la|
519
+ y = la.map { |l| l.y }.max
520
+ strs.concat ['']*((py-y)/12) if py and py > y
521
+ la.sort_by { |l| [-l.y, l.x] }.each { |l|
522
+ # 9 == base font size
523
+ strs << '' if y > l.y+l.fonty*0.9 or strs.last.length*1000/Line::CHARWIDTH/9 > l.x-mx
524
+ strs[-1] = strs.last.ljust((l.x-mx)*1000/Line::CHARWIDTH/9-1) << ' ' << l.str
525
+ y = l.y
526
+ }
527
+ py = y if not py or py > y
528
+ }
529
+ strs.join("\n")
530
+ end
531
+ end
532
+
533
+ if __FILE__ == $0
534
+ require 'pp'
535
+ begin
536
+ pdf = PDF.read ARGV.shift
537
+
538
+ if $VERBOSE
539
+ puts 'Info: '
540
+ pp pdf.deref(pdf.trailer['Info'])
541
+ puts
542
+ end
543
+
544
+ if not ARGV.empty?
545
+ ARGV.each { |pagenr|
546
+ puts pdf.page(pagenr.to_i)
547
+ }
548
+ else
549
+ puts 'Pages: '
550
+ pagecnt = 0
551
+ pdf.each_page { |page|
552
+ pagecnt += 1
553
+ puts " ------- p.#{pagecnt} ---------", page
554
+ }
555
+ end
556
+ rescue
557
+ puts "at #{pdf.off.to_s(16) if pdf}", $!, $!.backtrace[0, 24]
558
+ end
559
+ end
560
+
561
+ __END__
562
+ PostScript text formatting, shamelessly ripped from the web (http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/)
563
+
564
+ Object 3, which contains the contents of page one of our document, is worth commenting on since it shows how text streams are used in PDF. The object looks like:
565
+
566
+ 3 0 obj
567
+ <<
568
+ /Length 168
569
+ >>
570
+ stream
571
+ BT
572
+ /F4 1 Tf
573
+ 12 0 0 12 50.64 731.52 Tm
574
+ 0 0 0 rg
575
+ BX /GS2 gs EX
576
+ 0 Tc
577
+ 0 Tw
578
+ @charspc = charspc
579
+ [(This is 12-point )10(T)41(imes. )
580
+ 18(This sentence will appear near
581
+ the top of page one.)]TJ
582
+ ET
583
+ endstream
584
+ endobj
585
+
586
+ The stream object (which is 168 bytes long) is bracketed by BT and ET operators, for Begin Text and End Text. The Tf command selects our font and its size in user-space units, which is given as 1. "But aren't we using 12-point type?" you may be wondering. Yes, we are. That's specified in the next line, ending in Tm (which is the set-text-matrix operator). For space reasons, we won't say much about coordinate system transformations and matrices here, but if you're familiar with the use of matrices in PostScript, the same rules apply in PDF. A transform matrix is given by an array of six numbers, the first and fourth of which determine scaling in x and y, respectively. We see in our text matrix, the scaling factor is 12. That means we will use 12-point type. The last two numbers in the matrix (50.64 and 731.52) specify a translation, in user-space units. The effect of the translation is to put our text approximately 10.1 inches high on the page, with a left margin of 0.7 inch.
587
+
588
+ The line ending with rg sets our ink color to an RGB value of 0 0 0, or black. The BX operator says that we are beginning a section that allows undefined operators. In this section, we apply the gs operator (which sets parameters in the extended graphics state), using /GS2 as our EGS specifications. The EX operator ends the section allowing undefined operators. In essence, we're saying "Any reading application that understands what's in this special section can execute the instructions contained there, but if you don't understand the instructions, just go on." The reason this section has to be handled this way is that extended graphics state instructions often contain device-dependent instructions. The lack of generality means we should bracket those instructions with BX/EX.
589
+
590
+ The Tc and Tw operators are for setting character spacing and word spacing, respectively.
591
+
592
+ Finally, we come to the text that will be displayed on our page. Oddly enough, it's specified in an array of text snippets interspersed with integers, such as:
593
+
594
+ (This is 12-point )10(T)41(imes. )
595
+
596
+ The number 10 represents a kerning value, in thousandths of an em. (An em is a typographical unit of measurement equal to the size of the font.) This number is subtracted from the 'x' coordinate of the letter(s) that follow, displacing the text to the left. The capital 'T' is displaced 10 units to the left, while "imes. " is displaced 41 units. The TJ at the end of the array is the operator for "show text, allowing individual character spacing."
597
+
598
+ Finally, ET closes off the text block, and endstream closes off the stream.
599
+
600
+ b closepath, fill,and stroke path.
601
+ B fill and stroke path.
602
+ b* closepath, eofill,and stroke path.
603
+ B* eofill and stroke path.
604
+ BI begin image.
605
+ BMC begin marked content.
606
+ BT begin text object.
607
+ BX begin section allowing undefined operators.
608
+ c curveto.
609
+ cm concat. Concatenates the matrix to the current transform.
610
+ cs setcolorspace for fill.
611
+ CS setcolorspace for stroke.
612
+ d setdash.
613
+ Do execute the named XObject.
614
+ DP mark a place in the content stream, with a dictionary.
615
+ EI end image.
616
+ EMC end marked content.
617
+ ET end text object.
618
+ EX end section that allows undefined operators.
619
+ f fill path.
620
+ f* eofill Even/odd fill path.
621
+ g setgray (fill).
622
+ G setgray (stroke).
623
+ gs set parameters in the extended graphics state.
624
+ h closepath.
625
+ i setflat.
626
+ ID begin image data.
627
+ j setlinejoin.
628
+ J setlinecap.
629
+ k setcmykcolor (fill).
630
+ K setcmykcolor (stroke).
631
+ l lineto.
632
+ m moveto.
633
+ M setmiterlimit.
634
+ n end path without fill or stroke.
635
+ q save graphics state.
636
+ Q restore graphics state.
637
+ re rectangle.
638
+ rg setrgbcolor (fill).
639
+ RG setrgbcolor (stroke).
640
+ s closepath and stroke path.
641
+ S stroke path.
642
+ sc setcolor (fill).
643
+ SC setcolor (stroke).
644
+ sh shfill (shaded fill).
645
+ Tc set character spacing.
646
+ Td move text current point.
647
+ TD move text current point and set leading.
648
+ Tf set font name and size.
649
+ Tj show text.
650
+ TJ show text, allowing individual character positioning.
651
+ TL set leading.
652
+ Tm set text matrix.
653
+ Tr set text rendering mode.
654
+ Ts set super/subscripting text rise.
655
+ Tw set word spacing.
656
+ Tz set horizontal scaling.
657
+ T* move to start of next line.
658
+ v curveto.
659
+ w setlinewidth.
660
+ W clip.
661
+ y curveto.