regular_expression 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ # The CFG is a directed graph of extended basic blocks of bytecode
5
+ # instructions. This module has objects to represent the EBB, a graph object
6
+ # which contains a set of EBB, and a builder that creates a CFG from a
7
+ # compiled bytecode object.
8
+ module CFG
9
+ def self.build(compiled)
10
+ # Each label in the compiled bytecode starts a block, as does the first
11
+ # instruction
12
+ all_blocks = { start: 0 }.merge(compiled.labels)
13
+ all_block_addresses = all_blocks.values
14
+
15
+ # We're going to create a potentially larger map of labels, and we'll be
16
+ # maintaining a reverse map as well.
17
+ all_labels = compiled.labels.dup
18
+ all_labels_reverse = all_labels.invert
19
+
20
+ # These are the blocks we're finding - indexed by their start address.
21
+ blocks = {}
22
+
23
+ # Go through each block.
24
+ all_blocks.each do |name, start_n|
25
+ # We're going to collect up the instructions in the block, and the
26
+ # labels it exits to.
27
+ block_insns = []
28
+ block_exits = Set.new
29
+
30
+ insn_n = start_n
31
+
32
+ loop do
33
+ # Does another instruction jump here? If so it's the end of the EBB,
34
+ # as EBBs have only one entry point.
35
+ if insn_n != start_n && all_block_addresses.include?(insn_n)
36
+ # As the EBB ends here - we should jump to the next EBB.
37
+ target = all_labels_reverse[insn_n]
38
+ unless target
39
+ target = :"extra#{insn_n}"
40
+ all_labels[target] = insn_n
41
+ all_labels_reverse[insn_n] = target
42
+ end
43
+ block_insns.push(Bytecode::Insns::Jump.new(target))
44
+ block_exits.add(target)
45
+ break
46
+ end
47
+
48
+ # Examine each instruction.
49
+ insn = compiled.insns[insn_n]
50
+ block_insns.push(insn)
51
+
52
+ # Remember which blocks exit to this target.
53
+ case insn
54
+ when Bytecode::Insns::PushIndex, Bytecode::Insns::PopIndex
55
+ insn_n += 1
56
+ when Bytecode::Insns::GuardBegin, Bytecode::Insns::GuardEnd
57
+ block_exits.add(insn.guarded)
58
+ insn_n += 1
59
+ when Bytecode::Insns::JumpAny, Bytecode::Insns::JumpValuesInvert,
60
+ Bytecode::Insns::JumpRange, Bytecode::Insns::JumpRangeInvert,
61
+ Bytecode::Insns::JumpValue
62
+ block_exits.add(insn.target)
63
+ insn_n += 1
64
+ when Bytecode::Insns::Jump
65
+ block_exits.add(insn.target)
66
+ break
67
+ when Bytecode::Insns::Match, Bytecode::Insns::Fail
68
+ break
69
+ else
70
+ raise
71
+ end
72
+ end
73
+
74
+ blocks[start_n] = ExtendedBasicBlock.new(name, block_insns, block_exits.to_a)
75
+ end
76
+
77
+ # Create a map of jump target labels to the blocks that contain them.
78
+ exit_map = {}
79
+ blocks.each_value do |block|
80
+ block.exits.each do |exit|
81
+ exit_map[exit] ||= blocks[all_labels[exit]]
82
+ end
83
+ end
84
+
85
+ Graph.new(blocks.values, exit_map)
86
+ end
87
+
88
+ def self.to_dot(cfg)
89
+ graph = Graphviz::Graph.new
90
+ cfg.to_dot(graph)
91
+
92
+ Graphviz.output(graph, path: "build/cfg.svg", format: "svg")
93
+ graph.to_dot
94
+ end
95
+
96
+ # An Extended Basic Block is a linear sequence of instructions with one
97
+ # entry point and zero or more exit points.
98
+ class ExtendedBasicBlock
99
+ attr_reader :name, :insns, :exits
100
+
101
+ def initialize(name, insns, exits)
102
+ @name = name
103
+ @insns = insns
104
+ @exits = exits
105
+ end
106
+
107
+ def dump(exit_map, io: $stdout)
108
+ io.puts("#{name}:")
109
+ insns.each { |insn| io.puts(" #{insn}") }
110
+ exits.each { |exit| io.puts(" #{exit} -> #{exit_map[exit].name}") }
111
+ end
112
+ end
113
+
114
+ # A graph is a set of EBBs.
115
+ class Graph
116
+ attr_reader :blocks, :exit_map
117
+
118
+ def initialize(blocks, exit_map)
119
+ @blocks = blocks
120
+ @exit_map = exit_map
121
+ end
122
+
123
+ def start
124
+ blocks.first
125
+ end
126
+
127
+ def dump
128
+ output = StringIO.new
129
+ blocks.each { |block| block.dump(exit_map, io: output) }
130
+ output.string
131
+ end
132
+
133
+ def to_dot(graph)
134
+ nodes = {}
135
+
136
+ blocks.each do |block|
137
+ label = []
138
+
139
+ label.push("#{block.name}:")
140
+ block.insns.each { |insn| label.push(" #{insn}") }
141
+
142
+ nodes[block] = graph.add_node(block.object_id, label: label.join($/), labeljust: "l", shape: "box")
143
+ end
144
+
145
+ blocks.each do |block|
146
+ successors = block.exits.map { |exit| nodes[exit_map[exit]] }.uniq
147
+ successors.each do |successor|
148
+ nodes[block].connect(successor)
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module Compiler
5
+ module Ruby
6
+ class Compiled
7
+ attr_reader :source
8
+
9
+ def initialize(source)
10
+ @source = source
11
+ end
12
+
13
+ def to_proc
14
+ eval(source) # rubocop:disable Security/Eval
15
+ end
16
+ end
17
+
18
+ # Generate Ruby code for a CFG. This looks just like the intepreter, but
19
+ # abstracted in time one level!
20
+ # rubocop:disable Layout/LineLength
21
+ def self.compile(cfg)
22
+ ruby_src = []
23
+ ruby_src.push "-> (string) {"
24
+ ruby_src.push " start_n = 0"
25
+ ruby_src.push " stack = []"
26
+ ruby_src.push " while start_n <= string.size"
27
+ ruby_src.push " string_n = start_n"
28
+ ruby_src.push " block = #{cfg.start.name.inspect}"
29
+ ruby_src.push " loop do"
30
+ ruby_src.push " case block"
31
+
32
+ cfg.blocks.each do |block|
33
+ ruby_src.push " when #{block.name.inspect}"
34
+
35
+ block.insns.each do |insn|
36
+ case insn
37
+ when Bytecode::Insns::PushIndex
38
+ ruby_src.push " stack << string_n"
39
+ when Bytecode::Insns::PopIndex
40
+ ruby_src.push " string_n = stack.pop"
41
+ when Bytecode::Insns::GuardBegin
42
+ ruby_src.push " return false if start_n != 0"
43
+ when Bytecode::Insns::GuardEnd
44
+ ruby_src.push " if string_n == string.size"
45
+ ruby_src.push " block = #{cfg.exit_map[insn.guarded].name.inspect}"
46
+ ruby_src.push " next"
47
+ ruby_src.push " end"
48
+ when Bytecode::Insns::JumpAny
49
+ ruby_src.push " if string_n < string.size"
50
+ ruby_src.push " string_n += 1"
51
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
52
+ ruby_src.push " next"
53
+ ruby_src.push " end"
54
+ when Bytecode::Insns::JumpValue
55
+ ruby_src.push " if string_n < string.size && string[string_n] == #{insn.char.inspect}"
56
+ ruby_src.push " string_n += 1"
57
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
58
+ ruby_src.push " next"
59
+ ruby_src.push " end"
60
+ when Bytecode::Insns::JumpValuesInvert
61
+ ruby_src.push " if string_n < string.size && !#{insn.chars.inspect}.include?(string[string_n])"
62
+ ruby_src.push " string_n += 1"
63
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
64
+ ruby_src.push " next"
65
+ ruby_src.push " end"
66
+ when Bytecode::Insns::JumpRange
67
+ ruby_src.push " if string_n < string.size && string[string_n] >= #{insn.left.inspect} && string[string_n] <= #{insn.right.inspect}"
68
+ ruby_src.push " string_n += 1"
69
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
70
+ ruby_src.push " next"
71
+ ruby_src.push " end"
72
+ when Bytecode::Insns::JumpRangeInvert
73
+ ruby_src.push " if string_n < string.size && (string[string_n] < #{insn.left.inspect} || string[string_n] > #{insn.right.inspect})"
74
+ ruby_src.push " string_n += 1"
75
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
76
+ ruby_src.push " next"
77
+ ruby_src.push " end"
78
+ when Bytecode::Insns::Jump
79
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
80
+ ruby_src.push " next"
81
+ when Bytecode::Insns::Match
82
+ ruby_src.push " return true"
83
+ when Bytecode::Insns::Fail
84
+ ruby_src.push " start_n += 1"
85
+ ruby_src.push " break"
86
+ else
87
+ raise
88
+ end
89
+ end
90
+ end
91
+
92
+ ruby_src.push " end"
93
+ ruby_src.push " end"
94
+ ruby_src.push " end"
95
+ ruby_src.push " false"
96
+ ruby_src.push "}"
97
+ ruby_src.push ""
98
+
99
+ Compiled.new(ruby_src.join($/))
100
+ end
101
+ # rubocop:enable Layout/LineLength
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,281 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module Compiler
5
+ module X86
6
+ class Compiled
7
+ attr_reader :buffer
8
+
9
+ def initialize(buffer)
10
+ @buffer = buffer
11
+ end
12
+
13
+ def disasm
14
+ output = StringIO.new
15
+
16
+ crabstone = Crabstone::Disassembler.new(Crabstone::ARCH_X86, Crabstone::MODE_64)
17
+ crabstone.disasm(buffer.memory.to_s(buffer.pos), buffer.memory.to_i).each do |insn|
18
+ output.printf(
19
+ "0x%<address>x:\t%<instruction>s\t%<details>s\n",
20
+ address: insn.address,
21
+ instruction: insn.mnemonic,
22
+ details: insn.op_str
23
+ )
24
+ end
25
+
26
+ output.string
27
+ end
28
+
29
+ def to_proc
30
+ function = buffer.to_function([Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T], Fiddle::TYPE_SIZE_T)
31
+
32
+ lambda do |string|
33
+ value = function.call(string, string.length)
34
+ value if value != string.length + 1
35
+ end
36
+ end
37
+ end
38
+
39
+ # Generate native code for a CFG. This looks just like the Ruby generator
40
+ # but abstracted one level, or just like the interpreter but abstracted
41
+ # two levels!
42
+ def self.compile(cfg)
43
+ fisk = Fisk.new
44
+ buffer = Fisk::Helpers.jitbuffer(1024)
45
+
46
+ fisk.asm(buffer) do
47
+ # Here we're setting up a couple of local variables that point to
48
+ # registers so that it's easier to see what's actually going on
49
+
50
+ # rax is a scratch register that is used for the return value of the
51
+ # function
52
+ return_value = rax
53
+
54
+ # rcx is a scratch register that is used to track the index of the
55
+ # string where we're currently looking
56
+ string_index = rcx
57
+
58
+ # rdx is a scratch register that is used to track the index of the
59
+ # string where we've started the match
60
+ match_index = rdx
61
+
62
+ # rsp is a reserved register that stores a pointer to the stack
63
+ stack_pointer = rsp
64
+
65
+ # rbp is a reserved register that stores a pointer to the base of the
66
+ # stack. It is also known as the frame pointer
67
+ frame_pointer = rbp
68
+
69
+ # rsi is a scratch register that stores the second argument to the
70
+ # function, and in our case stores the length of the string
71
+ string_length = rsi
72
+
73
+ # rdi is a scratch register that stores the first argument to the
74
+ # function, and in our case stores a pointer to the base of the string
75
+ string_pointer = rdi
76
+
77
+ # r8 is a scratch register that we're using to store the last read
78
+ # character value from the string
79
+ character_buffer = r8
80
+
81
+ # First we're going to do some initialization of the frame pointer and
82
+ # stack pointer so we can clear the stack when we're done with this
83
+ # function
84
+ push frame_pointer
85
+ mov frame_pointer, stack_pointer
86
+
87
+ # Now we're going to initialize the counter to 0 so that we attempt to
88
+ # match at each index of the input string
89
+ xor match_index, match_index
90
+
91
+ # This is the start of our loop, where at the beginning of the loop
92
+ # we check if we have already finished looking at each index (in which
93
+ # case we'll jump to a failure condition)
94
+ make_label :start_loop_head
95
+ cmp match_index, string_length
96
+ jg label(:exit)
97
+
98
+ # Set the string_index value to the match_index value so that we begin
99
+ # each loop at the current match index
100
+ mov string_index, match_index
101
+
102
+ cfg.blocks.each do |block|
103
+ # Label the start of each block so that we can jump between them
104
+ make_label block.name
105
+
106
+ block.insns.each do |insn|
107
+ case insn
108
+ when Bytecode::Insns::PushIndex
109
+ push string_index
110
+ when Bytecode::Insns::PopIndex
111
+ pop string_index
112
+ when Bytecode::Insns::GuardBegin
113
+ cmp string_index, imm8(0)
114
+ jne label(:exit)
115
+ jmp label(cfg.exit_map[insn.guarded].name)
116
+ when Bytecode::Insns::GuardEnd
117
+ cmp string_index, string_length
118
+ je label(cfg.exit_map[insn.guarded].name)
119
+ when Bytecode::Insns::JumpAny
120
+ no_match_label = :"no_match_#{insn.object_id}"
121
+
122
+ # Ensure we have a character we can read
123
+ cmp string_index, string_length
124
+ je label(no_match_label)
125
+
126
+ # Move the string index forward and jump to the target
127
+ # instruction
128
+ inc string_index
129
+ jmp label(cfg.exit_map[insn.target].name)
130
+
131
+ make_label no_match_label
132
+ when Bytecode::Insns::JumpValue
133
+ no_match_label = :"no_match_#{insn.object_id}"
134
+
135
+ # Ensure we have a character we can read
136
+ cmp string_index, string_length
137
+ je label(no_match_label)
138
+
139
+ # Read the character into the character buffer
140
+ mov character_buffer, string_pointer
141
+ add character_buffer, string_index
142
+ mov character_buffer, m64(character_buffer)
143
+
144
+ # Compare the character buffer to the instruction's character,
145
+ # continue on to the next instruction if it's not equal
146
+ cmp character_buffer, imm8(insn.char.ord)
147
+ jne label(no_match_label)
148
+
149
+ # Move the string index forward and jump to the target
150
+ # instruction
151
+ inc string_index
152
+ jmp label(cfg.exit_map[insn.target].name)
153
+
154
+ make_label no_match_label
155
+ when Bytecode::Insns::JumpValuesInvert
156
+ no_match_label = :"no_match_#{insn.object_id}"
157
+
158
+ # Ensure we have a character we can read
159
+ cmp string_index, string_length
160
+ je label(no_match_label)
161
+
162
+ # Read the character into the character buffer
163
+ mov character_buffer, string_pointer
164
+ add character_buffer, string_index
165
+ mov character_buffer, m64(character_buffer)
166
+
167
+ # Compare the character buffer to each of the instruction's
168
+ # characters, continue on to the next instruction if any of them
169
+ # are equal
170
+ insn.chars.each do |value|
171
+ cmp character_buffer, imm8(value.ord)
172
+ je label(no_match_label)
173
+ end
174
+
175
+ # Move the string index forward and jump to the target
176
+ # instruction
177
+ inc string_index
178
+ jmp label(cfg.exit_map[insn.target].name)
179
+
180
+ make_label no_match_label
181
+ when Bytecode::Insns::JumpRange
182
+ no_match_label = :"no_match_#{insn.object_id}"
183
+
184
+ # Ensure we have a character we can read
185
+ cmp string_index, string_length
186
+ je label(no_match_label)
187
+
188
+ # Read the character into the character buffer
189
+ mov character_buffer, string_pointer
190
+ add character_buffer, string_index
191
+ mov character_buffer, m64(character_buffer)
192
+
193
+ # Compare the character buffer to the left hand side of the
194
+ # instruction's range, continue on to the next instruction if
195
+ # it's outside the range
196
+ cmp character_buffer, imm8(insn.left.ord)
197
+ jl label(no_match_label)
198
+
199
+ # Compare the character buffer to the right hand side of the
200
+ # instruction's range, continue on to the next instruction if
201
+ # it's outside the range
202
+ cmp character_buffer, imm8(insn.right.ord)
203
+ jg label(no_match_label)
204
+
205
+ # Move the string index forward and jump to the target
206
+ # instruction
207
+ inc string_index
208
+ jmp label(cfg.exit_map[insn.target].name)
209
+
210
+ make_label no_match_label
211
+ when Bytecode::Insns::JumpRangeInvert
212
+ no_match_label = :"no_match_#{insn.object_id}"
213
+ match_label = :"match_#{insn.object_id}"
214
+
215
+ # Ensure we have a character we can read
216
+ cmp string_index, string_length
217
+ je label(no_match_label)
218
+
219
+ # Read the character into the character buffer
220
+ mov character_buffer, string_pointer
221
+ add character_buffer, string_index
222
+ mov character_buffer, m64(character_buffer)
223
+
224
+ # Compare the character buffer to the left hand side of the
225
+ # instruction's range, jump down to the success case if it's
226
+ # outside the range
227
+ cmp character_buffer, imm8(insn.left.ord)
228
+ jl label(match_label)
229
+
230
+ # Compare the character buffer to the right hand side of the
231
+ # instruction's range, continue on to the next instruction if
232
+ # it's inside the range
233
+ cmp character_buffer, imm8(insn.right.ord)
234
+ jle label(no_match_label)
235
+
236
+ # Move the string index forward and jump to the target
237
+ # instruction
238
+ make_label match_label
239
+ inc string_index
240
+ jmp label(cfg.exit_map[insn.target].name)
241
+
242
+ make_label no_match_label
243
+ when Bytecode::Insns::Jump
244
+ jmp label(cfg.exit_map[insn.target].name)
245
+ when Bytecode::Insns::Match
246
+ # If we reach this instruction, then we've successfully matched
247
+ # against the input string, so we're going to return the integer
248
+ # that represents the index at which this match began
249
+ mov return_value, match_index
250
+ mov stack_pointer, frame_pointer
251
+ pop frame_pointer
252
+ ret
253
+ when Bytecode::Insns::Fail
254
+ inc match_index
255
+ jmp label(:start_loop_head)
256
+ else
257
+ raise
258
+ end
259
+ end
260
+ end
261
+
262
+ # If we reach this instruction, then we've failed to match at every
263
+ # possible index in the string, so we're going to return the length
264
+ # of the string + 1 so that the caller knows that this match failed
265
+ make_label :exit
266
+ mov return_value, string_length
267
+ inc return_value
268
+
269
+ # Here we make sure to clean up after ourselves by returning the frame
270
+ # pointer to its former position
271
+ mov stack_pointer, frame_pointer
272
+ pop frame_pointer
273
+
274
+ ret
275
+ end
276
+
277
+ Compiled.new(buffer)
278
+ end
279
+ end
280
+ end
281
+ end