regular_expression 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ # The CFG is a directed graph of extended basic blocks of bytecode
5
+ # instructions. This module has objects to represent the EBB, a graph object
6
+ # which contains a set of EBB, and a builder that creates a CFG from a
7
+ # compiled bytecode object.
8
+ module CFG
9
+ def self.build(compiled)
10
+ # Each label in the compiled bytecode starts a block, as does the first
11
+ # instruction
12
+ all_blocks = { start: 0 }.merge(compiled.labels)
13
+ all_block_addresses = all_blocks.values
14
+
15
+ # We're going to create a potentially larger map of labels, and we'll be
16
+ # maintaining a reverse map as well.
17
+ all_labels = compiled.labels.dup
18
+ all_labels_reverse = all_labels.invert
19
+
20
+ # These are the blocks we're finding - indexed by their start address.
21
+ blocks = {}
22
+
23
+ # Go through each block.
24
+ all_blocks.each do |name, start_n|
25
+ # We're going to collect up the instructions in the block, and the
26
+ # labels it exits to.
27
+ block_insns = []
28
+ block_exits = Set.new
29
+
30
+ insn_n = start_n
31
+
32
+ loop do
33
+ # Does another instruction jump here? If so it's the end of the EBB,
34
+ # as EBBs have only one entry point.
35
+ if insn_n != start_n && all_block_addresses.include?(insn_n)
36
+ # As the EBB ends here - we should jump to the next EBB.
37
+ target = all_labels_reverse[insn_n]
38
+ unless target
39
+ target = :"extra#{insn_n}"
40
+ all_labels[target] = insn_n
41
+ all_labels_reverse[insn_n] = target
42
+ end
43
+ block_insns.push(Bytecode::Insns::Jump.new(target))
44
+ block_exits.add(target)
45
+ break
46
+ end
47
+
48
+ # Examine each instruction.
49
+ insn = compiled.insns[insn_n]
50
+ block_insns.push(insn)
51
+
52
+ # Remember which blocks exit to this target.
53
+ case insn
54
+ when Bytecode::Insns::PushIndex, Bytecode::Insns::PopIndex
55
+ insn_n += 1
56
+ when Bytecode::Insns::GuardBegin, Bytecode::Insns::GuardEnd
57
+ block_exits.add(insn.guarded)
58
+ insn_n += 1
59
+ when Bytecode::Insns::JumpAny, Bytecode::Insns::JumpValuesInvert,
60
+ Bytecode::Insns::JumpRange, Bytecode::Insns::JumpRangeInvert,
61
+ Bytecode::Insns::JumpValue
62
+ block_exits.add(insn.target)
63
+ insn_n += 1
64
+ when Bytecode::Insns::Jump
65
+ block_exits.add(insn.target)
66
+ break
67
+ when Bytecode::Insns::Match, Bytecode::Insns::Fail
68
+ break
69
+ else
70
+ raise
71
+ end
72
+ end
73
+
74
+ blocks[start_n] = ExtendedBasicBlock.new(name, block_insns, block_exits.to_a)
75
+ end
76
+
77
+ # Create a map of jump target labels to the blocks that contain them.
78
+ exit_map = {}
79
+ blocks.each_value do |block|
80
+ block.exits.each do |exit|
81
+ exit_map[exit] ||= blocks[all_labels[exit]]
82
+ end
83
+ end
84
+
85
+ Graph.new(blocks.values, exit_map)
86
+ end
87
+
88
+ def self.to_dot(cfg)
89
+ graph = Graphviz::Graph.new
90
+ cfg.to_dot(graph)
91
+
92
+ Graphviz.output(graph, path: "build/cfg.svg", format: "svg")
93
+ graph.to_dot
94
+ end
95
+
96
+ # An Extended Basic Block is a linear sequence of instructions with one
97
+ # entry point and zero or more exit points.
98
+ class ExtendedBasicBlock
99
+ attr_reader :name, :insns, :exits
100
+
101
+ def initialize(name, insns, exits)
102
+ @name = name
103
+ @insns = insns
104
+ @exits = exits
105
+ end
106
+
107
+ def dump(exit_map, io: $stdout)
108
+ io.puts("#{name}:")
109
+ insns.each { |insn| io.puts(" #{insn}") }
110
+ exits.each { |exit| io.puts(" #{exit} -> #{exit_map[exit].name}") }
111
+ end
112
+ end
113
+
114
+ # A graph is a set of EBBs.
115
+ class Graph
116
+ attr_reader :blocks, :exit_map
117
+
118
+ def initialize(blocks, exit_map)
119
+ @blocks = blocks
120
+ @exit_map = exit_map
121
+ end
122
+
123
+ def start
124
+ blocks.first
125
+ end
126
+
127
+ def dump
128
+ output = StringIO.new
129
+ blocks.each { |block| block.dump(exit_map, io: output) }
130
+ output.string
131
+ end
132
+
133
+ def to_dot(graph)
134
+ nodes = {}
135
+
136
+ blocks.each do |block|
137
+ label = []
138
+
139
+ label.push("#{block.name}:")
140
+ block.insns.each { |insn| label.push(" #{insn}") }
141
+
142
+ nodes[block] = graph.add_node(block.object_id, label: label.join($/), labeljust: "l", shape: "box")
143
+ end
144
+
145
+ blocks.each do |block|
146
+ successors = block.exits.map { |exit| nodes[exit_map[exit]] }.uniq
147
+ successors.each do |successor|
148
+ nodes[block].connect(successor)
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module Compiler
5
+ module Ruby
6
+ class Compiled
7
+ attr_reader :source
8
+
9
+ def initialize(source)
10
+ @source = source
11
+ end
12
+
13
+ def to_proc
14
+ eval(source) # rubocop:disable Security/Eval
15
+ end
16
+ end
17
+
18
+ # Generate Ruby code for a CFG. This looks just like the intepreter, but
19
+ # abstracted in time one level!
20
+ # rubocop:disable Layout/LineLength
21
+ def self.compile(cfg)
22
+ ruby_src = []
23
+ ruby_src.push "-> (string) {"
24
+ ruby_src.push " start_n = 0"
25
+ ruby_src.push " stack = []"
26
+ ruby_src.push " while start_n <= string.size"
27
+ ruby_src.push " string_n = start_n"
28
+ ruby_src.push " block = #{cfg.start.name.inspect}"
29
+ ruby_src.push " loop do"
30
+ ruby_src.push " case block"
31
+
32
+ cfg.blocks.each do |block|
33
+ ruby_src.push " when #{block.name.inspect}"
34
+
35
+ block.insns.each do |insn|
36
+ case insn
37
+ when Bytecode::Insns::PushIndex
38
+ ruby_src.push " stack << string_n"
39
+ when Bytecode::Insns::PopIndex
40
+ ruby_src.push " string_n = stack.pop"
41
+ when Bytecode::Insns::GuardBegin
42
+ ruby_src.push " return false if start_n != 0"
43
+ when Bytecode::Insns::GuardEnd
44
+ ruby_src.push " if string_n == string.size"
45
+ ruby_src.push " block = #{cfg.exit_map[insn.guarded].name.inspect}"
46
+ ruby_src.push " next"
47
+ ruby_src.push " end"
48
+ when Bytecode::Insns::JumpAny
49
+ ruby_src.push " if string_n < string.size"
50
+ ruby_src.push " string_n += 1"
51
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
52
+ ruby_src.push " next"
53
+ ruby_src.push " end"
54
+ when Bytecode::Insns::JumpValue
55
+ ruby_src.push " if string_n < string.size && string[string_n] == #{insn.char.inspect}"
56
+ ruby_src.push " string_n += 1"
57
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
58
+ ruby_src.push " next"
59
+ ruby_src.push " end"
60
+ when Bytecode::Insns::JumpValuesInvert
61
+ ruby_src.push " if string_n < string.size && !#{insn.chars.inspect}.include?(string[string_n])"
62
+ ruby_src.push " string_n += 1"
63
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
64
+ ruby_src.push " next"
65
+ ruby_src.push " end"
66
+ when Bytecode::Insns::JumpRange
67
+ ruby_src.push " if string_n < string.size && string[string_n] >= #{insn.left.inspect} && string[string_n] <= #{insn.right.inspect}"
68
+ ruby_src.push " string_n += 1"
69
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
70
+ ruby_src.push " next"
71
+ ruby_src.push " end"
72
+ when Bytecode::Insns::JumpRangeInvert
73
+ ruby_src.push " if string_n < string.size && (string[string_n] < #{insn.left.inspect} || string[string_n] > #{insn.right.inspect})"
74
+ ruby_src.push " string_n += 1"
75
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
76
+ ruby_src.push " next"
77
+ ruby_src.push " end"
78
+ when Bytecode::Insns::Jump
79
+ ruby_src.push " block = #{cfg.exit_map[insn.target].name.inspect}"
80
+ ruby_src.push " next"
81
+ when Bytecode::Insns::Match
82
+ ruby_src.push " return true"
83
+ when Bytecode::Insns::Fail
84
+ ruby_src.push " start_n += 1"
85
+ ruby_src.push " break"
86
+ else
87
+ raise
88
+ end
89
+ end
90
+ end
91
+
92
+ ruby_src.push " end"
93
+ ruby_src.push " end"
94
+ ruby_src.push " end"
95
+ ruby_src.push " false"
96
+ ruby_src.push "}"
97
+ ruby_src.push ""
98
+
99
+ Compiled.new(ruby_src.join($/))
100
+ end
101
+ # rubocop:enable Layout/LineLength
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,281 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module Compiler
5
+ module X86
6
+ class Compiled
7
+ attr_reader :buffer
8
+
9
+ def initialize(buffer)
10
+ @buffer = buffer
11
+ end
12
+
13
+ def disasm
14
+ output = StringIO.new
15
+
16
+ crabstone = Crabstone::Disassembler.new(Crabstone::ARCH_X86, Crabstone::MODE_64)
17
+ crabstone.disasm(buffer.memory.to_s(buffer.pos), buffer.memory.to_i).each do |insn|
18
+ output.printf(
19
+ "0x%<address>x:\t%<instruction>s\t%<details>s\n",
20
+ address: insn.address,
21
+ instruction: insn.mnemonic,
22
+ details: insn.op_str
23
+ )
24
+ end
25
+
26
+ output.string
27
+ end
28
+
29
+ def to_proc
30
+ function = buffer.to_function([Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T], Fiddle::TYPE_SIZE_T)
31
+
32
+ lambda do |string|
33
+ value = function.call(string, string.length)
34
+ value if value != string.length + 1
35
+ end
36
+ end
37
+ end
38
+
39
+ # Generate native code for a CFG. This looks just like the Ruby generator
40
+ # but abstracted one level, or just like the interpreter but abstracted
41
+ # two levels!
42
+ def self.compile(cfg)
43
+ fisk = Fisk.new
44
+ buffer = Fisk::Helpers.jitbuffer(1024)
45
+
46
+ fisk.asm(buffer) do
47
+ # Here we're setting up a couple of local variables that point to
48
+ # registers so that it's easier to see what's actually going on
49
+
50
+ # rax is a scratch register that is used for the return value of the
51
+ # function
52
+ return_value = rax
53
+
54
+ # rcx is a scratch register that is used to track the index of the
55
+ # string where we're currently looking
56
+ string_index = rcx
57
+
58
+ # rdx is a scratch register that is used to track the index of the
59
+ # string where we've started the match
60
+ match_index = rdx
61
+
62
+ # rsp is a reserved register that stores a pointer to the stack
63
+ stack_pointer = rsp
64
+
65
+ # rbp is a reserved register that stores a pointer to the base of the
66
+ # stack. It is also known as the frame pointer
67
+ frame_pointer = rbp
68
+
69
+ # rsi is a scratch register that stores the second argument to the
70
+ # function, and in our case stores the length of the string
71
+ string_length = rsi
72
+
73
+ # rdi is a scratch register that stores the first argument to the
74
+ # function, and in our case stores a pointer to the base of the string
75
+ string_pointer = rdi
76
+
77
+ # r8 is a scratch register that we're using to store the last read
78
+ # character value from the string
79
+ character_buffer = r8
80
+
81
+ # First we're going to do some initialization of the frame pointer and
82
+ # stack pointer so we can clear the stack when we're done with this
83
+ # function
84
+ push frame_pointer
85
+ mov frame_pointer, stack_pointer
86
+
87
+ # Now we're going to initialize the counter to 0 so that we attempt to
88
+ # match at each index of the input string
89
+ xor match_index, match_index
90
+
91
+ # This is the start of our loop, where at the beginning of the loop
92
+ # we check if we have already finished looking at each index (in which
93
+ # case we'll jump to a failure condition)
94
+ make_label :start_loop_head
95
+ cmp match_index, string_length
96
+ jg label(:exit)
97
+
98
+ # Set the string_index value to the match_index value so that we begin
99
+ # each loop at the current match index
100
+ mov string_index, match_index
101
+
102
+ cfg.blocks.each do |block|
103
+ # Label the start of each block so that we can jump between them
104
+ make_label block.name
105
+
106
+ block.insns.each do |insn|
107
+ case insn
108
+ when Bytecode::Insns::PushIndex
109
+ push string_index
110
+ when Bytecode::Insns::PopIndex
111
+ pop string_index
112
+ when Bytecode::Insns::GuardBegin
113
+ cmp string_index, imm8(0)
114
+ jne label(:exit)
115
+ jmp label(cfg.exit_map[insn.guarded].name)
116
+ when Bytecode::Insns::GuardEnd
117
+ cmp string_index, string_length
118
+ je label(cfg.exit_map[insn.guarded].name)
119
+ when Bytecode::Insns::JumpAny
120
+ no_match_label = :"no_match_#{insn.object_id}"
121
+
122
+ # Ensure we have a character we can read
123
+ cmp string_index, string_length
124
+ je label(no_match_label)
125
+
126
+ # Move the string index forward and jump to the target
127
+ # instruction
128
+ inc string_index
129
+ jmp label(cfg.exit_map[insn.target].name)
130
+
131
+ make_label no_match_label
132
+ when Bytecode::Insns::JumpValue
133
+ no_match_label = :"no_match_#{insn.object_id}"
134
+
135
+ # Ensure we have a character we can read
136
+ cmp string_index, string_length
137
+ je label(no_match_label)
138
+
139
+ # Read the character into the character buffer
140
+ mov character_buffer, string_pointer
141
+ add character_buffer, string_index
142
+ mov character_buffer, m64(character_buffer)
143
+
144
+ # Compare the character buffer to the instruction's character,
145
+ # continue on to the next instruction if it's not equal
146
+ cmp character_buffer, imm8(insn.char.ord)
147
+ jne label(no_match_label)
148
+
149
+ # Move the string index forward and jump to the target
150
+ # instruction
151
+ inc string_index
152
+ jmp label(cfg.exit_map[insn.target].name)
153
+
154
+ make_label no_match_label
155
+ when Bytecode::Insns::JumpValuesInvert
156
+ no_match_label = :"no_match_#{insn.object_id}"
157
+
158
+ # Ensure we have a character we can read
159
+ cmp string_index, string_length
160
+ je label(no_match_label)
161
+
162
+ # Read the character into the character buffer
163
+ mov character_buffer, string_pointer
164
+ add character_buffer, string_index
165
+ mov character_buffer, m64(character_buffer)
166
+
167
+ # Compare the character buffer to each of the instruction's
168
+ # characters, continue on to the next instruction if any of them
169
+ # are equal
170
+ insn.chars.each do |value|
171
+ cmp character_buffer, imm8(value.ord)
172
+ je label(no_match_label)
173
+ end
174
+
175
+ # Move the string index forward and jump to the target
176
+ # instruction
177
+ inc string_index
178
+ jmp label(cfg.exit_map[insn.target].name)
179
+
180
+ make_label no_match_label
181
+ when Bytecode::Insns::JumpRange
182
+ no_match_label = :"no_match_#{insn.object_id}"
183
+
184
+ # Ensure we have a character we can read
185
+ cmp string_index, string_length
186
+ je label(no_match_label)
187
+
188
+ # Read the character into the character buffer
189
+ mov character_buffer, string_pointer
190
+ add character_buffer, string_index
191
+ mov character_buffer, m64(character_buffer)
192
+
193
+ # Compare the character buffer to the left hand side of the
194
+ # instruction's range, continue on to the next instruction if
195
+ # it's outside the range
196
+ cmp character_buffer, imm8(insn.left.ord)
197
+ jl label(no_match_label)
198
+
199
+ # Compare the character buffer to the right hand side of the
200
+ # instruction's range, continue on to the next instruction if
201
+ # it's outside the range
202
+ cmp character_buffer, imm8(insn.right.ord)
203
+ jg label(no_match_label)
204
+
205
+ # Move the string index forward and jump to the target
206
+ # instruction
207
+ inc string_index
208
+ jmp label(cfg.exit_map[insn.target].name)
209
+
210
+ make_label no_match_label
211
+ when Bytecode::Insns::JumpRangeInvert
212
+ no_match_label = :"no_match_#{insn.object_id}"
213
+ match_label = :"match_#{insn.object_id}"
214
+
215
+ # Ensure we have a character we can read
216
+ cmp string_index, string_length
217
+ je label(no_match_label)
218
+
219
+ # Read the character into the character buffer
220
+ mov character_buffer, string_pointer
221
+ add character_buffer, string_index
222
+ mov character_buffer, m64(character_buffer)
223
+
224
+ # Compare the character buffer to the left hand side of the
225
+ # instruction's range, jump down to the success case if it's
226
+ # outside the range
227
+ cmp character_buffer, imm8(insn.left.ord)
228
+ jl label(match_label)
229
+
230
+ # Compare the character buffer to the right hand side of the
231
+ # instruction's range, continue on to the next instruction if
232
+ # it's inside the range
233
+ cmp character_buffer, imm8(insn.right.ord)
234
+ jle label(no_match_label)
235
+
236
+ # Move the string index forward and jump to the target
237
+ # instruction
238
+ make_label match_label
239
+ inc string_index
240
+ jmp label(cfg.exit_map[insn.target].name)
241
+
242
+ make_label no_match_label
243
+ when Bytecode::Insns::Jump
244
+ jmp label(cfg.exit_map[insn.target].name)
245
+ when Bytecode::Insns::Match
246
+ # If we reach this instruction, then we've successfully matched
247
+ # against the input string, so we're going to return the integer
248
+ # that represents the index at which this match began
249
+ mov return_value, match_index
250
+ mov stack_pointer, frame_pointer
251
+ pop frame_pointer
252
+ ret
253
+ when Bytecode::Insns::Fail
254
+ inc match_index
255
+ jmp label(:start_loop_head)
256
+ else
257
+ raise
258
+ end
259
+ end
260
+ end
261
+
262
+ # If we reach this instruction, then we've failed to match at every
263
+ # possible index in the string, so we're going to return the length
264
+ # of the string + 1 so that the caller knows that this match failed
265
+ make_label :exit
266
+ mov return_value, string_length
267
+ inc return_value
268
+
269
+ # Here we make sure to clean up after ourselves by returning the frame
270
+ # pointer to its former position
271
+ mov stack_pointer, frame_pointer
272
+ pop frame_pointer
273
+
274
+ ret
275
+ end
276
+
277
+ Compiled.new(buffer)
278
+ end
279
+ end
280
+ end
281
+ end