regular_expression 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/parse ADDED
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $:.unshift(File.expand_path("../lib", __dir__))
5
+ require "regular_expression"
6
+ require "crabstone"
7
+ require "graphviz"
8
+
9
+ # Pass the source through the various parsing phases
10
+ source = ARGV.shift
11
+ ast = RegularExpression::Parser.new.parse(source)
12
+ nfa = ast.to_nfa
13
+ bytecode = RegularExpression::Bytecode.compile(nfa)
14
+
15
+ # Compile the graph into various outputs
16
+ cfg = RegularExpression::CFG.build(bytecode)
17
+ ruby = RegularExpression::Compiler::Ruby.compile(cfg)
18
+ x86 = RegularExpression::Compiler::X86.compile(cfg)
19
+
20
+ # Make sure we get some nice dot output
21
+ RegularExpression::AST.to_dot(ast)
22
+ RegularExpression::NFA.to_dot(nfa)
23
+ RegularExpression::CFG.to_dot(cfg)
24
+
25
+ # Dump out the bytecode and cfg to strings
26
+ puts "#{bytecode.dump}\n"
27
+ puts "#{cfg.dump}\n"
28
+
29
+ check =
30
+ if ARGV.any?
31
+ lambda do |compiled|
32
+ checker = compiled.to_proc
33
+ ARGV.each { |string| puts "#{string}: #{checker.call(string)}" }
34
+ puts
35
+ end
36
+ else
37
+ ->(_compiled) {}
38
+ end
39
+
40
+ # Test the interpreter against any passed strings
41
+ interpreter = RegularExpression::Interpreter.new(bytecode)
42
+ check.call(interpreter)
43
+
44
+ # Dump out the compiled ruby source and match against any passed values
45
+ puts "#{ruby.source}\n"
46
+ check.call(ruby)
47
+
48
+ # Dump out the diassembled x86 source and match against any passed values
49
+ puts "#{x86.disasm}\n"
50
+ check.call(x86)
data/build/.gitignore ADDED
@@ -0,0 +1 @@
1
+ *.svg
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fisk"
4
+ require "fisk/helpers"
5
+ require "set"
6
+ require "stringio"
7
+
8
+ require_relative "./regular_expression/ast"
9
+ require_relative "./regular_expression/bytecode"
10
+ require_relative "./regular_expression/cfg"
11
+ require_relative "./regular_expression/interpreter"
12
+ require_relative "./regular_expression/lexer"
13
+ require_relative "./regular_expression/nfa"
14
+ require_relative "./regular_expression/parser"
15
+ require_relative "./regular_expression/pattern"
16
+ require_relative "./regular_expression/version"
17
+
18
+ require_relative "./regular_expression/compiler/ruby"
19
+ require_relative "./regular_expression/compiler/x86"
@@ -0,0 +1,364 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module AST
5
+ def self.to_dot(root)
6
+ graph = Graphviz::Graph.new
7
+ root.to_dot(graph)
8
+
9
+ Graphviz.output(graph, path: "build/ast.svg", format: "svg")
10
+ graph.to_dot
11
+ end
12
+
13
+ class Root
14
+ attr_reader :expressions # Array[Expression]
15
+ attr_reader :at_start # bool
16
+
17
+ def initialize(expressions, at_start: false)
18
+ @expressions = expressions
19
+ @at_start = at_start
20
+ end
21
+
22
+ def to_dot(graph)
23
+ label = "Root"
24
+ label = "#{label} (at start)" if at_start
25
+
26
+ node = graph.add_node(object_id, label: label)
27
+ expressions.each { |expression| expression.to_dot(node) }
28
+ end
29
+
30
+ def to_nfa
31
+ start = NFA::StartState.new
32
+ current = start
33
+
34
+ if at_start
35
+ current = NFA::State.new
36
+ start.add_transition(NFA::Transition::BeginAnchor.new(current))
37
+ end
38
+
39
+ finish = NFA::FinishState.new
40
+ expressions.each do |expression|
41
+ expression.to_nfa(current, finish)
42
+ end
43
+
44
+ start
45
+ end
46
+ end
47
+
48
+ class Expression
49
+ attr_reader :items # Group | Match | Anchor
50
+
51
+ def initialize(items)
52
+ @items = items
53
+ end
54
+
55
+ def to_dot(parent)
56
+ node = parent.add_node(object_id, label: "Expression")
57
+
58
+ items.each { |item| item.to_dot(node) }
59
+ end
60
+
61
+ def to_nfa(start, finish)
62
+ inner = Array.new(items.length - 1) { NFA::State.new }
63
+ states = [start, *inner, finish]
64
+
65
+ items.each_with_index do |item, index|
66
+ item.to_nfa(states[index], states[index + 1])
67
+ end
68
+ end
69
+ end
70
+
71
+ class Group
72
+ attr_reader :expressions # Array[Expression]
73
+ attr_reader :quantifier # Quantifier
74
+
75
+ def initialize(expressions, quantifier: Quantifier::Once.new)
76
+ @expressions = expressions
77
+ @quantifier = quantifier
78
+ end
79
+
80
+ def to_dot(parent)
81
+ node = parent.add_node(object_id, label: "Group")
82
+
83
+ expressions.each { |expression| expression.to_dot(node) }
84
+ quantifier.to_dot(node)
85
+ end
86
+
87
+ def to_nfa(start, finish)
88
+ quantifier.quantify(start, finish) do |qstart, qfinish|
89
+ expressions.each { |expression| expression.to_nfa(qstart, qfinish) }
90
+ end
91
+ end
92
+ end
93
+
94
+ class Match
95
+ attr_reader :item # CharacterGroup | CharacterClass | Character | Period
96
+ attr_reader :quantifier # Quantifier
97
+
98
+ def initialize(item, quantifier: Quantifier::Once.new)
99
+ @item = item
100
+ @quantifier = quantifier
101
+ end
102
+
103
+ def to_dot(parent)
104
+ node = parent.add_node(object_id, label: "Match")
105
+
106
+ item.to_dot(node)
107
+ quantifier.to_dot(node)
108
+ end
109
+
110
+ def to_nfa(start, finish)
111
+ quantifier.quantify(start, finish) do |qstart, qfinish|
112
+ item.to_nfa(qstart, qfinish)
113
+ end
114
+ end
115
+ end
116
+
117
+ class CharacterGroup
118
+ attr_reader :items # Array[CharacterRange | Character]
119
+ attr_reader :invert # bool
120
+
121
+ def initialize(items, invert: false)
122
+ @items = items
123
+ @invert = invert
124
+ end
125
+
126
+ def to_dot(parent)
127
+ label = "CharacterGroup"
128
+ label = "#{label} (invert)" if invert
129
+
130
+ node = parent.add_node(object_id, label: label)
131
+ items.each { |item| item.to_dot(node) }
132
+ end
133
+
134
+ def to_nfa(start, finish)
135
+ if invert
136
+ transition = NFA::Transition::Invert.new(finish, items.flat_map(&:to_nfa_values).sort)
137
+ start.add_transition(transition)
138
+ else
139
+ items.each do |item|
140
+ item.to_nfa(start, finish)
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ class CharacterClass
147
+ attr_reader :value # "\w" | "\W" | "\d" | "\D"
148
+
149
+ def initialize(value)
150
+ @value = value
151
+ end
152
+
153
+ def to_dot(parent)
154
+ parent.add_node(object_id, label: value, shape: "box")
155
+ end
156
+
157
+ def to_nfa(start, finish)
158
+ case value
159
+ when "\\w"
160
+ start.add_transition(NFA::Transition::Range.new(finish, "a", "z"))
161
+ start.add_transition(NFA::Transition::Range.new(finish, "A", "Z"))
162
+ start.add_transition(NFA::Transition::Range.new(finish, "0", "9"))
163
+ start.add_transition(NFA::Transition::Value.new(finish, "_"))
164
+ when "\\W"
165
+ start.add_transition(NFA::Transition::Invert.new(finish, [*("a".."z"), *("A".."Z"), *("0".."9"), "_"]))
166
+ when "\\d"
167
+ start.add_transition(NFA::Transition::Range.new(finish, "0", "9"))
168
+ when "\\D"
169
+ start.add_transition(NFA::Transition::Range.new(finish, "0", "9", invert: true))
170
+ else
171
+ raise
172
+ end
173
+ end
174
+ end
175
+
176
+ class Character
177
+ attr_reader :value # String
178
+
179
+ def initialize(value)
180
+ @value = value
181
+ end
182
+
183
+ def to_dot(parent)
184
+ parent.add_node(object_id, label: value, shape: "box")
185
+ end
186
+
187
+ def to_nfa_values
188
+ [value]
189
+ end
190
+
191
+ def to_nfa(start, finish)
192
+ start.add_transition(NFA::Transition::Value.new(finish, value))
193
+ end
194
+ end
195
+
196
+ class Period
197
+ def to_dot(parent)
198
+ parent.add_node(object_id, label: ".", shape: "box")
199
+ end
200
+
201
+ def to_nfa(start, finish)
202
+ transition = NFA::Transition::Any.new(finish)
203
+ start.add_transition(transition)
204
+ end
205
+ end
206
+
207
+ class CharacterRange
208
+ attr_reader :left, :right # String
209
+
210
+ def initialize(left, right)
211
+ @left = left
212
+ @right = right
213
+ end
214
+
215
+ def to_dot(parent)
216
+ parent.add_node(object_id, label: "#{left}-#{right}", shape: "box")
217
+ end
218
+
219
+ def to_nfa_values
220
+ (left..right).to_a
221
+ end
222
+
223
+ def to_nfa(start, finish)
224
+ transition = NFA::Transition::Range.new(finish, left, right)
225
+ start.add_transition(transition)
226
+ end
227
+ end
228
+
229
+ class Anchor
230
+ attr_reader :value # "\A" | "\z" | "$"
231
+
232
+ def initialize(value)
233
+ @value = value
234
+ end
235
+
236
+ def to_dot(parent)
237
+ parent.add_node(object_id, label: value, shape: "box")
238
+ end
239
+
240
+ def to_nfa(start, finish)
241
+ transition =
242
+ case value
243
+ when "\\A"
244
+ NFA::Transition::BeginAnchor.new(finish)
245
+ when "\\z", "$"
246
+ NFA::Transition::EndAnchor.new(finish)
247
+ end
248
+
249
+ start.add_transition(transition)
250
+ end
251
+ end
252
+
253
+ module Quantifier
254
+ class Once
255
+ def to_dot(parent); end
256
+
257
+ def quantify(start, finish)
258
+ yield start, finish
259
+ end
260
+ end
261
+
262
+ class ZeroOrMore
263
+ def to_dot(parent)
264
+ parent.add_node(object_id, label: "*", shape: "box")
265
+ end
266
+
267
+ def quantify(start, finish)
268
+ yield start, start
269
+ start.add_transition(NFA::Transition::Epsilon.new(finish))
270
+ end
271
+ end
272
+
273
+ class OneOrMore
274
+ def to_dot(parent)
275
+ parent.add_node(object_id, label: "+", shape: "box")
276
+ end
277
+
278
+ def quantify(start, finish)
279
+ yield start, finish
280
+ finish.add_transition(NFA::Transition::Epsilon.new(start))
281
+ end
282
+ end
283
+
284
+ class Optional
285
+ def to_dot(parent)
286
+ parent.add_node(object_id, label: "?", shape: "box")
287
+ end
288
+
289
+ def quantify(start, finish)
290
+ yield start, finish
291
+ start.add_transition(NFA::Transition::Epsilon.new(finish))
292
+ end
293
+ end
294
+
295
+ class Exact
296
+ attr_reader :value # Integer
297
+
298
+ def initialize(value)
299
+ @value = value
300
+ end
301
+
302
+ def to_dot(parent)
303
+ parent.add_node(object_id, label: "{#{value}}", shape: "box")
304
+ end
305
+
306
+ def quantify(start, finish)
307
+ states = [start, *(value - 1).times.map { NFA::State.new }, finish]
308
+
309
+ value.times do |index|
310
+ yield states[index], states[index + 1]
311
+ end
312
+ end
313
+ end
314
+
315
+ class AtLeast
316
+ attr_reader :value # Integer
317
+
318
+ def initialize(value)
319
+ @value = value
320
+ end
321
+
322
+ def to_dot(parent)
323
+ parent.add_node(object_id, label: "{#{value},}", shape: "box")
324
+ end
325
+
326
+ def quantify(start, finish)
327
+ states = [start, *(value - 1).times.map { NFA::State.new }, finish]
328
+
329
+ value.times do |index|
330
+ yield states[index], states[index + 1]
331
+ end
332
+
333
+ states[-1].add_transition(NFA::Transition::Epsilon.new(states[-2]))
334
+ end
335
+ end
336
+
337
+ class Range
338
+ attr_reader :lower, :upper # Integer
339
+
340
+ def initialize(lower, upper)
341
+ @lower = lower
342
+ @upper = upper
343
+ end
344
+
345
+ def to_dot(parent)
346
+ parent.add_node(object_id, label: "{#{lower},#{upper}}", shape: "box")
347
+ end
348
+
349
+ def quantify(start, finish)
350
+ states = [start, *(upper - 1).times.map { NFA::State.new }, finish]
351
+
352
+ upper.times do |index|
353
+ yield states[index], states[index + 1]
354
+ end
355
+
356
+ (upper - lower).times do |index|
357
+ transition = NFA::Transition::Epsilon.new(states[-1])
358
+ states[lower + index].add_transition(transition)
359
+ end
360
+ end
361
+ end
362
+ end
363
+ end
364
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ # The bytecode module defines instructions, and has a compiled object for
5
+ # storing a stream of them, and a builder object for creating the compiled
6
+ # object.
7
+ module Bytecode
8
+ # Never recurse a graph in a compiler! We don't know how deep it is and
9
+ # don't want to limit how large a program we can accept due to arbitrary
10
+ # stack space. Always use a worklist.
11
+ def self.compile(nfa)
12
+ builder = Builder.new
13
+ label = ->(state, index = 0) { :"state_#{state.object_id}_#{index}" }
14
+
15
+ visited = Set.new
16
+ worklist = [[nfa, [Insns::Jump.new(:fail)]]]
17
+
18
+ # For each state in the NFA.
19
+ until worklist.empty?
20
+ state, fallback = worklist.pop
21
+ next if visited.include?(state)
22
+
23
+ # Label the start of the state.
24
+ builder.mark_label(label[state])
25
+ visited.add(state)
26
+
27
+ if state.is_a?(NFA::FinishState)
28
+ builder.push(Insns::Match.new)
29
+ next
30
+ end
31
+
32
+ # Other states have transitions out of them. Go through each
33
+ # transition.
34
+ state.transitions.each_with_index do |transition, index|
35
+ builder.mark_label(label[state, index])
36
+
37
+ if state.transitions.length > 1 && index != state.transitions.length - 1
38
+ builder.push(Insns::PushIndex.new)
39
+ end
40
+
41
+ case transition
42
+ when NFA::Transition::BeginAnchor
43
+ builder.push(Insns::GuardBegin.new(label[transition.state]))
44
+ when NFA::Transition::EndAnchor
45
+ builder.push(Insns::GuardEnd.new(label[transition.state]))
46
+ when NFA::Transition::Any
47
+ builder.push(Insns::JumpAny.new(label[transition.state]))
48
+ when NFA::Transition::Value
49
+ builder.push(Insns::JumpValue.new(transition.value, label[transition.state]))
50
+ when NFA::Transition::Invert
51
+ builder.push(Insns::JumpValuesInvert.new(transition.values, label[transition.state]))
52
+ when NFA::Transition::Range
53
+ if transition.invert
54
+ builder.push(Insns::JumpRangeInvert.new(transition.left, transition.right, label[transition.state]))
55
+ else
56
+ builder.push(Insns::JumpRange.new(transition.left, transition.right, label[transition.state]))
57
+ end
58
+ when NFA::Transition::Epsilon
59
+ builder.push(Insns::Jump.new(label[transition.state]))
60
+ else
61
+ raise
62
+ end
63
+
64
+ next_fallback =
65
+ if state.transitions.length > 1 && index != state.transitions.length - 1
66
+ [Insns::PopIndex.new, Insns::Jump.new(label[state, index + 1])]
67
+ else
68
+ fallback
69
+ end
70
+
71
+ worklist.push([transition.state, next_fallback])
72
+ end
73
+
74
+ # If we don't have one of the transitions that always executes, then we
75
+ # need to add the fallback to the output for this state.
76
+ if state.transitions.none? { |t| t.is_a?(NFA::Transition::BeginAnchor) || t.is_a?(NFA::Transition::Epsilon) }
77
+ builder.push(*fallback)
78
+ end
79
+ end
80
+
81
+ # We always have a failure case - it's just the failure instruction.
82
+ builder.mark_label(:fail)
83
+ builder.push(Insns::Fail.new)
84
+ builder.build
85
+ end
86
+
87
+ module Insns
88
+ # Push the current string index onto the stack. This is necessary to
89
+ # support backtracking so that we can pop it off later when we want to go
90
+ # backward.
91
+ PushIndex = Class.new
92
+
93
+ # Pop the string index off the stack. This is necessary so that we can
94
+ # support backtracking.
95
+ PopIndex = Class.new
96
+
97
+ # If we're at the beginning of the string, then jump to the then
98
+ # instruction. Otherwise fail the entire match.
99
+ GuardBegin = Struct.new(:guarded)
100
+
101
+ # If we're at the end of the string, then jump to the then instruction.
102
+ # Otherwise fail the match at the current index.
103
+ GuardEnd = Struct.new(:guarded)
104
+
105
+ # If it's possible to read a character off the input, then do so and jump
106
+ # to the target instruction.
107
+ JumpAny = Struct.new(:target)
108
+
109
+ # If it's possible to read a character off the input and that character
110
+ # matches the char value, then do so and jump to the target instruction.
111
+ JumpValue = Struct.new(:char, :target)
112
+
113
+ # If it's possible to read a character off the input and that character is
114
+ # not contained within the list of values, then do so and jump to the
115
+ # target instruction.
116
+ JumpValuesInvert = Struct.new(:chars, :target)
117
+
118
+ # If it's possible to read a character off the input and that character is
119
+ # within the range of possible values, then do so and jump to the target
120
+ # instruction.
121
+ JumpRange = Struct.new(:left, :right, :target)
122
+
123
+ # If it's possible to read a character off the input and that character is
124
+ # not within the range of possible values, then do so and jump to the
125
+ # target instruction.
126
+ JumpRangeInvert = Struct.new(:left, :right, :target)
127
+
128
+ # Jump directly to the target instruction.
129
+ Jump = Struct.new(:target)
130
+
131
+ # Successfully match the string and stop executing instructions.
132
+ Match = Class.new
133
+
134
+ # Fail to match the string at the current index. Increment the starting
135
+ # index and try again if possible.
136
+ Fail = Class.new
137
+ end
138
+
139
+ class Builder
140
+ attr_reader :insns # Array[Insns]
141
+ attr_reader :labels # Hash[Symbol, Integer]
142
+
143
+ def initialize
144
+ @insns = []
145
+ @labels = {}
146
+ end
147
+
148
+ def mark_label(label)
149
+ labels[label] = insns.size
150
+ end
151
+
152
+ def push(*new_insns)
153
+ insns.push(*new_insns)
154
+ end
155
+
156
+ def build
157
+ Compiled.new(insns, labels)
158
+ end
159
+ end
160
+
161
+ class Compiled
162
+ attr_reader :insns, :labels
163
+
164
+ def initialize(insns, labels)
165
+ @insns = insns
166
+ @labels = labels
167
+ end
168
+
169
+ def dump
170
+ output = StringIO.new
171
+
172
+ # Labels store name -> address, but if we want to print the label name
173
+ # at its address, we need to store the address to the name as well.
174
+ reverse_labels = {}
175
+ labels.each do |label, n|
176
+ reverse_labels[n] = label
177
+ end
178
+
179
+ insns.each_with_index do |insn, n|
180
+ label = reverse_labels[n]
181
+ output.puts("#{label}:") if label
182
+ output.puts(" #{insn}")
183
+ end
184
+
185
+ output.string
186
+ end
187
+ end
188
+ end
189
+ end