regular_expression 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/parse ADDED
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $:.unshift(File.expand_path("../lib", __dir__))
5
+ require "regular_expression"
6
+ require "crabstone"
7
+ require "graphviz"
8
+
9
+ # Pass the source through the various parsing phases
10
+ source = ARGV.shift
11
+ ast = RegularExpression::Parser.new.parse(source)
12
+ nfa = ast.to_nfa
13
+ bytecode = RegularExpression::Bytecode.compile(nfa)
14
+
15
+ # Compile the graph into various outputs
16
+ cfg = RegularExpression::CFG.build(bytecode)
17
+ ruby = RegularExpression::Compiler::Ruby.compile(cfg)
18
+ x86 = RegularExpression::Compiler::X86.compile(cfg)
19
+
20
+ # Make sure we get some nice dot output
21
+ RegularExpression::AST.to_dot(ast)
22
+ RegularExpression::NFA.to_dot(nfa)
23
+ RegularExpression::CFG.to_dot(cfg)
24
+
25
+ # Dump out the bytecode and cfg to strings
26
+ puts "#{bytecode.dump}\n"
27
+ puts "#{cfg.dump}\n"
28
+
29
+ check =
30
+ if ARGV.any?
31
+ lambda do |compiled|
32
+ checker = compiled.to_proc
33
+ ARGV.each { |string| puts "#{string}: #{checker.call(string)}" }
34
+ puts
35
+ end
36
+ else
37
+ ->(_compiled) {}
38
+ end
39
+
40
+ # Test the interpreter against any passed strings
41
+ interpreter = RegularExpression::Interpreter.new(bytecode)
42
+ check.call(interpreter)
43
+
44
+ # Dump out the compiled ruby source and match against any passed values
45
+ puts "#{ruby.source}\n"
46
+ check.call(ruby)
47
+
48
+ # Dump out the diassembled x86 source and match against any passed values
49
+ puts "#{x86.disasm}\n"
50
+ check.call(x86)
data/build/.gitignore ADDED
@@ -0,0 +1 @@
1
+ *.svg
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fisk"
4
+ require "fisk/helpers"
5
+ require "set"
6
+ require "stringio"
7
+
8
+ require_relative "./regular_expression/ast"
9
+ require_relative "./regular_expression/bytecode"
10
+ require_relative "./regular_expression/cfg"
11
+ require_relative "./regular_expression/interpreter"
12
+ require_relative "./regular_expression/lexer"
13
+ require_relative "./regular_expression/nfa"
14
+ require_relative "./regular_expression/parser"
15
+ require_relative "./regular_expression/pattern"
16
+ require_relative "./regular_expression/version"
17
+
18
+ require_relative "./regular_expression/compiler/ruby"
19
+ require_relative "./regular_expression/compiler/x86"
@@ -0,0 +1,364 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module AST
5
+ def self.to_dot(root)
6
+ graph = Graphviz::Graph.new
7
+ root.to_dot(graph)
8
+
9
+ Graphviz.output(graph, path: "build/ast.svg", format: "svg")
10
+ graph.to_dot
11
+ end
12
+
13
+ class Root
14
+ attr_reader :expressions # Array[Expression]
15
+ attr_reader :at_start # bool
16
+
17
+ def initialize(expressions, at_start: false)
18
+ @expressions = expressions
19
+ @at_start = at_start
20
+ end
21
+
22
+ def to_dot(graph)
23
+ label = "Root"
24
+ label = "#{label} (at start)" if at_start
25
+
26
+ node = graph.add_node(object_id, label: label)
27
+ expressions.each { |expression| expression.to_dot(node) }
28
+ end
29
+
30
+ def to_nfa
31
+ start = NFA::StartState.new
32
+ current = start
33
+
34
+ if at_start
35
+ current = NFA::State.new
36
+ start.add_transition(NFA::Transition::BeginAnchor.new(current))
37
+ end
38
+
39
+ finish = NFA::FinishState.new
40
+ expressions.each do |expression|
41
+ expression.to_nfa(current, finish)
42
+ end
43
+
44
+ start
45
+ end
46
+ end
47
+
48
+ class Expression
49
+ attr_reader :items # Group | Match | Anchor
50
+
51
+ def initialize(items)
52
+ @items = items
53
+ end
54
+
55
+ def to_dot(parent)
56
+ node = parent.add_node(object_id, label: "Expression")
57
+
58
+ items.each { |item| item.to_dot(node) }
59
+ end
60
+
61
+ def to_nfa(start, finish)
62
+ inner = Array.new(items.length - 1) { NFA::State.new }
63
+ states = [start, *inner, finish]
64
+
65
+ items.each_with_index do |item, index|
66
+ item.to_nfa(states[index], states[index + 1])
67
+ end
68
+ end
69
+ end
70
+
71
+ class Group
72
+ attr_reader :expressions # Array[Expression]
73
+ attr_reader :quantifier # Quantifier
74
+
75
+ def initialize(expressions, quantifier: Quantifier::Once.new)
76
+ @expressions = expressions
77
+ @quantifier = quantifier
78
+ end
79
+
80
+ def to_dot(parent)
81
+ node = parent.add_node(object_id, label: "Group")
82
+
83
+ expressions.each { |expression| expression.to_dot(node) }
84
+ quantifier.to_dot(node)
85
+ end
86
+
87
+ def to_nfa(start, finish)
88
+ quantifier.quantify(start, finish) do |qstart, qfinish|
89
+ expressions.each { |expression| expression.to_nfa(qstart, qfinish) }
90
+ end
91
+ end
92
+ end
93
+
94
+ class Match
95
+ attr_reader :item # CharacterGroup | CharacterClass | Character | Period
96
+ attr_reader :quantifier # Quantifier
97
+
98
+ def initialize(item, quantifier: Quantifier::Once.new)
99
+ @item = item
100
+ @quantifier = quantifier
101
+ end
102
+
103
+ def to_dot(parent)
104
+ node = parent.add_node(object_id, label: "Match")
105
+
106
+ item.to_dot(node)
107
+ quantifier.to_dot(node)
108
+ end
109
+
110
+ def to_nfa(start, finish)
111
+ quantifier.quantify(start, finish) do |qstart, qfinish|
112
+ item.to_nfa(qstart, qfinish)
113
+ end
114
+ end
115
+ end
116
+
117
+ class CharacterGroup
118
+ attr_reader :items # Array[CharacterRange | Character]
119
+ attr_reader :invert # bool
120
+
121
+ def initialize(items, invert: false)
122
+ @items = items
123
+ @invert = invert
124
+ end
125
+
126
+ def to_dot(parent)
127
+ label = "CharacterGroup"
128
+ label = "#{label} (invert)" if invert
129
+
130
+ node = parent.add_node(object_id, label: label)
131
+ items.each { |item| item.to_dot(node) }
132
+ end
133
+
134
+ def to_nfa(start, finish)
135
+ if invert
136
+ transition = NFA::Transition::Invert.new(finish, items.flat_map(&:to_nfa_values).sort)
137
+ start.add_transition(transition)
138
+ else
139
+ items.each do |item|
140
+ item.to_nfa(start, finish)
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ class CharacterClass
147
+ attr_reader :value # "\w" | "\W" | "\d" | "\D"
148
+
149
+ def initialize(value)
150
+ @value = value
151
+ end
152
+
153
+ def to_dot(parent)
154
+ parent.add_node(object_id, label: value, shape: "box")
155
+ end
156
+
157
+ def to_nfa(start, finish)
158
+ case value
159
+ when "\\w"
160
+ start.add_transition(NFA::Transition::Range.new(finish, "a", "z"))
161
+ start.add_transition(NFA::Transition::Range.new(finish, "A", "Z"))
162
+ start.add_transition(NFA::Transition::Range.new(finish, "0", "9"))
163
+ start.add_transition(NFA::Transition::Value.new(finish, "_"))
164
+ when "\\W"
165
+ start.add_transition(NFA::Transition::Invert.new(finish, [*("a".."z"), *("A".."Z"), *("0".."9"), "_"]))
166
+ when "\\d"
167
+ start.add_transition(NFA::Transition::Range.new(finish, "0", "9"))
168
+ when "\\D"
169
+ start.add_transition(NFA::Transition::Range.new(finish, "0", "9", invert: true))
170
+ else
171
+ raise
172
+ end
173
+ end
174
+ end
175
+
176
+ class Character
177
+ attr_reader :value # String
178
+
179
+ def initialize(value)
180
+ @value = value
181
+ end
182
+
183
+ def to_dot(parent)
184
+ parent.add_node(object_id, label: value, shape: "box")
185
+ end
186
+
187
+ def to_nfa_values
188
+ [value]
189
+ end
190
+
191
+ def to_nfa(start, finish)
192
+ start.add_transition(NFA::Transition::Value.new(finish, value))
193
+ end
194
+ end
195
+
196
+ class Period
197
+ def to_dot(parent)
198
+ parent.add_node(object_id, label: ".", shape: "box")
199
+ end
200
+
201
+ def to_nfa(start, finish)
202
+ transition = NFA::Transition::Any.new(finish)
203
+ start.add_transition(transition)
204
+ end
205
+ end
206
+
207
+ class CharacterRange
208
+ attr_reader :left, :right # String
209
+
210
+ def initialize(left, right)
211
+ @left = left
212
+ @right = right
213
+ end
214
+
215
+ def to_dot(parent)
216
+ parent.add_node(object_id, label: "#{left}-#{right}", shape: "box")
217
+ end
218
+
219
+ def to_nfa_values
220
+ (left..right).to_a
221
+ end
222
+
223
+ def to_nfa(start, finish)
224
+ transition = NFA::Transition::Range.new(finish, left, right)
225
+ start.add_transition(transition)
226
+ end
227
+ end
228
+
229
+ class Anchor
230
+ attr_reader :value # "\A" | "\z" | "$"
231
+
232
+ def initialize(value)
233
+ @value = value
234
+ end
235
+
236
+ def to_dot(parent)
237
+ parent.add_node(object_id, label: value, shape: "box")
238
+ end
239
+
240
+ def to_nfa(start, finish)
241
+ transition =
242
+ case value
243
+ when "\\A"
244
+ NFA::Transition::BeginAnchor.new(finish)
245
+ when "\\z", "$"
246
+ NFA::Transition::EndAnchor.new(finish)
247
+ end
248
+
249
+ start.add_transition(transition)
250
+ end
251
+ end
252
+
253
+ module Quantifier
254
+ class Once
255
+ def to_dot(parent); end
256
+
257
+ def quantify(start, finish)
258
+ yield start, finish
259
+ end
260
+ end
261
+
262
+ class ZeroOrMore
263
+ def to_dot(parent)
264
+ parent.add_node(object_id, label: "*", shape: "box")
265
+ end
266
+
267
+ def quantify(start, finish)
268
+ yield start, start
269
+ start.add_transition(NFA::Transition::Epsilon.new(finish))
270
+ end
271
+ end
272
+
273
+ class OneOrMore
274
+ def to_dot(parent)
275
+ parent.add_node(object_id, label: "+", shape: "box")
276
+ end
277
+
278
+ def quantify(start, finish)
279
+ yield start, finish
280
+ finish.add_transition(NFA::Transition::Epsilon.new(start))
281
+ end
282
+ end
283
+
284
+ class Optional
285
+ def to_dot(parent)
286
+ parent.add_node(object_id, label: "?", shape: "box")
287
+ end
288
+
289
+ def quantify(start, finish)
290
+ yield start, finish
291
+ start.add_transition(NFA::Transition::Epsilon.new(finish))
292
+ end
293
+ end
294
+
295
+ class Exact
296
+ attr_reader :value # Integer
297
+
298
+ def initialize(value)
299
+ @value = value
300
+ end
301
+
302
+ def to_dot(parent)
303
+ parent.add_node(object_id, label: "{#{value}}", shape: "box")
304
+ end
305
+
306
+ def quantify(start, finish)
307
+ states = [start, *(value - 1).times.map { NFA::State.new }, finish]
308
+
309
+ value.times do |index|
310
+ yield states[index], states[index + 1]
311
+ end
312
+ end
313
+ end
314
+
315
+ class AtLeast
316
+ attr_reader :value # Integer
317
+
318
+ def initialize(value)
319
+ @value = value
320
+ end
321
+
322
+ def to_dot(parent)
323
+ parent.add_node(object_id, label: "{#{value},}", shape: "box")
324
+ end
325
+
326
+ def quantify(start, finish)
327
+ states = [start, *(value - 1).times.map { NFA::State.new }, finish]
328
+
329
+ value.times do |index|
330
+ yield states[index], states[index + 1]
331
+ end
332
+
333
+ states[-1].add_transition(NFA::Transition::Epsilon.new(states[-2]))
334
+ end
335
+ end
336
+
337
+ class Range
338
+ attr_reader :lower, :upper # Integer
339
+
340
+ def initialize(lower, upper)
341
+ @lower = lower
342
+ @upper = upper
343
+ end
344
+
345
+ def to_dot(parent)
346
+ parent.add_node(object_id, label: "{#{lower},#{upper}}", shape: "box")
347
+ end
348
+
349
+ def quantify(start, finish)
350
+ states = [start, *(upper - 1).times.map { NFA::State.new }, finish]
351
+
352
+ upper.times do |index|
353
+ yield states[index], states[index + 1]
354
+ end
355
+
356
+ (upper - lower).times do |index|
357
+ transition = NFA::Transition::Epsilon.new(states[-1])
358
+ states[lower + index].add_transition(transition)
359
+ end
360
+ end
361
+ end
362
+ end
363
+ end
364
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ # The bytecode module defines instructions, and has a compiled object for
5
+ # storing a stream of them, and a builder object for creating the compiled
6
+ # object.
7
+ module Bytecode
8
+ # Never recurse a graph in a compiler! We don't know how deep it is and
9
+ # don't want to limit how large a program we can accept due to arbitrary
10
+ # stack space. Always use a worklist.
11
+ def self.compile(nfa)
12
+ builder = Builder.new
13
+ label = ->(state, index = 0) { :"state_#{state.object_id}_#{index}" }
14
+
15
+ visited = Set.new
16
+ worklist = [[nfa, [Insns::Jump.new(:fail)]]]
17
+
18
+ # For each state in the NFA.
19
+ until worklist.empty?
20
+ state, fallback = worklist.pop
21
+ next if visited.include?(state)
22
+
23
+ # Label the start of the state.
24
+ builder.mark_label(label[state])
25
+ visited.add(state)
26
+
27
+ if state.is_a?(NFA::FinishState)
28
+ builder.push(Insns::Match.new)
29
+ next
30
+ end
31
+
32
+ # Other states have transitions out of them. Go through each
33
+ # transition.
34
+ state.transitions.each_with_index do |transition, index|
35
+ builder.mark_label(label[state, index])
36
+
37
+ if state.transitions.length > 1 && index != state.transitions.length - 1
38
+ builder.push(Insns::PushIndex.new)
39
+ end
40
+
41
+ case transition
42
+ when NFA::Transition::BeginAnchor
43
+ builder.push(Insns::GuardBegin.new(label[transition.state]))
44
+ when NFA::Transition::EndAnchor
45
+ builder.push(Insns::GuardEnd.new(label[transition.state]))
46
+ when NFA::Transition::Any
47
+ builder.push(Insns::JumpAny.new(label[transition.state]))
48
+ when NFA::Transition::Value
49
+ builder.push(Insns::JumpValue.new(transition.value, label[transition.state]))
50
+ when NFA::Transition::Invert
51
+ builder.push(Insns::JumpValuesInvert.new(transition.values, label[transition.state]))
52
+ when NFA::Transition::Range
53
+ if transition.invert
54
+ builder.push(Insns::JumpRangeInvert.new(transition.left, transition.right, label[transition.state]))
55
+ else
56
+ builder.push(Insns::JumpRange.new(transition.left, transition.right, label[transition.state]))
57
+ end
58
+ when NFA::Transition::Epsilon
59
+ builder.push(Insns::Jump.new(label[transition.state]))
60
+ else
61
+ raise
62
+ end
63
+
64
+ next_fallback =
65
+ if state.transitions.length > 1 && index != state.transitions.length - 1
66
+ [Insns::PopIndex.new, Insns::Jump.new(label[state, index + 1])]
67
+ else
68
+ fallback
69
+ end
70
+
71
+ worklist.push([transition.state, next_fallback])
72
+ end
73
+
74
+ # If we don't have one of the transitions that always executes, then we
75
+ # need to add the fallback to the output for this state.
76
+ if state.transitions.none? { |t| t.is_a?(NFA::Transition::BeginAnchor) || t.is_a?(NFA::Transition::Epsilon) }
77
+ builder.push(*fallback)
78
+ end
79
+ end
80
+
81
+ # We always have a failure case - it's just the failure instruction.
82
+ builder.mark_label(:fail)
83
+ builder.push(Insns::Fail.new)
84
+ builder.build
85
+ end
86
+
87
+ module Insns
88
+ # Push the current string index onto the stack. This is necessary to
89
+ # support backtracking so that we can pop it off later when we want to go
90
+ # backward.
91
+ PushIndex = Class.new
92
+
93
+ # Pop the string index off the stack. This is necessary so that we can
94
+ # support backtracking.
95
+ PopIndex = Class.new
96
+
97
+ # If we're at the beginning of the string, then jump to the then
98
+ # instruction. Otherwise fail the entire match.
99
+ GuardBegin = Struct.new(:guarded)
100
+
101
+ # If we're at the end of the string, then jump to the then instruction.
102
+ # Otherwise fail the match at the current index.
103
+ GuardEnd = Struct.new(:guarded)
104
+
105
+ # If it's possible to read a character off the input, then do so and jump
106
+ # to the target instruction.
107
+ JumpAny = Struct.new(:target)
108
+
109
+ # If it's possible to read a character off the input and that character
110
+ # matches the char value, then do so and jump to the target instruction.
111
+ JumpValue = Struct.new(:char, :target)
112
+
113
+ # If it's possible to read a character off the input and that character is
114
+ # not contained within the list of values, then do so and jump to the
115
+ # target instruction.
116
+ JumpValuesInvert = Struct.new(:chars, :target)
117
+
118
+ # If it's possible to read a character off the input and that character is
119
+ # within the range of possible values, then do so and jump to the target
120
+ # instruction.
121
+ JumpRange = Struct.new(:left, :right, :target)
122
+
123
+ # If it's possible to read a character off the input and that character is
124
+ # not within the range of possible values, then do so and jump to the
125
+ # target instruction.
126
+ JumpRangeInvert = Struct.new(:left, :right, :target)
127
+
128
+ # Jump directly to the target instruction.
129
+ Jump = Struct.new(:target)
130
+
131
+ # Successfully match the string and stop executing instructions.
132
+ Match = Class.new
133
+
134
+ # Fail to match the string at the current index. Increment the starting
135
+ # index and try again if possible.
136
+ Fail = Class.new
137
+ end
138
+
139
+ class Builder
140
+ attr_reader :insns # Array[Insns]
141
+ attr_reader :labels # Hash[Symbol, Integer]
142
+
143
+ def initialize
144
+ @insns = []
145
+ @labels = {}
146
+ end
147
+
148
+ def mark_label(label)
149
+ labels[label] = insns.size
150
+ end
151
+
152
+ def push(*new_insns)
153
+ insns.push(*new_insns)
154
+ end
155
+
156
+ def build
157
+ Compiled.new(insns, labels)
158
+ end
159
+ end
160
+
161
+ class Compiled
162
+ attr_reader :insns, :labels
163
+
164
+ def initialize(insns, labels)
165
+ @insns = insns
166
+ @labels = labels
167
+ end
168
+
169
+ def dump
170
+ output = StringIO.new
171
+
172
+ # Labels store name -> address, but if we want to print the label name
173
+ # at its address, we need to store the address to the name as well.
174
+ reverse_labels = {}
175
+ labels.each do |label, n|
176
+ reverse_labels[n] = label
177
+ end
178
+
179
+ insns.each_with_index do |insn, n|
180
+ label = reverse_labels[n]
181
+ output.puts("#{label}:") if label
182
+ output.puts(" #{insn}")
183
+ end
184
+
185
+ output.string
186
+ end
187
+ end
188
+ end
189
+ end