regular_expression 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ # An interpreter for our compiled bytecode. Maybe we could make this possible
5
+ # to enter at a given state and deoptimise to it from the compiled code?
6
+ class Interpreter
7
+ attr_reader :bytecode
8
+
9
+ def initialize(bytecode)
10
+ @bytecode = bytecode
11
+ end
12
+
13
+ # This is just here for API parity with the compiled outputs.
14
+ def to_proc
15
+ interpreter = self
16
+ ->(string) { interpreter.match?(string) }
17
+ end
18
+
19
+ def match?(string)
20
+ stack = []
21
+
22
+ (0..string.size).any? do |start_n|
23
+ string_n = start_n
24
+ insn_n = 0
25
+
26
+ loop do
27
+ insn = bytecode.insns[insn_n]
28
+
29
+ case insn
30
+ when Bytecode::Insns::PushIndex
31
+ stack << string_n
32
+ insn_n += 1
33
+ when Bytecode::Insns::PopIndex
34
+ string_n = stack.pop
35
+ insn_n += 1
36
+ when Bytecode::Insns::GuardBegin
37
+ return false if start_n != 0
38
+
39
+ insn_n = bytecode.labels[insn.guarded]
40
+ when Bytecode::Insns::GuardEnd
41
+ break if string_n != string.size
42
+
43
+ insn_n = bytecode.labels[insn.guarded]
44
+ when Bytecode::Insns::JumpAny
45
+ if string_n < string.size
46
+ string_n += 1
47
+ insn_n = bytecode.labels[insn.target]
48
+ else
49
+ insn_n += 1
50
+ end
51
+ when Bytecode::Insns::JumpValue
52
+ if string_n < string.size && string[string_n] == insn.char
53
+ string_n += 1
54
+ insn_n = bytecode.labels[insn.target]
55
+ else
56
+ insn_n += 1
57
+ end
58
+ when Bytecode::Insns::JumpValuesInvert
59
+ if string_n < string.size && !insn.chars.include?(string[string_n])
60
+ string_n += 1
61
+ insn_n = bytecode.labels[insn.target]
62
+ else
63
+ insn_n += 1
64
+ end
65
+ when Bytecode::Insns::JumpRange
66
+ if string_n < string.size && string[string_n] >= insn.left && string[string_n] <= insn.right
67
+ string_n += 1
68
+ insn_n = bytecode.labels[insn.target]
69
+ else
70
+ insn_n += 1
71
+ end
72
+ when Bytecode::Insns::JumpRangeInvert
73
+ if string_n < string.size && (string[string_n] < insn.left || string[string_n] > insn.right)
74
+ string_n += 1
75
+ insn_n = bytecode.labels[insn.target]
76
+ else
77
+ insn_n += 1
78
+ end
79
+ when Bytecode::Insns::Jump
80
+ insn_n = bytecode.labels[insn.target]
81
+ when Bytecode::Insns::Match
82
+ return true
83
+ when Bytecode::Insns::Fail
84
+ break
85
+ else
86
+ raise
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ class Lexer
5
+ SINGLE = {
6
+ "^" => :CARET,
7
+ "$" => :ENDING,
8
+ "(" => :LPAREN,
9
+ ")" => :RPAREN,
10
+ "[" => :LBRACKET,
11
+ "]" => :RBRACKET,
12
+ "{" => :LBRACE,
13
+ "}" => :RBRACE,
14
+ "|" => :PIPE,
15
+ "*" => :STAR,
16
+ "+" => :PLUS,
17
+ "?" => :QMARK,
18
+ "." => :PERIOD,
19
+ "-" => :DASH,
20
+ "," => :COMMA
21
+ }.freeze
22
+
23
+ def initialize(source)
24
+ @source = source.dup
25
+ end
26
+
27
+ def tokens
28
+ result = []
29
+
30
+ until @source.empty?
31
+ case @source
32
+ when /\A\\[wWdD]/
33
+ result << [:CHAR_CLASS, $&]
34
+ when /\A(?:\\[Az]|\$)/
35
+ result << [:ANCHOR, $&]
36
+ when /\A[\^$()\[\]{}|*+?.\-,]/
37
+ result << [SINGLE[$&], $&]
38
+ when /\A\d+/
39
+ result << [:INTEGER, $&.to_i]
40
+ when /\A(?:\u0009|\u000A|\u000D|[\u0020-\uD7FF]|[\uE000-\uFFFD])/
41
+ result << [:CHAR, $&]
42
+ else
43
+ raise SyntaxError, @source
44
+ end
45
+
46
+ @source = $'
47
+ end
48
+
49
+ result << [false, "end"]
50
+ result
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module NFA
5
+ def self.to_dot(nfa)
6
+ graph = Graphviz::Graph.new(rankdir: "LR")
7
+ nfa.to_dot(graph, {})
8
+
9
+ Graphviz.output(graph, path: "build/nfa.svg", format: "svg")
10
+ graph.to_dot
11
+ end
12
+
13
+ class State
14
+ attr_reader :transitions # Array[Transition]
15
+
16
+ def initialize
17
+ @transitions = []
18
+ end
19
+
20
+ def add_transition(transition)
21
+ transitions << transition
22
+ end
23
+
24
+ def to_dot(graph, visited)
25
+ return visited[self] if visited.include?(self)
26
+
27
+ source = graph.add_node(object_id, label: "")
28
+ visited[self] = source
29
+
30
+ transitions.each do |transition|
31
+ target = transition.state.to_dot(graph, visited)
32
+ source.connect(target, label: transition.label)
33
+ end
34
+
35
+ source
36
+ end
37
+ end
38
+
39
+ class StartState < State
40
+ def to_dot(graph, visited)
41
+ super(graph, visited).tap do |node|
42
+ node.attributes.merge!(label: "Start", shape: "box")
43
+ end
44
+ end
45
+ end
46
+
47
+ class FinishState < State
48
+ def to_dot(graph, visited)
49
+ super(graph, visited).tap do |node|
50
+ node.attributes.merge!(label: "Finish", shape: "box")
51
+ end
52
+ end
53
+ end
54
+
55
+ module Transition
56
+ class BeginAnchor < Struct.new(:state)
57
+ def label
58
+ "\\A"
59
+ end
60
+ end
61
+
62
+ class EndAnchor < Struct.new(:state)
63
+ def label
64
+ "\\z"
65
+ end
66
+ end
67
+
68
+ class Any < Struct.new(:state)
69
+ def label
70
+ "."
71
+ end
72
+ end
73
+
74
+ class Value < Struct.new(:state, :value)
75
+ def label
76
+ value.inspect
77
+ end
78
+ end
79
+
80
+ class Invert
81
+ attr_reader :state # State
82
+ attr_reader :values # Array[String]
83
+
84
+ def initialize(state, values)
85
+ @state = state
86
+ @values = values
87
+ end
88
+
89
+ def label
90
+ "[^#{values.join}]"
91
+ end
92
+ end
93
+
94
+ class Range
95
+ attr_reader :state # State
96
+ attr_reader :left, :right # String
97
+ attr_reader :invert # bool
98
+
99
+ def initialize(state, left, right, invert: false)
100
+ @state = state
101
+ @left = left
102
+ @right = right
103
+ @invert = invert
104
+ end
105
+
106
+ def label
107
+ "#{left}-#{right}"
108
+ end
109
+ end
110
+
111
+ class Epsilon < Struct.new(:state)
112
+ def label
113
+ "ε"
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,399 @@
1
+ #
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by Racc 1.5.2
4
+ # from Racc grammar file "".
5
+ #
6
+
7
+ require 'racc/parser.rb'
8
+ module RegularExpression
9
+ class Parser < Racc::Parser
10
+
11
+ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 88)
12
+
13
+ def parse(str)
14
+ @tokens = RegularExpression::Lexer.new(str).tokens
15
+ do_parse
16
+ end
17
+
18
+ def next_token
19
+ @tokens.shift
20
+ end
21
+ ...end parser.y/module_eval...
22
+ ##### State transition tables begin ###
23
+
24
+ racc_action_table = [
25
+ 3, 16, 9, 10, 26, 12, 18, 13, 14, 15,
26
+ 31, 29, 30, 9, 10, 44, 12, 45, 13, 14,
27
+ 15, 9, 10, 33, 12, 34, 13, 14, 15, 9,
28
+ 10, 36, 12, 38, 13, 14, 15, 9, 10, 42,
29
+ 12, 22, 13, 14, 15, 23, 24, 25, 22, 29,
30
+ 30, 43, 23, 24, 25, 29, 30, 40, 41, 46 ]
31
+
32
+ racc_action_check = [
33
+ 0, 1, 0, 0, 12, 0, 5, 0, 0, 0,
34
+ 16, 12, 12, 3, 3, 40, 3, 40, 3, 3,
35
+ 3, 6, 6, 20, 6, 22, 6, 6, 6, 10,
36
+ 10, 27, 10, 30, 10, 10, 10, 18, 18, 35,
37
+ 18, 11, 18, 18, 18, 11, 11, 11, 33, 26,
38
+ 26, 38, 33, 33, 33, 28, 28, 34, 34, 44 ]
39
+
40
+ racc_action_pointer = [
41
+ -2, 1, nil, 9, nil, 3, 17, nil, nil, nil,
42
+ 25, 28, 2, nil, nil, nil, 10, nil, 33, nil,
43
+ 17, nil, 11, nil, nil, nil, 40, 23, 46, nil,
44
+ 21, nil, nil, 35, 42, 31, nil, nil, 41, nil,
45
+ 1, nil, nil, nil, 43, nil, nil ]
46
+
47
+ racc_action_default = [
48
+ -2, -32, -1, -32, -4, -6, -8, -9, -10, -11,
49
+ -32, -15, -32, -18, -19, -20, -32, -3, -32, -7,
50
+ -32, -14, -32, -29, -30, -31, -32, -32, -22, -23,
51
+ -25, 47, -5, -13, -32, -32, -17, -21, -32, -12,
52
+ -32, -28, -16, -24, -32, -27, -26 ]
53
+
54
+ racc_goto_table = [
55
+ 21, 4, 27, 1, 17, 2, 19, nil, nil, nil,
56
+ nil, 20, nil, nil, nil, nil, 35, nil, 37, 32,
57
+ nil, nil, 39 ]
58
+
59
+ racc_goto_check = [
60
+ 8, 3, 10, 1, 3, 2, 4, nil, nil, nil,
61
+ nil, 3, nil, nil, nil, nil, 10, nil, 10, 3,
62
+ nil, nil, 8 ]
63
+
64
+ racc_goto_pointer = [
65
+ nil, 3, 5, 1, 0, nil, nil, nil, -11, nil,
66
+ -10, nil ]
67
+
68
+ racc_goto_default = [
69
+ nil, nil, nil, nil, 5, 6, 7, 8, nil, 11,
70
+ nil, 28 ]
71
+
72
+ racc_reduce_table = [
73
+ 0, 0, :racc_error,
74
+ 1, 21, :_reduce_1,
75
+ 0, 21, :_reduce_2,
76
+ 2, 22, :_reduce_3,
77
+ 1, 22, :_reduce_4,
78
+ 3, 23, :_reduce_5,
79
+ 1, 23, :_reduce_6,
80
+ 2, 24, :_reduce_7,
81
+ 1, 24, :_reduce_8,
82
+ 1, 25, :_reduce_none,
83
+ 1, 25, :_reduce_none,
84
+ 1, 25, :_reduce_11,
85
+ 4, 26, :_reduce_12,
86
+ 3, 26, :_reduce_13,
87
+ 2, 27, :_reduce_14,
88
+ 1, 27, :_reduce_15,
89
+ 4, 29, :_reduce_16,
90
+ 3, 29, :_reduce_17,
91
+ 1, 29, :_reduce_18,
92
+ 1, 29, :_reduce_19,
93
+ 1, 29, :_reduce_20,
94
+ 2, 30, :_reduce_21,
95
+ 1, 30, :_reduce_22,
96
+ 1, 31, :_reduce_none,
97
+ 3, 31, :_reduce_24,
98
+ 1, 31, :_reduce_25,
99
+ 5, 28, :_reduce_26,
100
+ 4, 28, :_reduce_27,
101
+ 3, 28, :_reduce_28,
102
+ 1, 28, :_reduce_29,
103
+ 1, 28, :_reduce_30,
104
+ 1, 28, :_reduce_31 ]
105
+
106
+ racc_reduce_n = 32
107
+
108
+ racc_shift_n = 47
109
+
110
+ racc_token_table = {
111
+ false => 0,
112
+ :error => 1,
113
+ :CARET => 2,
114
+ :PIPE => 3,
115
+ :ANCHOR => 4,
116
+ :LPAREN => 5,
117
+ :RPAREN => 6,
118
+ :LBRACKET => 7,
119
+ :RBRACKET => 8,
120
+ :CHAR_CLASS => 9,
121
+ :CHAR => 10,
122
+ :PERIOD => 11,
123
+ :DASH => 12,
124
+ :LBRACE => 13,
125
+ :INTEGER => 14,
126
+ :COMMA => 15,
127
+ :RBRACE => 16,
128
+ :STAR => 17,
129
+ :PLUS => 18,
130
+ :QMARK => 19 }
131
+
132
+ racc_nt_base = 20
133
+
134
+ racc_use_result_var = true
135
+
136
+ Racc_arg = [
137
+ racc_action_table,
138
+ racc_action_check,
139
+ racc_action_default,
140
+ racc_action_pointer,
141
+ racc_goto_table,
142
+ racc_goto_check,
143
+ racc_goto_default,
144
+ racc_goto_pointer,
145
+ racc_nt_base,
146
+ racc_reduce_table,
147
+ racc_token_table,
148
+ racc_shift_n,
149
+ racc_reduce_n,
150
+ racc_use_result_var ]
151
+
152
+ Racc_token_to_s_table = [
153
+ "$end",
154
+ "error",
155
+ "CARET",
156
+ "PIPE",
157
+ "ANCHOR",
158
+ "LPAREN",
159
+ "RPAREN",
160
+ "LBRACKET",
161
+ "RBRACKET",
162
+ "CHAR_CLASS",
163
+ "CHAR",
164
+ "PERIOD",
165
+ "DASH",
166
+ "LBRACE",
167
+ "INTEGER",
168
+ "COMMA",
169
+ "RBRACE",
170
+ "STAR",
171
+ "PLUS",
172
+ "QMARK",
173
+ "$start",
174
+ "target",
175
+ "root",
176
+ "expression",
177
+ "subexpression",
178
+ "item",
179
+ "group",
180
+ "match",
181
+ "quantifier",
182
+ "match_item",
183
+ "character_group_items",
184
+ "character_group_item" ]
185
+
186
+ Racc_debug_parser = false
187
+
188
+ ##### State transition tables end #####
189
+
190
+ # reduce 0 omitted
191
+
192
+ module_eval(<<'.,.,', 'parser.y', 6)
193
+ def _reduce_1(val, _values, result)
194
+ result = val[0]
195
+ result
196
+ end
197
+ .,.,
198
+
199
+ module_eval(<<'.,.,', 'parser.y', 8)
200
+ def _reduce_2(val, _values, result)
201
+ result = nil
202
+ result
203
+ end
204
+ .,.,
205
+
206
+ module_eval(<<'.,.,', 'parser.y', 12)
207
+ def _reduce_3(val, _values, result)
208
+ result = RegularExpression::AST::Root.new(val[1], at_start: true)
209
+ result
210
+ end
211
+ .,.,
212
+
213
+ module_eval(<<'.,.,', 'parser.y', 14)
214
+ def _reduce_4(val, _values, result)
215
+ result = RegularExpression::AST::Root.new(val[0])
216
+ result
217
+ end
218
+ .,.,
219
+
220
+ module_eval(<<'.,.,', 'parser.y', 18)
221
+ def _reduce_5(val, _values, result)
222
+ result = [RegularExpression::AST::Expression.new(val[0])] + val[2]
223
+ result
224
+ end
225
+ .,.,
226
+
227
+ module_eval(<<'.,.,', 'parser.y', 20)
228
+ def _reduce_6(val, _values, result)
229
+ result = [RegularExpression::AST::Expression.new(val[0])]
230
+ result
231
+ end
232
+ .,.,
233
+
234
+ module_eval(<<'.,.,', 'parser.y', 24)
235
+ def _reduce_7(val, _values, result)
236
+ result = [val[0]] + val[1]
237
+ result
238
+ end
239
+ .,.,
240
+
241
+ module_eval(<<'.,.,', 'parser.y', 26)
242
+ def _reduce_8(val, _values, result)
243
+ result = [val[0]]
244
+ result
245
+ end
246
+ .,.,
247
+
248
+ # reduce 9 omitted
249
+
250
+ # reduce 10 omitted
251
+
252
+ module_eval(<<'.,.,', 'parser.y', 32)
253
+ def _reduce_11(val, _values, result)
254
+ result = RegularExpression::AST::Anchor.new(val[0])
255
+ result
256
+ end
257
+ .,.,
258
+
259
+ module_eval(<<'.,.,', 'parser.y', 36)
260
+ def _reduce_12(val, _values, result)
261
+ result = RegularExpression::AST::Group.new(val[1], quantifier: val[3])
262
+ result
263
+ end
264
+ .,.,
265
+
266
+ module_eval(<<'.,.,', 'parser.y', 38)
267
+ def _reduce_13(val, _values, result)
268
+ result = RegularExpression::AST::Group.new(val[1])
269
+ result
270
+ end
271
+ .,.,
272
+
273
+ module_eval(<<'.,.,', 'parser.y', 42)
274
+ def _reduce_14(val, _values, result)
275
+ result = RegularExpression::AST::Match.new(val[0], quantifier: val[1])
276
+ result
277
+ end
278
+ .,.,
279
+
280
+ module_eval(<<'.,.,', 'parser.y', 44)
281
+ def _reduce_15(val, _values, result)
282
+ result = RegularExpression::AST::Match.new(val[0])
283
+ result
284
+ end
285
+ .,.,
286
+
287
+ module_eval(<<'.,.,', 'parser.y', 48)
288
+ def _reduce_16(val, _values, result)
289
+ result = RegularExpression::AST::CharacterGroup.new(val[2], invert: true)
290
+ result
291
+ end
292
+ .,.,
293
+
294
+ module_eval(<<'.,.,', 'parser.y', 50)
295
+ def _reduce_17(val, _values, result)
296
+ result = RegularExpression::AST::CharacterGroup.new(val[1])
297
+ result
298
+ end
299
+ .,.,
300
+
301
+ module_eval(<<'.,.,', 'parser.y', 52)
302
+ def _reduce_18(val, _values, result)
303
+ result = RegularExpression::AST::CharacterClass.new(val[0])
304
+ result
305
+ end
306
+ .,.,
307
+
308
+ module_eval(<<'.,.,', 'parser.y', 54)
309
+ def _reduce_19(val, _values, result)
310
+ result = RegularExpression::AST::Character.new(val[0])
311
+ result
312
+ end
313
+ .,.,
314
+
315
+ module_eval(<<'.,.,', 'parser.y', 56)
316
+ def _reduce_20(val, _values, result)
317
+ result = RegularExpression::AST::Period.new
318
+ result
319
+ end
320
+ .,.,
321
+
322
+ module_eval(<<'.,.,', 'parser.y', 60)
323
+ def _reduce_21(val, _values, result)
324
+ result = [val[0]] + val[1]
325
+ result
326
+ end
327
+ .,.,
328
+
329
+ module_eval(<<'.,.,', 'parser.y', 62)
330
+ def _reduce_22(val, _values, result)
331
+ result = [val[0]]
332
+ result
333
+ end
334
+ .,.,
335
+
336
+ # reduce 23 omitted
337
+
338
+ module_eval(<<'.,.,', 'parser.y', 67)
339
+ def _reduce_24(val, _values, result)
340
+ result = RegularExpression::AST::CharacterRange.new(val[0], val[2])
341
+ result
342
+ end
343
+ .,.,
344
+
345
+ module_eval(<<'.,.,', 'parser.y', 69)
346
+ def _reduce_25(val, _values, result)
347
+ result = RegularExpression::AST::Character.new(val[0])
348
+ result
349
+ end
350
+ .,.,
351
+
352
+ module_eval(<<'.,.,', 'parser.y', 73)
353
+ def _reduce_26(val, _values, result)
354
+ result = RegularExpression::AST::Quantifier::Range.new(val[1], val[3])
355
+ result
356
+ end
357
+ .,.,
358
+
359
+ module_eval(<<'.,.,', 'parser.y', 75)
360
+ def _reduce_27(val, _values, result)
361
+ result = RegularExpression::AST::Quantifier::AtLeast.new(val[1])
362
+ result
363
+ end
364
+ .,.,
365
+
366
+ module_eval(<<'.,.,', 'parser.y', 77)
367
+ def _reduce_28(val, _values, result)
368
+ result = RegularExpression::AST::Quantifier::Exact.new(val[1])
369
+ result
370
+ end
371
+ .,.,
372
+
373
+ module_eval(<<'.,.,', 'parser.y', 79)
374
+ def _reduce_29(val, _values, result)
375
+ result = RegularExpression::AST::Quantifier::ZeroOrMore.new
376
+ result
377
+ end
378
+ .,.,
379
+
380
+ module_eval(<<'.,.,', 'parser.y', 81)
381
+ def _reduce_30(val, _values, result)
382
+ result = RegularExpression::AST::Quantifier::OneOrMore.new
383
+ result
384
+ end
385
+ .,.,
386
+
387
+ module_eval(<<'.,.,', 'parser.y', 83)
388
+ def _reduce_31(val, _values, result)
389
+ result = RegularExpression::AST::Quantifier::Optional.new
390
+ result
391
+ end
392
+ .,.,
393
+
394
+ def _reduce_none(val, _values, result)
395
+ val[0]
396
+ end
397
+
398
+ end # class Parser
399
+ end # module RegularExpression