regular_expression 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ # An interpreter for our compiled bytecode. Maybe we could make this possible
5
+ # to enter at a given state and deoptimise to it from the compiled code?
6
+ class Interpreter
7
+ attr_reader :bytecode
8
+
9
+ def initialize(bytecode)
10
+ @bytecode = bytecode
11
+ end
12
+
13
+ # This is just here for API parity with the compiled outputs.
14
+ def to_proc
15
+ interpreter = self
16
+ ->(string) { interpreter.match?(string) }
17
+ end
18
+
19
+ def match?(string)
20
+ stack = []
21
+
22
+ (0..string.size).any? do |start_n|
23
+ string_n = start_n
24
+ insn_n = 0
25
+
26
+ loop do
27
+ insn = bytecode.insns[insn_n]
28
+
29
+ case insn
30
+ when Bytecode::Insns::PushIndex
31
+ stack << string_n
32
+ insn_n += 1
33
+ when Bytecode::Insns::PopIndex
34
+ string_n = stack.pop
35
+ insn_n += 1
36
+ when Bytecode::Insns::GuardBegin
37
+ return false if start_n != 0
38
+
39
+ insn_n = bytecode.labels[insn.guarded]
40
+ when Bytecode::Insns::GuardEnd
41
+ break if string_n != string.size
42
+
43
+ insn_n = bytecode.labels[insn.guarded]
44
+ when Bytecode::Insns::JumpAny
45
+ if string_n < string.size
46
+ string_n += 1
47
+ insn_n = bytecode.labels[insn.target]
48
+ else
49
+ insn_n += 1
50
+ end
51
+ when Bytecode::Insns::JumpValue
52
+ if string_n < string.size && string[string_n] == insn.char
53
+ string_n += 1
54
+ insn_n = bytecode.labels[insn.target]
55
+ else
56
+ insn_n += 1
57
+ end
58
+ when Bytecode::Insns::JumpValuesInvert
59
+ if string_n < string.size && !insn.chars.include?(string[string_n])
60
+ string_n += 1
61
+ insn_n = bytecode.labels[insn.target]
62
+ else
63
+ insn_n += 1
64
+ end
65
+ when Bytecode::Insns::JumpRange
66
+ if string_n < string.size && string[string_n] >= insn.left && string[string_n] <= insn.right
67
+ string_n += 1
68
+ insn_n = bytecode.labels[insn.target]
69
+ else
70
+ insn_n += 1
71
+ end
72
+ when Bytecode::Insns::JumpRangeInvert
73
+ if string_n < string.size && (string[string_n] < insn.left || string[string_n] > insn.right)
74
+ string_n += 1
75
+ insn_n = bytecode.labels[insn.target]
76
+ else
77
+ insn_n += 1
78
+ end
79
+ when Bytecode::Insns::Jump
80
+ insn_n = bytecode.labels[insn.target]
81
+ when Bytecode::Insns::Match
82
+ return true
83
+ when Bytecode::Insns::Fail
84
+ break
85
+ else
86
+ raise
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ class Lexer
5
+ SINGLE = {
6
+ "^" => :CARET,
7
+ "$" => :ENDING,
8
+ "(" => :LPAREN,
9
+ ")" => :RPAREN,
10
+ "[" => :LBRACKET,
11
+ "]" => :RBRACKET,
12
+ "{" => :LBRACE,
13
+ "}" => :RBRACE,
14
+ "|" => :PIPE,
15
+ "*" => :STAR,
16
+ "+" => :PLUS,
17
+ "?" => :QMARK,
18
+ "." => :PERIOD,
19
+ "-" => :DASH,
20
+ "," => :COMMA
21
+ }.freeze
22
+
23
+ def initialize(source)
24
+ @source = source.dup
25
+ end
26
+
27
+ def tokens
28
+ result = []
29
+
30
+ until @source.empty?
31
+ case @source
32
+ when /\A\\[wWdD]/
33
+ result << [:CHAR_CLASS, $&]
34
+ when /\A(?:\\[Az]|\$)/
35
+ result << [:ANCHOR, $&]
36
+ when /\A[\^$()\[\]{}|*+?.\-,]/
37
+ result << [SINGLE[$&], $&]
38
+ when /\A\d+/
39
+ result << [:INTEGER, $&.to_i]
40
+ when /\A(?:\u0009|\u000A|\u000D|[\u0020-\uD7FF]|[\uE000-\uFFFD])/
41
+ result << [:CHAR, $&]
42
+ else
43
+ raise SyntaxError, @source
44
+ end
45
+
46
+ @source = $'
47
+ end
48
+
49
+ result << [false, "end"]
50
+ result
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RegularExpression
4
+ module NFA
5
+ def self.to_dot(nfa)
6
+ graph = Graphviz::Graph.new(rankdir: "LR")
7
+ nfa.to_dot(graph, {})
8
+
9
+ Graphviz.output(graph, path: "build/nfa.svg", format: "svg")
10
+ graph.to_dot
11
+ end
12
+
13
+ class State
14
+ attr_reader :transitions # Array[Transition]
15
+
16
+ def initialize
17
+ @transitions = []
18
+ end
19
+
20
+ def add_transition(transition)
21
+ transitions << transition
22
+ end
23
+
24
+ def to_dot(graph, visited)
25
+ return visited[self] if visited.include?(self)
26
+
27
+ source = graph.add_node(object_id, label: "")
28
+ visited[self] = source
29
+
30
+ transitions.each do |transition|
31
+ target = transition.state.to_dot(graph, visited)
32
+ source.connect(target, label: transition.label)
33
+ end
34
+
35
+ source
36
+ end
37
+ end
38
+
39
+ class StartState < State
40
+ def to_dot(graph, visited)
41
+ super(graph, visited).tap do |node|
42
+ node.attributes.merge!(label: "Start", shape: "box")
43
+ end
44
+ end
45
+ end
46
+
47
+ class FinishState < State
48
+ def to_dot(graph, visited)
49
+ super(graph, visited).tap do |node|
50
+ node.attributes.merge!(label: "Finish", shape: "box")
51
+ end
52
+ end
53
+ end
54
+
55
+ module Transition
56
+ class BeginAnchor < Struct.new(:state)
57
+ def label
58
+ "\\A"
59
+ end
60
+ end
61
+
62
+ class EndAnchor < Struct.new(:state)
63
+ def label
64
+ "\\z"
65
+ end
66
+ end
67
+
68
+ class Any < Struct.new(:state)
69
+ def label
70
+ "."
71
+ end
72
+ end
73
+
74
+ class Value < Struct.new(:state, :value)
75
+ def label
76
+ value.inspect
77
+ end
78
+ end
79
+
80
+ class Invert
81
+ attr_reader :state # State
82
+ attr_reader :values # Array[String]
83
+
84
+ def initialize(state, values)
85
+ @state = state
86
+ @values = values
87
+ end
88
+
89
+ def label
90
+ "[^#{values.join}]"
91
+ end
92
+ end
93
+
94
+ class Range
95
+ attr_reader :state # State
96
+ attr_reader :left, :right # String
97
+ attr_reader :invert # bool
98
+
99
+ def initialize(state, left, right, invert: false)
100
+ @state = state
101
+ @left = left
102
+ @right = right
103
+ @invert = invert
104
+ end
105
+
106
+ def label
107
+ "#{left}-#{right}"
108
+ end
109
+ end
110
+
111
+ class Epsilon < Struct.new(:state)
112
+ def label
113
+ "ε"
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,399 @@
1
+ #
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by Racc 1.5.2
4
+ # from Racc grammar file "".
5
+ #
6
+
7
+ require 'racc/parser.rb'
8
+ module RegularExpression
9
+ class Parser < Racc::Parser
10
+
11
+ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 88)
12
+
13
+ def parse(str)
14
+ @tokens = RegularExpression::Lexer.new(str).tokens
15
+ do_parse
16
+ end
17
+
18
+ def next_token
19
+ @tokens.shift
20
+ end
21
+ ...end parser.y/module_eval...
22
+ ##### State transition tables begin ###
23
+
24
+ racc_action_table = [
25
+ 3, 16, 9, 10, 26, 12, 18, 13, 14, 15,
26
+ 31, 29, 30, 9, 10, 44, 12, 45, 13, 14,
27
+ 15, 9, 10, 33, 12, 34, 13, 14, 15, 9,
28
+ 10, 36, 12, 38, 13, 14, 15, 9, 10, 42,
29
+ 12, 22, 13, 14, 15, 23, 24, 25, 22, 29,
30
+ 30, 43, 23, 24, 25, 29, 30, 40, 41, 46 ]
31
+
32
+ racc_action_check = [
33
+ 0, 1, 0, 0, 12, 0, 5, 0, 0, 0,
34
+ 16, 12, 12, 3, 3, 40, 3, 40, 3, 3,
35
+ 3, 6, 6, 20, 6, 22, 6, 6, 6, 10,
36
+ 10, 27, 10, 30, 10, 10, 10, 18, 18, 35,
37
+ 18, 11, 18, 18, 18, 11, 11, 11, 33, 26,
38
+ 26, 38, 33, 33, 33, 28, 28, 34, 34, 44 ]
39
+
40
+ racc_action_pointer = [
41
+ -2, 1, nil, 9, nil, 3, 17, nil, nil, nil,
42
+ 25, 28, 2, nil, nil, nil, 10, nil, 33, nil,
43
+ 17, nil, 11, nil, nil, nil, 40, 23, 46, nil,
44
+ 21, nil, nil, 35, 42, 31, nil, nil, 41, nil,
45
+ 1, nil, nil, nil, 43, nil, nil ]
46
+
47
+ racc_action_default = [
48
+ -2, -32, -1, -32, -4, -6, -8, -9, -10, -11,
49
+ -32, -15, -32, -18, -19, -20, -32, -3, -32, -7,
50
+ -32, -14, -32, -29, -30, -31, -32, -32, -22, -23,
51
+ -25, 47, -5, -13, -32, -32, -17, -21, -32, -12,
52
+ -32, -28, -16, -24, -32, -27, -26 ]
53
+
54
+ racc_goto_table = [
55
+ 21, 4, 27, 1, 17, 2, 19, nil, nil, nil,
56
+ nil, 20, nil, nil, nil, nil, 35, nil, 37, 32,
57
+ nil, nil, 39 ]
58
+
59
+ racc_goto_check = [
60
+ 8, 3, 10, 1, 3, 2, 4, nil, nil, nil,
61
+ nil, 3, nil, nil, nil, nil, 10, nil, 10, 3,
62
+ nil, nil, 8 ]
63
+
64
+ racc_goto_pointer = [
65
+ nil, 3, 5, 1, 0, nil, nil, nil, -11, nil,
66
+ -10, nil ]
67
+
68
+ racc_goto_default = [
69
+ nil, nil, nil, nil, 5, 6, 7, 8, nil, 11,
70
+ nil, 28 ]
71
+
72
+ racc_reduce_table = [
73
+ 0, 0, :racc_error,
74
+ 1, 21, :_reduce_1,
75
+ 0, 21, :_reduce_2,
76
+ 2, 22, :_reduce_3,
77
+ 1, 22, :_reduce_4,
78
+ 3, 23, :_reduce_5,
79
+ 1, 23, :_reduce_6,
80
+ 2, 24, :_reduce_7,
81
+ 1, 24, :_reduce_8,
82
+ 1, 25, :_reduce_none,
83
+ 1, 25, :_reduce_none,
84
+ 1, 25, :_reduce_11,
85
+ 4, 26, :_reduce_12,
86
+ 3, 26, :_reduce_13,
87
+ 2, 27, :_reduce_14,
88
+ 1, 27, :_reduce_15,
89
+ 4, 29, :_reduce_16,
90
+ 3, 29, :_reduce_17,
91
+ 1, 29, :_reduce_18,
92
+ 1, 29, :_reduce_19,
93
+ 1, 29, :_reduce_20,
94
+ 2, 30, :_reduce_21,
95
+ 1, 30, :_reduce_22,
96
+ 1, 31, :_reduce_none,
97
+ 3, 31, :_reduce_24,
98
+ 1, 31, :_reduce_25,
99
+ 5, 28, :_reduce_26,
100
+ 4, 28, :_reduce_27,
101
+ 3, 28, :_reduce_28,
102
+ 1, 28, :_reduce_29,
103
+ 1, 28, :_reduce_30,
104
+ 1, 28, :_reduce_31 ]
105
+
106
+ racc_reduce_n = 32
107
+
108
+ racc_shift_n = 47
109
+
110
+ racc_token_table = {
111
+ false => 0,
112
+ :error => 1,
113
+ :CARET => 2,
114
+ :PIPE => 3,
115
+ :ANCHOR => 4,
116
+ :LPAREN => 5,
117
+ :RPAREN => 6,
118
+ :LBRACKET => 7,
119
+ :RBRACKET => 8,
120
+ :CHAR_CLASS => 9,
121
+ :CHAR => 10,
122
+ :PERIOD => 11,
123
+ :DASH => 12,
124
+ :LBRACE => 13,
125
+ :INTEGER => 14,
126
+ :COMMA => 15,
127
+ :RBRACE => 16,
128
+ :STAR => 17,
129
+ :PLUS => 18,
130
+ :QMARK => 19 }
131
+
132
+ racc_nt_base = 20
133
+
134
+ racc_use_result_var = true
135
+
136
+ Racc_arg = [
137
+ racc_action_table,
138
+ racc_action_check,
139
+ racc_action_default,
140
+ racc_action_pointer,
141
+ racc_goto_table,
142
+ racc_goto_check,
143
+ racc_goto_default,
144
+ racc_goto_pointer,
145
+ racc_nt_base,
146
+ racc_reduce_table,
147
+ racc_token_table,
148
+ racc_shift_n,
149
+ racc_reduce_n,
150
+ racc_use_result_var ]
151
+
152
+ Racc_token_to_s_table = [
153
+ "$end",
154
+ "error",
155
+ "CARET",
156
+ "PIPE",
157
+ "ANCHOR",
158
+ "LPAREN",
159
+ "RPAREN",
160
+ "LBRACKET",
161
+ "RBRACKET",
162
+ "CHAR_CLASS",
163
+ "CHAR",
164
+ "PERIOD",
165
+ "DASH",
166
+ "LBRACE",
167
+ "INTEGER",
168
+ "COMMA",
169
+ "RBRACE",
170
+ "STAR",
171
+ "PLUS",
172
+ "QMARK",
173
+ "$start",
174
+ "target",
175
+ "root",
176
+ "expression",
177
+ "subexpression",
178
+ "item",
179
+ "group",
180
+ "match",
181
+ "quantifier",
182
+ "match_item",
183
+ "character_group_items",
184
+ "character_group_item" ]
185
+
186
+ Racc_debug_parser = false
187
+
188
+ ##### State transition tables end #####
189
+
190
+ # reduce 0 omitted
191
+
192
+ module_eval(<<'.,.,', 'parser.y', 6)
193
+ def _reduce_1(val, _values, result)
194
+ result = val[0]
195
+ result
196
+ end
197
+ .,.,
198
+
199
+ module_eval(<<'.,.,', 'parser.y', 8)
200
+ def _reduce_2(val, _values, result)
201
+ result = nil
202
+ result
203
+ end
204
+ .,.,
205
+
206
+ module_eval(<<'.,.,', 'parser.y', 12)
207
+ def _reduce_3(val, _values, result)
208
+ result = RegularExpression::AST::Root.new(val[1], at_start: true)
209
+ result
210
+ end
211
+ .,.,
212
+
213
+ module_eval(<<'.,.,', 'parser.y', 14)
214
+ def _reduce_4(val, _values, result)
215
+ result = RegularExpression::AST::Root.new(val[0])
216
+ result
217
+ end
218
+ .,.,
219
+
220
+ module_eval(<<'.,.,', 'parser.y', 18)
221
+ def _reduce_5(val, _values, result)
222
+ result = [RegularExpression::AST::Expression.new(val[0])] + val[2]
223
+ result
224
+ end
225
+ .,.,
226
+
227
+ module_eval(<<'.,.,', 'parser.y', 20)
228
+ def _reduce_6(val, _values, result)
229
+ result = [RegularExpression::AST::Expression.new(val[0])]
230
+ result
231
+ end
232
+ .,.,
233
+
234
+ module_eval(<<'.,.,', 'parser.y', 24)
235
+ def _reduce_7(val, _values, result)
236
+ result = [val[0]] + val[1]
237
+ result
238
+ end
239
+ .,.,
240
+
241
+ module_eval(<<'.,.,', 'parser.y', 26)
242
+ def _reduce_8(val, _values, result)
243
+ result = [val[0]]
244
+ result
245
+ end
246
+ .,.,
247
+
248
+ # reduce 9 omitted
249
+
250
+ # reduce 10 omitted
251
+
252
+ module_eval(<<'.,.,', 'parser.y', 32)
253
+ def _reduce_11(val, _values, result)
254
+ result = RegularExpression::AST::Anchor.new(val[0])
255
+ result
256
+ end
257
+ .,.,
258
+
259
+ module_eval(<<'.,.,', 'parser.y', 36)
260
+ def _reduce_12(val, _values, result)
261
+ result = RegularExpression::AST::Group.new(val[1], quantifier: val[3])
262
+ result
263
+ end
264
+ .,.,
265
+
266
+ module_eval(<<'.,.,', 'parser.y', 38)
267
+ def _reduce_13(val, _values, result)
268
+ result = RegularExpression::AST::Group.new(val[1])
269
+ result
270
+ end
271
+ .,.,
272
+
273
+ module_eval(<<'.,.,', 'parser.y', 42)
274
+ def _reduce_14(val, _values, result)
275
+ result = RegularExpression::AST::Match.new(val[0], quantifier: val[1])
276
+ result
277
+ end
278
+ .,.,
279
+
280
+ module_eval(<<'.,.,', 'parser.y', 44)
281
+ def _reduce_15(val, _values, result)
282
+ result = RegularExpression::AST::Match.new(val[0])
283
+ result
284
+ end
285
+ .,.,
286
+
287
+ module_eval(<<'.,.,', 'parser.y', 48)
288
+ def _reduce_16(val, _values, result)
289
+ result = RegularExpression::AST::CharacterGroup.new(val[2], invert: true)
290
+ result
291
+ end
292
+ .,.,
293
+
294
+ module_eval(<<'.,.,', 'parser.y', 50)
295
+ def _reduce_17(val, _values, result)
296
+ result = RegularExpression::AST::CharacterGroup.new(val[1])
297
+ result
298
+ end
299
+ .,.,
300
+
301
+ module_eval(<<'.,.,', 'parser.y', 52)
302
+ def _reduce_18(val, _values, result)
303
+ result = RegularExpression::AST::CharacterClass.new(val[0])
304
+ result
305
+ end
306
+ .,.,
307
+
308
+ module_eval(<<'.,.,', 'parser.y', 54)
309
+ def _reduce_19(val, _values, result)
310
+ result = RegularExpression::AST::Character.new(val[0])
311
+ result
312
+ end
313
+ .,.,
314
+
315
+ module_eval(<<'.,.,', 'parser.y', 56)
316
+ def _reduce_20(val, _values, result)
317
+ result = RegularExpression::AST::Period.new
318
+ result
319
+ end
320
+ .,.,
321
+
322
+ module_eval(<<'.,.,', 'parser.y', 60)
323
+ def _reduce_21(val, _values, result)
324
+ result = [val[0]] + val[1]
325
+ result
326
+ end
327
+ .,.,
328
+
329
+ module_eval(<<'.,.,', 'parser.y', 62)
330
+ def _reduce_22(val, _values, result)
331
+ result = [val[0]]
332
+ result
333
+ end
334
+ .,.,
335
+
336
+ # reduce 23 omitted
337
+
338
+ module_eval(<<'.,.,', 'parser.y', 67)
339
+ def _reduce_24(val, _values, result)
340
+ result = RegularExpression::AST::CharacterRange.new(val[0], val[2])
341
+ result
342
+ end
343
+ .,.,
344
+
345
+ module_eval(<<'.,.,', 'parser.y', 69)
346
+ def _reduce_25(val, _values, result)
347
+ result = RegularExpression::AST::Character.new(val[0])
348
+ result
349
+ end
350
+ .,.,
351
+
352
+ module_eval(<<'.,.,', 'parser.y', 73)
353
+ def _reduce_26(val, _values, result)
354
+ result = RegularExpression::AST::Quantifier::Range.new(val[1], val[3])
355
+ result
356
+ end
357
+ .,.,
358
+
359
+ module_eval(<<'.,.,', 'parser.y', 75)
360
+ def _reduce_27(val, _values, result)
361
+ result = RegularExpression::AST::Quantifier::AtLeast.new(val[1])
362
+ result
363
+ end
364
+ .,.,
365
+
366
+ module_eval(<<'.,.,', 'parser.y', 77)
367
+ def _reduce_28(val, _values, result)
368
+ result = RegularExpression::AST::Quantifier::Exact.new(val[1])
369
+ result
370
+ end
371
+ .,.,
372
+
373
+ module_eval(<<'.,.,', 'parser.y', 79)
374
+ def _reduce_29(val, _values, result)
375
+ result = RegularExpression::AST::Quantifier::ZeroOrMore.new
376
+ result
377
+ end
378
+ .,.,
379
+
380
+ module_eval(<<'.,.,', 'parser.y', 81)
381
+ def _reduce_30(val, _values, result)
382
+ result = RegularExpression::AST::Quantifier::OneOrMore.new
383
+ result
384
+ end
385
+ .,.,
386
+
387
+ module_eval(<<'.,.,', 'parser.y', 83)
388
+ def _reduce_31(val, _values, result)
389
+ result = RegularExpression::AST::Quantifier::Optional.new
390
+ result
391
+ end
392
+ .,.,
393
+
394
+ def _reduce_none(val, _values, result)
395
+ val[0]
396
+ end
397
+
398
+ end # class Parser
399
+ end # module RegularExpression