srl_ruby 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +3 -0
  4. data/.yardopts +6 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +66 -0
  8. data/Rakefile +16 -0
  9. data/bin/srl_ruby +58 -0
  10. data/lib/regex/abstract_method.rb +35 -0
  11. data/lib/regex/alternation.rb +27 -0
  12. data/lib/regex/anchor.rb +45 -0
  13. data/lib/regex/atomic_expression.rb +16 -0
  14. data/lib/regex/capturing_group.rb +51 -0
  15. data/lib/regex/char_class.rb +38 -0
  16. data/lib/regex/char_range.rb +51 -0
  17. data/lib/regex/char_shorthand.rb +50 -0
  18. data/lib/regex/character.rb +204 -0
  19. data/lib/regex/compound_expression.rb +57 -0
  20. data/lib/regex/concatenation.rb +29 -0
  21. data/lib/regex/expression.rb +60 -0
  22. data/lib/regex/lookaround.rb +50 -0
  23. data/lib/regex/match_option.rb +34 -0
  24. data/lib/regex/monadic_expression.rb +28 -0
  25. data/lib/regex/multiplicity.rb +91 -0
  26. data/lib/regex/non_capturing_group.rb +27 -0
  27. data/lib/regex/polyadic_expression.rb +60 -0
  28. data/lib/regex/quantifiable.rb +22 -0
  29. data/lib/regex/repetition.rb +29 -0
  30. data/lib/regex/wildcard.rb +23 -0
  31. data/lib/srl_ruby/ast_builder.rb +384 -0
  32. data/lib/srl_ruby/grammar.rb +106 -0
  33. data/lib/srl_ruby/regex_repr.rb +13 -0
  34. data/lib/srl_ruby/tokenizer.rb +147 -0
  35. data/lib/srl_ruby/version.rb +3 -0
  36. data/lib/srl_ruby.rb +4 -0
  37. data/spec/integration_spec.rb +451 -0
  38. data/spec/regex/character_spec.rb +166 -0
  39. data/spec/regex/multiplicity_spec.rb +79 -0
  40. data/spec/spec_helper.rb +16 -0
  41. data/spec/srl_ruby/srl_ruby_spec.rb +7 -0
  42. data/spec/srl_ruby/tokenizer_spec.rb +147 -0
  43. data/srl_ruby.gemspec +58 -0
  44. metadata +150 -0
@@ -0,0 +1,23 @@
1
+ # File: wildcard.rb
2
+
3
+ require_relative 'atomic_expression' # Access the superclass
4
+
5
+ module Regex # This module is used as a namespace
6
+ # A wildcard matches any character (except for the newline).
7
+ class Wildcard < AtomicExpression
8
+ # Constructor
9
+ def initialize()
10
+ super
11
+ end
12
+
13
+ protected
14
+
15
+ # Conversion method re-definition.
16
+ # Purpose: Return the String representation of the expression.
17
+ def text_repr()
18
+ return '.'
19
+ end
20
+ end # class
21
+ end # module
22
+
23
+ # End of file
@@ -0,0 +1,384 @@
1
+ require 'stringio'
2
+ require_relative 'regex_repr'
3
+
4
+ module SrlRuby
5
+ # The purpose of a ASTBuilder is to build piece by piece an AST
6
+ # (Abstract Syntax Tree) from a sequence of input tokens and
7
+ # visit events produced by walking over a GFGParsing object.
8
+ # Uses the Builder GoF pattern.
9
+ # The Builder pattern creates a complex object
10
+ # (say, a parse tree) from simpler objects (terminal and non-terminal
11
+ # nodes) and using a step by step approach.
12
+ class ASTBuilder < Rley::ParseRep::ASTBaseBuilder
13
+ Terminal2NodeClass = {}.freeze
14
+
15
+ attr_reader :options
16
+
17
+ protected
18
+
19
+ def terminal2node()
20
+ Terminal2NodeClass
21
+ end
22
+
23
+ # Overriding method.
24
+ # Factory method for creating a node object for the given
25
+ # input token.
26
+ # @param aTerminal [Terminal] Terminal symbol associated with the token
27
+ # @param aTokenPosition [Integer] Position of token in the input stream
28
+ # @param aToken [Token] The input token
29
+ def new_leaf_node(_production, _terminal, aTokenPosition, aToken)
30
+ node = Rley::PTree::TerminalNode.new(aToken, aTokenPosition)
31
+
32
+ return node
33
+ end
34
+
35
+ def multiplicity(lowerBound, upperBound)
36
+ return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
37
+ end
38
+
39
+ def string_literal(aString, to_escape = true)
40
+ if aString.size > 1
41
+ chars = []
42
+ aString.each_char do |ch|
43
+ if to_escape && Regex::Character::MetaChars.include?(ch)
44
+ chars << Regex::Character.new("\\")
45
+ end
46
+ chars << Regex::Character.new(ch)
47
+ end
48
+ result = Regex::Concatenation.new(*chars)
49
+ elsif to_escape && Regex::Character::MetaChars.include?(aString)
50
+ backslash = Regex::Character.new("\\")
51
+ a_string = Regex::Character.new(aString)
52
+ result = Regex::Concatenation.new(backslash, a_string)
53
+ else
54
+ result = Regex::Character.new(aString)
55
+ end
56
+
57
+ return result
58
+ end
59
+
60
+ def char_range(lowerBound, upperBound)
61
+ # TODO fix module nesting
62
+ lower = Regex::Character.new(lowerBound)
63
+ upper = Regex::Character.new(upperBound)
64
+ return Regex::CharRange.new(lower, upper)
65
+ end
66
+
67
+ def char_class(toNegate, *theChildren)
68
+ Regex::CharClass.new(toNegate, *theChildren)
69
+ end
70
+
71
+ def char_shorthand(shortName)
72
+ Regex::CharShorthand.new(shortName)
73
+ end
74
+
75
+ def wildcard()
76
+ Regex::Wildcard.new
77
+ end
78
+
79
+ def repetition(expressionToRepeat, aMultiplicity)
80
+ return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
81
+ end
82
+
83
+ def begin_anchor
84
+ return Regex::Anchor.new('^')
85
+ end
86
+
87
+ # rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
88
+ def reduce_flagged_expr(_production, aRange, theTokens, theChildren)
89
+ @options = theChildren[2] if theChildren[2]
90
+ return_first_child(aRange, theTokens, theChildren)
91
+ end
92
+
93
+ # rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
94
+ def reduce_pattern_sequence(_production, _range, _tokens, theChildren)
95
+ return Regex::Concatenation.new(theChildren[0], theChildren[2])
96
+ end
97
+
98
+ # rule('flags' => %[flags separator single_flag]).as 'flag_sequence'
99
+ def reduce_flag_sequence(_production, _range, _tokens, theChildren)
100
+ theChildren[0] << theChildren[2]
101
+ end
102
+
103
+ # rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
104
+ def reduce_case_insensitive(_production, _range, _tokens, _children)
105
+ return [Regex::MatchOption.new(:IGNORECASE, true)]
106
+ end
107
+
108
+ # rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
109
+ def reduce_multi_line(_production, _range, _tokens, _children)
110
+ return [Regex::MatchOption.new(:MULTILINE, true)]
111
+ end
112
+
113
+ # rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
114
+ def reduce_all_lazy(_production, _range, _tokens, _children)
115
+ return [Regex::MatchOption.new(:ALL_LAZY, true)]
116
+ end
117
+
118
+ # rule 'quantifiable' => %w[begin_anchor anchorable end_anchor]
119
+ def reduce_pinned_quantifiable(_production, _range, _tokens, theChildren)
120
+ theChildren[1].begin_anchor = theChildren[0]
121
+ theChildren[1].end_anchor = theChildren[2]
122
+ return theChildren[1]
123
+ end
124
+
125
+ # rule 'quantifiable' => %w[begin_anchor anchorable]
126
+ def reduce_begin_anchor_quantifiable(_production, _range, _tokens, theChildren)
127
+ theChildren[1].begin_anchor = theChildren[0]
128
+ return theChildren[1]
129
+ end
130
+
131
+ # rule 'quantifiable' => %w[anchorable end_anchor]
132
+ def reduce_end_anchor_quantifiable(_production, _range, _tokens, theChildren)
133
+ theChildren[0].end_anchor = theChildren[1]
134
+ return theChildren[0]
135
+ end
136
+
137
+ # rule 'begin_anchor' => %w[STARTS WITH]
138
+ def reduce_starts_with(_production, _range, _tokens, _children)
139
+ begin_anchor
140
+ end
141
+
142
+ # rule 'begin_anchor' => %w[BEGIN WITH]
143
+ def reduce_begin_with(_production, _range, _tokens, _children)
144
+ begin_anchor
145
+ end
146
+
147
+ # rule 'end_anchor' => %w[MUST END].as 'end_anchor'
148
+ def reduce_end_anchor(_production, _range, _tokens, _children)
149
+ return Regex::Anchor.new('$')
150
+ end
151
+
152
+ # rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
153
+ def reduce_asserted_anchorable(_production, _range, _tokens, theChildren)
154
+ assertion = theChildren.last
155
+ assertion.children.unshift(theChildren[0])
156
+ return assertion
157
+ end
158
+
159
+ # rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
160
+ def reduce_if_followed(_production, _range, _tokens, theChildren)
161
+ return Regex::Lookaround.new(theChildren.last, :ahead, :positive)
162
+ end
163
+
164
+ # rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
165
+ def reduce_if_not_followed(_production, _range, _tokens, theChildren)
166
+ return Regex::Lookaround.new(theChildren.last, :ahead, :negative)
167
+ end
168
+
169
+ # rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
170
+ def reduce_if_had(_production, _range, _tokens, theChildren)
171
+ return Regex::Lookaround.new(theChildren.last, :behind, :positive)
172
+ end
173
+
174
+ # rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
175
+ def reduce_if_not_had(_production, _range, _tokens, theChildren)
176
+ return Regex::Lookaround.new(theChildren.last, :behind, :negative)
177
+ end
178
+
179
+ # rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
180
+ def reduce_quantified_assertable(_production, _range, _tokens, theChildren)
181
+ quantifier = theChildren[1]
182
+ term = theChildren[0]
183
+ repetition(term, quantifier)
184
+ end
185
+
186
+ # rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
187
+ def reduce_lowercase_from_to(_production, _range, _tokens, theChildren)
188
+ lower = theChildren[2].token.lexeme
189
+ upper = theChildren[4].token.lexeme
190
+ ch_range = char_range(lower, upper)
191
+ char_class(false, ch_range)
192
+ end
193
+
194
+ # rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
195
+ def reduce_uppercase_from_to(_production, _range, _tokens, theChildren)
196
+ lower = theChildren[3].token.lexeme
197
+ upper = theChildren[5].token.lexeme
198
+ ch_range = char_range(lower.upcase, upper.upcase)
199
+ char_class(false, ch_range)
200
+ end
201
+
202
+ # rule('letter_range' => 'LETTER').as 'any_lowercase'
203
+ def reduce_any_lowercase(_production, _range, _tokens, _children)
204
+ ch_range = char_range('a', 'z')
205
+ char_class(false, ch_range)
206
+ end
207
+
208
+ # rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
209
+ def reduce_any_uppercase(_production, _range, _tokens, _children)
210
+ ch_range = char_range('A', 'Z')
211
+ char_class(false, ch_range)
212
+ end
213
+
214
+ # rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
215
+ def reduce_digits_from_to(aProduction, aRange, theTokens, theChildren)
216
+ reduce_lowercase_from_to(aProduction, aRange, theTokens, theChildren)
217
+ end
218
+
219
+ # rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
220
+ def reduce_simple_digit_range(_production, _range, _tokens, _children)
221
+ char_shorthand('d')
222
+ end
223
+
224
+ # rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
225
+ def reduce_any_character(_production, _range, _tokens, _children)
226
+ char_shorthand('w')
227
+ end
228
+
229
+ # rule('character_class' => %w[NO CHARACTER]).as 'no_character'
230
+ def reduce_no_character(_production, _range, _tokens, _children)
231
+ char_shorthand('W')
232
+ end
233
+
234
+ # rule('character_class' => 'WHITESPACE').as 'whitespace'
235
+ def reduce_whitespace(_production, _range, _tokens, _children)
236
+ char_shorthand('s')
237
+ end
238
+
239
+ # rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
240
+ def reduce_no_whitespace(_production, _range, _tokens, _children)
241
+ char_shorthand('S')
242
+ end
243
+
244
+ # rule('character_class' => 'ANYTHING').as 'anything'
245
+ def reduce_anything(_production, _range, _tokens, _children)
246
+ wildcard
247
+ end
248
+
249
+ # rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
250
+ def reduce_one_of(_production, _range, _tokens, theChildren)
251
+ raw_literal = theChildren[-1].token.lexeme.dup
252
+ alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
253
+ # TODO check other implementations
254
+ return Regex::CharClass.new(false, *alternatives)
255
+ end
256
+
257
+ # rule('special_char' => 'TAB').as 'tab'
258
+ def reduce_tab(_production, _range, _tokens, _children)
259
+ Regex::Character.new('\t')
260
+ end
261
+
262
+ # rule('special_char' => 'BACKSLASH').as 'backslash'
263
+ def reduce_backslash(_production, _range, _tokens, _children)
264
+ Regex::Character.new('\\')
265
+ end
266
+
267
+ # rule('special_char' => %w[NEW LINE]).as 'new_line'
268
+ def reduce_new_line(_production, _range, _tokens, _children)
269
+ # TODO: control portability
270
+ Regex::Character.new('\n')
271
+ end
272
+
273
+ # rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
274
+ def reduce_literally(_production, _range, _tokens, theChildren)
275
+ # What if literal is empty?...
276
+
277
+ raw_literal = theChildren[-1].token.lexeme.dup
278
+ return string_literal(raw_literal)
279
+ end
280
+
281
+ # rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
282
+ def reduce_any_of(_production, _range, _tokens, theChildren)
283
+ return Regex::Alternation.new(*theChildren[3])
284
+ end
285
+
286
+ # rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
287
+ def reduce_alternative_list(_production, _range, _tokens, theChildren)
288
+ return theChildren[0] << theChildren[-1]
289
+ end
290
+
291
+ # rule('alternatives' => 'quantifiable').as 'simple_alternative'
292
+ def reduce_simple_alternative(_production, _range, _tokens, theChildren)
293
+ return [theChildren.last]
294
+ end
295
+
296
+ # rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
297
+ def reduce_grouping_parenthenses(_production, _range, _tokens, theChildren)
298
+ return Regex::NonCapturingGroup.new(theChildren[1])
299
+ end
300
+
301
+ # rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
302
+ def reduce_capture(_production, _range, _tokens, theChildren)
303
+ return Regex::CapturingGroup.new(theChildren[1])
304
+ end
305
+
306
+ # rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as
307
+ # 'capture_until'
308
+ def reduce_capture_until(_production, _range, _tokens, theChildren)
309
+ group = Regex::CapturingGroup.new(theChildren[1])
310
+ return Regex::Concatenation.new(group, theChildren[3])
311
+ end
312
+
313
+ # rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as
314
+ # 'named_capture'
315
+ def reduce_named_capture(_production, _range, _tokens, theChildren)
316
+ name = theChildren[3].token.lexeme.dup
317
+ return Regex::CapturingGroup.new(theChildren[1], name)
318
+ end
319
+
320
+ # rule('capturing_group' => %w[CAPTURE assertable AS var_name
321
+ # UNTIL assertable]).as 'named_capture_until'
322
+ def reduce_named_capture_until(_production, _range, _tokens, theChildren)
323
+ name = theChildren[3].token.lexeme.dup
324
+ group = Regex::CapturingGroup.new(theChildren[1], name)
325
+ return Regex::Concatenation.new(group, theChildren[5])
326
+ end
327
+
328
+ # rule('quantifier' => 'ONCE').as 'once'
329
+ def reduce_once(_production, _range, _tokens, _children)
330
+ multiplicity(1, 1)
331
+ end
332
+
333
+ # rule('quantifier' => 'TWICE').as 'twice'
334
+ def reduce_twice(_production, _range, _tokens, _children)
335
+ multiplicity(2, 2)
336
+ end
337
+
338
+ # rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
339
+ def reduce_exactly(_production, _range, _tokens, theChildren)
340
+ count = theChildren[1].token.lexeme.to_i
341
+ multiplicity(count, count)
342
+ end
343
+
344
+ # rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as
345
+ # 'between_and'
346
+ def reduce_between_and(_production, _range, _tokens, theChildren)
347
+ lower = theChildren[1].token.lexeme.to_i
348
+ upper = theChildren[3].token.lexeme.to_i
349
+ multiplicity(lower, upper)
350
+ end
351
+
352
+ # rule('quantifier' => 'OPTIONAL').as 'optional'
353
+ def reduce_optional(_production, _range, _tokens, _children)
354
+ multiplicity(0, 1)
355
+ end
356
+
357
+ # rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
358
+ def reduce_once_or_more(_production, _range, _tokens, _children)
359
+ multiplicity(1, :more)
360
+ end
361
+
362
+ # rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
363
+ def reduce_never_or_more(_production, _range, _tokens, _children)
364
+ multiplicity(0, :more)
365
+ end
366
+
367
+ # rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
368
+ def reduce_at_least(_production, _range, _tokens, theChildren)
369
+ count = theChildren[2].token.lexeme.to_i
370
+ multiplicity(count, :more)
371
+ end
372
+
373
+ # rule('times_suffix' => 'TIMES').as 'times_keyword'
374
+ def reduce_times_keyword(_production, _range, _tokens, _children)
375
+ return nil
376
+ end
377
+
378
+ # rule('times_suffix' => []).as 'times_dropped'
379
+ def reduce_times_dropped(_production, _range, _tokens, _children)
380
+ return nil
381
+ end
382
+ end # class
383
+ end # module
384
+ # End of file
@@ -0,0 +1,106 @@
1
+ # Grammar for SRL (Simple Regex Language)
2
+ require 'rley' # Load the gem
3
+ module SrlRuby
4
+ ########################################
5
+ # Work in progress.
6
+ # This is a very partial grammar of SRL.
7
+ # It will be expanded with the coming versions of Rley
8
+ builder = Rley::Syntax::GrammarBuilder.new do
9
+ add_terminals('LPAREN', 'RPAREN', 'COMMA')
10
+ add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
11
+ add_terminals('LITERALLY', 'STRING_LIT')
12
+ add_terminals('BEGIN', 'STARTS', 'WITH')
13
+ add_terminals('MUST', 'END')
14
+ add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
15
+ add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
16
+ add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
17
+ add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
18
+ add_terminals('OF', 'ONE')
19
+ add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
20
+ add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
21
+ add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
22
+ add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
23
+ add_terminals('ALREADY', 'HAD')
24
+ add_terminals('CAPTURE', 'AS', 'UNTIL')
25
+ add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
26
+ add_terminals('LAZY')
27
+
28
+ rule('srl' => 'expression').as 'start_rule'
29
+ rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
30
+ rule('expression' => 'pattern').as 'simple_expr'
31
+ rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
32
+ rule('pattern' => 'quantifiable').as 'basic_pattern'
33
+ rule('separator' => 'COMMA').as 'comma_separator'
34
+ rule('separator' => []).as 'void_separator'
35
+ rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
36
+ rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
37
+ rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
38
+ rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
39
+ rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
40
+ rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
41
+ rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
42
+ rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
43
+ rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
44
+ rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
45
+ rule('end_anchor' => %w[MUST END]).as 'end_anchor'
46
+ rule('anchorable' => 'assertable').as 'simple_anchorable'
47
+ rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
48
+ rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
49
+ rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
50
+ rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
51
+ rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
52
+ rule('assertable' => 'term').as 'simple_assertable'
53
+ rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
54
+ rule('term' => 'atom').as 'atom_term'
55
+ rule('term' => 'alternation').as 'alternation_term'
56
+ rule('term' => 'grouping').as 'grouping_term'
57
+ rule('term' => 'capturing_group').as 'capturing_group_atom'
58
+ rule('atom' => 'letter_range').as 'letter_range_atom'
59
+ rule('atom' => 'digit_range').as 'digit_range_atom'
60
+ rule('atom' => 'character_class').as 'character_class_atom'
61
+ rule('atom' => 'special_char').as 'special_char_atom'
62
+ rule('atom' => 'literal').as 'literal_atom'
63
+ rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
64
+ rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
65
+ rule('letter_range' => 'LETTER').as 'any_lowercase'
66
+ rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
67
+ rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
68
+ rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
69
+ rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
70
+ rule('character_class' => %w[NO CHARACTER]).as 'no_character'
71
+ rule('character_class' => 'WHITESPACE').as 'whitespace'
72
+ rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
73
+ rule('character_class' => 'ANYTHING').as 'anything'
74
+ rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
75
+ rule('special_char' => 'TAB').as 'tab'
76
+ rule('special_char' => 'BACKSLASH').as 'backslash'
77
+ rule('special_char' => %w[NEW LINE]).as 'new_line'
78
+ rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
79
+ rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
80
+ rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
81
+ rule('alternatives' => 'quantifiable').as 'simple_alternative'
82
+ rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
83
+ rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
84
+ rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
85
+ rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
86
+ rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
87
+ rule('var_name' => 'STRING_LIT').as 'var_name'
88
+ rule('quantifier' => 'ONCE').as 'once'
89
+ rule('quantifier' => 'TWICE').as 'twice'
90
+ rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
91
+ rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
92
+ rule('quantifier' => 'OPTIONAL').as 'optional'
93
+ rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
94
+ rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
95
+ rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
96
+ rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
97
+ rule('digit_or_number' => 'NUMBER').as 'number_keyword'
98
+ rule('count' => 'DIGIT_LIT').as 'single_digit'
99
+ rule('count' => 'INTEGER').as 'integer_count'
100
+ rule('times_suffix' => 'TIMES').as 'times_keyword'
101
+ rule('times_suffix' => []).as 'times_dropped'
102
+ end
103
+
104
+ # And now build the grammar and make it accessible via a global constant
105
+ Grammar = builder.grammar
106
+ end # module
@@ -0,0 +1,13 @@
1
+ require_relative '../regex/character'
2
+ require_relative '../regex/char_range'
3
+ require_relative '../regex/concatenation'
4
+ require_relative '../regex/multiplicity'
5
+ require_relative '../regex/repetition'
6
+ require_relative '../regex/char_class'
7
+ require_relative '../regex/char_shorthand'
8
+ require_relative '../regex/wildcard'
9
+ require_relative '../regex/alternation'
10
+ require_relative '../regex/non_capturing_group'
11
+ require_relative '../regex/anchor'
12
+ require_relative '../regex/lookaround'
13
+ require_relative '../regex/capturing_group'
@@ -0,0 +1,147 @@
1
+ # File: srl_tokenizer.rb
2
+ # Tokenizer for SRL (Simple Regex Language)
3
+ require 'strscan'
4
+ require 'rley' # Load the Rley gem
5
+
6
+ module SrlRuby
7
+ # The tokenizer should recognize:
8
+ # Keywords: as, capture, letter
9
+ # Integer literals including single digit
10
+ # String literals (quote delimited)
11
+ # Single character literal
12
+ # Delimiters: parentheses '(' and ')'
13
+ # Separators: comma (optional)
14
+ class Tokenizer
15
+ attr_reader(:scanner)
16
+ attr_reader(:lineno)
17
+ attr_reader(:line_start)
18
+
19
+ @@lexeme2name = {
20
+ '(' => 'LPAREN',
21
+ ')' => 'RPAREN',
22
+ ',' => 'COMMA'
23
+ }.freeze
24
+
25
+ # Here are all the SRL keywords (in uppercase)
26
+ @@keywords = %w[
27
+ ALL
28
+ ALREADY
29
+ AND
30
+ ANY
31
+ ANYTHING
32
+ AS
33
+ AT
34
+ BACKSLASH
35
+ BEGIN
36
+ BETWEEN
37
+ BY
38
+ CAPTURE
39
+ CASE
40
+ CHARACTER
41
+ DIGIT
42
+ END
43
+ EXACTLY
44
+ FOLLOWED
45
+ FROM
46
+ HAD
47
+ IF
48
+ INSENSITIVE
49
+ LAZY
50
+ LEAST
51
+ LETTER
52
+ LINE
53
+ LITERALLY
54
+ MORE
55
+ MULTI
56
+ MUST
57
+ NEVER
58
+ NEW
59
+ NO
60
+ NOT
61
+ NUMBER
62
+ OF
63
+ ONCE
64
+ ONE
65
+ OPTIONAL
66
+ OR
67
+ STARTS
68
+ TAB
69
+ TIMES
70
+ TO
71
+ TWICE
72
+ UNTIL
73
+ UPPERCASE
74
+ WHITESPACE
75
+ WITH
76
+ ].map { |x| [x, x] } .to_h
77
+
78
+ class ScanError < StandardError; end
79
+
80
+ def initialize(source)
81
+ @scanner = StringScanner.new(source)
82
+ @lineno = 1
83
+ end
84
+
85
+ def tokens()
86
+ tok_sequence = []
87
+ until @scanner.eos?
88
+ token = _next_token
89
+ tok_sequence << token unless token.nil?
90
+ end
91
+
92
+ return tok_sequence
93
+ end
94
+
95
+ private
96
+
97
+ def _next_token()
98
+ skip_whitespaces
99
+ curr_ch = scanner.peek(1)
100
+ return nil if curr_ch.nil? || curr_ch.empty?
101
+
102
+ token = nil
103
+
104
+ if '(),'.include? curr_ch
105
+ # Delimiters, separators => single character token
106
+ token = build_token(@@lexeme2name[curr_ch], scanner.getch)
107
+ elsif (lexeme = scanner.scan(/[0-9]{2,}/))
108
+ token = build_token('INTEGER', lexeme) # An integer has 2..* digits
109
+ elsif (lexeme = scanner.scan(/[0-9]/))
110
+ token = build_token('DIGIT_LIT', lexeme)
111
+ elsif (lexeme = scanner.scan(/[a-zA-Z]{2,}/))
112
+ token = build_token(@@keywords[lexeme.upcase], lexeme)
113
+ # TODO: handle case unknown identifier
114
+ elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s)|$)/))
115
+ token = build_token('LETTER_LIT', lexeme)
116
+ elsif (lexeme = scanner.scan(/"([^"]|\\")*"/)) # Double quotes literal?
117
+ unquoted = lexeme.gsub(/(^")|("$)/, '')
118
+ token = build_token('STRING_LIT', unquoted)
119
+ elsif (lexeme = scanner.scan(/'([^']|\\')*'/)) # Single quotes literal?
120
+ unquoted = lexeme.gsub(/(^')|('$)/, '')
121
+ token = build_token('STRING_LIT', unquoted)
122
+ else # Unknown token
123
+ erroneous = curr_ch.nil? ? '' : curr_ch
124
+ sequel = scanner.scan(/.{1,20}/)
125
+ erroneous += sequel unless sequel.nil?
126
+ raise ScanError.new("Unknown token #{erroneous}")
127
+ end
128
+
129
+ return token
130
+ end
131
+
132
+ def build_token(aSymbolName, aLexeme)
133
+ begin
134
+ token = Rley::Lexical::Token.new(aLexeme, aSymbolName)
135
+ rescue StandardError
136
+ puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
137
+ raise ex
138
+ end
139
+
140
+ return token
141
+ end
142
+
143
+ def skip_whitespaces()
144
+ scanner.scan(/[ \t\f\n\r]+/)
145
+ end
146
+ end # class
147
+ end # module
@@ -0,0 +1,3 @@
1
+ module SrlRuby
2
+ VERSION = '0.0.1'.freeze
3
+ end
data/lib/srl_ruby.rb ADDED
@@ -0,0 +1,4 @@
1
+ require_relative './srl_ruby/version'
2
+
3
+ module SrlRuby # This module is used as a namespace
4
+ end