srl_ruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +3 -0
  4. data/.yardopts +6 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +66 -0
  8. data/Rakefile +16 -0
  9. data/bin/srl_ruby +58 -0
  10. data/lib/regex/abstract_method.rb +35 -0
  11. data/lib/regex/alternation.rb +27 -0
  12. data/lib/regex/anchor.rb +45 -0
  13. data/lib/regex/atomic_expression.rb +16 -0
  14. data/lib/regex/capturing_group.rb +51 -0
  15. data/lib/regex/char_class.rb +38 -0
  16. data/lib/regex/char_range.rb +51 -0
  17. data/lib/regex/char_shorthand.rb +50 -0
  18. data/lib/regex/character.rb +204 -0
  19. data/lib/regex/compound_expression.rb +57 -0
  20. data/lib/regex/concatenation.rb +29 -0
  21. data/lib/regex/expression.rb +60 -0
  22. data/lib/regex/lookaround.rb +50 -0
  23. data/lib/regex/match_option.rb +34 -0
  24. data/lib/regex/monadic_expression.rb +28 -0
  25. data/lib/regex/multiplicity.rb +91 -0
  26. data/lib/regex/non_capturing_group.rb +27 -0
  27. data/lib/regex/polyadic_expression.rb +60 -0
  28. data/lib/regex/quantifiable.rb +22 -0
  29. data/lib/regex/repetition.rb +29 -0
  30. data/lib/regex/wildcard.rb +23 -0
  31. data/lib/srl_ruby/ast_builder.rb +384 -0
  32. data/lib/srl_ruby/grammar.rb +106 -0
  33. data/lib/srl_ruby/regex_repr.rb +13 -0
  34. data/lib/srl_ruby/tokenizer.rb +147 -0
  35. data/lib/srl_ruby/version.rb +3 -0
  36. data/lib/srl_ruby.rb +4 -0
  37. data/spec/integration_spec.rb +451 -0
  38. data/spec/regex/character_spec.rb +166 -0
  39. data/spec/regex/multiplicity_spec.rb +79 -0
  40. data/spec/spec_helper.rb +16 -0
  41. data/spec/srl_ruby/srl_ruby_spec.rb +7 -0
  42. data/spec/srl_ruby/tokenizer_spec.rb +147 -0
  43. data/srl_ruby.gemspec +58 -0
  44. metadata +150 -0
@@ -0,0 +1,23 @@
1
+ # File: wildcard.rb
2
+
3
+ require_relative 'atomic_expression' # Access the superclass
4
+
5
+ module Regex # This module is used as a namespace
6
+ # A wildcard matches any character (except for the newline).
7
+ class Wildcard < AtomicExpression
8
+ # Constructor
9
+ def initialize()
10
+ super
11
+ end
12
+
13
+ protected
14
+
15
+ # Conversion method re-definition.
16
+ # Purpose: Return the String representation of the expression.
17
+ def text_repr()
18
+ return '.'
19
+ end
20
+ end # class
21
+ end # module
22
+
23
+ # End of file
@@ -0,0 +1,384 @@
1
+ require 'stringio'
2
+ require_relative 'regex_repr'
3
+
4
+ module SrlRuby
5
+ # The purpose of a ASTBuilder is to build piece by piece an AST
6
+ # (Abstract Syntax Tree) from a sequence of input tokens and
7
+ # visit events produced by walking over a GFGParsing object.
8
+ # Uses the Builder GoF pattern.
9
+ # The Builder pattern creates a complex object
10
+ # (say, a parse tree) from simpler objects (terminal and non-terminal
11
+ # nodes) and using a step by step approach.
12
+ class ASTBuilder < Rley::ParseRep::ASTBaseBuilder
13
+ Terminal2NodeClass = {}.freeze
14
+
15
+ attr_reader :options
16
+
17
+ protected
18
+
19
+ def terminal2node()
20
+ Terminal2NodeClass
21
+ end
22
+
23
+ # Overriding method.
24
+ # Factory method for creating a node object for the given
25
+ # input token.
26
+ # @param aTerminal [Terminal] Terminal symbol associated with the token
27
+ # @param aTokenPosition [Integer] Position of token in the input stream
28
+ # @param aToken [Token] The input token
29
+ def new_leaf_node(_production, _terminal, aTokenPosition, aToken)
30
+ node = Rley::PTree::TerminalNode.new(aToken, aTokenPosition)
31
+
32
+ return node
33
+ end
34
+
35
+ def multiplicity(lowerBound, upperBound)
36
+ return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
37
+ end
38
+
39
+ def string_literal(aString, to_escape = true)
40
+ if aString.size > 1
41
+ chars = []
42
+ aString.each_char do |ch|
43
+ if to_escape && Regex::Character::MetaChars.include?(ch)
44
+ chars << Regex::Character.new("\\")
45
+ end
46
+ chars << Regex::Character.new(ch)
47
+ end
48
+ result = Regex::Concatenation.new(*chars)
49
+ elsif to_escape && Regex::Character::MetaChars.include?(aString)
50
+ backslash = Regex::Character.new("\\")
51
+ a_string = Regex::Character.new(aString)
52
+ result = Regex::Concatenation.new(backslash, a_string)
53
+ else
54
+ result = Regex::Character.new(aString)
55
+ end
56
+
57
+ return result
58
+ end
59
+
60
+ def char_range(lowerBound, upperBound)
61
+ # TODO fix module nesting
62
+ lower = Regex::Character.new(lowerBound)
63
+ upper = Regex::Character.new(upperBound)
64
+ return Regex::CharRange.new(lower, upper)
65
+ end
66
+
67
+ def char_class(toNegate, *theChildren)
68
+ Regex::CharClass.new(toNegate, *theChildren)
69
+ end
70
+
71
+ def char_shorthand(shortName)
72
+ Regex::CharShorthand.new(shortName)
73
+ end
74
+
75
+ def wildcard()
76
+ Regex::Wildcard.new
77
+ end
78
+
79
+ def repetition(expressionToRepeat, aMultiplicity)
80
+ return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
81
+ end
82
+
83
+ def begin_anchor
84
+ return Regex::Anchor.new('^')
85
+ end
86
+
87
+ # rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
88
+ def reduce_flagged_expr(_production, aRange, theTokens, theChildren)
89
+ @options = theChildren[2] if theChildren[2]
90
+ return_first_child(aRange, theTokens, theChildren)
91
+ end
92
+
93
+ # rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
94
+ def reduce_pattern_sequence(_production, _range, _tokens, theChildren)
95
+ return Regex::Concatenation.new(theChildren[0], theChildren[2])
96
+ end
97
+
98
+ # rule('flags' => %[flags separator single_flag]).as 'flag_sequence'
99
+ def reduce_flag_sequence(_production, _range, _tokens, theChildren)
100
+ theChildren[0] << theChildren[2]
101
+ end
102
+
103
+ # rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
104
+ def reduce_case_insensitive(_production, _range, _tokens, _children)
105
+ return [Regex::MatchOption.new(:IGNORECASE, true)]
106
+ end
107
+
108
+ # rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
109
+ def reduce_multi_line(_production, _range, _tokens, _children)
110
+ return [Regex::MatchOption.new(:MULTILINE, true)]
111
+ end
112
+
113
+ # rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
114
+ def reduce_all_lazy(_production, _range, _tokens, _children)
115
+ return [Regex::MatchOption.new(:ALL_LAZY, true)]
116
+ end
117
+
118
+ # rule 'quantifiable' => %w[begin_anchor anchorable end_anchor]
119
+ def reduce_pinned_quantifiable(_production, _range, _tokens, theChildren)
120
+ theChildren[1].begin_anchor = theChildren[0]
121
+ theChildren[1].end_anchor = theChildren[2]
122
+ return theChildren[1]
123
+ end
124
+
125
+ # rule 'quantifiable' => %w[begin_anchor anchorable]
126
+ def reduce_begin_anchor_quantifiable(_production, _range, _tokens, theChildren)
127
+ theChildren[1].begin_anchor = theChildren[0]
128
+ return theChildren[1]
129
+ end
130
+
131
+ # rule 'quantifiable' => %w[anchorable end_anchor]
132
+ def reduce_end_anchor_quantifiable(_production, _range, _tokens, theChildren)
133
+ theChildren[0].end_anchor = theChildren[1]
134
+ return theChildren[0]
135
+ end
136
+
137
+ # rule 'begin_anchor' => %w[STARTS WITH]
138
+ def reduce_starts_with(_production, _range, _tokens, _children)
139
+ begin_anchor
140
+ end
141
+
142
+ # rule 'begin_anchor' => %w[BEGIN WITH]
143
+ def reduce_begin_with(_production, _range, _tokens, _children)
144
+ begin_anchor
145
+ end
146
+
147
+ # rule 'end_anchor' => %w[MUST END].as 'end_anchor'
148
+ def reduce_end_anchor(_production, _range, _tokens, _children)
149
+ return Regex::Anchor.new('$')
150
+ end
151
+
152
+ # rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
153
+ def reduce_asserted_anchorable(_production, _range, _tokens, theChildren)
154
+ assertion = theChildren.last
155
+ assertion.children.unshift(theChildren[0])
156
+ return assertion
157
+ end
158
+
159
+ # rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
160
+ def reduce_if_followed(_production, _range, _tokens, theChildren)
161
+ return Regex::Lookaround.new(theChildren.last, :ahead, :positive)
162
+ end
163
+
164
+ # rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
165
+ def reduce_if_not_followed(_production, _range, _tokens, theChildren)
166
+ return Regex::Lookaround.new(theChildren.last, :ahead, :negative)
167
+ end
168
+
169
+ # rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
170
+ def reduce_if_had(_production, _range, _tokens, theChildren)
171
+ return Regex::Lookaround.new(theChildren.last, :behind, :positive)
172
+ end
173
+
174
+ # rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
175
+ def reduce_if_not_had(_production, _range, _tokens, theChildren)
176
+ return Regex::Lookaround.new(theChildren.last, :behind, :negative)
177
+ end
178
+
179
+ # rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
180
+ def reduce_quantified_assertable(_production, _range, _tokens, theChildren)
181
+ quantifier = theChildren[1]
182
+ term = theChildren[0]
183
+ repetition(term, quantifier)
184
+ end
185
+
186
+ # rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
187
+ def reduce_lowercase_from_to(_production, _range, _tokens, theChildren)
188
+ lower = theChildren[2].token.lexeme
189
+ upper = theChildren[4].token.lexeme
190
+ ch_range = char_range(lower, upper)
191
+ char_class(false, ch_range)
192
+ end
193
+
194
+ # rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
195
+ def reduce_uppercase_from_to(_production, _range, _tokens, theChildren)
196
+ lower = theChildren[3].token.lexeme
197
+ upper = theChildren[5].token.lexeme
198
+ ch_range = char_range(lower.upcase, upper.upcase)
199
+ char_class(false, ch_range)
200
+ end
201
+
202
+ # rule('letter_range' => 'LETTER').as 'any_lowercase'
203
+ def reduce_any_lowercase(_production, _range, _tokens, _children)
204
+ ch_range = char_range('a', 'z')
205
+ char_class(false, ch_range)
206
+ end
207
+
208
+ # rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
209
+ def reduce_any_uppercase(_production, _range, _tokens, _children)
210
+ ch_range = char_range('A', 'Z')
211
+ char_class(false, ch_range)
212
+ end
213
+
214
+ # rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
215
+ def reduce_digits_from_to(aProduction, aRange, theTokens, theChildren)
216
+ reduce_lowercase_from_to(aProduction, aRange, theTokens, theChildren)
217
+ end
218
+
219
+ # rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
220
+ def reduce_simple_digit_range(_production, _range, _tokens, _children)
221
+ char_shorthand('d')
222
+ end
223
+
224
+ # rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
225
+ def reduce_any_character(_production, _range, _tokens, _children)
226
+ char_shorthand('w')
227
+ end
228
+
229
+ # rule('character_class' => %w[NO CHARACTER]).as 'no_character'
230
+ def reduce_no_character(_production, _range, _tokens, _children)
231
+ char_shorthand('W')
232
+ end
233
+
234
+ # rule('character_class' => 'WHITESPACE').as 'whitespace'
235
+ def reduce_whitespace(_production, _range, _tokens, _children)
236
+ char_shorthand('s')
237
+ end
238
+
239
+ # rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
240
+ def reduce_no_whitespace(_production, _range, _tokens, _children)
241
+ char_shorthand('S')
242
+ end
243
+
244
+ # rule('character_class' => 'ANYTHING').as 'anything'
245
+ def reduce_anything(_production, _range, _tokens, _children)
246
+ wildcard
247
+ end
248
+
249
+ # rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
250
+ def reduce_one_of(_production, _range, _tokens, theChildren)
251
+ raw_literal = theChildren[-1].token.lexeme.dup
252
+ alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
253
+ # TODO check other implementations
254
+ return Regex::CharClass.new(false, *alternatives)
255
+ end
256
+
257
+ # rule('special_char' => 'TAB').as 'tab'
258
+ def reduce_tab(_production, _range, _tokens, _children)
259
+ Regex::Character.new('\t')
260
+ end
261
+
262
+ # rule('special_char' => 'BACKSLASH').as 'backslash'
263
+ def reduce_backslash(_production, _range, _tokens, _children)
264
+ Regex::Character.new('\\')
265
+ end
266
+
267
+ # rule('special_char' => %w[NEW LINE]).as 'new_line'
268
+ def reduce_new_line(_production, _range, _tokens, _children)
269
+ # TODO: control portability
270
+ Regex::Character.new('\n')
271
+ end
272
+
273
+ # rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
274
+ def reduce_literally(_production, _range, _tokens, theChildren)
275
+ # What if literal is empty?...
276
+
277
+ raw_literal = theChildren[-1].token.lexeme.dup
278
+ return string_literal(raw_literal)
279
+ end
280
+
281
+ # rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
282
+ def reduce_any_of(_production, _range, _tokens, theChildren)
283
+ return Regex::Alternation.new(*theChildren[3])
284
+ end
285
+
286
+ # rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
287
+ def reduce_alternative_list(_production, _range, _tokens, theChildren)
288
+ return theChildren[0] << theChildren[-1]
289
+ end
290
+
291
+ # rule('alternatives' => 'quantifiable').as 'simple_alternative'
292
+ def reduce_simple_alternative(_production, _range, _tokens, theChildren)
293
+ return [theChildren.last]
294
+ end
295
+
296
+ # rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
297
+ def reduce_grouping_parenthenses(_production, _range, _tokens, theChildren)
298
+ return Regex::NonCapturingGroup.new(theChildren[1])
299
+ end
300
+
301
+ # rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
302
+ def reduce_capture(_production, _range, _tokens, theChildren)
303
+ return Regex::CapturingGroup.new(theChildren[1])
304
+ end
305
+
306
+ # rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as
307
+ # 'capture_until'
308
+ def reduce_capture_until(_production, _range, _tokens, theChildren)
309
+ group = Regex::CapturingGroup.new(theChildren[1])
310
+ return Regex::Concatenation.new(group, theChildren[3])
311
+ end
312
+
313
+ # rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as
314
+ # 'named_capture'
315
+ def reduce_named_capture(_production, _range, _tokens, theChildren)
316
+ name = theChildren[3].token.lexeme.dup
317
+ return Regex::CapturingGroup.new(theChildren[1], name)
318
+ end
319
+
320
+ # rule('capturing_group' => %w[CAPTURE assertable AS var_name
321
+ # UNTIL assertable]).as 'named_capture_until'
322
+ def reduce_named_capture_until(_production, _range, _tokens, theChildren)
323
+ name = theChildren[3].token.lexeme.dup
324
+ group = Regex::CapturingGroup.new(theChildren[1], name)
325
+ return Regex::Concatenation.new(group, theChildren[5])
326
+ end
327
+
328
+ # rule('quantifier' => 'ONCE').as 'once'
329
+ def reduce_once(_production, _range, _tokens, _children)
330
+ multiplicity(1, 1)
331
+ end
332
+
333
+ # rule('quantifier' => 'TWICE').as 'twice'
334
+ def reduce_twice(_production, _range, _tokens, _children)
335
+ multiplicity(2, 2)
336
+ end
337
+
338
+ # rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
339
+ def reduce_exactly(_production, _range, _tokens, theChildren)
340
+ count = theChildren[1].token.lexeme.to_i
341
+ multiplicity(count, count)
342
+ end
343
+
344
+ # rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as
345
+ # 'between_and'
346
+ def reduce_between_and(_production, _range, _tokens, theChildren)
347
+ lower = theChildren[1].token.lexeme.to_i
348
+ upper = theChildren[3].token.lexeme.to_i
349
+ multiplicity(lower, upper)
350
+ end
351
+
352
+ # rule('quantifier' => 'OPTIONAL').as 'optional'
353
+ def reduce_optional(_production, _range, _tokens, _children)
354
+ multiplicity(0, 1)
355
+ end
356
+
357
+ # rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
358
+ def reduce_once_or_more(_production, _range, _tokens, _children)
359
+ multiplicity(1, :more)
360
+ end
361
+
362
+ # rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
363
+ def reduce_never_or_more(_production, _range, _tokens, _children)
364
+ multiplicity(0, :more)
365
+ end
366
+
367
+ # rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
368
+ def reduce_at_least(_production, _range, _tokens, theChildren)
369
+ count = theChildren[2].token.lexeme.to_i
370
+ multiplicity(count, :more)
371
+ end
372
+
373
+ # rule('times_suffix' => 'TIMES').as 'times_keyword'
374
+ def reduce_times_keyword(_production, _range, _tokens, _children)
375
+ return nil
376
+ end
377
+
378
+ # rule('times_suffix' => []).as 'times_dropped'
379
+ def reduce_times_dropped(_production, _range, _tokens, _children)
380
+ return nil
381
+ end
382
+ end # class
383
+ end # module
384
+ # End of file
@@ -0,0 +1,106 @@
1
+ # Grammar for SRL (Simple Regex Language)
2
+ require 'rley' # Load the gem
3
+ module SrlRuby
4
+ ########################################
5
+ # Work in progress.
6
+ # This is a very partial grammar of SRL.
7
+ # It will be expanded with the coming versions of Rley
8
+ builder = Rley::Syntax::GrammarBuilder.new do
9
+ add_terminals('LPAREN', 'RPAREN', 'COMMA')
10
+ add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
11
+ add_terminals('LITERALLY', 'STRING_LIT')
12
+ add_terminals('BEGIN', 'STARTS', 'WITH')
13
+ add_terminals('MUST', 'END')
14
+ add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
15
+ add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
16
+ add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
17
+ add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
18
+ add_terminals('OF', 'ONE')
19
+ add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
20
+ add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
21
+ add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
22
+ add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
23
+ add_terminals('ALREADY', 'HAD')
24
+ add_terminals('CAPTURE', 'AS', 'UNTIL')
25
+ add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
26
+ add_terminals('LAZY')
27
+
28
+ rule('srl' => 'expression').as 'start_rule'
29
+ rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
30
+ rule('expression' => 'pattern').as 'simple_expr'
31
+ rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
32
+ rule('pattern' => 'quantifiable').as 'basic_pattern'
33
+ rule('separator' => 'COMMA').as 'comma_separator'
34
+ rule('separator' => []).as 'void_separator'
35
+ rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
36
+ rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
37
+ rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
38
+ rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
39
+ rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
40
+ rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
41
+ rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
42
+ rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
43
+ rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
44
+ rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
45
+ rule('end_anchor' => %w[MUST END]).as 'end_anchor'
46
+ rule('anchorable' => 'assertable').as 'simple_anchorable'
47
+ rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
48
+ rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
49
+ rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
50
+ rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
51
+ rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
52
+ rule('assertable' => 'term').as 'simple_assertable'
53
+ rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
54
+ rule('term' => 'atom').as 'atom_term'
55
+ rule('term' => 'alternation').as 'alternation_term'
56
+ rule('term' => 'grouping').as 'grouping_term'
57
+ rule('term' => 'capturing_group').as 'capturing_group_atom'
58
+ rule('atom' => 'letter_range').as 'letter_range_atom'
59
+ rule('atom' => 'digit_range').as 'digit_range_atom'
60
+ rule('atom' => 'character_class').as 'character_class_atom'
61
+ rule('atom' => 'special_char').as 'special_char_atom'
62
+ rule('atom' => 'literal').as 'literal_atom'
63
+ rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
64
+ rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
65
+ rule('letter_range' => 'LETTER').as 'any_lowercase'
66
+ rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
67
+ rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
68
+ rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
69
+ rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
70
+ rule('character_class' => %w[NO CHARACTER]).as 'no_character'
71
+ rule('character_class' => 'WHITESPACE').as 'whitespace'
72
+ rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
73
+ rule('character_class' => 'ANYTHING').as 'anything'
74
+ rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
75
+ rule('special_char' => 'TAB').as 'tab'
76
+ rule('special_char' => 'BACKSLASH').as 'backslash'
77
+ rule('special_char' => %w[NEW LINE]).as 'new_line'
78
+ rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
79
+ rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
80
+ rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
81
+ rule('alternatives' => 'quantifiable').as 'simple_alternative'
82
+ rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
83
+ rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
84
+ rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
85
+ rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
86
+ rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
87
+ rule('var_name' => 'STRING_LIT').as 'var_name'
88
+ rule('quantifier' => 'ONCE').as 'once'
89
+ rule('quantifier' => 'TWICE').as 'twice'
90
+ rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
91
+ rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
92
+ rule('quantifier' => 'OPTIONAL').as 'optional'
93
+ rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
94
+ rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
95
+ rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
96
+ rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
97
+ rule('digit_or_number' => 'NUMBER').as 'number_keyword'
98
+ rule('count' => 'DIGIT_LIT').as 'single_digit'
99
+ rule('count' => 'INTEGER').as 'integer_count'
100
+ rule('times_suffix' => 'TIMES').as 'times_keyword'
101
+ rule('times_suffix' => []).as 'times_dropped'
102
+ end
103
+
104
+ # And now build the grammar and make it accessible via a global constant
105
+ Grammar = builder.grammar
106
+ end # module
@@ -0,0 +1,13 @@
1
+ require_relative '../regex/character'
2
+ require_relative '../regex/char_range'
3
+ require_relative '../regex/concatenation'
4
+ require_relative '../regex/multiplicity'
5
+ require_relative '../regex/repetition'
6
+ require_relative '../regex/char_class'
7
+ require_relative '../regex/char_shorthand'
8
+ require_relative '../regex/wildcard'
9
+ require_relative '../regex/alternation'
10
+ require_relative '../regex/non_capturing_group'
11
+ require_relative '../regex/anchor'
12
+ require_relative '../regex/lookaround'
13
+ require_relative '../regex/capturing_group'
@@ -0,0 +1,147 @@
1
+ # File: srl_tokenizer.rb
2
+ # Tokenizer for SRL (Simple Regex Language)
3
+ require 'strscan'
4
+ require 'rley' # Load the Rley gem
5
+
6
+ module SrlRuby
7
+ # The tokenizer should recognize:
8
+ # Keywords: as, capture, letter
9
+ # Integer literals including single digit
10
+ # String literals (quote delimited)
11
+ # Single character literal
12
+ # Delimiters: parentheses '(' and ')'
13
+ # Separators: comma (optional)
14
+ class Tokenizer
15
+ attr_reader(:scanner)
16
+ attr_reader(:lineno)
17
+ attr_reader(:line_start)
18
+
19
+ @@lexeme2name = {
20
+ '(' => 'LPAREN',
21
+ ')' => 'RPAREN',
22
+ ',' => 'COMMA'
23
+ }.freeze
24
+
25
+ # Here are all the SRL keywords (in uppercase)
26
+ @@keywords = %w[
27
+ ALL
28
+ ALREADY
29
+ AND
30
+ ANY
31
+ ANYTHING
32
+ AS
33
+ AT
34
+ BACKSLASH
35
+ BEGIN
36
+ BETWEEN
37
+ BY
38
+ CAPTURE
39
+ CASE
40
+ CHARACTER
41
+ DIGIT
42
+ END
43
+ EXACTLY
44
+ FOLLOWED
45
+ FROM
46
+ HAD
47
+ IF
48
+ INSENSITIVE
49
+ LAZY
50
+ LEAST
51
+ LETTER
52
+ LINE
53
+ LITERALLY
54
+ MORE
55
+ MULTI
56
+ MUST
57
+ NEVER
58
+ NEW
59
+ NO
60
+ NOT
61
+ NUMBER
62
+ OF
63
+ ONCE
64
+ ONE
65
+ OPTIONAL
66
+ OR
67
+ STARTS
68
+ TAB
69
+ TIMES
70
+ TO
71
+ TWICE
72
+ UNTIL
73
+ UPPERCASE
74
+ WHITESPACE
75
+ WITH
76
+ ].map { |x| [x, x] } .to_h
77
+
78
+ class ScanError < StandardError; end
79
+
80
+ def initialize(source)
81
+ @scanner = StringScanner.new(source)
82
+ @lineno = 1
83
+ end
84
+
85
+ def tokens()
86
+ tok_sequence = []
87
+ until @scanner.eos?
88
+ token = _next_token
89
+ tok_sequence << token unless token.nil?
90
+ end
91
+
92
+ return tok_sequence
93
+ end
94
+
95
+ private
96
+
97
+ def _next_token()
98
+ skip_whitespaces
99
+ curr_ch = scanner.peek(1)
100
+ return nil if curr_ch.nil? || curr_ch.empty?
101
+
102
+ token = nil
103
+
104
+ if '(),'.include? curr_ch
105
+ # Delimiters, separators => single character token
106
+ token = build_token(@@lexeme2name[curr_ch], scanner.getch)
107
+ elsif (lexeme = scanner.scan(/[0-9]{2,}/))
108
+ token = build_token('INTEGER', lexeme) # An integer has 2..* digits
109
+ elsif (lexeme = scanner.scan(/[0-9]/))
110
+ token = build_token('DIGIT_LIT', lexeme)
111
+ elsif (lexeme = scanner.scan(/[a-zA-Z]{2,}/))
112
+ token = build_token(@@keywords[lexeme.upcase], lexeme)
113
+ # TODO: handle case unknown identifier
114
+ elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s)|$)/))
115
+ token = build_token('LETTER_LIT', lexeme)
116
+ elsif (lexeme = scanner.scan(/"([^"]|\\")*"/)) # Double quotes literal?
117
+ unquoted = lexeme.gsub(/(^")|("$)/, '')
118
+ token = build_token('STRING_LIT', unquoted)
119
+ elsif (lexeme = scanner.scan(/'([^']|\\')*'/)) # Single quotes literal?
120
+ unquoted = lexeme.gsub(/(^')|('$)/, '')
121
+ token = build_token('STRING_LIT', unquoted)
122
+ else # Unknown token
123
+ erroneous = curr_ch.nil? ? '' : curr_ch
124
+ sequel = scanner.scan(/.{1,20}/)
125
+ erroneous += sequel unless sequel.nil?
126
+ raise ScanError.new("Unknown token #{erroneous}")
127
+ end
128
+
129
+ return token
130
+ end
131
+
132
+ def build_token(aSymbolName, aLexeme)
133
+ begin
134
+ token = Rley::Lexical::Token.new(aLexeme, aSymbolName)
135
+ rescue StandardError
136
+ puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
137
+ raise ex
138
+ end
139
+
140
+ return token
141
+ end
142
+
143
+ def skip_whitespaces()
144
+ scanner.scan(/[ \t\f\n\r]+/)
145
+ end
146
+ end # class
147
+ end # module
@@ -0,0 +1,3 @@
1
+ module SrlRuby
2
+ VERSION = '0.0.1'.freeze
3
+ end
data/lib/srl_ruby.rb ADDED
@@ -0,0 +1,4 @@
1
+ require_relative './srl_ruby/version'
2
+
3
+ module SrlRuby # This module is used as a namespace
4
+ end