srl_ruby 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +3 -0
- data/.yardopts +6 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +66 -0
- data/Rakefile +16 -0
- data/bin/srl_ruby +58 -0
- data/lib/regex/abstract_method.rb +35 -0
- data/lib/regex/alternation.rb +27 -0
- data/lib/regex/anchor.rb +45 -0
- data/lib/regex/atomic_expression.rb +16 -0
- data/lib/regex/capturing_group.rb +51 -0
- data/lib/regex/char_class.rb +38 -0
- data/lib/regex/char_range.rb +51 -0
- data/lib/regex/char_shorthand.rb +50 -0
- data/lib/regex/character.rb +204 -0
- data/lib/regex/compound_expression.rb +57 -0
- data/lib/regex/concatenation.rb +29 -0
- data/lib/regex/expression.rb +60 -0
- data/lib/regex/lookaround.rb +50 -0
- data/lib/regex/match_option.rb +34 -0
- data/lib/regex/monadic_expression.rb +28 -0
- data/lib/regex/multiplicity.rb +91 -0
- data/lib/regex/non_capturing_group.rb +27 -0
- data/lib/regex/polyadic_expression.rb +60 -0
- data/lib/regex/quantifiable.rb +22 -0
- data/lib/regex/repetition.rb +29 -0
- data/lib/regex/wildcard.rb +23 -0
- data/lib/srl_ruby/ast_builder.rb +384 -0
- data/lib/srl_ruby/grammar.rb +106 -0
- data/lib/srl_ruby/regex_repr.rb +13 -0
- data/lib/srl_ruby/tokenizer.rb +147 -0
- data/lib/srl_ruby/version.rb +3 -0
- data/lib/srl_ruby.rb +4 -0
- data/spec/integration_spec.rb +451 -0
- data/spec/regex/character_spec.rb +166 -0
- data/spec/regex/multiplicity_spec.rb +79 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/srl_ruby/srl_ruby_spec.rb +7 -0
- data/spec/srl_ruby/tokenizer_spec.rb +147 -0
- data/srl_ruby.gemspec +58 -0
- metadata +150 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
# File: wildcard.rb
|
2
|
+
|
3
|
+
require_relative 'atomic_expression' # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
# A wildcard matches any character (except for the newline).
|
7
|
+
class Wildcard < AtomicExpression
|
8
|
+
# Constructor
|
9
|
+
def initialize()
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
|
15
|
+
# Conversion method re-definition.
|
16
|
+
# Purpose: Return the String representation of the expression.
|
17
|
+
def text_repr()
|
18
|
+
return '.'
|
19
|
+
end
|
20
|
+
end # class
|
21
|
+
end # module
|
22
|
+
|
23
|
+
# End of file
|
@@ -0,0 +1,384 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
require_relative 'regex_repr'
|
3
|
+
|
4
|
+
module SrlRuby
|
5
|
+
# The purpose of a ASTBuilder is to build piece by piece an AST
|
6
|
+
# (Abstract Syntax Tree) from a sequence of input tokens and
|
7
|
+
# visit events produced by walking over a GFGParsing object.
|
8
|
+
# Uses the Builder GoF pattern.
|
9
|
+
# The Builder pattern creates a complex object
|
10
|
+
# (say, a parse tree) from simpler objects (terminal and non-terminal
|
11
|
+
# nodes) and using a step by step approach.
|
12
|
+
class ASTBuilder < Rley::ParseRep::ASTBaseBuilder
|
13
|
+
Terminal2NodeClass = {}.freeze
|
14
|
+
|
15
|
+
attr_reader :options
|
16
|
+
|
17
|
+
protected
|
18
|
+
|
19
|
+
def terminal2node()
|
20
|
+
Terminal2NodeClass
|
21
|
+
end
|
22
|
+
|
23
|
+
# Overriding method.
|
24
|
+
# Factory method for creating a node object for the given
|
25
|
+
# input token.
|
26
|
+
# @param aTerminal [Terminal] Terminal symbol associated with the token
|
27
|
+
# @param aTokenPosition [Integer] Position of token in the input stream
|
28
|
+
# @param aToken [Token] The input token
|
29
|
+
def new_leaf_node(_production, _terminal, aTokenPosition, aToken)
|
30
|
+
node = Rley::PTree::TerminalNode.new(aToken, aTokenPosition)
|
31
|
+
|
32
|
+
return node
|
33
|
+
end
|
34
|
+
|
35
|
+
def multiplicity(lowerBound, upperBound)
|
36
|
+
return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
|
37
|
+
end
|
38
|
+
|
39
|
+
def string_literal(aString, to_escape = true)
|
40
|
+
if aString.size > 1
|
41
|
+
chars = []
|
42
|
+
aString.each_char do |ch|
|
43
|
+
if to_escape && Regex::Character::MetaChars.include?(ch)
|
44
|
+
chars << Regex::Character.new("\\")
|
45
|
+
end
|
46
|
+
chars << Regex::Character.new(ch)
|
47
|
+
end
|
48
|
+
result = Regex::Concatenation.new(*chars)
|
49
|
+
elsif to_escape && Regex::Character::MetaChars.include?(aString)
|
50
|
+
backslash = Regex::Character.new("\\")
|
51
|
+
a_string = Regex::Character.new(aString)
|
52
|
+
result = Regex::Concatenation.new(backslash, a_string)
|
53
|
+
else
|
54
|
+
result = Regex::Character.new(aString)
|
55
|
+
end
|
56
|
+
|
57
|
+
return result
|
58
|
+
end
|
59
|
+
|
60
|
+
def char_range(lowerBound, upperBound)
|
61
|
+
# TODO fix module nesting
|
62
|
+
lower = Regex::Character.new(lowerBound)
|
63
|
+
upper = Regex::Character.new(upperBound)
|
64
|
+
return Regex::CharRange.new(lower, upper)
|
65
|
+
end
|
66
|
+
|
67
|
+
def char_class(toNegate, *theChildren)
|
68
|
+
Regex::CharClass.new(toNegate, *theChildren)
|
69
|
+
end
|
70
|
+
|
71
|
+
def char_shorthand(shortName)
|
72
|
+
Regex::CharShorthand.new(shortName)
|
73
|
+
end
|
74
|
+
|
75
|
+
def wildcard()
|
76
|
+
Regex::Wildcard.new
|
77
|
+
end
|
78
|
+
|
79
|
+
def repetition(expressionToRepeat, aMultiplicity)
|
80
|
+
return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
|
81
|
+
end
|
82
|
+
|
83
|
+
def begin_anchor
|
84
|
+
return Regex::Anchor.new('^')
|
85
|
+
end
|
86
|
+
|
87
|
+
# rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
|
88
|
+
def reduce_flagged_expr(_production, aRange, theTokens, theChildren)
|
89
|
+
@options = theChildren[2] if theChildren[2]
|
90
|
+
return_first_child(aRange, theTokens, theChildren)
|
91
|
+
end
|
92
|
+
|
93
|
+
# rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
|
94
|
+
def reduce_pattern_sequence(_production, _range, _tokens, theChildren)
|
95
|
+
return Regex::Concatenation.new(theChildren[0], theChildren[2])
|
96
|
+
end
|
97
|
+
|
98
|
+
# rule('flags' => %[flags separator single_flag]).as 'flag_sequence'
|
99
|
+
def reduce_flag_sequence(_production, _range, _tokens, theChildren)
|
100
|
+
theChildren[0] << theChildren[2]
|
101
|
+
end
|
102
|
+
|
103
|
+
# rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
|
104
|
+
def reduce_case_insensitive(_production, _range, _tokens, _children)
|
105
|
+
return [Regex::MatchOption.new(:IGNORECASE, true)]
|
106
|
+
end
|
107
|
+
|
108
|
+
# rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
|
109
|
+
def reduce_multi_line(_production, _range, _tokens, _children)
|
110
|
+
return [Regex::MatchOption.new(:MULTILINE, true)]
|
111
|
+
end
|
112
|
+
|
113
|
+
# rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
|
114
|
+
def reduce_all_lazy(_production, _range, _tokens, _children)
|
115
|
+
return [Regex::MatchOption.new(:ALL_LAZY, true)]
|
116
|
+
end
|
117
|
+
|
118
|
+
# rule 'quantifiable' => %w[begin_anchor anchorable end_anchor]
|
119
|
+
def reduce_pinned_quantifiable(_production, _range, _tokens, theChildren)
|
120
|
+
theChildren[1].begin_anchor = theChildren[0]
|
121
|
+
theChildren[1].end_anchor = theChildren[2]
|
122
|
+
return theChildren[1]
|
123
|
+
end
|
124
|
+
|
125
|
+
# rule 'quantifiable' => %w[begin_anchor anchorable]
|
126
|
+
def reduce_begin_anchor_quantifiable(_production, _range, _tokens, theChildren)
|
127
|
+
theChildren[1].begin_anchor = theChildren[0]
|
128
|
+
return theChildren[1]
|
129
|
+
end
|
130
|
+
|
131
|
+
# rule 'quantifiable' => %w[anchorable end_anchor]
|
132
|
+
def reduce_end_anchor_quantifiable(_production, _range, _tokens, theChildren)
|
133
|
+
theChildren[0].end_anchor = theChildren[1]
|
134
|
+
return theChildren[0]
|
135
|
+
end
|
136
|
+
|
137
|
+
# rule 'begin_anchor' => %w[STARTS WITH]
|
138
|
+
def reduce_starts_with(_production, _range, _tokens, _children)
|
139
|
+
begin_anchor
|
140
|
+
end
|
141
|
+
|
142
|
+
# rule 'begin_anchor' => %w[BEGIN WITH]
|
143
|
+
def reduce_begin_with(_production, _range, _tokens, _children)
|
144
|
+
begin_anchor
|
145
|
+
end
|
146
|
+
|
147
|
+
# rule 'end_anchor' => %w[MUST END].as 'end_anchor'
|
148
|
+
def reduce_end_anchor(_production, _range, _tokens, _children)
|
149
|
+
return Regex::Anchor.new('$')
|
150
|
+
end
|
151
|
+
|
152
|
+
# rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
|
153
|
+
def reduce_asserted_anchorable(_production, _range, _tokens, theChildren)
|
154
|
+
assertion = theChildren.last
|
155
|
+
assertion.children.unshift(theChildren[0])
|
156
|
+
return assertion
|
157
|
+
end
|
158
|
+
|
159
|
+
# rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
|
160
|
+
def reduce_if_followed(_production, _range, _tokens, theChildren)
|
161
|
+
return Regex::Lookaround.new(theChildren.last, :ahead, :positive)
|
162
|
+
end
|
163
|
+
|
164
|
+
# rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
|
165
|
+
def reduce_if_not_followed(_production, _range, _tokens, theChildren)
|
166
|
+
return Regex::Lookaround.new(theChildren.last, :ahead, :negative)
|
167
|
+
end
|
168
|
+
|
169
|
+
# rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
|
170
|
+
def reduce_if_had(_production, _range, _tokens, theChildren)
|
171
|
+
return Regex::Lookaround.new(theChildren.last, :behind, :positive)
|
172
|
+
end
|
173
|
+
|
174
|
+
# rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
|
175
|
+
def reduce_if_not_had(_production, _range, _tokens, theChildren)
|
176
|
+
return Regex::Lookaround.new(theChildren.last, :behind, :negative)
|
177
|
+
end
|
178
|
+
|
179
|
+
# rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
|
180
|
+
def reduce_quantified_assertable(_production, _range, _tokens, theChildren)
|
181
|
+
quantifier = theChildren[1]
|
182
|
+
term = theChildren[0]
|
183
|
+
repetition(term, quantifier)
|
184
|
+
end
|
185
|
+
|
186
|
+
# rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
|
187
|
+
def reduce_lowercase_from_to(_production, _range, _tokens, theChildren)
|
188
|
+
lower = theChildren[2].token.lexeme
|
189
|
+
upper = theChildren[4].token.lexeme
|
190
|
+
ch_range = char_range(lower, upper)
|
191
|
+
char_class(false, ch_range)
|
192
|
+
end
|
193
|
+
|
194
|
+
# rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
|
195
|
+
def reduce_uppercase_from_to(_production, _range, _tokens, theChildren)
|
196
|
+
lower = theChildren[3].token.lexeme
|
197
|
+
upper = theChildren[5].token.lexeme
|
198
|
+
ch_range = char_range(lower.upcase, upper.upcase)
|
199
|
+
char_class(false, ch_range)
|
200
|
+
end
|
201
|
+
|
202
|
+
# rule('letter_range' => 'LETTER').as 'any_lowercase'
|
203
|
+
def reduce_any_lowercase(_production, _range, _tokens, _children)
|
204
|
+
ch_range = char_range('a', 'z')
|
205
|
+
char_class(false, ch_range)
|
206
|
+
end
|
207
|
+
|
208
|
+
# rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
|
209
|
+
def reduce_any_uppercase(_production, _range, _tokens, _children)
|
210
|
+
ch_range = char_range('A', 'Z')
|
211
|
+
char_class(false, ch_range)
|
212
|
+
end
|
213
|
+
|
214
|
+
# rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
|
215
|
+
def reduce_digits_from_to(aProduction, aRange, theTokens, theChildren)
|
216
|
+
reduce_lowercase_from_to(aProduction, aRange, theTokens, theChildren)
|
217
|
+
end
|
218
|
+
|
219
|
+
# rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
|
220
|
+
def reduce_simple_digit_range(_production, _range, _tokens, _children)
|
221
|
+
char_shorthand('d')
|
222
|
+
end
|
223
|
+
|
224
|
+
# rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
|
225
|
+
def reduce_any_character(_production, _range, _tokens, _children)
|
226
|
+
char_shorthand('w')
|
227
|
+
end
|
228
|
+
|
229
|
+
# rule('character_class' => %w[NO CHARACTER]).as 'no_character'
|
230
|
+
def reduce_no_character(_production, _range, _tokens, _children)
|
231
|
+
char_shorthand('W')
|
232
|
+
end
|
233
|
+
|
234
|
+
# rule('character_class' => 'WHITESPACE').as 'whitespace'
|
235
|
+
def reduce_whitespace(_production, _range, _tokens, _children)
|
236
|
+
char_shorthand('s')
|
237
|
+
end
|
238
|
+
|
239
|
+
# rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
|
240
|
+
def reduce_no_whitespace(_production, _range, _tokens, _children)
|
241
|
+
char_shorthand('S')
|
242
|
+
end
|
243
|
+
|
244
|
+
# rule('character_class' => 'ANYTHING').as 'anything'
|
245
|
+
def reduce_anything(_production, _range, _tokens, _children)
|
246
|
+
wildcard
|
247
|
+
end
|
248
|
+
|
249
|
+
# rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
250
|
+
def reduce_one_of(_production, _range, _tokens, theChildren)
|
251
|
+
raw_literal = theChildren[-1].token.lexeme.dup
|
252
|
+
alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
|
253
|
+
# TODO check other implementations
|
254
|
+
return Regex::CharClass.new(false, *alternatives)
|
255
|
+
end
|
256
|
+
|
257
|
+
# rule('special_char' => 'TAB').as 'tab'
|
258
|
+
def reduce_tab(_production, _range, _tokens, _children)
|
259
|
+
Regex::Character.new('\t')
|
260
|
+
end
|
261
|
+
|
262
|
+
# rule('special_char' => 'BACKSLASH').as 'backslash'
|
263
|
+
def reduce_backslash(_production, _range, _tokens, _children)
|
264
|
+
Regex::Character.new('\\')
|
265
|
+
end
|
266
|
+
|
267
|
+
# rule('special_char' => %w[NEW LINE]).as 'new_line'
|
268
|
+
def reduce_new_line(_production, _range, _tokens, _children)
|
269
|
+
# TODO: control portability
|
270
|
+
Regex::Character.new('\n')
|
271
|
+
end
|
272
|
+
|
273
|
+
# rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
|
274
|
+
def reduce_literally(_production, _range, _tokens, theChildren)
|
275
|
+
# What if literal is empty?...
|
276
|
+
|
277
|
+
raw_literal = theChildren[-1].token.lexeme.dup
|
278
|
+
return string_literal(raw_literal)
|
279
|
+
end
|
280
|
+
|
281
|
+
# rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
282
|
+
def reduce_any_of(_production, _range, _tokens, theChildren)
|
283
|
+
return Regex::Alternation.new(*theChildren[3])
|
284
|
+
end
|
285
|
+
|
286
|
+
# rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
|
287
|
+
def reduce_alternative_list(_production, _range, _tokens, theChildren)
|
288
|
+
return theChildren[0] << theChildren[-1]
|
289
|
+
end
|
290
|
+
|
291
|
+
# rule('alternatives' => 'quantifiable').as 'simple_alternative'
|
292
|
+
def reduce_simple_alternative(_production, _range, _tokens, theChildren)
|
293
|
+
return [theChildren.last]
|
294
|
+
end
|
295
|
+
|
296
|
+
# rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
|
297
|
+
def reduce_grouping_parenthenses(_production, _range, _tokens, theChildren)
|
298
|
+
return Regex::NonCapturingGroup.new(theChildren[1])
|
299
|
+
end
|
300
|
+
|
301
|
+
# rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
|
302
|
+
def reduce_capture(_production, _range, _tokens, theChildren)
|
303
|
+
return Regex::CapturingGroup.new(theChildren[1])
|
304
|
+
end
|
305
|
+
|
306
|
+
# rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as
|
307
|
+
# 'capture_until'
|
308
|
+
def reduce_capture_until(_production, _range, _tokens, theChildren)
|
309
|
+
group = Regex::CapturingGroup.new(theChildren[1])
|
310
|
+
return Regex::Concatenation.new(group, theChildren[3])
|
311
|
+
end
|
312
|
+
|
313
|
+
# rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as
|
314
|
+
# 'named_capture'
|
315
|
+
def reduce_named_capture(_production, _range, _tokens, theChildren)
|
316
|
+
name = theChildren[3].token.lexeme.dup
|
317
|
+
return Regex::CapturingGroup.new(theChildren[1], name)
|
318
|
+
end
|
319
|
+
|
320
|
+
# rule('capturing_group' => %w[CAPTURE assertable AS var_name
|
321
|
+
# UNTIL assertable]).as 'named_capture_until'
|
322
|
+
def reduce_named_capture_until(_production, _range, _tokens, theChildren)
|
323
|
+
name = theChildren[3].token.lexeme.dup
|
324
|
+
group = Regex::CapturingGroup.new(theChildren[1], name)
|
325
|
+
return Regex::Concatenation.new(group, theChildren[5])
|
326
|
+
end
|
327
|
+
|
328
|
+
# rule('quantifier' => 'ONCE').as 'once'
|
329
|
+
def reduce_once(_production, _range, _tokens, _children)
|
330
|
+
multiplicity(1, 1)
|
331
|
+
end
|
332
|
+
|
333
|
+
# rule('quantifier' => 'TWICE').as 'twice'
|
334
|
+
def reduce_twice(_production, _range, _tokens, _children)
|
335
|
+
multiplicity(2, 2)
|
336
|
+
end
|
337
|
+
|
338
|
+
# rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
|
339
|
+
def reduce_exactly(_production, _range, _tokens, theChildren)
|
340
|
+
count = theChildren[1].token.lexeme.to_i
|
341
|
+
multiplicity(count, count)
|
342
|
+
end
|
343
|
+
|
344
|
+
# rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as
|
345
|
+
# 'between_and'
|
346
|
+
def reduce_between_and(_production, _range, _tokens, theChildren)
|
347
|
+
lower = theChildren[1].token.lexeme.to_i
|
348
|
+
upper = theChildren[3].token.lexeme.to_i
|
349
|
+
multiplicity(lower, upper)
|
350
|
+
end
|
351
|
+
|
352
|
+
# rule('quantifier' => 'OPTIONAL').as 'optional'
|
353
|
+
def reduce_optional(_production, _range, _tokens, _children)
|
354
|
+
multiplicity(0, 1)
|
355
|
+
end
|
356
|
+
|
357
|
+
# rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
|
358
|
+
def reduce_once_or_more(_production, _range, _tokens, _children)
|
359
|
+
multiplicity(1, :more)
|
360
|
+
end
|
361
|
+
|
362
|
+
# rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
|
363
|
+
def reduce_never_or_more(_production, _range, _tokens, _children)
|
364
|
+
multiplicity(0, :more)
|
365
|
+
end
|
366
|
+
|
367
|
+
# rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
|
368
|
+
def reduce_at_least(_production, _range, _tokens, theChildren)
|
369
|
+
count = theChildren[2].token.lexeme.to_i
|
370
|
+
multiplicity(count, :more)
|
371
|
+
end
|
372
|
+
|
373
|
+
# rule('times_suffix' => 'TIMES').as 'times_keyword'
|
374
|
+
def reduce_times_keyword(_production, _range, _tokens, _children)
|
375
|
+
return nil
|
376
|
+
end
|
377
|
+
|
378
|
+
# rule('times_suffix' => []).as 'times_dropped'
|
379
|
+
def reduce_times_dropped(_production, _range, _tokens, _children)
|
380
|
+
return nil
|
381
|
+
end
|
382
|
+
end # class
|
383
|
+
end # module
|
384
|
+
# End of file
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# Grammar for SRL (Simple Regex Language)
|
2
|
+
require 'rley' # Load the gem
|
3
|
+
module SrlRuby
|
4
|
+
########################################
|
5
|
+
# Work in progress.
|
6
|
+
# This is a very partial grammar of SRL.
|
7
|
+
# It will be expanded with the coming versions of Rley
|
8
|
+
builder = Rley::Syntax::GrammarBuilder.new do
|
9
|
+
add_terminals('LPAREN', 'RPAREN', 'COMMA')
|
10
|
+
add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
|
11
|
+
add_terminals('LITERALLY', 'STRING_LIT')
|
12
|
+
add_terminals('BEGIN', 'STARTS', 'WITH')
|
13
|
+
add_terminals('MUST', 'END')
|
14
|
+
add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
|
15
|
+
add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
|
16
|
+
add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
|
17
|
+
add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
|
18
|
+
add_terminals('OF', 'ONE')
|
19
|
+
add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
|
20
|
+
add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
|
21
|
+
add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
|
22
|
+
add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
|
23
|
+
add_terminals('ALREADY', 'HAD')
|
24
|
+
add_terminals('CAPTURE', 'AS', 'UNTIL')
|
25
|
+
add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
|
26
|
+
add_terminals('LAZY')
|
27
|
+
|
28
|
+
rule('srl' => 'expression').as 'start_rule'
|
29
|
+
rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
|
30
|
+
rule('expression' => 'pattern').as 'simple_expr'
|
31
|
+
rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
|
32
|
+
rule('pattern' => 'quantifiable').as 'basic_pattern'
|
33
|
+
rule('separator' => 'COMMA').as 'comma_separator'
|
34
|
+
rule('separator' => []).as 'void_separator'
|
35
|
+
rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
|
36
|
+
rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
|
37
|
+
rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
|
38
|
+
rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
|
39
|
+
rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
|
40
|
+
rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
|
41
|
+
rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
|
42
|
+
rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
|
43
|
+
rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
|
44
|
+
rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
|
45
|
+
rule('end_anchor' => %w[MUST END]).as 'end_anchor'
|
46
|
+
rule('anchorable' => 'assertable').as 'simple_anchorable'
|
47
|
+
rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
|
48
|
+
rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
|
49
|
+
rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
|
50
|
+
rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
|
51
|
+
rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
|
52
|
+
rule('assertable' => 'term').as 'simple_assertable'
|
53
|
+
rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
|
54
|
+
rule('term' => 'atom').as 'atom_term'
|
55
|
+
rule('term' => 'alternation').as 'alternation_term'
|
56
|
+
rule('term' => 'grouping').as 'grouping_term'
|
57
|
+
rule('term' => 'capturing_group').as 'capturing_group_atom'
|
58
|
+
rule('atom' => 'letter_range').as 'letter_range_atom'
|
59
|
+
rule('atom' => 'digit_range').as 'digit_range_atom'
|
60
|
+
rule('atom' => 'character_class').as 'character_class_atom'
|
61
|
+
rule('atom' => 'special_char').as 'special_char_atom'
|
62
|
+
rule('atom' => 'literal').as 'literal_atom'
|
63
|
+
rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
|
64
|
+
rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
|
65
|
+
rule('letter_range' => 'LETTER').as 'any_lowercase'
|
66
|
+
rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
|
67
|
+
rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
|
68
|
+
rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
|
69
|
+
rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
|
70
|
+
rule('character_class' => %w[NO CHARACTER]).as 'no_character'
|
71
|
+
rule('character_class' => 'WHITESPACE').as 'whitespace'
|
72
|
+
rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
|
73
|
+
rule('character_class' => 'ANYTHING').as 'anything'
|
74
|
+
rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
|
75
|
+
rule('special_char' => 'TAB').as 'tab'
|
76
|
+
rule('special_char' => 'BACKSLASH').as 'backslash'
|
77
|
+
rule('special_char' => %w[NEW LINE]).as 'new_line'
|
78
|
+
rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
|
79
|
+
rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
80
|
+
rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
|
81
|
+
rule('alternatives' => 'quantifiable').as 'simple_alternative'
|
82
|
+
rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
|
83
|
+
rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
|
84
|
+
rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
|
85
|
+
rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
|
86
|
+
rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
|
87
|
+
rule('var_name' => 'STRING_LIT').as 'var_name'
|
88
|
+
rule('quantifier' => 'ONCE').as 'once'
|
89
|
+
rule('quantifier' => 'TWICE').as 'twice'
|
90
|
+
rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
|
91
|
+
rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
|
92
|
+
rule('quantifier' => 'OPTIONAL').as 'optional'
|
93
|
+
rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
|
94
|
+
rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
|
95
|
+
rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
|
96
|
+
rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
|
97
|
+
rule('digit_or_number' => 'NUMBER').as 'number_keyword'
|
98
|
+
rule('count' => 'DIGIT_LIT').as 'single_digit'
|
99
|
+
rule('count' => 'INTEGER').as 'integer_count'
|
100
|
+
rule('times_suffix' => 'TIMES').as 'times_keyword'
|
101
|
+
rule('times_suffix' => []).as 'times_dropped'
|
102
|
+
end
|
103
|
+
|
104
|
+
# And now build the grammar and make it accessible via a global constant
|
105
|
+
Grammar = builder.grammar
|
106
|
+
end # module
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative '../regex/character'
|
2
|
+
require_relative '../regex/char_range'
|
3
|
+
require_relative '../regex/concatenation'
|
4
|
+
require_relative '../regex/multiplicity'
|
5
|
+
require_relative '../regex/repetition'
|
6
|
+
require_relative '../regex/char_class'
|
7
|
+
require_relative '../regex/char_shorthand'
|
8
|
+
require_relative '../regex/wildcard'
|
9
|
+
require_relative '../regex/alternation'
|
10
|
+
require_relative '../regex/non_capturing_group'
|
11
|
+
require_relative '../regex/anchor'
|
12
|
+
require_relative '../regex/lookaround'
|
13
|
+
require_relative '../regex/capturing_group'
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# File: srl_tokenizer.rb
|
2
|
+
# Tokenizer for SRL (Simple Regex Language)
|
3
|
+
require 'strscan'
|
4
|
+
require 'rley' # Load the Rley gem
|
5
|
+
|
6
|
+
module SrlRuby
|
7
|
+
# The tokenizer should recognize:
|
8
|
+
# Keywords: as, capture, letter
|
9
|
+
# Integer literals including single digit
|
10
|
+
# String literals (quote delimited)
|
11
|
+
# Single character literal
|
12
|
+
# Delimiters: parentheses '(' and ')'
|
13
|
+
# Separators: comma (optional)
|
14
|
+
class Tokenizer
|
15
|
+
attr_reader(:scanner)
|
16
|
+
attr_reader(:lineno)
|
17
|
+
attr_reader(:line_start)
|
18
|
+
|
19
|
+
@@lexeme2name = {
|
20
|
+
'(' => 'LPAREN',
|
21
|
+
')' => 'RPAREN',
|
22
|
+
',' => 'COMMA'
|
23
|
+
}.freeze
|
24
|
+
|
25
|
+
# Here are all the SRL keywords (in uppercase)
|
26
|
+
@@keywords = %w[
|
27
|
+
ALL
|
28
|
+
ALREADY
|
29
|
+
AND
|
30
|
+
ANY
|
31
|
+
ANYTHING
|
32
|
+
AS
|
33
|
+
AT
|
34
|
+
BACKSLASH
|
35
|
+
BEGIN
|
36
|
+
BETWEEN
|
37
|
+
BY
|
38
|
+
CAPTURE
|
39
|
+
CASE
|
40
|
+
CHARACTER
|
41
|
+
DIGIT
|
42
|
+
END
|
43
|
+
EXACTLY
|
44
|
+
FOLLOWED
|
45
|
+
FROM
|
46
|
+
HAD
|
47
|
+
IF
|
48
|
+
INSENSITIVE
|
49
|
+
LAZY
|
50
|
+
LEAST
|
51
|
+
LETTER
|
52
|
+
LINE
|
53
|
+
LITERALLY
|
54
|
+
MORE
|
55
|
+
MULTI
|
56
|
+
MUST
|
57
|
+
NEVER
|
58
|
+
NEW
|
59
|
+
NO
|
60
|
+
NOT
|
61
|
+
NUMBER
|
62
|
+
OF
|
63
|
+
ONCE
|
64
|
+
ONE
|
65
|
+
OPTIONAL
|
66
|
+
OR
|
67
|
+
STARTS
|
68
|
+
TAB
|
69
|
+
TIMES
|
70
|
+
TO
|
71
|
+
TWICE
|
72
|
+
UNTIL
|
73
|
+
UPPERCASE
|
74
|
+
WHITESPACE
|
75
|
+
WITH
|
76
|
+
].map { |x| [x, x] } .to_h
|
77
|
+
|
78
|
+
class ScanError < StandardError; end
|
79
|
+
|
80
|
+
def initialize(source)
|
81
|
+
@scanner = StringScanner.new(source)
|
82
|
+
@lineno = 1
|
83
|
+
end
|
84
|
+
|
85
|
+
def tokens()
|
86
|
+
tok_sequence = []
|
87
|
+
until @scanner.eos?
|
88
|
+
token = _next_token
|
89
|
+
tok_sequence << token unless token.nil?
|
90
|
+
end
|
91
|
+
|
92
|
+
return tok_sequence
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def _next_token()
|
98
|
+
skip_whitespaces
|
99
|
+
curr_ch = scanner.peek(1)
|
100
|
+
return nil if curr_ch.nil? || curr_ch.empty?
|
101
|
+
|
102
|
+
token = nil
|
103
|
+
|
104
|
+
if '(),'.include? curr_ch
|
105
|
+
# Delimiters, separators => single character token
|
106
|
+
token = build_token(@@lexeme2name[curr_ch], scanner.getch)
|
107
|
+
elsif (lexeme = scanner.scan(/[0-9]{2,}/))
|
108
|
+
token = build_token('INTEGER', lexeme) # An integer has 2..* digits
|
109
|
+
elsif (lexeme = scanner.scan(/[0-9]/))
|
110
|
+
token = build_token('DIGIT_LIT', lexeme)
|
111
|
+
elsif (lexeme = scanner.scan(/[a-zA-Z]{2,}/))
|
112
|
+
token = build_token(@@keywords[lexeme.upcase], lexeme)
|
113
|
+
# TODO: handle case unknown identifier
|
114
|
+
elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s)|$)/))
|
115
|
+
token = build_token('LETTER_LIT', lexeme)
|
116
|
+
elsif (lexeme = scanner.scan(/"([^"]|\\")*"/)) # Double quotes literal?
|
117
|
+
unquoted = lexeme.gsub(/(^")|("$)/, '')
|
118
|
+
token = build_token('STRING_LIT', unquoted)
|
119
|
+
elsif (lexeme = scanner.scan(/'([^']|\\')*'/)) # Single quotes literal?
|
120
|
+
unquoted = lexeme.gsub(/(^')|('$)/, '')
|
121
|
+
token = build_token('STRING_LIT', unquoted)
|
122
|
+
else # Unknown token
|
123
|
+
erroneous = curr_ch.nil? ? '' : curr_ch
|
124
|
+
sequel = scanner.scan(/.{1,20}/)
|
125
|
+
erroneous += sequel unless sequel.nil?
|
126
|
+
raise ScanError.new("Unknown token #{erroneous}")
|
127
|
+
end
|
128
|
+
|
129
|
+
return token
|
130
|
+
end
|
131
|
+
|
132
|
+
def build_token(aSymbolName, aLexeme)
|
133
|
+
begin
|
134
|
+
token = Rley::Lexical::Token.new(aLexeme, aSymbolName)
|
135
|
+
rescue StandardError
|
136
|
+
puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
|
137
|
+
raise ex
|
138
|
+
end
|
139
|
+
|
140
|
+
return token
|
141
|
+
end
|
142
|
+
|
143
|
+
def skip_whitespaces()
|
144
|
+
scanner.scan(/[ \t\f\n\r]+/)
|
145
|
+
end
|
146
|
+
end # class
|
147
|
+
end # module
|
data/lib/srl_ruby.rb
ADDED