rley 0.5.10 → 0.5.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +2 -1
- data/appveyor.yml +6 -5
- data/examples/NLP/engtagger.rb +176 -0
- data/examples/general/SRL/lib/ast_builder.rb +217 -21
- data/examples/general/SRL/lib/grammar.rb +33 -5
- data/examples/general/SRL/lib/regex/alternation.rb +30 -0
- data/examples/general/SRL/lib/regex/char_class.rb +28 -22
- data/examples/general/SRL/lib/regex/char_shorthand.rb +50 -0
- data/examples/general/SRL/lib/regex/character.rb +5 -3
- data/examples/general/SRL/lib/regex/concatenation.rb +32 -0
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +29 -0
- data/examples/general/SRL/lib/regex/wildcard.rb +26 -0
- data/examples/general/SRL/lib/regex_repr.rb +5 -0
- data/examples/general/SRL/lib/tokenizer.rb +28 -3
- data/examples/general/SRL/spec/integration_spec.rb +151 -8
- data/examples/general/SRL/spec/tokenizer_spec.rb +12 -0
- data/examples/general/left.rb +36 -0
- data/examples/general/right.rb +36 -0
- data/lib/rley/constants.rb +1 -1
- data/lib/rley/gfg/edge.rb +12 -1
- data/lib/rley/gfg/grm_flow_graph.rb +21 -1
- data/lib/rley/gfg/item_vertex.rb +1 -1
- data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
- data/lib/rley/gfg/start_vertex.rb +1 -0
- data/lib/rley/gfg/vertex.rb +27 -0
- data/lib/rley/lexical/token.rb +1 -0
- data/lib/rley/parser/error_reason.rb +2 -1
- data/lib/rley/parser/gfg_chart.rb +14 -0
- data/lib/rley/parser/gfg_earley_parser.rb +0 -1
- data/lib/rley/parser/gfg_parsing.rb +4 -3
- data/lib/rley/parser/parse_entry.rb +33 -3
- data/lib/rley/parser/parse_entry_set.rb +14 -2
- data/lib/rley/parser/parse_tree_builder.rb +1 -1
- data/lib/rley/parser/parse_walker_factory.rb +0 -1
- data/lib/rley/syntax/grm_symbol.rb +2 -0
- data/lib/rley/syntax/production.rb +15 -3
- data/lib/rley/syntax/symbol_seq.rb +16 -1
- data/spec/rley/gfg/end_vertex_spec.rb +9 -1
- data/spec/rley/gfg/grm_flow_graph_spec.rb +9 -0
- data/spec/rley/gfg/item_vertex_spec.rb +9 -0
- data/spec/rley/gfg/start_vertex_spec.rb +9 -1
- data/spec/rley/parser/gfg_parsing_spec.rb +0 -1
- data/spec/rley/parser/parse_entry_set_spec.rb +15 -0
- data/spec/rley/parser/parse_entry_spec.rb +24 -13
- data/spec/rley/parser/parse_tracer_spec.rb +1 -1
- data/spec/rley/syntax/production_spec.rb +10 -0
- data/spec/rley/syntax/symbol_seq_spec.rb +5 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ec06612f299302b861fbaeb04b75c0040a026cf
|
4
|
+
data.tar.gz: d68438efcbacceb2ae4319ac268492e93db35265
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec3be765a424028c986ea4812cf6f1485f04285beb2b9d8fffc774fc0b61108d4d6758a09a648132562752ab25904fb38f8ee57ecff90d0a70bca253150ed130
|
7
|
+
data.tar.gz: 2463def65eecbefed2bbfffc61e63e88dca2d0498078e83bc742811e540718e95e75f3896fa31b5bdc9068f5420906f389615470a86831dbcb5025824645775d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
### 0.5.11 / 2018-01-25
|
2
|
+
* [NEW] File `left.rb` added in `examples/general` folder for showing use of left-recursive rules.
|
3
|
+
* [NEW] File `right.rb` added in `examples/general` folder for showing use of right-recursive rules (less performant).
|
4
|
+
* [NEW] File `examples/general/SRL/lib/alternation.rb Added support for alternation in regular expressions (|).
|
5
|
+
* [NEW] File `examples/general/SRL/lib/character.rb Added support for single character in regular expressions.
|
6
|
+
* [NEW] File `examples/general/SRL/lib/char_class.rb Added support for character class in regular expressions.
|
7
|
+
* [NEW] File `examples/general/SRL/lib/shorthand.rb Added support for character class shorthand in regular expressions.
|
8
|
+
* [NEW] File `examples/general/SRL/lib/concatenation.rb Added support for concatenation in regular expressions.
|
9
|
+
* [NEW] File `examples/general/SRL/lib/non_capturing_group.rb Added support for non-capturing groups in regular expressions.
|
10
|
+
* [NEW] File `examples/general/SRL/lib/wildcard.rb Added support for wilcards in regular expressions.
|
11
|
+
* [CHANGE] File `examples/general/SRL/grammar.rb increased coverage of Simple Regex Language parser.
|
12
|
+
* [CHANGE] File `examples/general/SRL/ast_builder.rb Added transformation rules for constructing regular expressions.
|
13
|
+
* [CHANGE] File `examples/general/SRL/spac/integration_spec.rb Added tests for SRL expressions.
|
14
|
+
* [FIX] Added an custom `inspect` method to sevaral core classes. This was necessary because default implementation from Ruby got lost with object graphs.
|
15
|
+
|
1
16
|
### 0.5.10 / 2017-12-02
|
2
17
|
* [CHANGE] Dir `examples/general/SRL/ Added support for digit range to Simple Regex Language parser.
|
3
18
|
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -440,6 +440,7 @@ actively curated by Andrei Beliankou (aka arbox).
|
|
440
440
|
|
441
441
|
## Thanks to:
|
442
442
|
* Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
|
443
|
+
* [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
|
443
444
|
|
444
445
|
## Grammar Flow Graph
|
445
446
|
Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
|
@@ -452,5 +453,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
|
|
452
453
|
|
453
454
|
Copyright
|
454
455
|
---------
|
455
|
-
Copyright (c) 2014-
|
456
|
+
Copyright (c) 2014-2018, Dimitri Geshef.
|
456
457
|
__Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
|
data/appveyor.yml
CHANGED
@@ -11,12 +11,13 @@ environment:
|
|
11
11
|
- Ruby_version: 23
|
12
12
|
- Ruby_version: 23-x64
|
13
13
|
- Ruby_version: 24
|
14
|
-
- Ruby_version: 24-x64
|
14
|
+
- Ruby_version: 24-x64
|
15
|
+
|
15
16
|
install:
|
16
|
-
-
|
17
|
-
|
17
|
+
- set PATH=C:\Ruby%Ruby_version%\bin;%PATH%
|
18
|
+
- bundle install --retry=3 --clean --force
|
18
19
|
|
19
|
-
bundle install --retry=3 --clean
|
20
20
|
build: off
|
21
|
+
|
21
22
|
test_script:
|
22
|
-
-
|
23
|
+
- bundle exec rake
|
@@ -0,0 +1,176 @@
|
|
1
|
+
require "rley"
|
2
|
+
require "engtagger"
|
3
|
+
require "pp"
|
4
|
+
|
5
|
+
# REGEX to remove XML tags from Engtagger output
|
6
|
+
GET_TAG = /<(.+?)>(.*?)<.+?>/
|
7
|
+
|
8
|
+
# Text tokenizer
|
9
|
+
# Taken directly from Engtagger, will ensure uniform indexing while parsing
|
10
|
+
def clean_text(text)
|
11
|
+
return false unless valid_text(text)
|
12
|
+
text = text.toutf8
|
13
|
+
cleaned_text = text
|
14
|
+
tokenized = []
|
15
|
+
# Tokenize the text (splitting on punctuation as you go)
|
16
|
+
cleaned_text.split(/\s+/).each do |line|
|
17
|
+
tokenized += split_punct(line)
|
18
|
+
end
|
19
|
+
words = split_sentences(tokenized)
|
20
|
+
return words
|
21
|
+
end
|
22
|
+
|
23
|
+
def valid_text(text)
|
24
|
+
if !text
|
25
|
+
# there's nothing to parse
|
26
|
+
"method call on uninitialized variable" if @conf[:debug]
|
27
|
+
return false
|
28
|
+
elsif /\A\s*\z/ =~ text
|
29
|
+
# text is an empty string, nothing to parse
|
30
|
+
return false
|
31
|
+
else
|
32
|
+
# $text is valid
|
33
|
+
return true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def split_sentences(array)
|
38
|
+
tokenized = array
|
39
|
+
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
40
|
+
supt det mssrs rev)
|
41
|
+
army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
|
42
|
+
inst = %w(dept univ assn bros ph.d)
|
43
|
+
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
44
|
+
hwy hway la pde pd plz pl rd st tce)
|
45
|
+
comp = %w(mfg inc ltd co corp)
|
46
|
+
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
47
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
48
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
49
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk)
|
50
|
+
month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
|
51
|
+
misc = %w(vs etc no esp)
|
52
|
+
abbr = Hash.new
|
53
|
+
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
54
|
+
abbr[i] = true
|
55
|
+
end
|
56
|
+
words = Array.new
|
57
|
+
tokenized.each_with_index do |t, i|
|
58
|
+
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
|
59
|
+
w = $1
|
60
|
+
# Don't separate the period off words that
|
61
|
+
# meet any of the following conditions:
|
62
|
+
#
|
63
|
+
# 1. It is defined in one of the lists above
|
64
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
65
|
+
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
66
|
+
unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
|
67
|
+
words << w
|
68
|
+
words << '.'
|
69
|
+
next
|
70
|
+
end
|
71
|
+
end
|
72
|
+
words << tokenized[i]
|
73
|
+
end
|
74
|
+
# If the final word ends in a period..
|
75
|
+
if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
|
76
|
+
words[-1] = $1
|
77
|
+
words.push '.'
|
78
|
+
end
|
79
|
+
return words
|
80
|
+
end
|
81
|
+
|
82
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
83
|
+
# periods in place to be dealt with later. Called by the clean_text method.
|
84
|
+
def split_punct(text)
|
85
|
+
# If there's no punctuation, return immediately
|
86
|
+
return [text] if /\A\w+\z/ =~ text
|
87
|
+
# Sanity checks
|
88
|
+
text = text.gsub(/\W{10,}/o, " ")
|
89
|
+
|
90
|
+
# Put quotes into a standard format
|
91
|
+
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
92
|
+
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
93
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
94
|
+
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
95
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
96
|
+
|
97
|
+
# Handle all other punctuation
|
98
|
+
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
99
|
+
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
100
|
+
text = text.gsub(/:/o, " :") # Shift semicolons off
|
101
|
+
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
102
|
+
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
103
|
+
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
104
|
+
|
105
|
+
# English-specific contractions
|
106
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
|
107
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
108
|
+
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
|
109
|
+
result = text.split(' ')
|
110
|
+
return result
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
# Instantiate a builder object that will build the grammar for us
|
115
|
+
builder = Rley::Syntax::GrammarBuilder.new do
|
116
|
+
|
117
|
+
add_terminals('NN', 'NNP')
|
118
|
+
add_terminals('DET', 'IN', 'VBD')
|
119
|
+
|
120
|
+
# Here we define the productions (= grammar rules)
|
121
|
+
rule 'S' => %w[NP VP]
|
122
|
+
rule 'NP' => 'NNP'
|
123
|
+
rule 'NP' => %w[DET NN]
|
124
|
+
rule 'NP' => %w[DET NN PP]
|
125
|
+
rule 'VP' => %w[VBD NP]
|
126
|
+
rule 'VP' => %w[VBD NP PP]
|
127
|
+
rule 'PP' => %w[IN NP]
|
128
|
+
end
|
129
|
+
|
130
|
+
# And now, let's build the grammar...
|
131
|
+
grammar = builder.grammar
|
132
|
+
|
133
|
+
parser = Rley::Parser::GFGEarleyParser.new(grammar)
|
134
|
+
|
135
|
+
# text = "Yo I'm not done with you"
|
136
|
+
text= "John saw Mary with a telescope"
|
137
|
+
pp "Input text --> #{text}"
|
138
|
+
|
139
|
+
tgr = EngTagger.new
|
140
|
+
|
141
|
+
# Generte POS
|
142
|
+
tagged = tgr.add_tags(text)
|
143
|
+
|
144
|
+
# Generte tokenied lexicon of input text
|
145
|
+
# Instead of creating a lexicon dictionary, we would simply generate one each time on the fly for the current text only.
|
146
|
+
lexicon = clean_text(text)
|
147
|
+
|
148
|
+
# Generte POS tokens in [[word, pos], ..] format
|
149
|
+
tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
|
150
|
+
|
151
|
+
def tokenizer(lexicon, grammar, tokens)
|
152
|
+
rley_tokens = []
|
153
|
+
lexicon.each_with_index do |word, i|
|
154
|
+
term_name = tokens[i].last
|
155
|
+
terminal = grammar.name2symbol[term_name]
|
156
|
+
rley_tokens << Rley::Tokens::Token.new(word, terminal)
|
157
|
+
end
|
158
|
+
return rley_tokens
|
159
|
+
end
|
160
|
+
|
161
|
+
# Convert input text into a sequence of rley token objects...
|
162
|
+
rley_tokens = tokenizer(lexicon, grammar, tokens)
|
163
|
+
|
164
|
+
result = parser.parse(rley_tokens)
|
165
|
+
|
166
|
+
pp "Parsing successful? #{result.success?}" # => Parsing successful? true
|
167
|
+
pp result.failure_reason.message unless result.success?
|
168
|
+
|
169
|
+
ptree = result.parse_tree
|
170
|
+
|
171
|
+
visitor = Rley::ParseTreeVisitor.new(ptree)
|
172
|
+
|
173
|
+
renderer = Rley::Formatter::Asciitree.new($stdout)
|
174
|
+
|
175
|
+
# Subscribe the formatter to the visitor's event and launch the visit
|
176
|
+
pp renderer.render(visitor)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'stringio'
|
1
2
|
require_relative 'ast_building'
|
2
3
|
require_relative 'regex_repr'
|
3
4
|
|
@@ -35,27 +36,54 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
35
36
|
# @param theChildren [Array] Children nodes (one per rhs symbol)
|
36
37
|
def new_parent_node(aProduction, aRange, theTokens, theChildren)
|
37
38
|
node = case aProduction.name
|
38
|
-
when 'srl_0' # rule 'srl' => '
|
39
|
+
when 'srl_0' # rule 'srl' => 'pattern'
|
40
|
+
return_first_child(aRange, theTokens, theChildren)
|
41
|
+
|
42
|
+
when 'pattern_0' # rule 'pattern' => %w[pattern COMMA quantifiable]
|
43
|
+
reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
|
44
|
+
|
45
|
+
when 'pattern_1' # rule 'pattern' => %w[pattern quantifiable]
|
46
|
+
reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
|
47
|
+
|
48
|
+
when 'pattern_2' # rule 'pattern' => 'quantifiable'
|
49
|
+
return_first_child(aRange, theTokens, theChildren)
|
50
|
+
|
51
|
+
when 'quantifiable_0' # rule 'quantifiable' => 'term'
|
39
52
|
return_first_child(aRange, theTokens, theChildren)
|
40
53
|
|
54
|
+
when 'quantifiable_1' # rule 'quantifiable' = %w[term quantifier]
|
55
|
+
reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
|
56
|
+
|
41
57
|
when 'term_0' # rule 'term' => 'atom'
|
42
58
|
return_first_child(aRange, theTokens, theChildren)
|
43
59
|
|
44
|
-
when 'term_1' # rule 'term' =>
|
45
|
-
|
60
|
+
when 'term_1' # rule 'term' => 'alternation'
|
61
|
+
return_first_child(aRange, theTokens, theChildren)
|
62
|
+
|
63
|
+
when 'term_2' # rule 'term' => 'grouping'
|
64
|
+
return_first_child(aRange, theTokens, theChildren)
|
46
65
|
|
47
66
|
when 'atom_0' # rule 'atom' => 'letter_range'
|
48
67
|
return_first_child(aRange, theTokens, theChildren)
|
49
|
-
|
68
|
+
|
50
69
|
when 'atom_1' # rule 'atom' => 'digit_range'
|
51
70
|
return_first_child(aRange, theTokens, theChildren)
|
52
|
-
|
71
|
+
|
72
|
+
when 'atom_2' # rule 'atom' => 'character_class'
|
73
|
+
return_first_child(aRange, theTokens, theChildren)
|
74
|
+
|
75
|
+
when 'atom_3' # rule 'atom' => 'special_char'
|
76
|
+
return_first_child(aRange, theTokens, theChildren)
|
77
|
+
|
78
|
+
when 'atom_4' # rule 'atom' => 'literal'
|
79
|
+
return_first_child(aRange, theTokens, theChildren)
|
80
|
+
|
53
81
|
# rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
54
|
-
when 'letter_range_0'
|
82
|
+
when 'letter_range_0'
|
55
83
|
reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
|
56
84
|
|
57
|
-
#rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
58
|
-
when 'letter_range_1'
|
85
|
+
#rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
86
|
+
when 'letter_range_1'
|
59
87
|
reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
|
60
88
|
|
61
89
|
when 'letter_range_2' # rule 'letter_range' => 'LETTER'
|
@@ -65,12 +93,60 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
65
93
|
reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
|
66
94
|
|
67
95
|
# rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
|
68
|
-
when 'digit_range_0'
|
96
|
+
when 'digit_range_0'
|
69
97
|
reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
|
70
98
|
|
71
|
-
when 'digit_range_1' #rule 'digit_range' => 'digit_or_number'
|
99
|
+
when 'digit_range_1' # rule 'digit_range' => 'digit_or_number'
|
72
100
|
reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
|
73
101
|
|
102
|
+
when 'character_class_0' # rule 'character_class' => %w[ANY CHARACTER]
|
103
|
+
reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
|
104
|
+
|
105
|
+
when 'character_class_1' # rule 'character_class' => %w[NO CHARACTER]
|
106
|
+
reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
|
107
|
+
|
108
|
+
when 'character_class_2' # rule 'character_class' => 'WHITESPACE'
|
109
|
+
reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
|
110
|
+
|
111
|
+
when 'character_class_3' # rule 'character_class' => %w[NO WHITESPACE]
|
112
|
+
reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
|
113
|
+
|
114
|
+
when 'character_class_4' # rule 'character_class' => 'ANYTHING'
|
115
|
+
reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
|
116
|
+
|
117
|
+
when 'character_class_5' # rule 'character_class' => %w[ONE OF STRING_LIT]
|
118
|
+
reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
|
119
|
+
|
120
|
+
when 'special_char_0' # rule 'special_char' => 'TAB'
|
121
|
+
reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
|
122
|
+
|
123
|
+
when 'special_char_1' # rule 'special_char' => 'BACKSLASH'
|
124
|
+
reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
|
125
|
+
|
126
|
+
when 'special_char_2' # rule 'special_char' => %w[NEW LINE]
|
127
|
+
reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
|
128
|
+
|
129
|
+
when 'literal_0' # rule 'literal' => %[LITERALLY STRING_LIT]
|
130
|
+
reduce_literal_0(aProduction, aRange, theTokens, theChildren)
|
131
|
+
|
132
|
+
# rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
|
133
|
+
when 'alternation_0'
|
134
|
+
reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
|
135
|
+
|
136
|
+
# rule 'alternatives' => %w[alternatives COMMA quantifiable]
|
137
|
+
when 'alternatives_0'
|
138
|
+
reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
|
139
|
+
|
140
|
+
# rule 'alternatives' => %w[alternatives quantifiable]
|
141
|
+
when 'alternatives_1'
|
142
|
+
reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
|
143
|
+
|
144
|
+
when 'alternatives_2' # rule 'alternatives' => 'quantifiable'
|
145
|
+
reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
|
146
|
+
|
147
|
+
when 'grouping' # rule 'grouping' => %w[LPAREN pattern RPAREN]
|
148
|
+
reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
|
149
|
+
|
74
150
|
when 'quantifier_0' # rule 'quantifier' => 'ONCE'
|
75
151
|
multiplicity(1, 1)
|
76
152
|
|
@@ -81,7 +157,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
81
157
|
reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
|
82
158
|
|
83
159
|
# rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
|
84
|
-
when 'quantifier_3'
|
160
|
+
when 'quantifier_3'
|
85
161
|
reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
|
86
162
|
|
87
163
|
when 'quantifier_4' # rule 'quantifier' => 'OPTIONAL'
|
@@ -95,10 +171,10 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
95
171
|
|
96
172
|
when 'quantifier_7' # rule 'quantifier' => %w[AT LEAST count TIMES]
|
97
173
|
reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
|
98
|
-
|
174
|
+
|
99
175
|
# rule 'digit_or_number' => 'DIGIT'
|
100
176
|
# rule 'digit_or_number' => 'NUMER'
|
101
|
-
when 'digit_or_number_0', 'digit_or_number_1'
|
177
|
+
when 'digit_or_number_0', 'digit_or_number_1'
|
102
178
|
return_first_child(aRange, theTokens, theChildren)
|
103
179
|
|
104
180
|
when 'count_0', 'count_1'
|
@@ -117,6 +193,28 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
117
193
|
return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
|
118
194
|
end
|
119
195
|
|
196
|
+
def string_literal(aString, to_escape = true)
|
197
|
+
if aString.size > 1
|
198
|
+
chars = []
|
199
|
+
aString.each_char do |ch|
|
200
|
+
if to_escape && Regex::Character::MetaChars.include?(ch)
|
201
|
+
chars << Regex::Character.new("\\")
|
202
|
+
end
|
203
|
+
chars << Regex::Character.new(ch)
|
204
|
+
end
|
205
|
+
result = Regex::Concatenation.new(*chars)
|
206
|
+
else
|
207
|
+
if to_escape && Regex::Character::MetaChars.include?(aString)
|
208
|
+
result = Regex::Concatenation.new(Regex::Character.new("\\"),
|
209
|
+
Regex::Character.new(aString))
|
210
|
+
else
|
211
|
+
result = Regex::Character.new(aString)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
return result
|
216
|
+
end
|
217
|
+
|
120
218
|
def char_range(lowerBound, upperBound)
|
121
219
|
# TODO fix module nesting
|
122
220
|
lower = Regex::Character.new(lowerBound)
|
@@ -128,15 +226,33 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
128
226
|
Regex::CharClass.new(toNegate, *theChildren)
|
129
227
|
end
|
130
228
|
|
229
|
+
def char_shorthand(shortName)
|
230
|
+
Regex::CharShorthand.new(shortName)
|
231
|
+
end
|
232
|
+
|
233
|
+
def wildcard()
|
234
|
+
Regex::Wildcard.new
|
235
|
+
end
|
236
|
+
|
131
237
|
def repetition(expressionToRepeat, aMultiplicity)
|
132
238
|
return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
|
133
239
|
end
|
240
|
+
|
241
|
+
# rule 'pattern' => %w[pattern COMMA quantifiable]
|
242
|
+
def reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
|
243
|
+
return Regex::Concatenation.new(theChildren[0], theChildren[2])
|
244
|
+
end
|
245
|
+
|
246
|
+
# rule 'pattern' => %w[pattern quantifiable]
|
247
|
+
def reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
|
248
|
+
return Regex::Concatenation.new(theChildren[0], theChildren[1])
|
249
|
+
end
|
134
250
|
|
135
|
-
# rule '
|
136
|
-
def
|
251
|
+
# rule 'quantifiable' => %w[term quantifier]
|
252
|
+
def reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
|
137
253
|
quantifier = theChildren.last
|
138
|
-
|
139
|
-
repetition(
|
254
|
+
term = theChildren.first
|
255
|
+
repetition(term, quantifier)
|
140
256
|
end
|
141
257
|
|
142
258
|
# rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
@@ -166,7 +282,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
166
282
|
ch_range = char_range('A', 'Z')
|
167
283
|
char_class(false, ch_range)
|
168
284
|
end
|
169
|
-
|
285
|
+
|
170
286
|
# rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
|
171
287
|
def reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
|
172
288
|
reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
|
@@ -174,15 +290,95 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
174
290
|
|
175
291
|
# rule 'digit_range' => 'digit_or_number'
|
176
292
|
def reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
|
177
|
-
|
178
|
-
|
293
|
+
char_shorthand('d')
|
294
|
+
end
|
295
|
+
|
296
|
+
# rule 'character_class' => %w[ANY CHARACTER]
|
297
|
+
def reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
|
298
|
+
char_shorthand('w')
|
299
|
+
end
|
300
|
+
|
301
|
+
# rule 'character_class' => %w[NO CHARACTER]
|
302
|
+
def reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
|
303
|
+
char_shorthand('W')
|
304
|
+
end
|
305
|
+
|
306
|
+
# rule 'character_class' => 'WHITESPACE'
|
307
|
+
def reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
|
308
|
+
char_shorthand('s')
|
309
|
+
end
|
310
|
+
|
311
|
+
# rule 'character_class' => %w[NO WHITESPACE]
|
312
|
+
def reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
|
313
|
+
char_shorthand('S')
|
314
|
+
end
|
315
|
+
|
316
|
+
# rule 'character_class' => 'ANYTHING'
|
317
|
+
def reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
|
318
|
+
wildcard
|
319
|
+
end
|
320
|
+
|
321
|
+
# rule 'character_class' => %w[ONE OF STRING_LIT]
|
322
|
+
def reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
|
323
|
+
raw_literal = theChildren[-1].token.lexeme.dup
|
324
|
+
alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
|
325
|
+
return Regex::CharClass.new(false, *alternatives) # TODO check other implementations
|
179
326
|
end
|
180
327
|
|
328
|
+
# rule 'special_char' => 'TAB'
|
329
|
+
def reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
|
330
|
+
Regex::Character.new('\t')
|
331
|
+
end
|
332
|
+
|
333
|
+
# rule 'special_char' => 'BACKSLASH'
|
334
|
+
def reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
|
335
|
+
Regex::Character.new('\\')
|
336
|
+
end
|
337
|
+
|
338
|
+
# rule 'special_char' => %w[NEW LINE]
|
339
|
+
def reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
|
340
|
+
# TODO: control portability
|
341
|
+
Regex::Character.new('\n')
|
342
|
+
end
|
343
|
+
|
344
|
+
# rule 'literal' => %[LITERALLY STRING_LIT]
|
345
|
+
def reduce_literal_0(aProduction, aRange, theTokens, theChildren)
|
346
|
+
# What if literal is empty?...
|
347
|
+
|
348
|
+
raw_literal = theChildren[-1].token.lexeme.dup
|
349
|
+
return string_literal(raw_literal)
|
350
|
+
end
|
351
|
+
|
352
|
+
# rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
|
353
|
+
def reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
|
354
|
+
return Regex::Alternation.new(*theChildren[3])
|
355
|
+
end
|
356
|
+
|
357
|
+
# rule 'alternatives' => %w[alternatives COMMA quantifiable]
|
358
|
+
def reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
|
359
|
+
return theChildren[0] << theChildren[-1]
|
360
|
+
end
|
361
|
+
|
362
|
+
# rule 'alternatives' => %w[alternatives quantifiable]
|
363
|
+
def reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
|
364
|
+
return theChildren[0] << theChildren[-1]
|
365
|
+
end
|
366
|
+
|
367
|
+
# rule 'alternatives' => 'quantifiable'
|
368
|
+
def reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
|
369
|
+
return [theChildren.last]
|
370
|
+
end
|
371
|
+
|
372
|
+
# rule 'grouping' => %w[LPAREN pattern RPAREN]
|
373
|
+
def reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
|
374
|
+
return Regex::NonCapturingGroup.new(theChildren[1])
|
375
|
+
end
|
376
|
+
|
181
377
|
# rule 'quantifier' => %w[EXACTLY count TIMES]
|
182
378
|
def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
|
183
379
|
count = theChildren[1].token.lexeme.to_i
|
184
380
|
multiplicity(count, count)
|
185
|
-
end
|
381
|
+
end
|
186
382
|
|
187
383
|
# rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
|
188
384
|
def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
|