rley 0.5.10 → 0.5.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +2 -1
- data/appveyor.yml +6 -5
- data/examples/NLP/engtagger.rb +176 -0
- data/examples/general/SRL/lib/ast_builder.rb +217 -21
- data/examples/general/SRL/lib/grammar.rb +33 -5
- data/examples/general/SRL/lib/regex/alternation.rb +30 -0
- data/examples/general/SRL/lib/regex/char_class.rb +28 -22
- data/examples/general/SRL/lib/regex/char_shorthand.rb +50 -0
- data/examples/general/SRL/lib/regex/character.rb +5 -3
- data/examples/general/SRL/lib/regex/concatenation.rb +32 -0
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +29 -0
- data/examples/general/SRL/lib/regex/wildcard.rb +26 -0
- data/examples/general/SRL/lib/regex_repr.rb +5 -0
- data/examples/general/SRL/lib/tokenizer.rb +28 -3
- data/examples/general/SRL/spec/integration_spec.rb +151 -8
- data/examples/general/SRL/spec/tokenizer_spec.rb +12 -0
- data/examples/general/left.rb +36 -0
- data/examples/general/right.rb +36 -0
- data/lib/rley/constants.rb +1 -1
- data/lib/rley/gfg/edge.rb +12 -1
- data/lib/rley/gfg/grm_flow_graph.rb +21 -1
- data/lib/rley/gfg/item_vertex.rb +1 -1
- data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
- data/lib/rley/gfg/start_vertex.rb +1 -0
- data/lib/rley/gfg/vertex.rb +27 -0
- data/lib/rley/lexical/token.rb +1 -0
- data/lib/rley/parser/error_reason.rb +2 -1
- data/lib/rley/parser/gfg_chart.rb +14 -0
- data/lib/rley/parser/gfg_earley_parser.rb +0 -1
- data/lib/rley/parser/gfg_parsing.rb +4 -3
- data/lib/rley/parser/parse_entry.rb +33 -3
- data/lib/rley/parser/parse_entry_set.rb +14 -2
- data/lib/rley/parser/parse_tree_builder.rb +1 -1
- data/lib/rley/parser/parse_walker_factory.rb +0 -1
- data/lib/rley/syntax/grm_symbol.rb +2 -0
- data/lib/rley/syntax/production.rb +15 -3
- data/lib/rley/syntax/symbol_seq.rb +16 -1
- data/spec/rley/gfg/end_vertex_spec.rb +9 -1
- data/spec/rley/gfg/grm_flow_graph_spec.rb +9 -0
- data/spec/rley/gfg/item_vertex_spec.rb +9 -0
- data/spec/rley/gfg/start_vertex_spec.rb +9 -1
- data/spec/rley/parser/gfg_parsing_spec.rb +0 -1
- data/spec/rley/parser/parse_entry_set_spec.rb +15 -0
- data/spec/rley/parser/parse_entry_spec.rb +24 -13
- data/spec/rley/parser/parse_tracer_spec.rb +1 -1
- data/spec/rley/syntax/production_spec.rb +10 -0
- data/spec/rley/syntax/symbol_seq_spec.rb +5 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ec06612f299302b861fbaeb04b75c0040a026cf
|
4
|
+
data.tar.gz: d68438efcbacceb2ae4319ac268492e93db35265
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec3be765a424028c986ea4812cf6f1485f04285beb2b9d8fffc774fc0b61108d4d6758a09a648132562752ab25904fb38f8ee57ecff90d0a70bca253150ed130
|
7
|
+
data.tar.gz: 2463def65eecbefed2bbfffc61e63e88dca2d0498078e83bc742811e540718e95e75f3896fa31b5bdc9068f5420906f389615470a86831dbcb5025824645775d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
### 0.5.11 / 2018-01-25
|
2
|
+
* [NEW] File `left.rb` added in `examples/general` folder for showing use of left-recursive rules.
|
3
|
+
* [NEW] File `right.rb` added in `examples/general` folder for showing use of right-recursive rules (less performant).
|
4
|
+
* [NEW] File `examples/general/SRL/lib/alternation.rb Added support for alternation in regular expressions (|).
|
5
|
+
* [NEW] File `examples/general/SRL/lib/character.rb Added support for single character in regular expressions.
|
6
|
+
* [NEW] File `examples/general/SRL/lib/char_class.rb Added support for character class in regular expressions.
|
7
|
+
* [NEW] File `examples/general/SRL/lib/shorthand.rb Added support for character class shorthand in regular expressions.
|
8
|
+
* [NEW] File `examples/general/SRL/lib/concatenation.rb Added support for concatenation in regular expressions.
|
9
|
+
* [NEW] File `examples/general/SRL/lib/non_capturing_group.rb Added support for non-capturing groups in regular expressions.
|
10
|
+
* [NEW] File `examples/general/SRL/lib/wildcard.rb Added support for wilcards in regular expressions.
|
11
|
+
* [CHANGE] File `examples/general/SRL/grammar.rb increased coverage of Simple Regex Language parser.
|
12
|
+
* [CHANGE] File `examples/general/SRL/ast_builder.rb Added transformation rules for constructing regular expressions.
|
13
|
+
* [CHANGE] File `examples/general/SRL/spac/integration_spec.rb Added tests for SRL expressions.
|
14
|
+
* [FIX] Added an custom `inspect` method to sevaral core classes. This was necessary because default implementation from Ruby got lost with object graphs.
|
15
|
+
|
1
16
|
### 0.5.10 / 2017-12-02
|
2
17
|
* [CHANGE] Dir `examples/general/SRL/ Added support for digit range to Simple Regex Language parser.
|
3
18
|
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -440,6 +440,7 @@ actively curated by Andrei Beliankou (aka arbox).
|
|
440
440
|
|
441
441
|
## Thanks to:
|
442
442
|
* Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
|
443
|
+
* [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
|
443
444
|
|
444
445
|
## Grammar Flow Graph
|
445
446
|
Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
|
@@ -452,5 +453,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
|
|
452
453
|
|
453
454
|
Copyright
|
454
455
|
---------
|
455
|
-
Copyright (c) 2014-
|
456
|
+
Copyright (c) 2014-2018, Dimitri Geshef.
|
456
457
|
__Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
|
data/appveyor.yml
CHANGED
@@ -11,12 +11,13 @@ environment:
|
|
11
11
|
- Ruby_version: 23
|
12
12
|
- Ruby_version: 23-x64
|
13
13
|
- Ruby_version: 24
|
14
|
-
- Ruby_version: 24-x64
|
14
|
+
- Ruby_version: 24-x64
|
15
|
+
|
15
16
|
install:
|
16
|
-
-
|
17
|
-
|
17
|
+
- set PATH=C:\Ruby%Ruby_version%\bin;%PATH%
|
18
|
+
- bundle install --retry=3 --clean --force
|
18
19
|
|
19
|
-
bundle install --retry=3 --clean
|
20
20
|
build: off
|
21
|
+
|
21
22
|
test_script:
|
22
|
-
-
|
23
|
+
- bundle exec rake
|
@@ -0,0 +1,176 @@
|
|
1
|
+
require "rley"
|
2
|
+
require "engtagger"
|
3
|
+
require "pp"
|
4
|
+
|
5
|
+
# REGEX to remove XML tags from Engtagger output
|
6
|
+
GET_TAG = /<(.+?)>(.*?)<.+?>/
|
7
|
+
|
8
|
+
# Text tokenizer
|
9
|
+
# Taken directly from Engtagger, will ensure uniform indexing while parsing
|
10
|
+
def clean_text(text)
|
11
|
+
return false unless valid_text(text)
|
12
|
+
text = text.toutf8
|
13
|
+
cleaned_text = text
|
14
|
+
tokenized = []
|
15
|
+
# Tokenize the text (splitting on punctuation as you go)
|
16
|
+
cleaned_text.split(/\s+/).each do |line|
|
17
|
+
tokenized += split_punct(line)
|
18
|
+
end
|
19
|
+
words = split_sentences(tokenized)
|
20
|
+
return words
|
21
|
+
end
|
22
|
+
|
23
|
+
def valid_text(text)
|
24
|
+
if !text
|
25
|
+
# there's nothing to parse
|
26
|
+
"method call on uninitialized variable" if @conf[:debug]
|
27
|
+
return false
|
28
|
+
elsif /\A\s*\z/ =~ text
|
29
|
+
# text is an empty string, nothing to parse
|
30
|
+
return false
|
31
|
+
else
|
32
|
+
# $text is valid
|
33
|
+
return true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def split_sentences(array)
|
38
|
+
tokenized = array
|
39
|
+
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
40
|
+
supt det mssrs rev)
|
41
|
+
army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
|
42
|
+
inst = %w(dept univ assn bros ph.d)
|
43
|
+
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
44
|
+
hwy hway la pde pd plz pl rd st tce)
|
45
|
+
comp = %w(mfg inc ltd co corp)
|
46
|
+
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
47
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
48
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
49
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk)
|
50
|
+
month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
|
51
|
+
misc = %w(vs etc no esp)
|
52
|
+
abbr = Hash.new
|
53
|
+
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
54
|
+
abbr[i] = true
|
55
|
+
end
|
56
|
+
words = Array.new
|
57
|
+
tokenized.each_with_index do |t, i|
|
58
|
+
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
|
59
|
+
w = $1
|
60
|
+
# Don't separate the period off words that
|
61
|
+
# meet any of the following conditions:
|
62
|
+
#
|
63
|
+
# 1. It is defined in one of the lists above
|
64
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
65
|
+
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
66
|
+
unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
|
67
|
+
words << w
|
68
|
+
words << '.'
|
69
|
+
next
|
70
|
+
end
|
71
|
+
end
|
72
|
+
words << tokenized[i]
|
73
|
+
end
|
74
|
+
# If the final word ends in a period..
|
75
|
+
if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
|
76
|
+
words[-1] = $1
|
77
|
+
words.push '.'
|
78
|
+
end
|
79
|
+
return words
|
80
|
+
end
|
81
|
+
|
82
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
83
|
+
# periods in place to be dealt with later. Called by the clean_text method.
|
84
|
+
def split_punct(text)
|
85
|
+
# If there's no punctuation, return immediately
|
86
|
+
return [text] if /\A\w+\z/ =~ text
|
87
|
+
# Sanity checks
|
88
|
+
text = text.gsub(/\W{10,}/o, " ")
|
89
|
+
|
90
|
+
# Put quotes into a standard format
|
91
|
+
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
92
|
+
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
93
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
94
|
+
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
95
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
96
|
+
|
97
|
+
# Handle all other punctuation
|
98
|
+
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
99
|
+
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
100
|
+
text = text.gsub(/:/o, " :") # Shift semicolons off
|
101
|
+
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
102
|
+
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
103
|
+
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
104
|
+
|
105
|
+
# English-specific contractions
|
106
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
|
107
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
108
|
+
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
|
109
|
+
result = text.split(' ')
|
110
|
+
return result
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
# Instantiate a builder object that will build the grammar for us
|
115
|
+
builder = Rley::Syntax::GrammarBuilder.new do
|
116
|
+
|
117
|
+
add_terminals('NN', 'NNP')
|
118
|
+
add_terminals('DET', 'IN', 'VBD')
|
119
|
+
|
120
|
+
# Here we define the productions (= grammar rules)
|
121
|
+
rule 'S' => %w[NP VP]
|
122
|
+
rule 'NP' => 'NNP'
|
123
|
+
rule 'NP' => %w[DET NN]
|
124
|
+
rule 'NP' => %w[DET NN PP]
|
125
|
+
rule 'VP' => %w[VBD NP]
|
126
|
+
rule 'VP' => %w[VBD NP PP]
|
127
|
+
rule 'PP' => %w[IN NP]
|
128
|
+
end
|
129
|
+
|
130
|
+
# And now, let's build the grammar...
|
131
|
+
grammar = builder.grammar
|
132
|
+
|
133
|
+
parser = Rley::Parser::GFGEarleyParser.new(grammar)
|
134
|
+
|
135
|
+
# text = "Yo I'm not done with you"
|
136
|
+
text= "John saw Mary with a telescope"
|
137
|
+
pp "Input text --> #{text}"
|
138
|
+
|
139
|
+
tgr = EngTagger.new
|
140
|
+
|
141
|
+
# Generte POS
|
142
|
+
tagged = tgr.add_tags(text)
|
143
|
+
|
144
|
+
# Generte tokenied lexicon of input text
|
145
|
+
# Instead of creating a lexicon dictionary, we would simply generate one each time on the fly for the current text only.
|
146
|
+
lexicon = clean_text(text)
|
147
|
+
|
148
|
+
# Generte POS tokens in [[word, pos], ..] format
|
149
|
+
tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
|
150
|
+
|
151
|
+
def tokenizer(lexicon, grammar, tokens)
|
152
|
+
rley_tokens = []
|
153
|
+
lexicon.each_with_index do |word, i|
|
154
|
+
term_name = tokens[i].last
|
155
|
+
terminal = grammar.name2symbol[term_name]
|
156
|
+
rley_tokens << Rley::Tokens::Token.new(word, terminal)
|
157
|
+
end
|
158
|
+
return rley_tokens
|
159
|
+
end
|
160
|
+
|
161
|
+
# Convert input text into a sequence of rley token objects...
|
162
|
+
rley_tokens = tokenizer(lexicon, grammar, tokens)
|
163
|
+
|
164
|
+
result = parser.parse(rley_tokens)
|
165
|
+
|
166
|
+
pp "Parsing successful? #{result.success?}" # => Parsing successful? true
|
167
|
+
pp result.failure_reason.message unless result.success?
|
168
|
+
|
169
|
+
ptree = result.parse_tree
|
170
|
+
|
171
|
+
visitor = Rley::ParseTreeVisitor.new(ptree)
|
172
|
+
|
173
|
+
renderer = Rley::Formatter::Asciitree.new($stdout)
|
174
|
+
|
175
|
+
# Subscribe the formatter to the visitor's event and launch the visit
|
176
|
+
pp renderer.render(visitor)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'stringio'
|
1
2
|
require_relative 'ast_building'
|
2
3
|
require_relative 'regex_repr'
|
3
4
|
|
@@ -35,27 +36,54 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
35
36
|
# @param theChildren [Array] Children nodes (one per rhs symbol)
|
36
37
|
def new_parent_node(aProduction, aRange, theTokens, theChildren)
|
37
38
|
node = case aProduction.name
|
38
|
-
when 'srl_0' # rule 'srl' => '
|
39
|
+
when 'srl_0' # rule 'srl' => 'pattern'
|
40
|
+
return_first_child(aRange, theTokens, theChildren)
|
41
|
+
|
42
|
+
when 'pattern_0' # rule 'pattern' => %w[pattern COMMA quantifiable]
|
43
|
+
reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
|
44
|
+
|
45
|
+
when 'pattern_1' # rule 'pattern' => %w[pattern quantifiable]
|
46
|
+
reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
|
47
|
+
|
48
|
+
when 'pattern_2' # rule 'pattern' => 'quantifiable'
|
49
|
+
return_first_child(aRange, theTokens, theChildren)
|
50
|
+
|
51
|
+
when 'quantifiable_0' # rule 'quantifiable' => 'term'
|
39
52
|
return_first_child(aRange, theTokens, theChildren)
|
40
53
|
|
54
|
+
when 'quantifiable_1' # rule 'quantifiable' = %w[term quantifier]
|
55
|
+
reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
|
56
|
+
|
41
57
|
when 'term_0' # rule 'term' => 'atom'
|
42
58
|
return_first_child(aRange, theTokens, theChildren)
|
43
59
|
|
44
|
-
when 'term_1' # rule 'term' =>
|
45
|
-
|
60
|
+
when 'term_1' # rule 'term' => 'alternation'
|
61
|
+
return_first_child(aRange, theTokens, theChildren)
|
62
|
+
|
63
|
+
when 'term_2' # rule 'term' => 'grouping'
|
64
|
+
return_first_child(aRange, theTokens, theChildren)
|
46
65
|
|
47
66
|
when 'atom_0' # rule 'atom' => 'letter_range'
|
48
67
|
return_first_child(aRange, theTokens, theChildren)
|
49
|
-
|
68
|
+
|
50
69
|
when 'atom_1' # rule 'atom' => 'digit_range'
|
51
70
|
return_first_child(aRange, theTokens, theChildren)
|
52
|
-
|
71
|
+
|
72
|
+
when 'atom_2' # rule 'atom' => 'character_class'
|
73
|
+
return_first_child(aRange, theTokens, theChildren)
|
74
|
+
|
75
|
+
when 'atom_3' # rule 'atom' => 'special_char'
|
76
|
+
return_first_child(aRange, theTokens, theChildren)
|
77
|
+
|
78
|
+
when 'atom_4' # rule 'atom' => 'literal'
|
79
|
+
return_first_child(aRange, theTokens, theChildren)
|
80
|
+
|
53
81
|
# rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
54
|
-
when 'letter_range_0'
|
82
|
+
when 'letter_range_0'
|
55
83
|
reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
|
56
84
|
|
57
|
-
#rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
58
|
-
when 'letter_range_1'
|
85
|
+
#rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
86
|
+
when 'letter_range_1'
|
59
87
|
reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
|
60
88
|
|
61
89
|
when 'letter_range_2' # rule 'letter_range' => 'LETTER'
|
@@ -65,12 +93,60 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
65
93
|
reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
|
66
94
|
|
67
95
|
# rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
|
68
|
-
when 'digit_range_0'
|
96
|
+
when 'digit_range_0'
|
69
97
|
reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
|
70
98
|
|
71
|
-
when 'digit_range_1' #rule 'digit_range' => 'digit_or_number'
|
99
|
+
when 'digit_range_1' # rule 'digit_range' => 'digit_or_number'
|
72
100
|
reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
|
73
101
|
|
102
|
+
when 'character_class_0' # rule 'character_class' => %w[ANY CHARACTER]
|
103
|
+
reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
|
104
|
+
|
105
|
+
when 'character_class_1' # rule 'character_class' => %w[NO CHARACTER]
|
106
|
+
reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
|
107
|
+
|
108
|
+
when 'character_class_2' # rule 'character_class' => 'WHITESPACE'
|
109
|
+
reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
|
110
|
+
|
111
|
+
when 'character_class_3' # rule 'character_class' => %w[NO WHITESPACE]
|
112
|
+
reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
|
113
|
+
|
114
|
+
when 'character_class_4' # rule 'character_class' => 'ANYTHING'
|
115
|
+
reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
|
116
|
+
|
117
|
+
when 'character_class_5' # rule 'character_class' => %w[ONE OF STRING_LIT]
|
118
|
+
reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
|
119
|
+
|
120
|
+
when 'special_char_0' # rule 'special_char' => 'TAB'
|
121
|
+
reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
|
122
|
+
|
123
|
+
when 'special_char_1' # rule 'special_char' => 'BACKSLASH'
|
124
|
+
reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
|
125
|
+
|
126
|
+
when 'special_char_2' # rule 'special_char' => %w[NEW LINE]
|
127
|
+
reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
|
128
|
+
|
129
|
+
when 'literal_0' # rule 'literal' => %[LITERALLY STRING_LIT]
|
130
|
+
reduce_literal_0(aProduction, aRange, theTokens, theChildren)
|
131
|
+
|
132
|
+
# rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
|
133
|
+
when 'alternation_0'
|
134
|
+
reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
|
135
|
+
|
136
|
+
# rule 'alternatives' => %w[alternatives COMMA quantifiable]
|
137
|
+
when 'alternatives_0'
|
138
|
+
reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
|
139
|
+
|
140
|
+
# rule 'alternatives' => %w[alternatives quantifiable]
|
141
|
+
when 'alternatives_1'
|
142
|
+
reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
|
143
|
+
|
144
|
+
when 'alternatives_2' # rule 'alternatives' => 'quantifiable'
|
145
|
+
reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
|
146
|
+
|
147
|
+
when 'grouping' # rule 'grouping' => %w[LPAREN pattern RPAREN]
|
148
|
+
reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
|
149
|
+
|
74
150
|
when 'quantifier_0' # rule 'quantifier' => 'ONCE'
|
75
151
|
multiplicity(1, 1)
|
76
152
|
|
@@ -81,7 +157,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
81
157
|
reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
|
82
158
|
|
83
159
|
# rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
|
84
|
-
when 'quantifier_3'
|
160
|
+
when 'quantifier_3'
|
85
161
|
reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
|
86
162
|
|
87
163
|
when 'quantifier_4' # rule 'quantifier' => 'OPTIONAL'
|
@@ -95,10 +171,10 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
95
171
|
|
96
172
|
when 'quantifier_7' # rule 'quantifier' => %w[AT LEAST count TIMES]
|
97
173
|
reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
|
98
|
-
|
174
|
+
|
99
175
|
# rule 'digit_or_number' => 'DIGIT'
|
100
176
|
# rule 'digit_or_number' => 'NUMER'
|
101
|
-
when 'digit_or_number_0', 'digit_or_number_1'
|
177
|
+
when 'digit_or_number_0', 'digit_or_number_1'
|
102
178
|
return_first_child(aRange, theTokens, theChildren)
|
103
179
|
|
104
180
|
when 'count_0', 'count_1'
|
@@ -117,6 +193,28 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
117
193
|
return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
|
118
194
|
end
|
119
195
|
|
196
|
+
def string_literal(aString, to_escape = true)
|
197
|
+
if aString.size > 1
|
198
|
+
chars = []
|
199
|
+
aString.each_char do |ch|
|
200
|
+
if to_escape && Regex::Character::MetaChars.include?(ch)
|
201
|
+
chars << Regex::Character.new("\\")
|
202
|
+
end
|
203
|
+
chars << Regex::Character.new(ch)
|
204
|
+
end
|
205
|
+
result = Regex::Concatenation.new(*chars)
|
206
|
+
else
|
207
|
+
if to_escape && Regex::Character::MetaChars.include?(aString)
|
208
|
+
result = Regex::Concatenation.new(Regex::Character.new("\\"),
|
209
|
+
Regex::Character.new(aString))
|
210
|
+
else
|
211
|
+
result = Regex::Character.new(aString)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
return result
|
216
|
+
end
|
217
|
+
|
120
218
|
def char_range(lowerBound, upperBound)
|
121
219
|
# TODO fix module nesting
|
122
220
|
lower = Regex::Character.new(lowerBound)
|
@@ -128,15 +226,33 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
128
226
|
Regex::CharClass.new(toNegate, *theChildren)
|
129
227
|
end
|
130
228
|
|
229
|
+
def char_shorthand(shortName)
|
230
|
+
Regex::CharShorthand.new(shortName)
|
231
|
+
end
|
232
|
+
|
233
|
+
def wildcard()
|
234
|
+
Regex::Wildcard.new
|
235
|
+
end
|
236
|
+
|
131
237
|
def repetition(expressionToRepeat, aMultiplicity)
|
132
238
|
return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
|
133
239
|
end
|
240
|
+
|
241
|
+
# rule 'pattern' => %w[pattern COMMA quantifiable]
|
242
|
+
def reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
|
243
|
+
return Regex::Concatenation.new(theChildren[0], theChildren[2])
|
244
|
+
end
|
245
|
+
|
246
|
+
# rule 'pattern' => %w[pattern quantifiable]
|
247
|
+
def reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
|
248
|
+
return Regex::Concatenation.new(theChildren[0], theChildren[1])
|
249
|
+
end
|
134
250
|
|
135
|
-
# rule '
|
136
|
-
def
|
251
|
+
# rule 'quantifiable' => %w[term quantifier]
|
252
|
+
def reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
|
137
253
|
quantifier = theChildren.last
|
138
|
-
|
139
|
-
repetition(
|
254
|
+
term = theChildren.first
|
255
|
+
repetition(term, quantifier)
|
140
256
|
end
|
141
257
|
|
142
258
|
# rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
@@ -166,7 +282,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
166
282
|
ch_range = char_range('A', 'Z')
|
167
283
|
char_class(false, ch_range)
|
168
284
|
end
|
169
|
-
|
285
|
+
|
170
286
|
# rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
|
171
287
|
def reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
|
172
288
|
reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
|
@@ -174,15 +290,95 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
174
290
|
|
175
291
|
# rule 'digit_range' => 'digit_or_number'
|
176
292
|
def reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
|
177
|
-
|
178
|
-
|
293
|
+
char_shorthand('d')
|
294
|
+
end
|
295
|
+
|
296
|
+
# rule 'character_class' => %w[ANY CHARACTER]
|
297
|
+
def reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
|
298
|
+
char_shorthand('w')
|
299
|
+
end
|
300
|
+
|
301
|
+
# rule 'character_class' => %w[NO CHARACTER]
|
302
|
+
def reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
|
303
|
+
char_shorthand('W')
|
304
|
+
end
|
305
|
+
|
306
|
+
# rule 'character_class' => 'WHITESPACE'
|
307
|
+
def reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
|
308
|
+
char_shorthand('s')
|
309
|
+
end
|
310
|
+
|
311
|
+
# rule 'character_class' => %w[NO WHITESPACE]
|
312
|
+
def reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
|
313
|
+
char_shorthand('S')
|
314
|
+
end
|
315
|
+
|
316
|
+
# rule 'character_class' => 'ANYTHING'
|
317
|
+
def reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
|
318
|
+
wildcard
|
319
|
+
end
|
320
|
+
|
321
|
+
# rule 'character_class' => %w[ONE OF STRING_LIT]
|
322
|
+
def reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
|
323
|
+
raw_literal = theChildren[-1].token.lexeme.dup
|
324
|
+
alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
|
325
|
+
return Regex::CharClass.new(false, *alternatives) # TODO check other implementations
|
179
326
|
end
|
180
327
|
|
328
|
+
# rule 'special_char' => 'TAB'
|
329
|
+
def reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
|
330
|
+
Regex::Character.new('\t')
|
331
|
+
end
|
332
|
+
|
333
|
+
# rule 'special_char' => 'BACKSLASH'
|
334
|
+
def reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
|
335
|
+
Regex::Character.new('\\')
|
336
|
+
end
|
337
|
+
|
338
|
+
# rule 'special_char' => %w[NEW LINE]
|
339
|
+
def reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
|
340
|
+
# TODO: control portability
|
341
|
+
Regex::Character.new('\n')
|
342
|
+
end
|
343
|
+
|
344
|
+
# rule 'literal' => %[LITERALLY STRING_LIT]
|
345
|
+
def reduce_literal_0(aProduction, aRange, theTokens, theChildren)
|
346
|
+
# What if literal is empty?...
|
347
|
+
|
348
|
+
raw_literal = theChildren[-1].token.lexeme.dup
|
349
|
+
return string_literal(raw_literal)
|
350
|
+
end
|
351
|
+
|
352
|
+
# rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
|
353
|
+
def reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
|
354
|
+
return Regex::Alternation.new(*theChildren[3])
|
355
|
+
end
|
356
|
+
|
357
|
+
# rule 'alternatives' => %w[alternatives COMMA quantifiable]
|
358
|
+
def reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
|
359
|
+
return theChildren[0] << theChildren[-1]
|
360
|
+
end
|
361
|
+
|
362
|
+
# rule 'alternatives' => %w[alternatives quantifiable]
|
363
|
+
def reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
|
364
|
+
return theChildren[0] << theChildren[-1]
|
365
|
+
end
|
366
|
+
|
367
|
+
# rule 'alternatives' => 'quantifiable'
|
368
|
+
def reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
|
369
|
+
return [theChildren.last]
|
370
|
+
end
|
371
|
+
|
372
|
+
# rule 'grouping' => %w[LPAREN pattern RPAREN]
|
373
|
+
def reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
|
374
|
+
return Regex::NonCapturingGroup.new(theChildren[1])
|
375
|
+
end
|
376
|
+
|
181
377
|
# rule 'quantifier' => %w[EXACTLY count TIMES]
|
182
378
|
def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
|
183
379
|
count = theChildren[1].token.lexeme.to_i
|
184
380
|
multiplicity(count, count)
|
185
|
-
end
|
381
|
+
end
|
186
382
|
|
187
383
|
# rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
|
188
384
|
def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
|