rley 0.5.10 → 0.5.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +2 -1
  5. data/appveyor.yml +6 -5
  6. data/examples/NLP/engtagger.rb +176 -0
  7. data/examples/general/SRL/lib/ast_builder.rb +217 -21
  8. data/examples/general/SRL/lib/grammar.rb +33 -5
  9. data/examples/general/SRL/lib/regex/alternation.rb +30 -0
  10. data/examples/general/SRL/lib/regex/char_class.rb +28 -22
  11. data/examples/general/SRL/lib/regex/char_shorthand.rb +50 -0
  12. data/examples/general/SRL/lib/regex/character.rb +5 -3
  13. data/examples/general/SRL/lib/regex/concatenation.rb +32 -0
  14. data/examples/general/SRL/lib/regex/non_capturing_group.rb +29 -0
  15. data/examples/general/SRL/lib/regex/wildcard.rb +26 -0
  16. data/examples/general/SRL/lib/regex_repr.rb +5 -0
  17. data/examples/general/SRL/lib/tokenizer.rb +28 -3
  18. data/examples/general/SRL/spec/integration_spec.rb +151 -8
  19. data/examples/general/SRL/spec/tokenizer_spec.rb +12 -0
  20. data/examples/general/left.rb +36 -0
  21. data/examples/general/right.rb +36 -0
  22. data/lib/rley/constants.rb +1 -1
  23. data/lib/rley/gfg/edge.rb +12 -1
  24. data/lib/rley/gfg/grm_flow_graph.rb +21 -1
  25. data/lib/rley/gfg/item_vertex.rb +1 -1
  26. data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
  27. data/lib/rley/gfg/start_vertex.rb +1 -0
  28. data/lib/rley/gfg/vertex.rb +27 -0
  29. data/lib/rley/lexical/token.rb +1 -0
  30. data/lib/rley/parser/error_reason.rb +2 -1
  31. data/lib/rley/parser/gfg_chart.rb +14 -0
  32. data/lib/rley/parser/gfg_earley_parser.rb +0 -1
  33. data/lib/rley/parser/gfg_parsing.rb +4 -3
  34. data/lib/rley/parser/parse_entry.rb +33 -3
  35. data/lib/rley/parser/parse_entry_set.rb +14 -2
  36. data/lib/rley/parser/parse_tree_builder.rb +1 -1
  37. data/lib/rley/parser/parse_walker_factory.rb +0 -1
  38. data/lib/rley/syntax/grm_symbol.rb +2 -0
  39. data/lib/rley/syntax/production.rb +15 -3
  40. data/lib/rley/syntax/symbol_seq.rb +16 -1
  41. data/spec/rley/gfg/end_vertex_spec.rb +9 -1
  42. data/spec/rley/gfg/grm_flow_graph_spec.rb +9 -0
  43. data/spec/rley/gfg/item_vertex_spec.rb +9 -0
  44. data/spec/rley/gfg/start_vertex_spec.rb +9 -1
  45. data/spec/rley/parser/gfg_parsing_spec.rb +0 -1
  46. data/spec/rley/parser/parse_entry_set_spec.rb +15 -0
  47. data/spec/rley/parser/parse_entry_spec.rb +24 -13
  48. data/spec/rley/parser/parse_tracer_spec.rb +1 -1
  49. data/spec/rley/syntax/production_spec.rb +10 -0
  50. data/spec/rley/syntax/symbol_seq_spec.rb +5 -0
  51. metadata +10 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ce33adee17693fccafcc29ce8340f57694c229fb
4
- data.tar.gz: 8acb3af15c3bd7c292209e3e1cc2e81b20ba3d9f
3
+ metadata.gz: 0ec06612f299302b861fbaeb04b75c0040a026cf
4
+ data.tar.gz: d68438efcbacceb2ae4319ac268492e93db35265
5
5
  SHA512:
6
- metadata.gz: dba9c01e5cb72954185ec5b2a973a7983202bc1733808637bd887feb6a473a57c6b3ef155090dec47f848b0e367ee558f04d0f7463d50b3494dad627ed7101fb
7
- data.tar.gz: ffb5e41e0325f51c1ae74cc2bb7739eb21b94dca87611f528067e70295a0cd03c956b7b3a75e32593b611abafd1007853917d32db9691d89ea4948df666ad7cc
6
+ metadata.gz: ec3be765a424028c986ea4812cf6f1485f04285beb2b9d8fffc774fc0b61108d4d6758a09a648132562752ab25904fb38f8ee57ecff90d0a70bca253150ed130
7
+ data.tar.gz: 2463def65eecbefed2bbfffc61e63e88dca2d0498078e83bc742811e540718e95e75f3896fa31b5bdc9068f5420906f389615470a86831dbcb5025824645775d
@@ -1,3 +1,18 @@
1
+ ### 0.5.11 / 2018-01-25
2
+ * [NEW] File `left.rb` added in `examples/general` folder for showing use of left-recursive rules.
3
+ * [NEW] File `right.rb` added in `examples/general` folder for showing use of right-recursive rules (less performant).
4
+ * [NEW] File `examples/general/SRL/lib/alternation.rb Added support for alternation in regular expressions (|).
5
+ * [NEW] File `examples/general/SRL/lib/character.rb Added support for single character in regular expressions.
6
+ * [NEW] File `examples/general/SRL/lib/char_class.rb Added support for character class in regular expressions.
7
+ * [NEW] File `examples/general/SRL/lib/shorthand.rb Added support for character class shorthand in regular expressions.
8
+ * [NEW] File `examples/general/SRL/lib/concatenation.rb Added support for concatenation in regular expressions.
9
+ * [NEW] File `examples/general/SRL/lib/non_capturing_group.rb Added support for non-capturing groups in regular expressions.
10
+ * [NEW] File `examples/general/SRL/lib/wildcard.rb Added support for wilcards in regular expressions.
11
+ * [CHANGE] File `examples/general/SRL/grammar.rb increased coverage of Simple Regex Language parser.
12
+ * [CHANGE] File `examples/general/SRL/ast_builder.rb Added transformation rules for constructing regular expressions.
13
+ * [CHANGE] File `examples/general/SRL/spac/integration_spec.rb Added tests for SRL expressions.
14
+ * [FIX] Added an custom `inspect` method to sevaral core classes. This was necessary because default implementation from Ruby got lost with object graphs.
15
+
1
16
  ### 0.5.10 / 2017-12-02
2
17
  * [CHANGE] Dir `examples/general/SRL/ Added support for digit range to Simple Regex Language parser.
3
18
 
@@ -1,4 +1,4 @@
1
- Copyright (c) 2014-2017 Dimitri Geshef
1
+ Copyright (c) 2014-2018 Dimitri Geshef
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -440,6 +440,7 @@ actively curated by Andrei Beliankou (aka arbox).
440
440
 
441
441
  ## Thanks to:
442
442
  * Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
443
+ * [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
443
444
 
444
445
  ## Grammar Flow Graph
445
446
  Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
@@ -452,5 +453,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
452
453
 
453
454
  Copyright
454
455
  ---------
455
- Copyright (c) 2014-2017, Dimitri Geshef.
456
+ Copyright (c) 2014-2018, Dimitri Geshef.
456
457
  __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
@@ -11,12 +11,13 @@ environment:
11
11
  - Ruby_version: 23
12
12
  - Ruby_version: 23-x64
13
13
  - Ruby_version: 24
14
- - Ruby_version: 24-x64
14
+ - Ruby_version: 24-x64
15
+
15
16
  install:
16
- - cmd: >-
17
- SET PATH=C:\Ruby%Ruby_version%\bin;%PATH%
17
+ - set PATH=C:\Ruby%Ruby_version%\bin;%PATH%
18
+ - bundle install --retry=3 --clean --force
18
19
 
19
- bundle install --retry=3 --clean
20
20
  build: off
21
+
21
22
  test_script:
22
- - cmd: bundle exec rake
23
+ - bundle exec rake
@@ -0,0 +1,176 @@
1
+ require "rley"
2
+ require "engtagger"
3
+ require "pp"
4
+
5
+ # REGEX to remove XML tags from Engtagger output
6
+ GET_TAG = /<(.+?)>(.*?)<.+?>/
7
+
8
+ # Text tokenizer
9
+ # Taken directly from Engtagger, will ensure uniform indexing while parsing
10
+ def clean_text(text)
11
+ return false unless valid_text(text)
12
+ text = text.toutf8
13
+ cleaned_text = text
14
+ tokenized = []
15
+ # Tokenize the text (splitting on punctuation as you go)
16
+ cleaned_text.split(/\s+/).each do |line|
17
+ tokenized += split_punct(line)
18
+ end
19
+ words = split_sentences(tokenized)
20
+ return words
21
+ end
22
+
23
+ def valid_text(text)
24
+ if !text
25
+ # there's nothing to parse
26
+ "method call on uninitialized variable" if @conf[:debug]
27
+ return false
28
+ elsif /\A\s*\z/ =~ text
29
+ # text is an empty string, nothing to parse
30
+ return false
31
+ else
32
+ # $text is valid
33
+ return true
34
+ end
35
+ end
36
+
37
+ def split_sentences(array)
38
+ tokenized = array
39
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
40
+ supt det mssrs rev)
41
+ army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
42
+ inst = %w(dept univ assn bros ph.d)
43
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
44
+ hwy hway la pde pd plz pl rd st tce)
45
+ comp = %w(mfg inc ltd co corp)
46
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
47
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
48
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
49
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk)
50
+ month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
51
+ misc = %w(vs etc no esp)
52
+ abbr = Hash.new
53
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
54
+ abbr[i] = true
55
+ end
56
+ words = Array.new
57
+ tokenized.each_with_index do |t, i|
58
+ if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
59
+ w = $1
60
+ # Don't separate the period off words that
61
+ # meet any of the following conditions:
62
+ #
63
+ # 1. It is defined in one of the lists above
64
+ # 2. It is only one letter long: Alfred E. Sloan
65
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
66
+ unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
67
+ words << w
68
+ words << '.'
69
+ next
70
+ end
71
+ end
72
+ words << tokenized[i]
73
+ end
74
+ # If the final word ends in a period..
75
+ if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
76
+ words[-1] = $1
77
+ words.push '.'
78
+ end
79
+ return words
80
+ end
81
+
82
+ # Separate punctuation from words, where appropriate. This leaves trailing
83
+ # periods in place to be dealt with later. Called by the clean_text method.
84
+ def split_punct(text)
85
+ # If there's no punctuation, return immediately
86
+ return [text] if /\A\w+\z/ =~ text
87
+ # Sanity checks
88
+ text = text.gsub(/\W{10,}/o, " ")
89
+
90
+ # Put quotes into a standard format
91
+ text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
92
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
93
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
94
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
95
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
96
+
97
+ # Handle all other punctuation
98
+ text = text.gsub(/--+/o, " - ") # Convert and separate dashes
99
+ text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
100
+ text = text.gsub(/:/o, " :") # Shift semicolons off
101
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
102
+ text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
103
+ text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
104
+
105
+ # English-specific contractions
106
+ text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
107
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
108
+ text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
109
+ result = text.split(' ')
110
+ return result
111
+ end
112
+
113
+
114
+ # Instantiate a builder object that will build the grammar for us
115
+ builder = Rley::Syntax::GrammarBuilder.new do
116
+
117
+ add_terminals('NN', 'NNP')
118
+ add_terminals('DET', 'IN', 'VBD')
119
+
120
+ # Here we define the productions (= grammar rules)
121
+ rule 'S' => %w[NP VP]
122
+ rule 'NP' => 'NNP'
123
+ rule 'NP' => %w[DET NN]
124
+ rule 'NP' => %w[DET NN PP]
125
+ rule 'VP' => %w[VBD NP]
126
+ rule 'VP' => %w[VBD NP PP]
127
+ rule 'PP' => %w[IN NP]
128
+ end
129
+
130
+ # And now, let's build the grammar...
131
+ grammar = builder.grammar
132
+
133
+ parser = Rley::Parser::GFGEarleyParser.new(grammar)
134
+
135
+ # text = "Yo I'm not done with you"
136
+ text= "John saw Mary with a telescope"
137
+ pp "Input text --> #{text}"
138
+
139
+ tgr = EngTagger.new
140
+
141
+ # Generte POS
142
+ tagged = tgr.add_tags(text)
143
+
144
+ # Generte tokenied lexicon of input text
145
+ # Instead of creating a lexicon dictionary, we would simply generate one each time on the fly for the current text only.
146
+ lexicon = clean_text(text)
147
+
148
+ # Generte POS tokens in [[word, pos], ..] format
149
+ tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
150
+
151
+ def tokenizer(lexicon, grammar, tokens)
152
+ rley_tokens = []
153
+ lexicon.each_with_index do |word, i|
154
+ term_name = tokens[i].last
155
+ terminal = grammar.name2symbol[term_name]
156
+ rley_tokens << Rley::Tokens::Token.new(word, terminal)
157
+ end
158
+ return rley_tokens
159
+ end
160
+
161
+ # Convert input text into a sequence of rley token objects...
162
+ rley_tokens = tokenizer(lexicon, grammar, tokens)
163
+
164
+ result = parser.parse(rley_tokens)
165
+
166
+ pp "Parsing successful? #{result.success?}" # => Parsing successful? true
167
+ pp result.failure_reason.message unless result.success?
168
+
169
+ ptree = result.parse_tree
170
+
171
+ visitor = Rley::ParseTreeVisitor.new(ptree)
172
+
173
+ renderer = Rley::Formatter::Asciitree.new($stdout)
174
+
175
+ # Subscribe the formatter to the visitor's event and launch the visit
176
+ pp renderer.render(visitor)
@@ -1,3 +1,4 @@
1
+ require 'stringio'
1
2
  require_relative 'ast_building'
2
3
  require_relative 'regex_repr'
3
4
 
@@ -35,27 +36,54 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
35
36
  # @param theChildren [Array] Children nodes (one per rhs symbol)
36
37
  def new_parent_node(aProduction, aRange, theTokens, theChildren)
37
38
  node = case aProduction.name
38
- when 'srl_0' # rule 'srl' => 'term'
39
+ when 'srl_0' # rule 'srl' => 'pattern'
40
+ return_first_child(aRange, theTokens, theChildren)
41
+
42
+ when 'pattern_0' # rule 'pattern' => %w[pattern COMMA quantifiable]
43
+ reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
44
+
45
+ when 'pattern_1' # rule 'pattern' => %w[pattern quantifiable]
46
+ reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
47
+
48
+ when 'pattern_2' # rule 'pattern' => 'quantifiable'
49
+ return_first_child(aRange, theTokens, theChildren)
50
+
51
+ when 'quantifiable_0' # rule 'quantifiable' => 'term'
39
52
  return_first_child(aRange, theTokens, theChildren)
40
53
 
54
+ when 'quantifiable_1' # rule 'quantifiable' = %w[term quantifier]
55
+ reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
56
+
41
57
  when 'term_0' # rule 'term' => 'atom'
42
58
  return_first_child(aRange, theTokens, theChildren)
43
59
 
44
- when 'term_1' # rule 'term' => %w[atom quantifier]
45
- reduce_term_1(aProduction, aRange, theTokens, theChildren)
60
+ when 'term_1' # rule 'term' => 'alternation'
61
+ return_first_child(aRange, theTokens, theChildren)
62
+
63
+ when 'term_2' # rule 'term' => 'grouping'
64
+ return_first_child(aRange, theTokens, theChildren)
46
65
 
47
66
  when 'atom_0' # rule 'atom' => 'letter_range'
48
67
  return_first_child(aRange, theTokens, theChildren)
49
-
68
+
50
69
  when 'atom_1' # rule 'atom' => 'digit_range'
51
70
  return_first_child(aRange, theTokens, theChildren)
52
-
71
+
72
+ when 'atom_2' # rule 'atom' => 'character_class'
73
+ return_first_child(aRange, theTokens, theChildren)
74
+
75
+ when 'atom_3' # rule 'atom' => 'special_char'
76
+ return_first_child(aRange, theTokens, theChildren)
77
+
78
+ when 'atom_4' # rule 'atom' => 'literal'
79
+ return_first_child(aRange, theTokens, theChildren)
80
+
53
81
  # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
54
- when 'letter_range_0'
82
+ when 'letter_range_0'
55
83
  reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
56
84
 
57
- #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
58
- when 'letter_range_1'
85
+ #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
86
+ when 'letter_range_1'
59
87
  reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
60
88
 
61
89
  when 'letter_range_2' # rule 'letter_range' => 'LETTER'
@@ -65,12 +93,60 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
65
93
  reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
66
94
 
67
95
  # rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
68
- when 'digit_range_0'
96
+ when 'digit_range_0'
69
97
  reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
70
98
 
71
- when 'digit_range_1' #rule 'digit_range' => 'digit_or_number'
99
+ when 'digit_range_1' # rule 'digit_range' => 'digit_or_number'
72
100
  reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
73
101
 
102
+ when 'character_class_0' # rule 'character_class' => %w[ANY CHARACTER]
103
+ reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
104
+
105
+ when 'character_class_1' # rule 'character_class' => %w[NO CHARACTER]
106
+ reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
107
+
108
+ when 'character_class_2' # rule 'character_class' => 'WHITESPACE'
109
+ reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
110
+
111
+ when 'character_class_3' # rule 'character_class' => %w[NO WHITESPACE]
112
+ reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
113
+
114
+ when 'character_class_4' # rule 'character_class' => 'ANYTHING'
115
+ reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
116
+
117
+ when 'character_class_5' # rule 'character_class' => %w[ONE OF STRING_LIT]
118
+ reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
119
+
120
+ when 'special_char_0' # rule 'special_char' => 'TAB'
121
+ reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
122
+
123
+ when 'special_char_1' # rule 'special_char' => 'BACKSLASH'
124
+ reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
125
+
126
+ when 'special_char_2' # rule 'special_char' => %w[NEW LINE]
127
+ reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
128
+
129
+ when 'literal_0' # rule 'literal' => %[LITERALLY STRING_LIT]
130
+ reduce_literal_0(aProduction, aRange, theTokens, theChildren)
131
+
132
+ # rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
133
+ when 'alternation_0'
134
+ reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
135
+
136
+ # rule 'alternatives' => %w[alternatives COMMA quantifiable]
137
+ when 'alternatives_0'
138
+ reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
139
+
140
+ # rule 'alternatives' => %w[alternatives quantifiable]
141
+ when 'alternatives_1'
142
+ reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
143
+
144
+ when 'alternatives_2' # rule 'alternatives' => 'quantifiable'
145
+ reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
146
+
147
+ when 'grouping' # rule 'grouping' => %w[LPAREN pattern RPAREN]
148
+ reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
149
+
74
150
  when 'quantifier_0' # rule 'quantifier' => 'ONCE'
75
151
  multiplicity(1, 1)
76
152
 
@@ -81,7 +157,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
81
157
  reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
82
158
 
83
159
  # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
84
- when 'quantifier_3'
160
+ when 'quantifier_3'
85
161
  reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
86
162
 
87
163
  when 'quantifier_4' # rule 'quantifier' => 'OPTIONAL'
@@ -95,10 +171,10 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
95
171
 
96
172
  when 'quantifier_7' # rule 'quantifier' => %w[AT LEAST count TIMES]
97
173
  reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
98
-
174
+
99
175
  # rule 'digit_or_number' => 'DIGIT'
100
176
  # rule 'digit_or_number' => 'NUMER'
101
- when 'digit_or_number_0', 'digit_or_number_1'
177
+ when 'digit_or_number_0', 'digit_or_number_1'
102
178
  return_first_child(aRange, theTokens, theChildren)
103
179
 
104
180
  when 'count_0', 'count_1'
@@ -117,6 +193,28 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
117
193
  return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
118
194
  end
119
195
 
196
+ def string_literal(aString, to_escape = true)
197
+ if aString.size > 1
198
+ chars = []
199
+ aString.each_char do |ch|
200
+ if to_escape && Regex::Character::MetaChars.include?(ch)
201
+ chars << Regex::Character.new("\\")
202
+ end
203
+ chars << Regex::Character.new(ch)
204
+ end
205
+ result = Regex::Concatenation.new(*chars)
206
+ else
207
+ if to_escape && Regex::Character::MetaChars.include?(aString)
208
+ result = Regex::Concatenation.new(Regex::Character.new("\\"),
209
+ Regex::Character.new(aString))
210
+ else
211
+ result = Regex::Character.new(aString)
212
+ end
213
+ end
214
+
215
+ return result
216
+ end
217
+
120
218
  def char_range(lowerBound, upperBound)
121
219
  # TODO fix module nesting
122
220
  lower = Regex::Character.new(lowerBound)
@@ -128,15 +226,33 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
128
226
  Regex::CharClass.new(toNegate, *theChildren)
129
227
  end
130
228
 
229
+ def char_shorthand(shortName)
230
+ Regex::CharShorthand.new(shortName)
231
+ end
232
+
233
+ def wildcard()
234
+ Regex::Wildcard.new
235
+ end
236
+
131
237
  def repetition(expressionToRepeat, aMultiplicity)
132
238
  return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
133
239
  end
240
+
241
+ # rule 'pattern' => %w[pattern COMMA quantifiable]
242
+ def reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
243
+ return Regex::Concatenation.new(theChildren[0], theChildren[2])
244
+ end
245
+
246
+ # rule 'pattern' => %w[pattern quantifiable]
247
+ def reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
248
+ return Regex::Concatenation.new(theChildren[0], theChildren[1])
249
+ end
134
250
 
135
- # rule 'term' => %w[atom quantifier]
136
- def reduce_term_1(aProduction, aRange, theTokens, theChildren)
251
+ # rule 'quantifiable' => %w[term quantifier]
252
+ def reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
137
253
  quantifier = theChildren.last
138
- atom = theChildren.first
139
- repetition(atom, quantifier)
254
+ term = theChildren.first
255
+ repetition(term, quantifier)
140
256
  end
141
257
 
142
258
  # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
@@ -166,7 +282,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
166
282
  ch_range = char_range('A', 'Z')
167
283
  char_class(false, ch_range)
168
284
  end
169
-
285
+
170
286
  # rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
171
287
  def reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
172
288
  reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
@@ -174,15 +290,95 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
174
290
 
175
291
  # rule 'digit_range' => 'digit_or_number'
176
292
  def reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
177
- ch_range = char_range('0', '9')
178
- char_class(false, ch_range)
293
+ char_shorthand('d')
294
+ end
295
+
296
+ # rule 'character_class' => %w[ANY CHARACTER]
297
+ def reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
298
+ char_shorthand('w')
299
+ end
300
+
301
+ # rule 'character_class' => %w[NO CHARACTER]
302
+ def reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
303
+ char_shorthand('W')
304
+ end
305
+
306
+ # rule 'character_class' => 'WHITESPACE'
307
+ def reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
308
+ char_shorthand('s')
309
+ end
310
+
311
+ # rule 'character_class' => %w[NO WHITESPACE]
312
+ def reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
313
+ char_shorthand('S')
314
+ end
315
+
316
+ # rule 'character_class' => 'ANYTHING'
317
+ def reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
318
+ wildcard
319
+ end
320
+
321
+ # rule 'character_class' => %w[ONE OF STRING_LIT]
322
+ def reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
323
+ raw_literal = theChildren[-1].token.lexeme.dup
324
+ alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
325
+ return Regex::CharClass.new(false, *alternatives) # TODO check other implementations
179
326
  end
180
327
 
328
+ # rule 'special_char' => 'TAB'
329
+ def reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
330
+ Regex::Character.new('\t')
331
+ end
332
+
333
+ # rule 'special_char' => 'BACKSLASH'
334
+ def reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
335
+ Regex::Character.new('\\')
336
+ end
337
+
338
+ # rule 'special_char' => %w[NEW LINE]
339
+ def reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
340
+ # TODO: control portability
341
+ Regex::Character.new('\n')
342
+ end
343
+
344
+ # rule 'literal' => %[LITERALLY STRING_LIT]
345
+ def reduce_literal_0(aProduction, aRange, theTokens, theChildren)
346
+ # What if literal is empty?...
347
+
348
+ raw_literal = theChildren[-1].token.lexeme.dup
349
+ return string_literal(raw_literal)
350
+ end
351
+
352
+ # rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
353
+ def reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
354
+ return Regex::Alternation.new(*theChildren[3])
355
+ end
356
+
357
+ # rule 'alternatives' => %w[alternatives COMMA quantifiable]
358
+ def reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
359
+ return theChildren[0] << theChildren[-1]
360
+ end
361
+
362
+ # rule 'alternatives' => %w[alternatives quantifiable]
363
+ def reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
364
+ return theChildren[0] << theChildren[-1]
365
+ end
366
+
367
+ # rule 'alternatives' => 'quantifiable'
368
+ def reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
369
+ return [theChildren.last]
370
+ end
371
+
372
+ # rule 'grouping' => %w[LPAREN pattern RPAREN]
373
+ def reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
374
+ return Regex::NonCapturingGroup.new(theChildren[1])
375
+ end
376
+
181
377
  # rule 'quantifier' => %w[EXACTLY count TIMES]
182
378
  def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
183
379
  count = theChildren[1].token.lexeme.to_i
184
380
  multiplicity(count, count)
185
- end
381
+ end
186
382
 
187
383
  # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
188
384
  def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)