rley 0.5.10 → 0.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +2 -1
  5. data/appveyor.yml +6 -5
  6. data/examples/NLP/engtagger.rb +176 -0
  7. data/examples/general/SRL/lib/ast_builder.rb +217 -21
  8. data/examples/general/SRL/lib/grammar.rb +33 -5
  9. data/examples/general/SRL/lib/regex/alternation.rb +30 -0
  10. data/examples/general/SRL/lib/regex/char_class.rb +28 -22
  11. data/examples/general/SRL/lib/regex/char_shorthand.rb +50 -0
  12. data/examples/general/SRL/lib/regex/character.rb +5 -3
  13. data/examples/general/SRL/lib/regex/concatenation.rb +32 -0
  14. data/examples/general/SRL/lib/regex/non_capturing_group.rb +29 -0
  15. data/examples/general/SRL/lib/regex/wildcard.rb +26 -0
  16. data/examples/general/SRL/lib/regex_repr.rb +5 -0
  17. data/examples/general/SRL/lib/tokenizer.rb +28 -3
  18. data/examples/general/SRL/spec/integration_spec.rb +151 -8
  19. data/examples/general/SRL/spec/tokenizer_spec.rb +12 -0
  20. data/examples/general/left.rb +36 -0
  21. data/examples/general/right.rb +36 -0
  22. data/lib/rley/constants.rb +1 -1
  23. data/lib/rley/gfg/edge.rb +12 -1
  24. data/lib/rley/gfg/grm_flow_graph.rb +21 -1
  25. data/lib/rley/gfg/item_vertex.rb +1 -1
  26. data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
  27. data/lib/rley/gfg/start_vertex.rb +1 -0
  28. data/lib/rley/gfg/vertex.rb +27 -0
  29. data/lib/rley/lexical/token.rb +1 -0
  30. data/lib/rley/parser/error_reason.rb +2 -1
  31. data/lib/rley/parser/gfg_chart.rb +14 -0
  32. data/lib/rley/parser/gfg_earley_parser.rb +0 -1
  33. data/lib/rley/parser/gfg_parsing.rb +4 -3
  34. data/lib/rley/parser/parse_entry.rb +33 -3
  35. data/lib/rley/parser/parse_entry_set.rb +14 -2
  36. data/lib/rley/parser/parse_tree_builder.rb +1 -1
  37. data/lib/rley/parser/parse_walker_factory.rb +0 -1
  38. data/lib/rley/syntax/grm_symbol.rb +2 -0
  39. data/lib/rley/syntax/production.rb +15 -3
  40. data/lib/rley/syntax/symbol_seq.rb +16 -1
  41. data/spec/rley/gfg/end_vertex_spec.rb +9 -1
  42. data/spec/rley/gfg/grm_flow_graph_spec.rb +9 -0
  43. data/spec/rley/gfg/item_vertex_spec.rb +9 -0
  44. data/spec/rley/gfg/start_vertex_spec.rb +9 -1
  45. data/spec/rley/parser/gfg_parsing_spec.rb +0 -1
  46. data/spec/rley/parser/parse_entry_set_spec.rb +15 -0
  47. data/spec/rley/parser/parse_entry_spec.rb +24 -13
  48. data/spec/rley/parser/parse_tracer_spec.rb +1 -1
  49. data/spec/rley/syntax/production_spec.rb +10 -0
  50. data/spec/rley/syntax/symbol_seq_spec.rb +5 -0
  51. metadata +10 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ce33adee17693fccafcc29ce8340f57694c229fb
4
- data.tar.gz: 8acb3af15c3bd7c292209e3e1cc2e81b20ba3d9f
3
+ metadata.gz: 0ec06612f299302b861fbaeb04b75c0040a026cf
4
+ data.tar.gz: d68438efcbacceb2ae4319ac268492e93db35265
5
5
  SHA512:
6
- metadata.gz: dba9c01e5cb72954185ec5b2a973a7983202bc1733808637bd887feb6a473a57c6b3ef155090dec47f848b0e367ee558f04d0f7463d50b3494dad627ed7101fb
7
- data.tar.gz: ffb5e41e0325f51c1ae74cc2bb7739eb21b94dca87611f528067e70295a0cd03c956b7b3a75e32593b611abafd1007853917d32db9691d89ea4948df666ad7cc
6
+ metadata.gz: ec3be765a424028c986ea4812cf6f1485f04285beb2b9d8fffc774fc0b61108d4d6758a09a648132562752ab25904fb38f8ee57ecff90d0a70bca253150ed130
7
+ data.tar.gz: 2463def65eecbefed2bbfffc61e63e88dca2d0498078e83bc742811e540718e95e75f3896fa31b5bdc9068f5420906f389615470a86831dbcb5025824645775d
@@ -1,3 +1,18 @@
1
+ ### 0.5.11 / 2018-01-25
2
+ * [NEW] File `left.rb` added in `examples/general` folder for showing use of left-recursive rules.
3
+ * [NEW] File `right.rb` added in `examples/general` folder for showing use of right-recursive rules (less performant).
4
+ * [NEW] File `examples/general/SRL/lib/alternation.rb Added support for alternation in regular expressions (|).
5
+ * [NEW] File `examples/general/SRL/lib/character.rb Added support for single character in regular expressions.
6
+ * [NEW] File `examples/general/SRL/lib/char_class.rb Added support for character class in regular expressions.
7
+ * [NEW] File `examples/general/SRL/lib/shorthand.rb Added support for character class shorthand in regular expressions.
8
+ * [NEW] File `examples/general/SRL/lib/concatenation.rb Added support for concatenation in regular expressions.
9
+ * [NEW] File `examples/general/SRL/lib/non_capturing_group.rb Added support for non-capturing groups in regular expressions.
10
+ * [NEW] File `examples/general/SRL/lib/wildcard.rb Added support for wilcards in regular expressions.
11
+ * [CHANGE] File `examples/general/SRL/grammar.rb increased coverage of Simple Regex Language parser.
12
+ * [CHANGE] File `examples/general/SRL/ast_builder.rb Added transformation rules for constructing regular expressions.
13
+ * [CHANGE] File `examples/general/SRL/spac/integration_spec.rb Added tests for SRL expressions.
14
+ * [FIX] Added an custom `inspect` method to sevaral core classes. This was necessary because default implementation from Ruby got lost with object graphs.
15
+
1
16
  ### 0.5.10 / 2017-12-02
2
17
  * [CHANGE] Dir `examples/general/SRL/ Added support for digit range to Simple Regex Language parser.
3
18
 
@@ -1,4 +1,4 @@
1
- Copyright (c) 2014-2017 Dimitri Geshef
1
+ Copyright (c) 2014-2018 Dimitri Geshef
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -440,6 +440,7 @@ actively curated by Andrei Beliankou (aka arbox).
440
440
 
441
441
  ## Thanks to:
442
442
  * Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
443
+ * [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
443
444
 
444
445
  ## Grammar Flow Graph
445
446
  Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
@@ -452,5 +453,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
452
453
 
453
454
  Copyright
454
455
  ---------
455
- Copyright (c) 2014-2017, Dimitri Geshef.
456
+ Copyright (c) 2014-2018, Dimitri Geshef.
456
457
  __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
@@ -11,12 +11,13 @@ environment:
11
11
  - Ruby_version: 23
12
12
  - Ruby_version: 23-x64
13
13
  - Ruby_version: 24
14
- - Ruby_version: 24-x64
14
+ - Ruby_version: 24-x64
15
+
15
16
  install:
16
- - cmd: >-
17
- SET PATH=C:\Ruby%Ruby_version%\bin;%PATH%
17
+ - set PATH=C:\Ruby%Ruby_version%\bin;%PATH%
18
+ - bundle install --retry=3 --clean --force
18
19
 
19
- bundle install --retry=3 --clean
20
20
  build: off
21
+
21
22
  test_script:
22
- - cmd: bundle exec rake
23
+ - bundle exec rake
@@ -0,0 +1,176 @@
1
+ require "rley"
2
+ require "engtagger"
3
+ require "pp"
4
+
5
+ # REGEX to remove XML tags from Engtagger output
6
+ GET_TAG = /<(.+?)>(.*?)<.+?>/
7
+
8
+ # Text tokenizer
9
+ # Taken directly from Engtagger, will ensure uniform indexing while parsing
10
+ def clean_text(text)
11
+ return false unless valid_text(text)
12
+ text = text.toutf8
13
+ cleaned_text = text
14
+ tokenized = []
15
+ # Tokenize the text (splitting on punctuation as you go)
16
+ cleaned_text.split(/\s+/).each do |line|
17
+ tokenized += split_punct(line)
18
+ end
19
+ words = split_sentences(tokenized)
20
+ return words
21
+ end
22
+
23
+ def valid_text(text)
24
+ if !text
25
+ # there's nothing to parse
26
+ "method call on uninitialized variable" if @conf[:debug]
27
+ return false
28
+ elsif /\A\s*\z/ =~ text
29
+ # text is an empty string, nothing to parse
30
+ return false
31
+ else
32
+ # $text is valid
33
+ return true
34
+ end
35
+ end
36
+
37
+ def split_sentences(array)
38
+ tokenized = array
39
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
40
+ supt det mssrs rev)
41
+ army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
42
+ inst = %w(dept univ assn bros ph.d)
43
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
44
+ hwy hway la pde pd plz pl rd st tce)
45
+ comp = %w(mfg inc ltd co corp)
46
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
47
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
48
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
49
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk)
50
+ month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
51
+ misc = %w(vs etc no esp)
52
+ abbr = Hash.new
53
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
54
+ abbr[i] = true
55
+ end
56
+ words = Array.new
57
+ tokenized.each_with_index do |t, i|
58
+ if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
59
+ w = $1
60
+ # Don't separate the period off words that
61
+ # meet any of the following conditions:
62
+ #
63
+ # 1. It is defined in one of the lists above
64
+ # 2. It is only one letter long: Alfred E. Sloan
65
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
66
+ unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
67
+ words << w
68
+ words << '.'
69
+ next
70
+ end
71
+ end
72
+ words << tokenized[i]
73
+ end
74
+ # If the final word ends in a period..
75
+ if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
76
+ words[-1] = $1
77
+ words.push '.'
78
+ end
79
+ return words
80
+ end
81
+
82
+ # Separate punctuation from words, where appropriate. This leaves trailing
83
+ # periods in place to be dealt with later. Called by the clean_text method.
84
+ def split_punct(text)
85
+ # If there's no punctuation, return immediately
86
+ return [text] if /\A\w+\z/ =~ text
87
+ # Sanity checks
88
+ text = text.gsub(/\W{10,}/o, " ")
89
+
90
+ # Put quotes into a standard format
91
+ text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
92
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
93
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
94
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
95
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
96
+
97
+ # Handle all other punctuation
98
+ text = text.gsub(/--+/o, " - ") # Convert and separate dashes
99
+ text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
100
+ text = text.gsub(/:/o, " :") # Shift semicolons off
101
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
102
+ text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
103
+ text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
104
+
105
+ # English-specific contractions
106
+ text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
107
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
108
+ text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
109
+ result = text.split(' ')
110
+ return result
111
+ end
112
+
113
+
114
+ # Instantiate a builder object that will build the grammar for us
115
+ builder = Rley::Syntax::GrammarBuilder.new do
116
+
117
+ add_terminals('NN', 'NNP')
118
+ add_terminals('DET', 'IN', 'VBD')
119
+
120
+ # Here we define the productions (= grammar rules)
121
+ rule 'S' => %w[NP VP]
122
+ rule 'NP' => 'NNP'
123
+ rule 'NP' => %w[DET NN]
124
+ rule 'NP' => %w[DET NN PP]
125
+ rule 'VP' => %w[VBD NP]
126
+ rule 'VP' => %w[VBD NP PP]
127
+ rule 'PP' => %w[IN NP]
128
+ end
129
+
130
+ # And now, let's build the grammar...
131
+ grammar = builder.grammar
132
+
133
+ parser = Rley::Parser::GFGEarleyParser.new(grammar)
134
+
135
+ # text = "Yo I'm not done with you"
136
+ text= "John saw Mary with a telescope"
137
+ pp "Input text --> #{text}"
138
+
139
+ tgr = EngTagger.new
140
+
141
+ # Generte POS
142
+ tagged = tgr.add_tags(text)
143
+
144
+ # Generte tokenied lexicon of input text
145
+ # Instead of creating a lexicon dictionary, we would simply generate one each time on the fly for the current text only.
146
+ lexicon = clean_text(text)
147
+
148
+ # Generte POS tokens in [[word, pos], ..] format
149
+ tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
150
+
151
+ def tokenizer(lexicon, grammar, tokens)
152
+ rley_tokens = []
153
+ lexicon.each_with_index do |word, i|
154
+ term_name = tokens[i].last
155
+ terminal = grammar.name2symbol[term_name]
156
+ rley_tokens << Rley::Tokens::Token.new(word, terminal)
157
+ end
158
+ return rley_tokens
159
+ end
160
+
161
+ # Convert input text into a sequence of rley token objects...
162
+ rley_tokens = tokenizer(lexicon, grammar, tokens)
163
+
164
+ result = parser.parse(rley_tokens)
165
+
166
+ pp "Parsing successful? #{result.success?}" # => Parsing successful? true
167
+ pp result.failure_reason.message unless result.success?
168
+
169
+ ptree = result.parse_tree
170
+
171
+ visitor = Rley::ParseTreeVisitor.new(ptree)
172
+
173
+ renderer = Rley::Formatter::Asciitree.new($stdout)
174
+
175
+ # Subscribe the formatter to the visitor's event and launch the visit
176
+ pp renderer.render(visitor)
@@ -1,3 +1,4 @@
1
+ require 'stringio'
1
2
  require_relative 'ast_building'
2
3
  require_relative 'regex_repr'
3
4
 
@@ -35,27 +36,54 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
35
36
  # @param theChildren [Array] Children nodes (one per rhs symbol)
36
37
  def new_parent_node(aProduction, aRange, theTokens, theChildren)
37
38
  node = case aProduction.name
38
- when 'srl_0' # rule 'srl' => 'term'
39
+ when 'srl_0' # rule 'srl' => 'pattern'
40
+ return_first_child(aRange, theTokens, theChildren)
41
+
42
+ when 'pattern_0' # rule 'pattern' => %w[pattern COMMA quantifiable]
43
+ reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
44
+
45
+ when 'pattern_1' # rule 'pattern' => %w[pattern quantifiable]
46
+ reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
47
+
48
+ when 'pattern_2' # rule 'pattern' => 'quantifiable'
49
+ return_first_child(aRange, theTokens, theChildren)
50
+
51
+ when 'quantifiable_0' # rule 'quantifiable' => 'term'
39
52
  return_first_child(aRange, theTokens, theChildren)
40
53
 
54
+ when 'quantifiable_1' # rule 'quantifiable' = %w[term quantifier]
55
+ reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
56
+
41
57
  when 'term_0' # rule 'term' => 'atom'
42
58
  return_first_child(aRange, theTokens, theChildren)
43
59
 
44
- when 'term_1' # rule 'term' => %w[atom quantifier]
45
- reduce_term_1(aProduction, aRange, theTokens, theChildren)
60
+ when 'term_1' # rule 'term' => 'alternation'
61
+ return_first_child(aRange, theTokens, theChildren)
62
+
63
+ when 'term_2' # rule 'term' => 'grouping'
64
+ return_first_child(aRange, theTokens, theChildren)
46
65
 
47
66
  when 'atom_0' # rule 'atom' => 'letter_range'
48
67
  return_first_child(aRange, theTokens, theChildren)
49
-
68
+
50
69
  when 'atom_1' # rule 'atom' => 'digit_range'
51
70
  return_first_child(aRange, theTokens, theChildren)
52
-
71
+
72
+ when 'atom_2' # rule 'atom' => 'character_class'
73
+ return_first_child(aRange, theTokens, theChildren)
74
+
75
+ when 'atom_3' # rule 'atom' => 'special_char'
76
+ return_first_child(aRange, theTokens, theChildren)
77
+
78
+ when 'atom_4' # rule 'atom' => 'literal'
79
+ return_first_child(aRange, theTokens, theChildren)
80
+
53
81
  # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
54
- when 'letter_range_0'
82
+ when 'letter_range_0'
55
83
  reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
56
84
 
57
- #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
58
- when 'letter_range_1'
85
+ #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
86
+ when 'letter_range_1'
59
87
  reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
60
88
 
61
89
  when 'letter_range_2' # rule 'letter_range' => 'LETTER'
@@ -65,12 +93,60 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
65
93
  reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
66
94
 
67
95
  # rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
68
- when 'digit_range_0'
96
+ when 'digit_range_0'
69
97
  reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
70
98
 
71
- when 'digit_range_1' #rule 'digit_range' => 'digit_or_number'
99
+ when 'digit_range_1' # rule 'digit_range' => 'digit_or_number'
72
100
  reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
73
101
 
102
+ when 'character_class_0' # rule 'character_class' => %w[ANY CHARACTER]
103
+ reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
104
+
105
+ when 'character_class_1' # rule 'character_class' => %w[NO CHARACTER]
106
+ reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
107
+
108
+ when 'character_class_2' # rule 'character_class' => 'WHITESPACE'
109
+ reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
110
+
111
+ when 'character_class_3' # rule 'character_class' => %w[NO WHITESPACE]
112
+ reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
113
+
114
+ when 'character_class_4' # rule 'character_class' => 'ANYTHING'
115
+ reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
116
+
117
+ when 'character_class_5' # rule 'character_class' => %w[ONE OF STRING_LIT]
118
+ reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
119
+
120
+ when 'special_char_0' # rule 'special_char' => 'TAB'
121
+ reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
122
+
123
+ when 'special_char_1' # rule 'special_char' => 'BACKSLASH'
124
+ reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
125
+
126
+ when 'special_char_2' # rule 'special_char' => %w[NEW LINE]
127
+ reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
128
+
129
+ when 'literal_0' # rule 'literal' => %[LITERALLY STRING_LIT]
130
+ reduce_literal_0(aProduction, aRange, theTokens, theChildren)
131
+
132
+ # rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
133
+ when 'alternation_0'
134
+ reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
135
+
136
+ # rule 'alternatives' => %w[alternatives COMMA quantifiable]
137
+ when 'alternatives_0'
138
+ reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
139
+
140
+ # rule 'alternatives' => %w[alternatives quantifiable]
141
+ when 'alternatives_1'
142
+ reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
143
+
144
+ when 'alternatives_2' # rule 'alternatives' => 'quantifiable'
145
+ reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
146
+
147
+ when 'grouping' # rule 'grouping' => %w[LPAREN pattern RPAREN]
148
+ reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
149
+
74
150
  when 'quantifier_0' # rule 'quantifier' => 'ONCE'
75
151
  multiplicity(1, 1)
76
152
 
@@ -81,7 +157,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
81
157
  reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
82
158
 
83
159
  # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
84
- when 'quantifier_3'
160
+ when 'quantifier_3'
85
161
  reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
86
162
 
87
163
  when 'quantifier_4' # rule 'quantifier' => 'OPTIONAL'
@@ -95,10 +171,10 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
95
171
 
96
172
  when 'quantifier_7' # rule 'quantifier' => %w[AT LEAST count TIMES]
97
173
  reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
98
-
174
+
99
175
  # rule 'digit_or_number' => 'DIGIT'
100
176
  # rule 'digit_or_number' => 'NUMER'
101
- when 'digit_or_number_0', 'digit_or_number_1'
177
+ when 'digit_or_number_0', 'digit_or_number_1'
102
178
  return_first_child(aRange, theTokens, theChildren)
103
179
 
104
180
  when 'count_0', 'count_1'
@@ -117,6 +193,28 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
117
193
  return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
118
194
  end
119
195
 
196
+ def string_literal(aString, to_escape = true)
197
+ if aString.size > 1
198
+ chars = []
199
+ aString.each_char do |ch|
200
+ if to_escape && Regex::Character::MetaChars.include?(ch)
201
+ chars << Regex::Character.new("\\")
202
+ end
203
+ chars << Regex::Character.new(ch)
204
+ end
205
+ result = Regex::Concatenation.new(*chars)
206
+ else
207
+ if to_escape && Regex::Character::MetaChars.include?(aString)
208
+ result = Regex::Concatenation.new(Regex::Character.new("\\"),
209
+ Regex::Character.new(aString))
210
+ else
211
+ result = Regex::Character.new(aString)
212
+ end
213
+ end
214
+
215
+ return result
216
+ end
217
+
120
218
  def char_range(lowerBound, upperBound)
121
219
  # TODO fix module nesting
122
220
  lower = Regex::Character.new(lowerBound)
@@ -128,15 +226,33 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
128
226
  Regex::CharClass.new(toNegate, *theChildren)
129
227
  end
130
228
 
229
+ def char_shorthand(shortName)
230
+ Regex::CharShorthand.new(shortName)
231
+ end
232
+
233
+ def wildcard()
234
+ Regex::Wildcard.new
235
+ end
236
+
131
237
  def repetition(expressionToRepeat, aMultiplicity)
132
238
  return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
133
239
  end
240
+
241
+ # rule 'pattern' => %w[pattern COMMA quantifiable]
242
+ def reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
243
+ return Regex::Concatenation.new(theChildren[0], theChildren[2])
244
+ end
245
+
246
+ # rule 'pattern' => %w[pattern quantifiable]
247
+ def reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
248
+ return Regex::Concatenation.new(theChildren[0], theChildren[1])
249
+ end
134
250
 
135
- # rule 'term' => %w[atom quantifier]
136
- def reduce_term_1(aProduction, aRange, theTokens, theChildren)
251
+ # rule 'quantifiable' => %w[term quantifier]
252
+ def reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
137
253
  quantifier = theChildren.last
138
- atom = theChildren.first
139
- repetition(atom, quantifier)
254
+ term = theChildren.first
255
+ repetition(term, quantifier)
140
256
  end
141
257
 
142
258
  # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
@@ -166,7 +282,7 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
166
282
  ch_range = char_range('A', 'Z')
167
283
  char_class(false, ch_range)
168
284
  end
169
-
285
+
170
286
  # rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
171
287
  def reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
172
288
  reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
@@ -174,15 +290,95 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
174
290
 
175
291
  # rule 'digit_range' => 'digit_or_number'
176
292
  def reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
177
- ch_range = char_range('0', '9')
178
- char_class(false, ch_range)
293
+ char_shorthand('d')
294
+ end
295
+
296
+ # rule 'character_class' => %w[ANY CHARACTER]
297
+ def reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
298
+ char_shorthand('w')
299
+ end
300
+
301
+ # rule 'character_class' => %w[NO CHARACTER]
302
+ def reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
303
+ char_shorthand('W')
304
+ end
305
+
306
+ # rule 'character_class' => 'WHITESPACE'
307
+ def reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
308
+ char_shorthand('s')
309
+ end
310
+
311
+ # rule 'character_class' => %w[NO WHITESPACE]
312
+ def reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
313
+ char_shorthand('S')
314
+ end
315
+
316
+ # rule 'character_class' => 'ANYTHING'
317
+ def reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
318
+ wildcard
319
+ end
320
+
321
+ # rule 'character_class' => %w[ONE OF STRING_LIT]
322
+ def reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
323
+ raw_literal = theChildren[-1].token.lexeme.dup
324
+ alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
325
+ return Regex::CharClass.new(false, *alternatives) # TODO check other implementations
179
326
  end
180
327
 
328
+ # rule 'special_char' => 'TAB'
329
+ def reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
330
+ Regex::Character.new('\t')
331
+ end
332
+
333
+ # rule 'special_char' => 'BACKSLASH'
334
+ def reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
335
+ Regex::Character.new('\\')
336
+ end
337
+
338
+ # rule 'special_char' => %w[NEW LINE]
339
+ def reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
340
+ # TODO: control portability
341
+ Regex::Character.new('\n')
342
+ end
343
+
344
+ # rule 'literal' => %[LITERALLY STRING_LIT]
345
+ def reduce_literal_0(aProduction, aRange, theTokens, theChildren)
346
+ # What if literal is empty?...
347
+
348
+ raw_literal = theChildren[-1].token.lexeme.dup
349
+ return string_literal(raw_literal)
350
+ end
351
+
352
+ # rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
353
+ def reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
354
+ return Regex::Alternation.new(*theChildren[3])
355
+ end
356
+
357
+ # rule 'alternatives' => %w[alternatives COMMA quantifiable]
358
+ def reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
359
+ return theChildren[0] << theChildren[-1]
360
+ end
361
+
362
+ # rule 'alternatives' => %w[alternatives quantifiable]
363
+ def reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
364
+ return theChildren[0] << theChildren[-1]
365
+ end
366
+
367
+ # rule 'alternatives' => 'quantifiable'
368
+ def reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
369
+ return [theChildren.last]
370
+ end
371
+
372
+ # rule 'grouping' => %w[LPAREN pattern RPAREN]
373
+ def reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
374
+ return Regex::NonCapturingGroup.new(theChildren[1])
375
+ end
376
+
181
377
  # rule 'quantifier' => %w[EXACTLY count TIMES]
182
378
  def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
183
379
  count = theChildren[1].token.lexeme.to_i
184
380
  multiplicity(count, count)
185
- end
381
+ end
186
382
 
187
383
  # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
188
384
  def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)