rley 0.7.08 → 0.8.00

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/README.md +4 -5
  4. data/examples/NLP/nano_eng/nano_en_demo.rb +7 -11
  5. data/examples/NLP/nano_eng/nano_grammar.rb +18 -18
  6. data/examples/NLP/pico_en_demo.rb +2 -2
  7. data/examples/data_formats/JSON/json_ast_builder.rb +9 -18
  8. data/examples/data_formats/JSON/json_demo.rb +1 -2
  9. data/examples/data_formats/JSON/json_grammar.rb +11 -11
  10. data/examples/general/calc_iter1/calc_grammar.rb +5 -4
  11. data/examples/general/calc_iter2/calc_grammar.rb +9 -9
  12. data/examples/general/left.rb +1 -1
  13. data/examples/general/right.rb +1 -1
  14. data/lib/rley.rb +1 -1
  15. data/lib/rley/base/dotted_item.rb +5 -0
  16. data/lib/rley/base/grm_items_builder.rb +6 -0
  17. data/lib/rley/constants.rb +1 -1
  18. data/lib/rley/engine.rb +2 -2
  19. data/lib/rley/interface.rb +16 -0
  20. data/lib/rley/notation/all_notation_nodes.rb +2 -0
  21. data/lib/rley/notation/ast_builder.rb +191 -0
  22. data/lib/rley/notation/ast_node.rb +44 -0
  23. data/lib/rley/notation/ast_visitor.rb +113 -0
  24. data/lib/rley/notation/grammar.rb +49 -0
  25. data/lib/rley/notation/grammar_builder.rb +451 -0
  26. data/lib/rley/notation/grouping_node.rb +23 -0
  27. data/lib/rley/notation/parser.rb +56 -0
  28. data/lib/rley/notation/sequence_node.rb +35 -0
  29. data/lib/rley/notation/symbol_node.rb +29 -0
  30. data/lib/rley/notation/tokenizer.rb +192 -0
  31. data/lib/rley/parse_rep/ast_base_builder.rb +13 -0
  32. data/lib/rley/parser/gfg_chart.rb +100 -6
  33. data/lib/rley/parser/gfg_parsing.rb +5 -3
  34. data/lib/rley/parser/parse_entry_set.rb +1 -1
  35. data/lib/rley/syntax/{grammar_builder.rb → base_grammar_builder.rb} +45 -15
  36. data/lib/rley/syntax/grm_symbol.rb +1 -1
  37. data/lib/rley/syntax/match_closest.rb +43 -0
  38. data/lib/rley/syntax/production.rb +6 -0
  39. data/spec/rley/engine_spec.rb +6 -6
  40. data/spec/rley/gfg/grm_flow_graph_spec.rb +2 -2
  41. data/spec/rley/notation/grammar_builder_spec.rb +295 -0
  42. data/spec/rley/notation/parser_spec.rb +184 -0
  43. data/spec/rley/notation/tokenizer_spec.rb +370 -0
  44. data/spec/rley/parse_rep/ast_builder_spec.rb +0 -1
  45. data/spec/rley/parse_rep/groucho_spec.rb +1 -1
  46. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +1 -1
  47. data/spec/rley/parse_rep/parse_forest_factory_spec.rb +2 -2
  48. data/spec/rley/parse_rep/parse_tree_factory_spec.rb +1 -1
  49. data/spec/rley/parser/dangling_else_spec.rb +445 -0
  50. data/spec/rley/parser/gfg_earley_parser_spec.rb +95 -9
  51. data/spec/rley/parser/gfg_parsing_spec.rb +1 -1
  52. data/spec/rley/parser/parse_walker_factory_spec.rb +2 -2
  53. data/spec/rley/support/ambiguous_grammar_helper.rb +2 -2
  54. data/spec/rley/support/grammar_abc_helper.rb +2 -2
  55. data/spec/rley/support/grammar_ambig01_helper.rb +2 -2
  56. data/spec/rley/support/grammar_arr_int_helper.rb +2 -2
  57. data/spec/rley/support/grammar_b_expr_helper.rb +2 -2
  58. data/spec/rley/support/grammar_int_seq_helper.rb +51 -0
  59. data/spec/rley/support/grammar_l0_helper.rb +2 -2
  60. data/spec/rley/support/grammar_pb_helper.rb +2 -2
  61. data/spec/rley/support/grammar_sppf_helper.rb +2 -2
  62. data/spec/rley/syntax/{grammar_builder_spec.rb → base_grammar_builder_spec.rb} +30 -11
  63. data/spec/rley/syntax/match_closest_spec.rb +46 -0
  64. data/spec/rley/syntax/production_spec.rb +4 -0
  65. metadata +29 -14
  66. data/lib/rley/parser/parse_state.rb +0 -78
  67. data/lib/rley/parser/parse_state_tracker.rb +0 -59
  68. data/lib/rley/parser/state_set.rb +0 -100
  69. data/spec/rley/parser/parse_state_spec.rb +0 -125
  70. data/spec/rley/parser/parse_tracer_spec.rb +0 -200
  71. data/spec/rley/parser/state_set_spec.rb +0 -130
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'sequence_node'
4
+
5
+ module Rley
6
+ module Notation
7
+ # A syntax node representing an expression bracketed by parentheses.
8
+ class GroupingNode < SequenceNode
9
+ # @param aPosition [Rley::Lexical::Position] Start position.
10
+ # @param sequence [Array<ASTNode>] sequence of AST nodes
11
+ # @param theRepetition [Symbol] indicates how many times the symbol can be repeated
12
+ def initialize(aPosition, sequence, theRepetition = nil)
13
+ super(aPosition, sequence, theRepetition)
14
+ end
15
+
16
+ # Part of the 'visitee' role in Visitor design pattern.
17
+ # @param visitor [Notation::ASTVisitor] the visitor
18
+ def accept(visitor)
19
+ visitor.visit_grouping_node(self)
20
+ end
21
+ end # class
22
+ end # module
23
+ end # module
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'tokenizer'
4
+ require_relative 'grammar'
5
+ require_relative 'ast_builder'
6
+
7
+ module Rley
8
+ module Notation
9
+ # A Lox parser that produce concrete parse trees.
10
+ # Concrete parse trees are the default kind of parse tree
11
+ # generated by the Rley library.
12
+ # They consist of two node types only:
13
+ # - NonTerminalNode
14
+ # - TerminalNode
15
+ # A NonTerminalNode has zero or more child nodes (called subnodes)
16
+ # A TerminalNode is leaf node, that is, it has no child node.
17
+ # While concrete parse tree nodes can be generated out of the box,
18
+ # they have the following drawbacks:
19
+ # - Generic node classes that aren't always suited for the needs of
20
+ # the language being processing.
21
+ # - Concrete parse tree tend to be deeply nested, which may complicate
22
+ # further processing.
23
+ class Parser
24
+ # @return [Rley::Engine] A facade object for the Rley parsing library
25
+ attr_reader(:engine)
26
+
27
+ def initialize
28
+ # Create a Rley facade object
29
+ @engine = Rley::Engine.new do |cfg|
30
+ cfg.diagnose = true
31
+ cfg.repr_builder = Notation::ASTBuilder
32
+ end
33
+
34
+ # Step 1. Load RGN grammar
35
+ @engine.use_grammar(Rley::Notation::RGNGrammar)
36
+ end
37
+
38
+ # Parse the given Lox program into a parse tree.
39
+ # @param source [String] Lox program to parse
40
+ # @return [Rley::ParseTree] A parse tree equivalent to the Lox input.
41
+ def parse(source)
42
+ lexer = Tokenizer.new(source)
43
+ result = engine.parse(lexer.tokens)
44
+
45
+ unless result.success?
46
+ # Stop if the parse failed...
47
+ line1 = "Parsing failed\n"
48
+ line2 = "Reason: #{result.failure_reason.message}"
49
+ raise SyntaxError, line1 + line2
50
+ end
51
+
52
+ return engine.convert(result) # engine.to_ptree(result)
53
+ end
54
+ end # class
55
+ end # module
56
+ end # module
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ast_node'
4
+
5
+ module Rley
6
+ module Notation
7
+ # A syntax node for a sequence of AST nodes
8
+ class SequenceNode < ASTNode
9
+ # @return [Array<ASTNode>]
10
+ attr_reader :subnodes
11
+
12
+ attr_accessor :constraints
13
+
14
+ # @param aPosition [Rley::Lexical::Position] Start position.
15
+ # @param sequence [Array<ASTNode>] sequence of AST nodes
16
+ # @param theRepetition [Symbol] indicates how many times the symbol can be repeated
17
+ def initialize(aPosition, sequence, theRepetition = nil)
18
+ super(aPosition)
19
+ @subnodes = sequence
20
+ repetition=(theRepetition) if theRepetition
21
+ @constraints = []
22
+ end
23
+
24
+ def size
25
+ subnodes.size
26
+ end
27
+
28
+ # Part of the 'visitee' role in Visitor design pattern.
29
+ # @param visitor [Notation::ASTVisitor] the visitor
30
+ def accept(visitor)
31
+ visitor.visit_sequence_node(self)
32
+ end
33
+ end # class
34
+ end # module
35
+ end # module
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ast_node'
4
+
5
+ module Rley
6
+ module Notation
7
+ # A syntax node for a grammar symbol occurring in rhs of a rule
8
+ class SymbolNode < ASTNode
9
+ # @return [String] name of grammar symbol
10
+ attr_reader :name
11
+
12
+ # @param aPosition [Rley::Lexical::Position] Position of the entry in the input stream.
13
+ # @param aName [String] name of grammar symbol
14
+ # @param theRepetition [Symbol] indicates how many times the symbol can be repeated
15
+ def initialize(aPosition, aName, theRepetition = nil)
16
+ super(aPosition)
17
+ @name = aName
18
+ repetition=(theRepetition) if theRepetition
19
+ end
20
+
21
+ # Abstract method (must be overriden in subclasses).
22
+ # Part of the 'visitee' role in Visitor design pattern.
23
+ # @param _visitor [LoxxyTreeVisitor] the visitor
24
+ def accept(visitor)
25
+ visitor.visit_symbol_node(self)
26
+ end
27
+ end # class
28
+ end # module
29
+ end # module
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'strscan'
4
+ require_relative '../lexical/token'
5
+
6
+ module Rley
7
+ module Notation
8
+ # A tokenizer for the Rley notation language.
9
+ # Responsibility: break input into a sequence of token objects.
10
+ # The tokenizer should recognize:
11
+ # Identifiers,
12
+ # Number literals including single digit
13
+ # String literals (quote delimited)
14
+ # Delimiters: e.g. parentheses '(', ')'
15
+ # Separators: e.g. comma
16
+ class Tokenizer
17
+ # @return [StringScanner] Low-level input scanner
18
+ attr_reader(:scanner)
19
+
20
+ # @return [Integer] The current line number
21
+ attr_reader(:lineno)
22
+
23
+ # @return [Integer] Position of last start of line in the input
24
+ attr_reader(:line_start)
25
+
26
+ # One or two special character tokens.
27
+ @@lexeme2name = {
28
+ '(' => 'LEFT_PAREN',
29
+ ')' => 'RIGHT_PAREN',
30
+ '{' => 'LEFT_BRACE',
31
+ '}' => 'RIGHT_BRACE',
32
+ ',' => 'COMMA',
33
+ '+' => 'PLUS',
34
+ '?' => 'QUESTION_MARK',
35
+ '*' => 'STAR',
36
+ '..' => 'ELLIPSIS'
37
+ }.freeze
38
+
39
+ # Here are all the implemented Rley notation keywords
40
+ @@keywords = %w[
41
+ match_closest repeat
42
+ ].map { |x| [x, x] }.to_h
43
+
44
+ # Constructor. Initialize a tokenizer for Lox input.
45
+ # @param source [String] Lox text to tokenize.
46
+ def initialize(source = nil)
47
+ @scanner = StringScanner.new('')
48
+ start_with(source) if source
49
+ end
50
+
51
+ # Reset the tokenizer and make the given text, the current input.
52
+ # @param source [String] Lox text to tokenize.
53
+ def start_with(source)
54
+ @scanner.string = source
55
+ @lineno = 1
56
+ @line_start = 0
57
+ end
58
+
59
+ # Scan the source and return an array of tokens.
60
+ # @return [Array<Rley::Lexical::Token>] | Returns a sequence of tokens
61
+ def tokens
62
+ tok_sequence = []
63
+ until @scanner.eos?
64
+ token = _next_token
65
+ tok_sequence << token unless token.nil?
66
+ end
67
+
68
+ return tok_sequence
69
+ end
70
+
71
+ private
72
+
73
+ def _next_token
74
+ pos_before = scanner.pos
75
+ skip_intertoken_spaces
76
+ ws_found = true if scanner.pos > pos_before
77
+ curr_ch = scanner.peek(1)
78
+ return nil if curr_ch.nil? || curr_ch.empty?
79
+
80
+ token = nil
81
+
82
+ if '(){},'.include? curr_ch
83
+ # Single delimiter, separator or character
84
+ token = build_token(@@lexeme2name[curr_ch], scanner.getch)
85
+ elsif '?*+,'.include? curr_ch # modifier character
86
+ # modifiers without prefix text are symbols
87
+ symb = ws_found ? 'SYMBOL' : @@lexeme2name[curr_ch]
88
+ token = build_token(symb, scanner.getch)
89
+ elsif (lexeme = scanner.scan(/\.\./))
90
+ # One or two special character tokens
91
+ token = build_token(@@lexeme2name[lexeme], lexeme)
92
+ elsif scanner.check(/"|'/) # Start of string detected...
93
+ token = build_string_token
94
+ elsif (lexeme = scanner.scan(/\d+/))
95
+ token = build_token('INT_LIT', lexeme)
96
+ elsif (lexeme = scanner.scan(/[a-zA-Z_][a-zA-Z_0-9]*:/))
97
+ keyw = @@keywords[lexeme.chop!]
98
+ token = build_token('KEY', lexeme) if keyw
99
+ # ... error case
100
+ elsif (lexeme = scanner.scan(/[^?*+,:(){}\s]+/))
101
+ token = build_token('SYMBOL', lexeme)
102
+ else # Unknown token
103
+ col = scanner.pos - @line_start + 1
104
+ _erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
105
+ raise ScanError, "Error: [line #{lineno}:#{col}]: Unexpected character."
106
+ end
107
+
108
+ return token
109
+ end
110
+
111
+ def build_token(aSymbolName, aLexeme)
112
+ begin
113
+ lex_length = aLexeme ? aLexeme.size : 0
114
+ col = scanner.pos - lex_length - @line_start + 1
115
+ pos = Rley::Lexical::Position.new(@lineno, col)
116
+ token = Rley::Lexical::Token.new(aLexeme.dup, aSymbolName, pos)
117
+
118
+ rescue StandardError => e
119
+ puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
120
+ raise e
121
+ end
122
+
123
+ return token
124
+ end
125
+
126
+ # precondition: current position at leading quote
127
+ def build_string_token
128
+ delimiter = scanner.scan(/./)
129
+ scan_pos = scanner.pos
130
+ line = @lineno
131
+ column_start = scan_pos - @line_start
132
+ literal = +''
133
+ loop do
134
+ substr = scanner.scan(/[^"'\\\r\n]*/)
135
+ if scanner.eos?
136
+ pos_start = "line #{line}:#{column_start}"
137
+ raise ScanError, "Error: [#{pos_start}]: Unterminated string."
138
+ else
139
+ literal << substr
140
+ special = scanner.scan(/["'\\\r\n]/)
141
+ case special
142
+ when delimiter # Terminating quote found
143
+ break
144
+ when "\r"
145
+ next_line
146
+ special << scanner.scan(/./) if scanner.match?(/\n/)
147
+ literal << special
148
+ when "\n"
149
+ next_line
150
+ literal << special
151
+ # when '\\'
152
+ # ch = scanner.scan(/./)
153
+ # next unless ch
154
+
155
+ # escaped = @@escape_chars[ch]
156
+ # if escaped
157
+ # literal << escaped
158
+ # else
159
+ # literal << ch
160
+ # end
161
+ end
162
+ end
163
+ end
164
+ pos = Rley::Lexical::Position.new(line, column_start)
165
+ lexeme = scanner.string[scan_pos - 1..scanner.pos - 1]
166
+ Rley::Lexical::Token.new(literal, 'STR_LIT', pos)
167
+ end
168
+
169
+ # Skip non-significant whitespaces and comments.
170
+ # Advance the scanner until something significant is found.
171
+ def skip_intertoken_spaces
172
+ loop do
173
+ ws_found = scanner.skip(/[ \t\f]+/) ? true : false
174
+ nl_found = scanner.skip(/(?:\r\n)|\r|\n/)
175
+ if nl_found
176
+ ws_found = true
177
+ next_line
178
+ end
179
+
180
+ break unless ws_found
181
+ end
182
+
183
+ scanner.pos
184
+ end
185
+
186
+ def next_line
187
+ @lineno += 1
188
+ @line_start = scanner.pos
189
+ end
190
+ end # class
191
+ end # module
192
+ end # module
@@ -123,6 +123,19 @@ module Rley # This module is used as a namespace
123
123
  end
124
124
  return node
125
125
  end
126
+
127
+ # Standard method for handling one or more modifier: symbol+
128
+ # rule('symbol_plus' => 'symbol_plus symbol')
129
+ def reduce_base_plus_more(_production, _range, _tokens, theChildren)
130
+ theChildren[0] << theChildren[1]
131
+ end
132
+
133
+ # Standard rule method handling one or more modifier: symbol+
134
+ # rule('symbol_plus' => 'symbol')
135
+ def reduce_base_plus_last(_production, _range, _tokens, theChildren)
136
+ [theChildren[0]]
137
+ end
138
+
126
139
  end # class
127
140
  end # module
128
141
  end # module
@@ -12,11 +12,15 @@ module Rley # This module is used as a namespace
12
12
  # the chart is an array with n + 1 entry sets.
13
13
  class GFGChart
14
14
  # @return [Array<ParseEntrySet>] entry sets (one per input token + 1)
15
- attr_reader(:sets)
15
+ attr_reader :sets
16
+
17
+ # @return [Array<Array<Syntax::MatchClosest>>]
18
+ attr_reader :constraints
16
19
 
17
20
  # @param aGFGraph [GFG::GrmFlowGraph] The GFG for the grammar in use.
18
21
  def initialize(aGFGraph)
19
22
  @sets = [ParseEntrySet.new]
23
+ @constraints = [[]]
20
24
  push_entry(aGFGraph.start_vertex, 0, 0, :start_rule)
21
25
  end
22
26
 
@@ -42,6 +46,18 @@ module Rley # This module is used as a namespace
42
46
  end
43
47
  end
44
48
 
49
+ # if an entry corresponds to dotted item with a constraint
50
+ # make this constraint active for this index
51
+ # :before 'IF'
52
+ # search backwards to find nearest 'IF' scan rule
53
+ # in n+1, retrieve all items with IF . pattern
54
+ # create a lambda
55
+ # for every subsequent push_entry with same index,
56
+ # the lambda checks the condition (i.e pattern: ELSE . )
57
+ # if the condition is false, then push new entry
58
+ # if the condition is true but the consequent is false, then discard push action
59
+ # consequent: candidate refers to same dotted_item and same origin, then condition is false
60
+
45
61
  # Push a parse entry for the chart entry with given index
46
62
  # @param anIndex [Integer] The rank of the token in the input stream.
47
63
  # @return [ParseEntry] the passed parse entry if it is pushed
@@ -51,14 +67,48 @@ module Rley # This module is used as a namespace
51
67
  # puts " anOrigin: #{anOrigin}"
52
68
  # puts " anIndex: #{anIndex}"
53
69
  # puts " _reason: #{_reason}"
54
- new_entry = ParseEntry.new(aVertex, anOrigin)
55
70
  if anIndex == sets.size
56
- err_msg = "Internal error: unexpected push reason #{reason}"
57
- raise StandardError, err_msg if reason != :scan_rule
71
+ if reason == :scan_rule
72
+ add_entry_set
73
+ else
74
+ err_msg = "Internal error: unexpected push reason #{reason}"
75
+ raise StandardError, err_msg
76
+ end
77
+ end
58
78
 
59
- add_entry_set
79
+ reject = false
80
+ unless constraints[anIndex].empty?
81
+ constraints[anIndex].each do |ct|
82
+ case ct
83
+ when Syntax::MatchClosest
84
+ not_found = sets[anIndex][0].prev_symbol != aVertex.prev_symbol
85
+ next if not_found
86
+
87
+ some_mismatch = ct.entries.find do |en|
88
+ (en.vertex.dotted_item.production == aVertex.dotted_item.production) &&
89
+ (en.origin != anOrigin)
90
+ end
91
+ reject = true if some_mismatch
92
+ end
93
+ end
94
+ end
95
+
96
+ return nil if reject
97
+
98
+ new_entry = ParseEntry.new(aVertex, anOrigin)
99
+ result = self[anIndex].push_entry(new_entry)
100
+
101
+ if aVertex.kind_of?(GFG::ItemVertex) && aVertex.dotted_item.constraint
102
+ ct = aVertex.dotted_item.constraint
103
+
104
+ case ct
105
+ when Syntax::MatchClosest
106
+ update_match_closest(ct, anIndex)
107
+ end
108
+ constraints[anIndex] << ct
60
109
  end
61
- self[anIndex].push_entry(new_entry)
110
+
111
+ result
62
112
  end
63
113
 
64
114
  # Retrieve the first parse entry added to this chart
@@ -113,6 +163,25 @@ module Rley # This module is used as a namespace
113
163
  end
114
164
  # rubocop: enable Lint/UselessAssignment
115
165
 
166
+ # Retrieve all entries that have a given terminal before the dot.
167
+ # @param criteria [Hash{Symbol => String}]
168
+ def search_entries(atIndex, criteria)
169
+ entries = sets[atIndex].entries
170
+ keyword = criteria.keys[0]
171
+ found = []
172
+ entries.each do |e|
173
+ case keyword
174
+ when :before # terminal before dot
175
+ term_name = criteria[keyword]
176
+ if e.dotted_entry? && e.vertex.dotted_item.position > -2
177
+ found << e if e.prev_symbol&.name == criteria[keyword]
178
+ end
179
+ end
180
+ end
181
+
182
+ found
183
+ end
184
+
116
185
  # @ return [String] A human-readable representation of the chart.
117
186
  def to_s
118
187
  result = +''
@@ -130,6 +199,31 @@ module Rley # This module is used as a namespace
130
199
 
131
200
  def add_entry_set
132
201
  @sets << ParseEntrySet.new
202
+ @constraints << []
203
+ end
204
+
205
+ def update_match_closest(aConstraint, anIndex)
206
+ # Locate in the chart the closest matching terminal...
207
+ i = anIndex - 1
208
+ loop do
209
+ first_entry = sets[i][0]
210
+ prev_symbol = first_entry.prev_symbol
211
+ break if prev_symbol.name == aConstraint.closest_symb
212
+ i -= 1
213
+ break if i < 0
214
+ end
215
+
216
+ # Retrieve all entries of the kind: closest_symb .
217
+ if i > 0
218
+ entries = sets[i].entries.select do |en|
219
+ if en.prev_symbol
220
+ en.prev_symbol.name == aConstraint.closest_symb
221
+ else
222
+ false
223
+ end
224
+ end
225
+ aConstraint.entries = entries
226
+ end
133
227
  end
134
228
  end # class
135
229
  end # module