dendroid 0.0.11 → 0.1.00

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2564f1269225e08732a9f995b10ebbbbf4710b0a1b0aea73e7fe4b486c34a1aa
4
- data.tar.gz: db15f965e9365276ffc576435d514cd6c9170a8727c7fafe1425a9de7ed3e0cd
3
+ metadata.gz: 722b27a6f20e87c43de339b3f0c45e2bcc77c464d5dd9ecd56bbb686c4857b61
4
+ data.tar.gz: ce6ffd0c100ea7b7c336044e2877617eebb85c7bb306d5de9d1d4395200320aa
5
5
  SHA512:
6
- metadata.gz: 2517fd57cca364571e19ddd183d53fcd4fd642f0cf83ecc58ef0f62e5c7512c343cc3db4f31ee621dad5009386db0161e7c2f67944820dd36cb2a253a4d7af80
7
- data.tar.gz: 24b77e7c0c5e97df315102c3434dddd251eacab96efaa3d194006c3874f6d260aeafa076ec5b6bd1bed296fbc675bef00e2fcf49a1e4516c20cc6b3e3b0aefdb
6
+ metadata.gz: 69870ade1f77e7fe0b9faf20b7943a500abdf2b41d383a4e048438e431e6f65bf4b418806ec4ba325a6839ee4eb1337085772fe7fa5c594b59663cd653cdeac6
7
+ data.tar.gz: 137bbf46a71dcb603f3866f51b5be25a8557fb574937e2e1b3f8b40980cd6e7fd6a851c84b229b7b7ecb692d2899d721e3bcc041fe09e213318f46efdb041ea7
data/.rubocop.yml CHANGED
@@ -12,7 +12,7 @@ Metrics/BlockLength:
12
12
 
13
13
  Metrics/ClassLength:
14
14
  Enabled: true
15
- Max: 200
15
+ Max: 300
16
16
 
17
17
  Metrics/CyclomaticComplexity:
18
18
  Enabled: true
data/CHANGELOG.md CHANGED
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.1.00] - 2023-11-03
6
+ Version bump: the Earley recognizer is functional.
7
+
8
+ ## [0.0.12] - 2023-11-02
9
+ Added more tests.
10
+
11
+ ### Added
12
+ - Added more tests to spec file of `Grammar` class.
13
+ - Added more tests to spec file of `Recognizer` class.
14
+
5
15
  ## [0.0.11] - 2023-11-02
6
16
  Added Earley recognizer and its ancillary classes.
7
17
 
@@ -13,7 +13,6 @@ module Dendroid
13
13
  attr_reader :grammar
14
14
  attr_reader :items
15
15
  attr_reader :production2items
16
- attr_reader :symbol2productions
17
16
 
18
17
  # @return [Dendroid::Syntax::Terminal] The pseudo-terminal `__epsilon` (for empty string)
19
18
  attr_reader :epsilon
@@ -37,7 +36,6 @@ module Dendroid
37
36
  @grammar = aGrammar
38
37
  @items = []
39
38
  @production2items = {}
40
- @symbol2productions = {}
41
39
  @epsilon = Syntax::Terminal.new(:__epsilon)
42
40
  @endmarker = Syntax::Terminal.new(:"$$")
43
41
  @first_sets = {}
@@ -56,14 +54,14 @@ module Dendroid
56
54
  prod.next_item(aDottedItem)
57
55
  end
58
56
 
57
+ def symbol2production(sym)
58
+ grammar.nonterm2production[sym]
59
+ end
60
+
59
61
  private
60
62
 
61
63
  def build_dotted_items
62
64
  grammar.rules.each do |prod|
63
- lhs = prod.head
64
- symbol2productions[lhs] = [] unless symbol2productions.include? lhs
65
- symbol2productions[lhs] << prod
66
- # production2items[prod] = []
67
65
  mixin = prod.choice? ? ChoiceItems : ProductionItems
68
66
  prod.extend(mixin)
69
67
  prod.build_items
@@ -76,33 +74,31 @@ module Dendroid
76
74
  def build_first_sets
77
75
  initialize_first_sets
78
76
 
79
- begin
77
+ loop do
80
78
  changed = false
81
79
  grammar.rules.each do |prod|
82
80
  head = prod.head
83
81
  first_head = first_sets[head]
84
82
  pre_first_size = first_head.size
85
- if prod.choice?
86
- prod.alternatives.each do |alt|
87
- first_head.merge(sequence_first(alt.members))
88
- end
89
- else
90
- first_head.merge(sequence_first(prod.body.members))
83
+ prod.rhs.each do |seq|
84
+ first_head.merge(sequence_first(seq.members))
91
85
  end
92
86
  changed = true if first_head.size > pre_first_size
93
87
  end
94
- end until !changed
88
+ break unless changed
89
+ end
95
90
  end
96
91
 
97
92
  def initialize_first_sets
98
93
  grammar.symbols.each do |symb|
99
- if symb.terminal?
100
- first_sets[symb] = Set.new([symb])
101
- elsif symb.nullable?
102
- first_sets[symb] = Set.new([epsilon])
103
- else
104
- first_sets[symb] = Set.new
105
- end
94
+ set_arg = if symb.terminal?
95
+ [symb]
96
+ elsif symb.nullable?
97
+ [epsilon]
98
+ else
99
+ nil
100
+ end
101
+ first_sets[symb] = Set.new(set_arg)
106
102
  end
107
103
  end
108
104
 
@@ -122,43 +118,11 @@ module Dendroid
122
118
  def build_follow_sets
123
119
  initialize_follow_sets
124
120
 
125
- begin
121
+ loop do
126
122
  changed = false
127
123
  grammar.rules.each do |prod|
128
- if prod.choice?
129
- prod.alternatives.each do |alt|
130
- body = alt.members
131
- next if body.empty?
132
-
133
- head = prod.head
134
- head_follow = follow_sets[head]
135
- # trailer = Set.new
136
- last = true
137
- last_index = body.size - 1
138
- last_index.downto(0) do |i|
139
- symbol = body[i]
140
- next if symbol.terminal?
141
-
142
- follow_symbol = follow_sets[symbol]
143
- size_before = follow_symbol.size
144
- if last
145
- # Rule: if last non-terminal member (symbol) is nullable
146
- # then add FOLLOW(head) to FOLLOW(symbol)
147
- follow_sets[symbol].merge(head_follow) if symbol.nullable?
148
- last = false
149
- else
150
- symbol_seq = body.slice(i + 1, last_index - i)
151
- trailer_first = sequence_first(symbol_seq)
152
- contains_epsilon = trailer_first.include? epsilon
153
- trailer_first.delete(epsilon) if contains_epsilon
154
- follow_sets[symbol].merge(trailer_first)
155
- follow_sets[symbol].merge(head_follow) if contains_epsilon
156
- end
157
- changed = true if follow_sets[symbol].size > size_before
158
- end
159
- end
160
- else
161
- body = prod.body.members
124
+ prod.rhs.each do |alt|
125
+ body = alt.members
162
126
  next if body.empty?
163
127
 
164
128
  head = prod.head
@@ -189,7 +153,8 @@ module Dendroid
189
153
  end
190
154
  end
191
155
  end
192
- end until !changed
156
+ break unless changed
157
+ end
193
158
  end
194
159
 
195
160
  def initialize_follow_sets
@@ -4,9 +4,11 @@ require_relative 'item_set'
4
4
 
5
5
  module Dendroid
6
6
  module Recognizer
7
- # Also called a parse table.
8
- # Assuming that n == number of input tokens,
9
- # then the chart is an array with n + 1 entry sets.
7
+ # Also called a parse table. It records the progress of the
8
+ # Earley recognizer whens its verifies the compliance of the input text
9
+ # to the language grammar rules.
10
+ # It essentially consists in an array of item sets.
11
+ # If n is the number of input tokens then the chart has n + 1 entry sets.
10
12
  class Chart
11
13
  extend Forwardable
12
14
 
@@ -17,10 +19,10 @@ module Dendroid
17
19
  attr_writer :success
18
20
 
19
21
  # @return [StandardError] The exception class in case of an error found by the recognizer
20
- attr_accessor :failure_class
22
+ attr_reader :failure_class
21
23
 
22
24
  # @return [String] The error message
23
- attr_accessor :failure_reason
25
+ attr_reader :failure_reason
24
26
 
25
27
  def_delegators :@item_sets, :[], :last, :size
26
28
 
@@ -33,7 +35,7 @@ module Dendroid
33
35
  end
34
36
 
35
37
  # Add a new empty item set at the end of the array of item sets
36
- def append_new_set()
38
+ def append_new_set
37
39
  item_sets << ItemSet.new
38
40
  end
39
41
 
@@ -48,6 +50,14 @@ module Dendroid
48
50
  def successful?
49
51
  @success
50
52
  end
53
+
54
+ # Set the error cause.
55
+ # @param exception_class [StandardError] Exception class
56
+ # @param message [String] Error message
57
+ def failure(exception_class, message)
58
+ @failure_class = exception_class
59
+ @failure_reason = message
60
+ end
51
61
  end # class
52
62
  end # module
53
63
  end # module
@@ -45,4 +45,3 @@ module Dendroid
45
45
  end # class
46
46
  end # module
47
47
  end # module
48
-
@@ -8,6 +8,7 @@ module Dendroid
8
8
 
9
9
  # @return [Recognizer::EItem]
10
10
  attr_reader :items
11
+
11
12
  def_delegators :@items, :clear, :each, :empty?, :select, :size
12
13
 
13
14
  def initialize
@@ -16,24 +16,30 @@ module Dendroid
16
16
  # @return [Object]
17
17
  attr_reader :tokenizer
18
18
 
19
+ # @param grammar [Dendroid::Syntax::Grammar]
20
+ # @param tokenizer [Object]
19
21
  def initialize(grammar, tokenizer)
20
22
  @grm_analysis = GrmAnalysis::GrmAnalyzer.new(grammar)
21
23
  @tokenizer = tokenizer
22
24
  end
23
25
 
26
+ # Try to read the `source` text and verify that it is syntactically correct.
27
+ # @param source [String] Input text to recognize
28
+ # @return [Dendroid::Recognizer::Chart]
24
29
  def run(source)
25
30
  tokenizer.input = source
26
31
  tok = tokenizer.next_token
27
32
  if tok.nil? && !grm_analysis.grammar.start_symbol.nullable?
28
33
  chart = new_chart
29
- chart.failure_class = StandardError
30
- chart.failure_reason = 'Error: Input may not be empty nor blank.'
34
+ chart.failure(StandardError, 'Error: Input may not be empty nor blank.')
31
35
  chart
32
36
  else
33
37
  earley_parse(tok)
34
38
  end
35
39
  end
36
40
 
41
+ # Run the Earley algorithm
42
+ # @param initial_token [Dednroid::Lexical::Token]
37
43
  def earley_parse(initial_token)
38
44
  chart = new_chart
39
45
  tokens = [initial_token]
@@ -42,7 +48,7 @@ module Dendroid
42
48
  rank = 0
43
49
 
44
50
  loop do
45
- eos_reached = advance_next_token(tokens, predicted_symbols) unless eos_reached
51
+ eos_reached ||= advance_next_token(tokens, predicted_symbols)
46
52
 
47
53
  advance = false
48
54
  curr_rank = rank
@@ -55,7 +61,7 @@ module Dendroid
55
61
 
56
62
  rank += 1 if advance
57
63
  break if eos_reached && !advance
58
- break if ! advance
64
+ break unless advance
59
65
  end
60
66
 
61
67
  determine_outcome(chart, tokens)
@@ -67,13 +73,10 @@ module Dendroid
67
73
  def new_chart
68
74
  top_symbol = grm_analysis.grammar.start_symbol
69
75
 
70
- # Reminder: there might be multiple rules for the start symbol
71
- prods = grm_analysis.grammar.nonterm2productions[top_symbol]
76
+ prd = grm_analysis.grammar.nonterm2production[top_symbol]
72
77
  chart = Chart.new
73
- prods.each do |prd|
74
- seed_items = prd.predicted_items
75
- seed_items.each { |item| chart.seed_last_set(EItem.new(item, 0)) }
76
- end
78
+ seed_items = prd.predicted_items
79
+ seed_items.each { |item| chart.seed_last_set(EItem.new(item, 0)) }
77
80
 
78
81
  chart
79
82
  end
@@ -96,25 +99,22 @@ module Dendroid
96
99
 
97
100
  if entry.completed?
98
101
  completer(chart, entry, rank, tokens, mode)
102
+ elsif entry.next_symbol.terminal?
103
+ advance = scanner(chart, entry, rank, tokens)
99
104
  else
100
- if entry.next_symbol.terminal?
101
- advance = scanner(chart, entry, rank, tokens)
102
- else
103
- predictor(chart, entry, rank, tokens, mode, predicted_symbols)
104
- end
105
+ predictor(chart, entry, rank, tokens, mode, predicted_symbols)
105
106
  end
106
107
 
107
108
  advance
108
109
  end
109
- =begin
110
- procedure PREDICTOR((A → α•Bβ, j), k)
111
- for each (B → γ) in GRAMMAR_RULES_FOR(B) do
112
- ADD_TO_SET((B → •γ, k), S[k])
113
- end
114
- Assuming next symbol is a non-terminal
115
110
 
116
- Error case: next actual token matches none of the expected tokens.
117
- =end
111
+ # procedure PREDICTOR((A α•Bβ, j), k)
112
+ # for each (B → γ) in GRAMMAR_RULES_FOR(B) do
113
+ # ADD_TO_SET((B → •γ, k), S[k])
114
+ # end
115
+ # Assuming next symbol is a non-terminal
116
+ #
117
+ # Error case: next actual token matches none of the expected tokens.
118
118
  def predictor(chart, item, rank, tokens, mode, predicted_symbols)
119
119
  next_symbol = item.next_symbol
120
120
  if mode == :genuine
@@ -125,40 +125,35 @@ module Dendroid
125
125
  predicted.add(next_symbol)
126
126
  end
127
127
 
128
- prods = grm_analysis.symbol2productions[next_symbol]
129
128
  curr_set = chart[rank]
130
129
  next_token = tokens[rank]
131
- prods.each do |prd|
132
- entry_items = prd.predicted_items
133
- entry_items.each do |entry|
134
- member = entry.next_symbol
135
- if member&.terminal?
136
- next unless next_token
137
- next if (member.name != next_token.terminal) && mode == :genuine
138
- end
139
-
140
- new_item = EItem.new(entry, rank)
141
- curr_set.add_item(new_item)
130
+ prd = grm_analysis.symbol2production(next_symbol)
131
+ entry_items = prd.predicted_items
132
+ entry_items.each do |entry|
133
+ member = entry.next_symbol
134
+ if member&.terminal?
135
+ next unless next_token
136
+ next if (member.name != next_token.terminal) && mode == :genuine
142
137
  end
143
- end
144
138
 
145
- # Use trick from paper John Aycock and R. Nigel Horspool: "Practical Earley Parsing"
146
- if next_symbol.nullable?
147
- next_item = grm_analysis.next_item(item.dotted_item)
148
- if next_item
149
- new_item = EItem.new(next_item, item.origin)
150
- curr_set.add_item(new_item)
151
- end
139
+ new_item = EItem.new(entry, rank)
140
+ curr_set.add_item(new_item)
152
141
  end
142
+ # Use trick from paper John Aycock and R. Nigel Horspool: "Practical Earley Parsing"
143
+ return unless next_symbol.nullable?
144
+
145
+ next_item = grm_analysis.next_item(item.dotted_item)
146
+ return unless next_item
147
+
148
+ new_item = EItem.new(next_item, item.origin)
149
+ curr_set.add_item(new_item)
153
150
  end
154
151
 
155
- =begin
156
- procedure SCANNER((A α•aβ, j), k, words)
157
- if j < LENGTH(words) and a PARTS_OF_SPEECH(words[k]) then
158
- ADD_TO_SET((A → αa•β, j), S[k+1])
159
- end
160
- Assuming next symbol is a terminal
161
- =end
152
+ # procedure SCANNER((A → α•aβ, j), k, words)
153
+ # if j < LENGTH(words) and a PARTS_OF_SPEECH(words[k]) then
154
+ # ADD_TO_SET((A αa•β, j), S[k+1])
155
+ # end
156
+ # Assuming next symbol is a terminal
162
157
  def scanner(chart, scan_item, rank, tokens)
163
158
  advance = false
164
159
  dit = scan_item.dotted_item
@@ -174,12 +169,10 @@ module Dendroid
174
169
  advance
175
170
  end
176
171
 
177
- =begin
178
- procedure COMPLETER((Bγ•, x), k)
179
- for each (A → α•Bβ, j) in S[x] do
180
- ADD_TO_SET((A → αB•β, j), S[k])
181
- end
182
- =end
172
+ # procedure COMPLETER((B → γ•, x), k)
173
+ # for each (Aα•Bβ, j) in S[x] do
174
+ # ADD_TO_SET((A → αB•β, j), S[k])
175
+ # end
183
176
  def completer(chart, item, rank, tokens, mode)
184
177
  origin = item.origin
185
178
 
@@ -190,6 +183,7 @@ module Dendroid
190
183
  callers.each do |call_item|
191
184
  return_item = grm_analysis.next_item(call_item.dotted_item)
192
185
  next unless return_item
186
+
193
187
  member = return_item.next_symbol
194
188
  if member&.terminal? && (mode == :genuine)
195
189
  next unless next_token
@@ -216,18 +210,17 @@ module Dendroid
216
210
  success = false
217
211
  if chart.size == tokens.size + 1
218
212
  top_symbol = grm_analysis.grammar.start_symbol
219
- top_rules = grm_analysis.grammar.nonterm2productions[top_symbol]
220
- final_items = top_rules.reduce([]) do |items, rule|
221
- items.concat(rule.reduce_items)
222
- end
213
+ top_rule = grm_analysis.grammar.nonterm2production[top_symbol]
214
+ final_items = top_rule.reduce_items
223
215
  last_set = chart.item_sets.last
224
216
  last_set.each do |entry|
225
- next if ((!entry.origin.zero?) || ! final_items.include?(entry.dotted_item))
217
+ next if !entry.origin.zero? || !final_items.include?(entry.dotted_item)
218
+
226
219
  success = true
227
220
  end
228
221
  end
229
222
 
230
- if !success
223
+ unless success
231
224
  # Error detected...
232
225
  replay_last_set(chart, tokens)
233
226
  if chart.size < tokens.size + 1
@@ -235,39 +228,37 @@ module Dendroid
235
228
  offending_token = tokens[chart.size - 1]
236
229
  pos = offending_token.position
237
230
  (line, col) = [pos.lineno, pos.column]
238
- last_set = chart.last
239
- terminals = last_set.items.reduce([]) do |result, ent|
240
- result << ent.next_symbol if ent.pre_scan?
241
- result
242
- end
243
- terminals.uniq!
231
+ terminals = expected_terminals(chart)
244
232
  prefix = "Syntax error at or near token line #{line}, column #{col} >>>#{offending_token.source}<<<"
245
- expectation = terminals.size == 1 ? "#{terminals[0].name}" : "one of: [#{terminals.map(&:name).join(', ')}]"
233
+ expectation = terminals.size == 1 ? terminals[0].name.to_s : "one of: [#{terminals.map(&:name).join(', ')}]"
246
234
  err_msg = "#{prefix} Expected #{expectation}, found a #{offending_token.terminal} instead."
247
- chart.failure_class = StandardError
248
- chart.failure_reason = err_msg
235
+ chart.failure(StandardError, err_msg)
249
236
  elsif chart.size == tokens.size + 1
250
237
  # EOS unexpected...
251
238
  last_token = tokens.last
252
239
  pos = last_token.position
253
240
  (line, col) = [pos.lineno, pos.column]
254
- last_set = chart.last
255
- terminals = last_set.items.reduce([]) do |result, ent|
256
- result << ent.next_symbol if ent.pre_scan?
257
- result
258
- end
259
- terminals.uniq!
260
-
241
+ terminals = expected_terminals(chart)
261
242
  prefix = "Line #{line}, column #{col}: Premature end of input after '#{last_token.source}'"
262
- expectation = terminals.size == 1 ? "#{terminals[0].name}" : "one of: [#{terminals.map(&:name).join(', ')}]"
243
+ expectation = terminals.size == 1 ? terminals[0].name.to_s : "one of: [#{terminals.map(&:name).join(', ')}]"
263
244
  err_msg = "#{prefix}, expected: #{expectation}."
264
- chart.failure_class = StandardError
265
- chart.failure_reason = err_msg
245
+ chart.failure(StandardError, err_msg)
266
246
  end
267
247
  end
268
248
  chart.success = success
269
249
  end
270
250
 
251
+ def expected_terminals(chart)
252
+ last_set = chart.last
253
+ terminals = last_set.items.reduce([]) do |result, ent|
254
+ result << ent.next_symbol if ent.pre_scan?
255
+ result
256
+ end
257
+ terminals.uniq!
258
+
259
+ terminals
260
+ end
261
+
271
262
  def replay_last_set(chart, tokens)
272
263
  rank = chart.size - 1
273
264
  seed_set(chart, rank) # Re-initialize last set with scan entries
@@ -279,4 +270,4 @@ module Dendroid
279
270
  end
280
271
  end # class
281
272
  end # module
282
- end # module
273
+ end # module