dendroid 0.0.11 → 0.1.00
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +10 -0
- data/lib/dendroid/grm_analysis/grm_analyzer.rb +22 -57
- data/lib/dendroid/recognizer/chart.rb +16 -6
- data/lib/dendroid/recognizer/e_item.rb +0 -1
- data/lib/dendroid/recognizer/item_set.rb +1 -0
- data/lib/dendroid/recognizer/recognizer.rb +73 -82
- data/lib/dendroid/syntax/grammar.rb +71 -59
- data/spec/dendroid/recognizer/chart_spec.rb +0 -1
- data/spec/dendroid/recognizer/e_item_spec.rb +4 -0
- data/spec/dendroid/recognizer/item_set_spec.rb +1 -1
- data/spec/dendroid/recognizer/recognizer_spec.rb +594 -19
- data/spec/dendroid/support/sample_grammars.rb +249 -6
- data/spec/dendroid/syntax/grammar_spec.rb +165 -5
- data/version.txt +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 722b27a6f20e87c43de339b3f0c45e2bcc77c464d5dd9ecd56bbb686c4857b61
|
4
|
+
data.tar.gz: ce6ffd0c100ea7b7c336044e2877617eebb85c7bb306d5de9d1d4395200320aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69870ade1f77e7fe0b9faf20b7943a500abdf2b41d383a4e048438e431e6f65bf4b418806ec4ba325a6839ee4eb1337085772fe7fa5c594b59663cd653cdeac6
|
7
|
+
data.tar.gz: 137bbf46a71dcb603f3866f51b5be25a8557fb574937e2e1b3f8b40980cd6e7fd6a851c84b229b7b7ecb692d2899d721e3bcc041fe09e213318f46efdb041ea7
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,16 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.1.00] - 2023-11-03
|
6
|
+
Version bump: the Earley recognizer is functional.
|
7
|
+
|
8
|
+
## [0.0.12] - 2023-11-02
|
9
|
+
Added more tests.
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- Added more tests to spec file of `Grammar` class.
|
13
|
+
- Added more tests to spec file of `Recognizer` class.
|
14
|
+
|
5
15
|
## [0.0.11] - 2023-11-02
|
6
16
|
Added Earley recognizer and its ancillary classes.
|
7
17
|
|
@@ -13,7 +13,6 @@ module Dendroid
|
|
13
13
|
attr_reader :grammar
|
14
14
|
attr_reader :items
|
15
15
|
attr_reader :production2items
|
16
|
-
attr_reader :symbol2productions
|
17
16
|
|
18
17
|
# @return [Dendroid::Syntax::Terminal] The pseudo-terminal `__epsilon` (for empty string)
|
19
18
|
attr_reader :epsilon
|
@@ -37,7 +36,6 @@ module Dendroid
|
|
37
36
|
@grammar = aGrammar
|
38
37
|
@items = []
|
39
38
|
@production2items = {}
|
40
|
-
@symbol2productions = {}
|
41
39
|
@epsilon = Syntax::Terminal.new(:__epsilon)
|
42
40
|
@endmarker = Syntax::Terminal.new(:"$$")
|
43
41
|
@first_sets = {}
|
@@ -56,14 +54,14 @@ module Dendroid
|
|
56
54
|
prod.next_item(aDottedItem)
|
57
55
|
end
|
58
56
|
|
57
|
+
def symbol2production(sym)
|
58
|
+
grammar.nonterm2production[sym]
|
59
|
+
end
|
60
|
+
|
59
61
|
private
|
60
62
|
|
61
63
|
def build_dotted_items
|
62
64
|
grammar.rules.each do |prod|
|
63
|
-
lhs = prod.head
|
64
|
-
symbol2productions[lhs] = [] unless symbol2productions.include? lhs
|
65
|
-
symbol2productions[lhs] << prod
|
66
|
-
# production2items[prod] = []
|
67
65
|
mixin = prod.choice? ? ChoiceItems : ProductionItems
|
68
66
|
prod.extend(mixin)
|
69
67
|
prod.build_items
|
@@ -76,33 +74,31 @@ module Dendroid
|
|
76
74
|
def build_first_sets
|
77
75
|
initialize_first_sets
|
78
76
|
|
79
|
-
|
77
|
+
loop do
|
80
78
|
changed = false
|
81
79
|
grammar.rules.each do |prod|
|
82
80
|
head = prod.head
|
83
81
|
first_head = first_sets[head]
|
84
82
|
pre_first_size = first_head.size
|
85
|
-
|
86
|
-
|
87
|
-
first_head.merge(sequence_first(alt.members))
|
88
|
-
end
|
89
|
-
else
|
90
|
-
first_head.merge(sequence_first(prod.body.members))
|
83
|
+
prod.rhs.each do |seq|
|
84
|
+
first_head.merge(sequence_first(seq.members))
|
91
85
|
end
|
92
86
|
changed = true if first_head.size > pre_first_size
|
93
87
|
end
|
94
|
-
|
88
|
+
break unless changed
|
89
|
+
end
|
95
90
|
end
|
96
91
|
|
97
92
|
def initialize_first_sets
|
98
93
|
grammar.symbols.each do |symb|
|
99
|
-
if symb.terminal?
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
94
|
+
set_arg = if symb.terminal?
|
95
|
+
[symb]
|
96
|
+
elsif symb.nullable?
|
97
|
+
[epsilon]
|
98
|
+
else
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
first_sets[symb] = Set.new(set_arg)
|
106
102
|
end
|
107
103
|
end
|
108
104
|
|
@@ -122,43 +118,11 @@ module Dendroid
|
|
122
118
|
def build_follow_sets
|
123
119
|
initialize_follow_sets
|
124
120
|
|
125
|
-
|
121
|
+
loop do
|
126
122
|
changed = false
|
127
123
|
grammar.rules.each do |prod|
|
128
|
-
|
129
|
-
|
130
|
-
body = alt.members
|
131
|
-
next if body.empty?
|
132
|
-
|
133
|
-
head = prod.head
|
134
|
-
head_follow = follow_sets[head]
|
135
|
-
# trailer = Set.new
|
136
|
-
last = true
|
137
|
-
last_index = body.size - 1
|
138
|
-
last_index.downto(0) do |i|
|
139
|
-
symbol = body[i]
|
140
|
-
next if symbol.terminal?
|
141
|
-
|
142
|
-
follow_symbol = follow_sets[symbol]
|
143
|
-
size_before = follow_symbol.size
|
144
|
-
if last
|
145
|
-
# Rule: if last non-terminal member (symbol) is nullable
|
146
|
-
# then add FOLLOW(head) to FOLLOW(symbol)
|
147
|
-
follow_sets[symbol].merge(head_follow) if symbol.nullable?
|
148
|
-
last = false
|
149
|
-
else
|
150
|
-
symbol_seq = body.slice(i + 1, last_index - i)
|
151
|
-
trailer_first = sequence_first(symbol_seq)
|
152
|
-
contains_epsilon = trailer_first.include? epsilon
|
153
|
-
trailer_first.delete(epsilon) if contains_epsilon
|
154
|
-
follow_sets[symbol].merge(trailer_first)
|
155
|
-
follow_sets[symbol].merge(head_follow) if contains_epsilon
|
156
|
-
end
|
157
|
-
changed = true if follow_sets[symbol].size > size_before
|
158
|
-
end
|
159
|
-
end
|
160
|
-
else
|
161
|
-
body = prod.body.members
|
124
|
+
prod.rhs.each do |alt|
|
125
|
+
body = alt.members
|
162
126
|
next if body.empty?
|
163
127
|
|
164
128
|
head = prod.head
|
@@ -189,7 +153,8 @@ module Dendroid
|
|
189
153
|
end
|
190
154
|
end
|
191
155
|
end
|
192
|
-
|
156
|
+
break unless changed
|
157
|
+
end
|
193
158
|
end
|
194
159
|
|
195
160
|
def initialize_follow_sets
|
@@ -4,9 +4,11 @@ require_relative 'item_set'
|
|
4
4
|
|
5
5
|
module Dendroid
|
6
6
|
module Recognizer
|
7
|
-
# Also called a parse table.
|
8
|
-
#
|
9
|
-
#
|
7
|
+
# Also called a parse table. It records the progress of the
|
8
|
+
# Earley recognizer whens its verifies the compliance of the input text
|
9
|
+
# to the language grammar rules.
|
10
|
+
# It essentially consists in an array of item sets.
|
11
|
+
# If n is the number of input tokens then the chart has n + 1 entry sets.
|
10
12
|
class Chart
|
11
13
|
extend Forwardable
|
12
14
|
|
@@ -17,10 +19,10 @@ module Dendroid
|
|
17
19
|
attr_writer :success
|
18
20
|
|
19
21
|
# @return [StandardError] The exception class in case of an error found by the recognizer
|
20
|
-
|
22
|
+
attr_reader :failure_class
|
21
23
|
|
22
24
|
# @return [String] The error message
|
23
|
-
|
25
|
+
attr_reader :failure_reason
|
24
26
|
|
25
27
|
def_delegators :@item_sets, :[], :last, :size
|
26
28
|
|
@@ -33,7 +35,7 @@ module Dendroid
|
|
33
35
|
end
|
34
36
|
|
35
37
|
# Add a new empty item set at the end of the array of item sets
|
36
|
-
def append_new_set
|
38
|
+
def append_new_set
|
37
39
|
item_sets << ItemSet.new
|
38
40
|
end
|
39
41
|
|
@@ -48,6 +50,14 @@ module Dendroid
|
|
48
50
|
def successful?
|
49
51
|
@success
|
50
52
|
end
|
53
|
+
|
54
|
+
# Set the error cause.
|
55
|
+
# @param exception_class [StandardError] Exception class
|
56
|
+
# @param message [String] Error message
|
57
|
+
def failure(exception_class, message)
|
58
|
+
@failure_class = exception_class
|
59
|
+
@failure_reason = message
|
60
|
+
end
|
51
61
|
end # class
|
52
62
|
end # module
|
53
63
|
end # module
|
@@ -16,24 +16,30 @@ module Dendroid
|
|
16
16
|
# @return [Object]
|
17
17
|
attr_reader :tokenizer
|
18
18
|
|
19
|
+
# @param grammar [Dendroid::Syntax::Grammar]
|
20
|
+
# @param tokenizer [Object]
|
19
21
|
def initialize(grammar, tokenizer)
|
20
22
|
@grm_analysis = GrmAnalysis::GrmAnalyzer.new(grammar)
|
21
23
|
@tokenizer = tokenizer
|
22
24
|
end
|
23
25
|
|
26
|
+
# Try to read the `source` text and verify that it is syntactically correct.
|
27
|
+
# @param source [String] Input text to recognize
|
28
|
+
# @return [Dendroid::Recognizer::Chart]
|
24
29
|
def run(source)
|
25
30
|
tokenizer.input = source
|
26
31
|
tok = tokenizer.next_token
|
27
32
|
if tok.nil? && !grm_analysis.grammar.start_symbol.nullable?
|
28
33
|
chart = new_chart
|
29
|
-
chart.
|
30
|
-
chart.failure_reason = 'Error: Input may not be empty nor blank.'
|
34
|
+
chart.failure(StandardError, 'Error: Input may not be empty nor blank.')
|
31
35
|
chart
|
32
36
|
else
|
33
37
|
earley_parse(tok)
|
34
38
|
end
|
35
39
|
end
|
36
40
|
|
41
|
+
# Run the Earley algorithm
|
42
|
+
# @param initial_token [Dednroid::Lexical::Token]
|
37
43
|
def earley_parse(initial_token)
|
38
44
|
chart = new_chart
|
39
45
|
tokens = [initial_token]
|
@@ -42,7 +48,7 @@ module Dendroid
|
|
42
48
|
rank = 0
|
43
49
|
|
44
50
|
loop do
|
45
|
-
eos_reached
|
51
|
+
eos_reached ||= advance_next_token(tokens, predicted_symbols)
|
46
52
|
|
47
53
|
advance = false
|
48
54
|
curr_rank = rank
|
@@ -55,7 +61,7 @@ module Dendroid
|
|
55
61
|
|
56
62
|
rank += 1 if advance
|
57
63
|
break if eos_reached && !advance
|
58
|
-
break
|
64
|
+
break unless advance
|
59
65
|
end
|
60
66
|
|
61
67
|
determine_outcome(chart, tokens)
|
@@ -67,13 +73,10 @@ module Dendroid
|
|
67
73
|
def new_chart
|
68
74
|
top_symbol = grm_analysis.grammar.start_symbol
|
69
75
|
|
70
|
-
|
71
|
-
prods = grm_analysis.grammar.nonterm2productions[top_symbol]
|
76
|
+
prd = grm_analysis.grammar.nonterm2production[top_symbol]
|
72
77
|
chart = Chart.new
|
73
|
-
|
74
|
-
|
75
|
-
seed_items.each { |item| chart.seed_last_set(EItem.new(item, 0)) }
|
76
|
-
end
|
78
|
+
seed_items = prd.predicted_items
|
79
|
+
seed_items.each { |item| chart.seed_last_set(EItem.new(item, 0)) }
|
77
80
|
|
78
81
|
chart
|
79
82
|
end
|
@@ -96,25 +99,22 @@ module Dendroid
|
|
96
99
|
|
97
100
|
if entry.completed?
|
98
101
|
completer(chart, entry, rank, tokens, mode)
|
102
|
+
elsif entry.next_symbol.terminal?
|
103
|
+
advance = scanner(chart, entry, rank, tokens)
|
99
104
|
else
|
100
|
-
|
101
|
-
advance = scanner(chart, entry, rank, tokens)
|
102
|
-
else
|
103
|
-
predictor(chart, entry, rank, tokens, mode, predicted_symbols)
|
104
|
-
end
|
105
|
+
predictor(chart, entry, rank, tokens, mode, predicted_symbols)
|
105
106
|
end
|
106
107
|
|
107
108
|
advance
|
108
109
|
end
|
109
|
-
=begin
|
110
|
-
procedure PREDICTOR((A → α•Bβ, j), k)
|
111
|
-
for each (B → γ) in GRAMMAR_RULES_FOR(B) do
|
112
|
-
ADD_TO_SET((B → •γ, k), S[k])
|
113
|
-
end
|
114
|
-
Assuming next symbol is a non-terminal
|
115
110
|
|
116
|
-
|
117
|
-
|
111
|
+
# procedure PREDICTOR((A → α•Bβ, j), k)
|
112
|
+
# for each (B → γ) in GRAMMAR_RULES_FOR(B) do
|
113
|
+
# ADD_TO_SET((B → •γ, k), S[k])
|
114
|
+
# end
|
115
|
+
# Assuming next symbol is a non-terminal
|
116
|
+
#
|
117
|
+
# Error case: next actual token matches none of the expected tokens.
|
118
118
|
def predictor(chart, item, rank, tokens, mode, predicted_symbols)
|
119
119
|
next_symbol = item.next_symbol
|
120
120
|
if mode == :genuine
|
@@ -125,40 +125,35 @@ module Dendroid
|
|
125
125
|
predicted.add(next_symbol)
|
126
126
|
end
|
127
127
|
|
128
|
-
prods = grm_analysis.symbol2productions[next_symbol]
|
129
128
|
curr_set = chart[rank]
|
130
129
|
next_token = tokens[rank]
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
end
|
139
|
-
|
140
|
-
new_item = EItem.new(entry, rank)
|
141
|
-
curr_set.add_item(new_item)
|
130
|
+
prd = grm_analysis.symbol2production(next_symbol)
|
131
|
+
entry_items = prd.predicted_items
|
132
|
+
entry_items.each do |entry|
|
133
|
+
member = entry.next_symbol
|
134
|
+
if member&.terminal?
|
135
|
+
next unless next_token
|
136
|
+
next if (member.name != next_token.terminal) && mode == :genuine
|
142
137
|
end
|
143
|
-
end
|
144
138
|
|
145
|
-
|
146
|
-
|
147
|
-
next_item = grm_analysis.next_item(item.dotted_item)
|
148
|
-
if next_item
|
149
|
-
new_item = EItem.new(next_item, item.origin)
|
150
|
-
curr_set.add_item(new_item)
|
151
|
-
end
|
139
|
+
new_item = EItem.new(entry, rank)
|
140
|
+
curr_set.add_item(new_item)
|
152
141
|
end
|
142
|
+
# Use trick from paper John Aycock and R. Nigel Horspool: "Practical Earley Parsing"
|
143
|
+
return unless next_symbol.nullable?
|
144
|
+
|
145
|
+
next_item = grm_analysis.next_item(item.dotted_item)
|
146
|
+
return unless next_item
|
147
|
+
|
148
|
+
new_item = EItem.new(next_item, item.origin)
|
149
|
+
curr_set.add_item(new_item)
|
153
150
|
end
|
154
151
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
Assuming next symbol is a terminal
|
161
|
-
=end
|
152
|
+
# procedure SCANNER((A → α•aβ, j), k, words)
|
153
|
+
# if j < LENGTH(words) and a ⊂ PARTS_OF_SPEECH(words[k]) then
|
154
|
+
# ADD_TO_SET((A → αa•β, j), S[k+1])
|
155
|
+
# end
|
156
|
+
# Assuming next symbol is a terminal
|
162
157
|
def scanner(chart, scan_item, rank, tokens)
|
163
158
|
advance = false
|
164
159
|
dit = scan_item.dotted_item
|
@@ -174,12 +169,10 @@ module Dendroid
|
|
174
169
|
advance
|
175
170
|
end
|
176
171
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
end
|
182
|
-
=end
|
172
|
+
# procedure COMPLETER((B → γ•, x), k)
|
173
|
+
# for each (A → α•Bβ, j) in S[x] do
|
174
|
+
# ADD_TO_SET((A → αB•β, j), S[k])
|
175
|
+
# end
|
183
176
|
def completer(chart, item, rank, tokens, mode)
|
184
177
|
origin = item.origin
|
185
178
|
|
@@ -190,6 +183,7 @@ module Dendroid
|
|
190
183
|
callers.each do |call_item|
|
191
184
|
return_item = grm_analysis.next_item(call_item.dotted_item)
|
192
185
|
next unless return_item
|
186
|
+
|
193
187
|
member = return_item.next_symbol
|
194
188
|
if member&.terminal? && (mode == :genuine)
|
195
189
|
next unless next_token
|
@@ -216,18 +210,17 @@ module Dendroid
|
|
216
210
|
success = false
|
217
211
|
if chart.size == tokens.size + 1
|
218
212
|
top_symbol = grm_analysis.grammar.start_symbol
|
219
|
-
|
220
|
-
final_items =
|
221
|
-
items.concat(rule.reduce_items)
|
222
|
-
end
|
213
|
+
top_rule = grm_analysis.grammar.nonterm2production[top_symbol]
|
214
|
+
final_items = top_rule.reduce_items
|
223
215
|
last_set = chart.item_sets.last
|
224
216
|
last_set.each do |entry|
|
225
|
-
next if
|
217
|
+
next if !entry.origin.zero? || !final_items.include?(entry.dotted_item)
|
218
|
+
|
226
219
|
success = true
|
227
220
|
end
|
228
221
|
end
|
229
222
|
|
230
|
-
|
223
|
+
unless success
|
231
224
|
# Error detected...
|
232
225
|
replay_last_set(chart, tokens)
|
233
226
|
if chart.size < tokens.size + 1
|
@@ -235,39 +228,37 @@ module Dendroid
|
|
235
228
|
offending_token = tokens[chart.size - 1]
|
236
229
|
pos = offending_token.position
|
237
230
|
(line, col) = [pos.lineno, pos.column]
|
238
|
-
|
239
|
-
terminals = last_set.items.reduce([]) do |result, ent|
|
240
|
-
result << ent.next_symbol if ent.pre_scan?
|
241
|
-
result
|
242
|
-
end
|
243
|
-
terminals.uniq!
|
231
|
+
terminals = expected_terminals(chart)
|
244
232
|
prefix = "Syntax error at or near token line #{line}, column #{col} >>>#{offending_token.source}<<<"
|
245
|
-
expectation = terminals.size == 1 ?
|
233
|
+
expectation = terminals.size == 1 ? terminals[0].name.to_s : "one of: [#{terminals.map(&:name).join(', ')}]"
|
246
234
|
err_msg = "#{prefix} Expected #{expectation}, found a #{offending_token.terminal} instead."
|
247
|
-
chart.
|
248
|
-
chart.failure_reason = err_msg
|
235
|
+
chart.failure(StandardError, err_msg)
|
249
236
|
elsif chart.size == tokens.size + 1
|
250
237
|
# EOS unexpected...
|
251
238
|
last_token = tokens.last
|
252
239
|
pos = last_token.position
|
253
240
|
(line, col) = [pos.lineno, pos.column]
|
254
|
-
|
255
|
-
terminals = last_set.items.reduce([]) do |result, ent|
|
256
|
-
result << ent.next_symbol if ent.pre_scan?
|
257
|
-
result
|
258
|
-
end
|
259
|
-
terminals.uniq!
|
260
|
-
|
241
|
+
terminals = expected_terminals(chart)
|
261
242
|
prefix = "Line #{line}, column #{col}: Premature end of input after '#{last_token.source}'"
|
262
|
-
expectation = terminals.size == 1 ?
|
243
|
+
expectation = terminals.size == 1 ? terminals[0].name.to_s : "one of: [#{terminals.map(&:name).join(', ')}]"
|
263
244
|
err_msg = "#{prefix}, expected: #{expectation}."
|
264
|
-
chart.
|
265
|
-
chart.failure_reason = err_msg
|
245
|
+
chart.failure(StandardError, err_msg)
|
266
246
|
end
|
267
247
|
end
|
268
248
|
chart.success = success
|
269
249
|
end
|
270
250
|
|
251
|
+
def expected_terminals(chart)
|
252
|
+
last_set = chart.last
|
253
|
+
terminals = last_set.items.reduce([]) do |result, ent|
|
254
|
+
result << ent.next_symbol if ent.pre_scan?
|
255
|
+
result
|
256
|
+
end
|
257
|
+
terminals.uniq!
|
258
|
+
|
259
|
+
terminals
|
260
|
+
end
|
261
|
+
|
271
262
|
def replay_last_set(chart, tokens)
|
272
263
|
rank = chart.size - 1
|
273
264
|
seed_set(chart, rank) # Re-initialize last set with scan entries
|
@@ -279,4 +270,4 @@ module Dendroid
|
|
279
270
|
end
|
280
271
|
end # class
|
281
272
|
end # module
|
282
|
-
end # module
|
273
|
+
end # module
|