dhaka 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/evaluator/evaluator.rb +18 -17
- data/lib/grammar/grammar.rb +4 -5
- data/lib/lexer/dfa.rb +63 -13
- data/lib/lexer/lexeme.rb +3 -4
- data/lib/lexer/lexer.rb +12 -3
- data/lib/lexer/lexer_run.rb +22 -10
- data/lib/lexer/regex_grammar.rb +88 -14
- data/lib/lexer/regex_parser.rb +1523 -1401
- data/lib/lexer/specification.rb +29 -3
- data/lib/lexer/state.rb +32 -9
- data/lib/lexer/state_machine.rb +2 -2
- data/lib/parser/channel.rb +4 -4
- data/lib/parser/parser.rb +17 -12
- data/lib/parser/parser_state.rb +3 -1
- data/test/chittagong/chittagong_lexer.rb +63 -63
- data/test/chittagong/chittagong_lexer.rb.rej +189 -0
- data/test/chittagong/chittagong_lexer_specification.rb +6 -8
- data/test/chittagong/chittagong_parser.rb +659 -659
- data/test/chittagong/chittagong_parser.rb.rej +1623 -0
- data/test/{another_lalr_but_not_slr_grammar.rb → core/another_lalr_but_not_slr_grammar.rb} +1 -1
- data/test/{compiled_parser_test.rb → core/compiled_parser_test.rb} +1 -1
- data/test/core/dfa_test.rb +170 -0
- data/test/{evaluator_test.rb → core/evaluator_test.rb} +3 -3
- data/test/{grammar_test.rb → core/grammar_test.rb} +3 -3
- data/test/{lalr_but_not_slr_grammar.rb → core/lalr_but_not_slr_grammar.rb} +0 -0
- data/test/core/lexer_test.rb +139 -0
- data/test/{malformed_grammar.rb → core/malformed_grammar.rb} +0 -0
- data/test/{malformed_grammar_test.rb → core/malformed_grammar_test.rb} +1 -1
- data/test/{nullable_grammar.rb → core/nullable_grammar.rb} +0 -0
- data/test/{parse_result_test.rb → core/parse_result_test.rb} +1 -1
- data/test/{parser_state_test.rb → core/parser_state_test.rb} +1 -1
- data/test/{parser_test.rb → core/parser_test.rb} +2 -2
- data/test/{precedence_grammar.rb → core/precedence_grammar.rb} +0 -0
- data/test/{precedence_grammar_test.rb → core/precedence_grammar_test.rb} +1 -1
- data/test/{rr_conflict_grammar.rb → core/rr_conflict_grammar.rb} +0 -0
- data/test/{simple_grammar.rb → core/simple_grammar.rb} +0 -0
- data/test/{sr_conflict_grammar.rb → core/sr_conflict_grammar.rb} +0 -0
- metadata +25 -22
- data/test/lexer_test.rb +0 -215
data/lib/evaluator/evaluator.rb
CHANGED
@@ -55,24 +55,24 @@ module Dhaka
|
|
55
55
|
#
|
56
56
|
# end
|
57
57
|
|
58
|
-
class Evaluator
|
58
|
+
class Evaluator < SimpleDelegator
|
59
59
|
class << self
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
67
|
-
# continue.
|
68
|
-
def define_evaluation_rules
|
60
|
+
# Define evaluation rules within a block passed to this method. The evaluator will define
|
61
|
+
# default evaluation rules for pass-through productions (i.e. productions with expansions
|
62
|
+
# consisting of exactly one grammar symbol). The default evaluation rule for such productions
|
63
|
+
# is to simply return the result of calling +evaluate+ on the unique child node. Setting the
|
64
|
+
# <tt>:raise_error</tt> option to true tells the evaluator to throw an exception if you neglect
|
65
|
+
# to define a rule for a non-pass-through production (one where the expansion consists of
|
66
|
+
# multiple symbols), listing all the productions that absolutely need to be defined before you
|
67
|
+
# can continue.
|
68
|
+
def define_evaluation_rules(options = {})
|
69
69
|
yield
|
70
|
-
check_definitions
|
70
|
+
check_definitions(options)
|
71
71
|
end
|
72
72
|
|
73
73
|
private
|
74
74
|
|
75
|
-
def check_definitions
|
75
|
+
def check_definitions(options)
|
76
76
|
filter = lambda {|productions| productions.map {|production| production.name} - actions}
|
77
77
|
pass_through_productions_without_rules = filter[grammar.productions.select {|production| production.expansion.size == 1}]
|
78
78
|
pass_through_productions_without_rules.each do |rule_name|
|
@@ -81,7 +81,7 @@ module Dhaka
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
|
84
|
-
raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
|
84
|
+
raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty? || !options[:raise_error]
|
85
85
|
end
|
86
86
|
|
87
87
|
def inherited(evaluator)
|
@@ -107,15 +107,16 @@ module Dhaka
|
|
107
107
|
# Evaluate a parse tree node.
|
108
108
|
def evaluate node
|
109
109
|
@node_stack ||= []
|
110
|
-
@node_stack << node
|
110
|
+
@node_stack << node
|
111
|
+
__setobj__(@node_stack.last)
|
111
112
|
result = send(node.production.name)
|
112
113
|
@node_stack.pop
|
114
|
+
__setobj__(@node_stack.last)
|
113
115
|
result
|
114
116
|
end
|
115
117
|
|
116
|
-
|
117
|
-
|
118
|
-
@node_stack.last
|
118
|
+
def initialize
|
119
|
+
|
119
120
|
end
|
120
121
|
end
|
121
122
|
|
data/lib/grammar/grammar.rb
CHANGED
@@ -131,17 +131,16 @@ module Dhaka
|
|
131
131
|
end
|
132
132
|
|
133
133
|
def closure(kernel) #:nodoc:
|
134
|
-
channels = Set.new
|
135
|
-
|
134
|
+
channels = Hash.new {|hash, start_item| hash[start_item] = Set.new}
|
136
135
|
result = compute_closure(kernel) do |hash, item|
|
137
136
|
if item.next_symbol and item.next_symbol.non_terminal
|
138
137
|
productions_by_symbol[item.next_symbol].each do |production|
|
139
|
-
|
138
|
+
new_channel = spontaneous_channel(item, hash[Item.new(production, 0)])
|
139
|
+
channels[item] << new_channel
|
140
140
|
end
|
141
141
|
end
|
142
142
|
end
|
143
|
-
|
144
|
-
[channels, result]
|
143
|
+
[result, channels]
|
145
144
|
end
|
146
145
|
|
147
146
|
def passive_channel(start_item, end_item) #:nodoc:
|
data/lib/lexer/dfa.rb
CHANGED
@@ -5,6 +5,22 @@ module Dhaka
|
|
5
5
|
# in a LexerSpecification
|
6
6
|
class InvalidRegexException < StandardError
|
7
7
|
end
|
8
|
+
|
9
|
+
class CheckpointAction
|
10
|
+
attr_reader :pattern
|
11
|
+
def initialize(pattern)
|
12
|
+
@pattern = pattern
|
13
|
+
end
|
14
|
+
|
15
|
+
def call(lexer_run)
|
16
|
+
lexer_run.save_checkpoint(pattern)
|
17
|
+
end
|
18
|
+
|
19
|
+
def compile_to_ruby_source
|
20
|
+
"add_checkpoint(#{pattern.inspect})"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
8
24
|
|
9
25
|
class DFA < StateMachine #:nodoc:
|
10
26
|
def initialize(regex)
|
@@ -12,7 +28,7 @@ module Dhaka
|
|
12
28
|
|
13
29
|
tokenize_result = RegexTokenizer.tokenize(@regex)
|
14
30
|
raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
|
15
|
-
|
31
|
+
|
16
32
|
parse_result = RegexParser.parse(tokenize_result)
|
17
33
|
raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
|
18
34
|
|
@@ -44,28 +60,62 @@ module Dhaka
|
|
44
60
|
end
|
45
61
|
|
46
62
|
def new_state_for_key key
|
47
|
-
|
48
|
-
|
63
|
+
accepting = key.detect {|position| position.accepting}
|
64
|
+
if accepting
|
65
|
+
new_state = State.new(self, accepting.action(@regex))
|
66
|
+
else
|
67
|
+
new_state = State.new(self)
|
68
|
+
end
|
69
|
+
if key.any? {|position| position.checkpoint}
|
70
|
+
new_state.checkpoint_actions << CheckpointAction.new(@regex)
|
71
|
+
end
|
72
|
+
new_state
|
49
73
|
end
|
50
74
|
|
51
75
|
def transition_characters key
|
52
76
|
result = Set.new
|
53
77
|
key.each do |node|
|
54
|
-
result << node.character unless node.accepting
|
78
|
+
result << node.character unless (node.accepting || node.checkpoint)
|
55
79
|
end
|
56
80
|
result
|
57
81
|
end
|
58
82
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
83
|
+
def match(input)
|
84
|
+
DFARun.new(self, input).match
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class DFARun
|
89
|
+
def initialize(dfa, input)
|
90
|
+
@dfa, @input = dfa, input
|
91
|
+
@matched = ""
|
92
|
+
@not_yet_accepted = ""
|
93
|
+
@curr_state = @dfa.start_state
|
94
|
+
end
|
95
|
+
|
96
|
+
def match
|
97
|
+
@input.unpack("C*").each do |i|
|
98
|
+
break unless dest_state = @curr_state.transitions[i.chr]
|
99
|
+
@not_yet_accepted << i.chr
|
100
|
+
@curr_state = dest_state
|
101
|
+
@curr_state.process(self)
|
65
102
|
end
|
66
|
-
|
103
|
+
@matched
|
104
|
+
end
|
105
|
+
|
106
|
+
def save_checkpoint(pattern)
|
107
|
+
@last_saved_checkpoint = @matched + @not_yet_accepted
|
108
|
+
end
|
109
|
+
|
110
|
+
def accept(pattern)
|
111
|
+
@matched.concat @not_yet_accepted
|
112
|
+
@not_yet_accepted = ""
|
113
|
+
end
|
114
|
+
|
115
|
+
def accept_last_saved_checkpoint(pattern)
|
116
|
+
@matched = @last_saved_checkpoint
|
117
|
+
@not_yet_accepted = ""
|
67
118
|
end
|
68
119
|
end
|
69
|
-
|
70
120
|
end
|
71
|
-
end
|
121
|
+
end
|
data/lib/lexer/lexeme.rb
CHANGED
@@ -2,11 +2,10 @@ module Dhaka
|
|
2
2
|
# Represents a portion of the input string that has been recognized as matching a given lexer pattern.
|
3
3
|
class Lexeme
|
4
4
|
# The pattern matched by this lexeme.
|
5
|
-
attr_accessor :pattern
|
5
|
+
attr_accessor :pattern, :characters
|
6
6
|
|
7
7
|
# +input_position+ is the index in the input stream that this lexeme starts at.
|
8
8
|
attr_reader :input_position
|
9
|
-
attr_reader :characters
|
10
9
|
|
11
10
|
def initialize(input_position) #:nodoc:
|
12
11
|
@input_position = input_position
|
@@ -23,11 +22,11 @@ module Dhaka
|
|
23
22
|
end
|
24
23
|
|
25
24
|
def << char #:nodoc:
|
26
|
-
|
25
|
+
characters << char
|
27
26
|
end
|
28
27
|
|
29
28
|
def concat chars #:nodoc:
|
30
|
-
|
29
|
+
characters.concat chars
|
31
30
|
end
|
32
31
|
end
|
33
32
|
end
|
data/lib/lexer/lexer.rb
CHANGED
@@ -41,8 +41,17 @@ module Dhaka
|
|
41
41
|
|
42
42
|
private
|
43
43
|
def new_state_for_key key
|
44
|
-
|
45
|
-
|
44
|
+
accepting_states = key.select {|state| state.accepting?}
|
45
|
+
unless accepting_states.empty?
|
46
|
+
highest_precedence_state = accepting_states.min {|a, b| @specification.items[a.action.pattern] <=> @specification.items[b.action.pattern]}
|
47
|
+
new_state = LexerSupport::State.new(self, highest_precedence_state.action)
|
48
|
+
else
|
49
|
+
new_state = LexerSupport::State.new(self)
|
50
|
+
end
|
51
|
+
key.select {|state| !state.checkpoint_actions.empty?}.each do |state|
|
52
|
+
new_state.checkpoint_actions.concat state.checkpoint_actions
|
53
|
+
end
|
54
|
+
new_state
|
46
55
|
end
|
47
56
|
|
48
57
|
def transition_characters states
|
@@ -58,4 +67,4 @@ module Dhaka
|
|
58
67
|
result
|
59
68
|
end
|
60
69
|
end
|
61
|
-
end
|
70
|
+
end
|
data/lib/lexer/lexer_run.rb
CHANGED
@@ -8,11 +8,12 @@ module Dhaka
|
|
8
8
|
@lexer, @input = lexer, input
|
9
9
|
@input_position = 0
|
10
10
|
@not_yet_accepted_chars = []
|
11
|
+
@last_saved_checkpoints = {}
|
11
12
|
end
|
12
13
|
|
13
14
|
# Constructs a token of type +symbol_name+ from the +current_lexeme+.
|
14
|
-
def create_token(symbol_name)
|
15
|
-
Token.new(symbol_name,
|
15
|
+
def create_token(symbol_name, value = current_lexeme.characters.join)
|
16
|
+
Token.new(symbol_name, value, current_lexeme.input_position)
|
16
17
|
end
|
17
18
|
|
18
19
|
# Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
|
@@ -29,20 +30,31 @@ module Dhaka
|
|
29
30
|
reset_and_rewind
|
30
31
|
else
|
31
32
|
@curr_state = dest_state
|
32
|
-
|
33
|
-
|
34
|
-
@current_lexeme.concat @not_yet_accepted_chars
|
35
|
-
@not_yet_accepted_chars = []
|
36
|
-
@current_lexeme << c
|
37
|
-
else
|
38
|
-
@not_yet_accepted_chars << c
|
39
|
-
end
|
33
|
+
@not_yet_accepted_chars << c
|
34
|
+
@curr_state.process(self)
|
40
35
|
advance
|
41
36
|
end
|
42
37
|
end
|
43
38
|
yield Token.new(END_SYMBOL_NAME, nil, nil)
|
44
39
|
end
|
45
40
|
|
41
|
+
def accept(pattern) #:nodoc:
|
42
|
+
@current_lexeme.pattern = pattern
|
43
|
+
@current_lexeme.concat @not_yet_accepted_chars
|
44
|
+
@not_yet_accepted_chars = []
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_checkpoint(pattern) #:nodoc:
|
48
|
+
@last_saved_checkpoints[pattern] = (@current_lexeme.characters + @not_yet_accepted_chars)
|
49
|
+
end
|
50
|
+
|
51
|
+
def accept_last_saved_checkpoint(pattern) #:nodoc:
|
52
|
+
@current_lexeme.pattern = pattern
|
53
|
+
@current_lexeme.concat @not_yet_accepted_chars
|
54
|
+
@not_yet_accepted_chars = @current_lexeme.characters[(@last_saved_checkpoints[pattern].size)..-1]
|
55
|
+
@current_lexeme.characters = @last_saved_checkpoints[pattern].dup
|
56
|
+
end
|
57
|
+
|
46
58
|
private
|
47
59
|
def reset_and_rewind
|
48
60
|
@input_position -= @not_yet_accepted_chars.size
|
data/lib/lexer/regex_grammar.rb
CHANGED
@@ -4,7 +4,7 @@ module Dhaka
|
|
4
4
|
LOWERCASE_LETTERS = ('a'..'z').to_a
|
5
5
|
UPPERCASE_LETTERS = ('A'..'Z').to_a
|
6
6
|
LETTERS = LOWERCASE_LETTERS + UPPERCASE_LETTERS
|
7
|
-
WHITESPACE = [" ", "\n", "\t"]
|
7
|
+
WHITESPACE = [" ", "\r", "\n", "\t"]
|
8
8
|
SYMBOLS = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
|
9
9
|
CLASSES = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
|
10
10
|
|
@@ -22,9 +22,10 @@ module Dhaka
|
|
22
22
|
class RegexGrammar < Dhaka::Grammar
|
23
23
|
|
24
24
|
for_symbol(Dhaka::START_SYMBOL_NAME) do
|
25
|
-
regex %w| Disjunction | do RootNode.new(child_nodes[0]) end
|
26
|
-
|
27
|
-
|
25
|
+
regex %w| Disjunction | do RootNode.new(child_nodes[0], AcceptingNode.new) end
|
26
|
+
regex_with_lookahead %w| Disjunction / Disjunction | do RootNode.new(LookaheadNode.new(child_nodes[0], child_nodes[2]), LookaheadAcceptingNode.new) end
|
27
|
+
end
|
28
|
+
|
28
29
|
for_symbol('Disjunction') do
|
29
30
|
disjunction %w| Alternative \| Disjunction | do OrNode.new(child_nodes[0], child_nodes[2]) end
|
30
31
|
alternative %w| Alternative | do child_nodes[0] end
|
@@ -45,7 +46,7 @@ module Dhaka
|
|
45
46
|
for_symbol('Atom') do
|
46
47
|
group %w| ( Disjunction ) | do child_nodes[1] end
|
47
48
|
char %w| Character | do LeafNode.new(child_nodes[0]) end
|
48
|
-
anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
|
49
|
+
anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\r", "\n"]).collect {|char| LeafNode.new(char)}) end
|
49
50
|
positive_set %w| [ SetContents ] | do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
|
50
51
|
negative_set %w| [ ^ SetContents ] | do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
|
51
52
|
|
@@ -145,6 +146,10 @@ module Dhaka
|
|
145
146
|
|
146
147
|
|
147
148
|
class ASTNode
|
149
|
+
def checkpoint
|
150
|
+
false
|
151
|
+
end
|
152
|
+
|
148
153
|
def accepting
|
149
154
|
false
|
150
155
|
end
|
@@ -184,15 +189,19 @@ module Dhaka
|
|
184
189
|
end
|
185
190
|
|
186
191
|
def first
|
187
|
-
|
188
|
-
|
192
|
+
result = Set.new
|
193
|
+
children.each do |child|
|
194
|
+
result.merge child.first
|
189
195
|
end
|
196
|
+
result
|
190
197
|
end
|
191
198
|
|
192
199
|
def last
|
193
|
-
|
194
|
-
|
200
|
+
result = Set.new
|
201
|
+
children.each do |child|
|
202
|
+
result.merge child.last
|
195
203
|
end
|
204
|
+
result
|
196
205
|
end
|
197
206
|
|
198
207
|
def to_dot(graph)
|
@@ -234,6 +243,19 @@ module Dhaka
|
|
234
243
|
end
|
235
244
|
end
|
236
245
|
end
|
246
|
+
|
247
|
+
class LookaheadNode < CatNode
|
248
|
+
def label
|
249
|
+
"/"
|
250
|
+
end
|
251
|
+
|
252
|
+
def calculate_follow_sets
|
253
|
+
super
|
254
|
+
left.last.each do |leaf_node|
|
255
|
+
leaf_node.follow_set.merge(Set.new([CheckpointNode.new]))
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
237
259
|
|
238
260
|
class UnaryNode < ASTNode
|
239
261
|
attr_reader :child
|
@@ -265,10 +287,6 @@ module Dhaka
|
|
265
287
|
end
|
266
288
|
|
267
289
|
class RootNode < CatNode
|
268
|
-
def initialize(left)
|
269
|
-
super(left, AcceptingNode.new())
|
270
|
-
end
|
271
|
-
|
272
290
|
def label
|
273
291
|
"start"
|
274
292
|
end
|
@@ -344,6 +362,19 @@ module Dhaka
|
|
344
362
|
def calculate_follow_sets
|
345
363
|
end
|
346
364
|
end
|
365
|
+
|
366
|
+
class CheckpointNode < ASTNode
|
367
|
+
def to_dot(graph)
|
368
|
+
graph.node(self, :label => "lookahead")
|
369
|
+
end
|
370
|
+
|
371
|
+
def character
|
372
|
+
end
|
373
|
+
|
374
|
+
def checkpoint
|
375
|
+
true
|
376
|
+
end
|
377
|
+
end
|
347
378
|
|
348
379
|
class AcceptingNode < ASTNode
|
349
380
|
def accepting
|
@@ -352,6 +383,10 @@ module Dhaka
|
|
352
383
|
|
353
384
|
def character
|
354
385
|
end
|
386
|
+
|
387
|
+
def action(pattern)
|
388
|
+
AcceptAction.new(pattern)
|
389
|
+
end
|
355
390
|
|
356
391
|
def first
|
357
392
|
Set.new([self])
|
@@ -364,5 +399,44 @@ module Dhaka
|
|
364
399
|
graph.node(self, :label => '#')
|
365
400
|
end
|
366
401
|
end
|
402
|
+
|
403
|
+
class LookaheadAcceptingNode < AcceptingNode
|
404
|
+
def action(pattern)
|
405
|
+
LookaheadAcceptAction.new(pattern)
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
class AcceptAction
|
410
|
+
attr_reader :pattern
|
411
|
+
def initialize(pattern)
|
412
|
+
@pattern = pattern
|
413
|
+
end
|
414
|
+
|
415
|
+
def call(lexer_run)
|
416
|
+
lexer_run.accept(pattern)
|
417
|
+
end
|
418
|
+
|
419
|
+
def compile_to_ruby_source
|
420
|
+
"accept(#{pattern.inspect})"
|
421
|
+
end
|
422
|
+
|
423
|
+
def to_dot
|
424
|
+
"Accept #{pattern.inspect}"
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
class LookaheadAcceptAction < AcceptAction
|
429
|
+
def call(lexer_run)
|
430
|
+
lexer_run.accept_last_saved_checkpoint(pattern)
|
431
|
+
end
|
432
|
+
|
433
|
+
def compile_to_ruby_source
|
434
|
+
"accept_with_lookahead(#{pattern.inspect})"
|
435
|
+
end
|
436
|
+
|
437
|
+
def to_dot
|
438
|
+
"Accept With Lookahead #{pattern.inspect}"
|
439
|
+
end
|
440
|
+
end
|
367
441
|
end
|
368
|
-
end
|
442
|
+
end
|