dhaka 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/evaluator/evaluator.rb +18 -17
- data/lib/grammar/grammar.rb +4 -5
- data/lib/lexer/dfa.rb +63 -13
- data/lib/lexer/lexeme.rb +3 -4
- data/lib/lexer/lexer.rb +12 -3
- data/lib/lexer/lexer_run.rb +22 -10
- data/lib/lexer/regex_grammar.rb +88 -14
- data/lib/lexer/regex_parser.rb +1523 -1401
- data/lib/lexer/specification.rb +29 -3
- data/lib/lexer/state.rb +32 -9
- data/lib/lexer/state_machine.rb +2 -2
- data/lib/parser/channel.rb +4 -4
- data/lib/parser/parser.rb +17 -12
- data/lib/parser/parser_state.rb +3 -1
- data/test/chittagong/chittagong_lexer.rb +63 -63
- data/test/chittagong/chittagong_lexer.rb.rej +189 -0
- data/test/chittagong/chittagong_lexer_specification.rb +6 -8
- data/test/chittagong/chittagong_parser.rb +659 -659
- data/test/chittagong/chittagong_parser.rb.rej +1623 -0
- data/test/{another_lalr_but_not_slr_grammar.rb → core/another_lalr_but_not_slr_grammar.rb} +1 -1
- data/test/{compiled_parser_test.rb → core/compiled_parser_test.rb} +1 -1
- data/test/core/dfa_test.rb +170 -0
- data/test/{evaluator_test.rb → core/evaluator_test.rb} +3 -3
- data/test/{grammar_test.rb → core/grammar_test.rb} +3 -3
- data/test/{lalr_but_not_slr_grammar.rb → core/lalr_but_not_slr_grammar.rb} +0 -0
- data/test/core/lexer_test.rb +139 -0
- data/test/{malformed_grammar.rb → core/malformed_grammar.rb} +0 -0
- data/test/{malformed_grammar_test.rb → core/malformed_grammar_test.rb} +1 -1
- data/test/{nullable_grammar.rb → core/nullable_grammar.rb} +0 -0
- data/test/{parse_result_test.rb → core/parse_result_test.rb} +1 -1
- data/test/{parser_state_test.rb → core/parser_state_test.rb} +1 -1
- data/test/{parser_test.rb → core/parser_test.rb} +2 -2
- data/test/{precedence_grammar.rb → core/precedence_grammar.rb} +0 -0
- data/test/{precedence_grammar_test.rb → core/precedence_grammar_test.rb} +1 -1
- data/test/{rr_conflict_grammar.rb → core/rr_conflict_grammar.rb} +0 -0
- data/test/{simple_grammar.rb → core/simple_grammar.rb} +0 -0
- data/test/{sr_conflict_grammar.rb → core/sr_conflict_grammar.rb} +0 -0
- metadata +25 -22
- data/test/lexer_test.rb +0 -215
data/lib/evaluator/evaluator.rb
CHANGED
@@ -55,24 +55,24 @@ module Dhaka
|
|
55
55
|
#
|
56
56
|
# end
|
57
57
|
|
58
|
-
class Evaluator
|
58
|
+
class Evaluator < SimpleDelegator
|
59
59
|
class << self
|
60
|
-
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
67
|
-
# continue.
|
68
|
-
def define_evaluation_rules
|
60
|
+
# Define evaluation rules within a block passed to this method. The evaluator will define
|
61
|
+
# default evaluation rules for pass-through productions (i.e. productions with expansions
|
62
|
+
# consisting of exactly one grammar symbol). The default evaluation rule for such productions
|
63
|
+
# is to simply return the result of calling +evaluate+ on the unique child node. Setting the
|
64
|
+
# <tt>:raise_error</tt> option to true tells the evaluator to throw an exception if you neglect
|
65
|
+
# to define a rule for a non-pass-through production (one where the expansion consists of
|
66
|
+
# multiple symbols), listing all the productions that absolutely need to be defined before you
|
67
|
+
# can continue.
|
68
|
+
def define_evaluation_rules(options = {})
|
69
69
|
yield
|
70
|
-
check_definitions
|
70
|
+
check_definitions(options)
|
71
71
|
end
|
72
72
|
|
73
73
|
private
|
74
74
|
|
75
|
-
def check_definitions
|
75
|
+
def check_definitions(options)
|
76
76
|
filter = lambda {|productions| productions.map {|production| production.name} - actions}
|
77
77
|
pass_through_productions_without_rules = filter[grammar.productions.select {|production| production.expansion.size == 1}]
|
78
78
|
pass_through_productions_without_rules.each do |rule_name|
|
@@ -81,7 +81,7 @@ module Dhaka
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
|
84
|
-
raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
|
84
|
+
raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty? || !options[:raise_error]
|
85
85
|
end
|
86
86
|
|
87
87
|
def inherited(evaluator)
|
@@ -107,15 +107,16 @@ module Dhaka
|
|
107
107
|
# Evaluate a parse tree node.
|
108
108
|
def evaluate node
|
109
109
|
@node_stack ||= []
|
110
|
-
@node_stack << node
|
110
|
+
@node_stack << node
|
111
|
+
__setobj__(@node_stack.last)
|
111
112
|
result = send(node.production.name)
|
112
113
|
@node_stack.pop
|
114
|
+
__setobj__(@node_stack.last)
|
113
115
|
result
|
114
116
|
end
|
115
117
|
|
116
|
-
|
117
|
-
|
118
|
-
@node_stack.last
|
118
|
+
def initialize
|
119
|
+
|
119
120
|
end
|
120
121
|
end
|
121
122
|
|
data/lib/grammar/grammar.rb
CHANGED
@@ -131,17 +131,16 @@ module Dhaka
|
|
131
131
|
end
|
132
132
|
|
133
133
|
def closure(kernel) #:nodoc:
|
134
|
-
channels = Set.new
|
135
|
-
|
134
|
+
channels = Hash.new {|hash, start_item| hash[start_item] = Set.new}
|
136
135
|
result = compute_closure(kernel) do |hash, item|
|
137
136
|
if item.next_symbol and item.next_symbol.non_terminal
|
138
137
|
productions_by_symbol[item.next_symbol].each do |production|
|
139
|
-
|
138
|
+
new_channel = spontaneous_channel(item, hash[Item.new(production, 0)])
|
139
|
+
channels[item] << new_channel
|
140
140
|
end
|
141
141
|
end
|
142
142
|
end
|
143
|
-
|
144
|
-
[channels, result]
|
143
|
+
[result, channels]
|
145
144
|
end
|
146
145
|
|
147
146
|
def passive_channel(start_item, end_item) #:nodoc:
|
data/lib/lexer/dfa.rb
CHANGED
@@ -5,6 +5,22 @@ module Dhaka
|
|
5
5
|
# in a LexerSpecification
|
6
6
|
class InvalidRegexException < StandardError
|
7
7
|
end
|
8
|
+
|
9
|
+
class CheckpointAction
|
10
|
+
attr_reader :pattern
|
11
|
+
def initialize(pattern)
|
12
|
+
@pattern = pattern
|
13
|
+
end
|
14
|
+
|
15
|
+
def call(lexer_run)
|
16
|
+
lexer_run.save_checkpoint(pattern)
|
17
|
+
end
|
18
|
+
|
19
|
+
def compile_to_ruby_source
|
20
|
+
"add_checkpoint(#{pattern.inspect})"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
8
24
|
|
9
25
|
class DFA < StateMachine #:nodoc:
|
10
26
|
def initialize(regex)
|
@@ -12,7 +28,7 @@ module Dhaka
|
|
12
28
|
|
13
29
|
tokenize_result = RegexTokenizer.tokenize(@regex)
|
14
30
|
raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
|
15
|
-
|
31
|
+
|
16
32
|
parse_result = RegexParser.parse(tokenize_result)
|
17
33
|
raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
|
18
34
|
|
@@ -44,28 +60,62 @@ module Dhaka
|
|
44
60
|
end
|
45
61
|
|
46
62
|
def new_state_for_key key
|
47
|
-
|
48
|
-
|
63
|
+
accepting = key.detect {|position| position.accepting}
|
64
|
+
if accepting
|
65
|
+
new_state = State.new(self, accepting.action(@regex))
|
66
|
+
else
|
67
|
+
new_state = State.new(self)
|
68
|
+
end
|
69
|
+
if key.any? {|position| position.checkpoint}
|
70
|
+
new_state.checkpoint_actions << CheckpointAction.new(@regex)
|
71
|
+
end
|
72
|
+
new_state
|
49
73
|
end
|
50
74
|
|
51
75
|
def transition_characters key
|
52
76
|
result = Set.new
|
53
77
|
key.each do |node|
|
54
|
-
result << node.character unless node.accepting
|
78
|
+
result << node.character unless (node.accepting || node.checkpoint)
|
55
79
|
end
|
56
80
|
result
|
57
81
|
end
|
58
82
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
83
|
+
def match(input)
|
84
|
+
DFARun.new(self, input).match
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class DFARun
|
89
|
+
def initialize(dfa, input)
|
90
|
+
@dfa, @input = dfa, input
|
91
|
+
@matched = ""
|
92
|
+
@not_yet_accepted = ""
|
93
|
+
@curr_state = @dfa.start_state
|
94
|
+
end
|
95
|
+
|
96
|
+
def match
|
97
|
+
@input.unpack("C*").each do |i|
|
98
|
+
break unless dest_state = @curr_state.transitions[i.chr]
|
99
|
+
@not_yet_accepted << i.chr
|
100
|
+
@curr_state = dest_state
|
101
|
+
@curr_state.process(self)
|
65
102
|
end
|
66
|
-
|
103
|
+
@matched
|
104
|
+
end
|
105
|
+
|
106
|
+
def save_checkpoint(pattern)
|
107
|
+
@last_saved_checkpoint = @matched + @not_yet_accepted
|
108
|
+
end
|
109
|
+
|
110
|
+
def accept(pattern)
|
111
|
+
@matched.concat @not_yet_accepted
|
112
|
+
@not_yet_accepted = ""
|
113
|
+
end
|
114
|
+
|
115
|
+
def accept_last_saved_checkpoint(pattern)
|
116
|
+
@matched = @last_saved_checkpoint
|
117
|
+
@not_yet_accepted = ""
|
67
118
|
end
|
68
119
|
end
|
69
|
-
|
70
120
|
end
|
71
|
-
end
|
121
|
+
end
|
data/lib/lexer/lexeme.rb
CHANGED
@@ -2,11 +2,10 @@ module Dhaka
|
|
2
2
|
# Represents a portion of the input string that has been recognized as matching a given lexer pattern.
|
3
3
|
class Lexeme
|
4
4
|
# The pattern matched by this lexeme.
|
5
|
-
attr_accessor :pattern
|
5
|
+
attr_accessor :pattern, :characters
|
6
6
|
|
7
7
|
# +input_position+ is the index in the input stream that this lexeme starts at.
|
8
8
|
attr_reader :input_position
|
9
|
-
attr_reader :characters
|
10
9
|
|
11
10
|
def initialize(input_position) #:nodoc:
|
12
11
|
@input_position = input_position
|
@@ -23,11 +22,11 @@ module Dhaka
|
|
23
22
|
end
|
24
23
|
|
25
24
|
def << char #:nodoc:
|
26
|
-
|
25
|
+
characters << char
|
27
26
|
end
|
28
27
|
|
29
28
|
def concat chars #:nodoc:
|
30
|
-
|
29
|
+
characters.concat chars
|
31
30
|
end
|
32
31
|
end
|
33
32
|
end
|
data/lib/lexer/lexer.rb
CHANGED
@@ -41,8 +41,17 @@ module Dhaka
|
|
41
41
|
|
42
42
|
private
|
43
43
|
def new_state_for_key key
|
44
|
-
|
45
|
-
|
44
|
+
accepting_states = key.select {|state| state.accepting?}
|
45
|
+
unless accepting_states.empty?
|
46
|
+
highest_precedence_state = accepting_states.min {|a, b| @specification.items[a.action.pattern] <=> @specification.items[b.action.pattern]}
|
47
|
+
new_state = LexerSupport::State.new(self, highest_precedence_state.action)
|
48
|
+
else
|
49
|
+
new_state = LexerSupport::State.new(self)
|
50
|
+
end
|
51
|
+
key.select {|state| !state.checkpoint_actions.empty?}.each do |state|
|
52
|
+
new_state.checkpoint_actions.concat state.checkpoint_actions
|
53
|
+
end
|
54
|
+
new_state
|
46
55
|
end
|
47
56
|
|
48
57
|
def transition_characters states
|
@@ -58,4 +67,4 @@ module Dhaka
|
|
58
67
|
result
|
59
68
|
end
|
60
69
|
end
|
61
|
-
end
|
70
|
+
end
|
data/lib/lexer/lexer_run.rb
CHANGED
@@ -8,11 +8,12 @@ module Dhaka
|
|
8
8
|
@lexer, @input = lexer, input
|
9
9
|
@input_position = 0
|
10
10
|
@not_yet_accepted_chars = []
|
11
|
+
@last_saved_checkpoints = {}
|
11
12
|
end
|
12
13
|
|
13
14
|
# Constructs a token of type +symbol_name+ from the +current_lexeme+.
|
14
|
-
def create_token(symbol_name)
|
15
|
-
Token.new(symbol_name,
|
15
|
+
def create_token(symbol_name, value = current_lexeme.characters.join)
|
16
|
+
Token.new(symbol_name, value, current_lexeme.input_position)
|
16
17
|
end
|
17
18
|
|
18
19
|
# Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
|
@@ -29,20 +30,31 @@ module Dhaka
|
|
29
30
|
reset_and_rewind
|
30
31
|
else
|
31
32
|
@curr_state = dest_state
|
32
|
-
|
33
|
-
|
34
|
-
@current_lexeme.concat @not_yet_accepted_chars
|
35
|
-
@not_yet_accepted_chars = []
|
36
|
-
@current_lexeme << c
|
37
|
-
else
|
38
|
-
@not_yet_accepted_chars << c
|
39
|
-
end
|
33
|
+
@not_yet_accepted_chars << c
|
34
|
+
@curr_state.process(self)
|
40
35
|
advance
|
41
36
|
end
|
42
37
|
end
|
43
38
|
yield Token.new(END_SYMBOL_NAME, nil, nil)
|
44
39
|
end
|
45
40
|
|
41
|
+
def accept(pattern) #:nodoc:
|
42
|
+
@current_lexeme.pattern = pattern
|
43
|
+
@current_lexeme.concat @not_yet_accepted_chars
|
44
|
+
@not_yet_accepted_chars = []
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_checkpoint(pattern) #:nodoc:
|
48
|
+
@last_saved_checkpoints[pattern] = (@current_lexeme.characters + @not_yet_accepted_chars)
|
49
|
+
end
|
50
|
+
|
51
|
+
def accept_last_saved_checkpoint(pattern) #:nodoc:
|
52
|
+
@current_lexeme.pattern = pattern
|
53
|
+
@current_lexeme.concat @not_yet_accepted_chars
|
54
|
+
@not_yet_accepted_chars = @current_lexeme.characters[(@last_saved_checkpoints[pattern].size)..-1]
|
55
|
+
@current_lexeme.characters = @last_saved_checkpoints[pattern].dup
|
56
|
+
end
|
57
|
+
|
46
58
|
private
|
47
59
|
def reset_and_rewind
|
48
60
|
@input_position -= @not_yet_accepted_chars.size
|
data/lib/lexer/regex_grammar.rb
CHANGED
@@ -4,7 +4,7 @@ module Dhaka
|
|
4
4
|
LOWERCASE_LETTERS = ('a'..'z').to_a
|
5
5
|
UPPERCASE_LETTERS = ('A'..'Z').to_a
|
6
6
|
LETTERS = LOWERCASE_LETTERS + UPPERCASE_LETTERS
|
7
|
-
WHITESPACE = [" ", "\n", "\t"]
|
7
|
+
WHITESPACE = [" ", "\r", "\n", "\t"]
|
8
8
|
SYMBOLS = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
|
9
9
|
CLASSES = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
|
10
10
|
|
@@ -22,9 +22,10 @@ module Dhaka
|
|
22
22
|
class RegexGrammar < Dhaka::Grammar
|
23
23
|
|
24
24
|
for_symbol(Dhaka::START_SYMBOL_NAME) do
|
25
|
-
regex %w| Disjunction | do RootNode.new(child_nodes[0]) end
|
26
|
-
|
27
|
-
|
25
|
+
regex %w| Disjunction | do RootNode.new(child_nodes[0], AcceptingNode.new) end
|
26
|
+
regex_with_lookahead %w| Disjunction / Disjunction | do RootNode.new(LookaheadNode.new(child_nodes[0], child_nodes[2]), LookaheadAcceptingNode.new) end
|
27
|
+
end
|
28
|
+
|
28
29
|
for_symbol('Disjunction') do
|
29
30
|
disjunction %w| Alternative \| Disjunction | do OrNode.new(child_nodes[0], child_nodes[2]) end
|
30
31
|
alternative %w| Alternative | do child_nodes[0] end
|
@@ -45,7 +46,7 @@ module Dhaka
|
|
45
46
|
for_symbol('Atom') do
|
46
47
|
group %w| ( Disjunction ) | do child_nodes[1] end
|
47
48
|
char %w| Character | do LeafNode.new(child_nodes[0]) end
|
48
|
-
anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
|
49
|
+
anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\r", "\n"]).collect {|char| LeafNode.new(char)}) end
|
49
50
|
positive_set %w| [ SetContents ] | do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
|
50
51
|
negative_set %w| [ ^ SetContents ] | do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
|
51
52
|
|
@@ -145,6 +146,10 @@ module Dhaka
|
|
145
146
|
|
146
147
|
|
147
148
|
class ASTNode
|
149
|
+
def checkpoint
|
150
|
+
false
|
151
|
+
end
|
152
|
+
|
148
153
|
def accepting
|
149
154
|
false
|
150
155
|
end
|
@@ -184,15 +189,19 @@ module Dhaka
|
|
184
189
|
end
|
185
190
|
|
186
191
|
def first
|
187
|
-
|
188
|
-
|
192
|
+
result = Set.new
|
193
|
+
children.each do |child|
|
194
|
+
result.merge child.first
|
189
195
|
end
|
196
|
+
result
|
190
197
|
end
|
191
198
|
|
192
199
|
def last
|
193
|
-
|
194
|
-
|
200
|
+
result = Set.new
|
201
|
+
children.each do |child|
|
202
|
+
result.merge child.last
|
195
203
|
end
|
204
|
+
result
|
196
205
|
end
|
197
206
|
|
198
207
|
def to_dot(graph)
|
@@ -234,6 +243,19 @@ module Dhaka
|
|
234
243
|
end
|
235
244
|
end
|
236
245
|
end
|
246
|
+
|
247
|
+
class LookaheadNode < CatNode
|
248
|
+
def label
|
249
|
+
"/"
|
250
|
+
end
|
251
|
+
|
252
|
+
def calculate_follow_sets
|
253
|
+
super
|
254
|
+
left.last.each do |leaf_node|
|
255
|
+
leaf_node.follow_set.merge(Set.new([CheckpointNode.new]))
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
237
259
|
|
238
260
|
class UnaryNode < ASTNode
|
239
261
|
attr_reader :child
|
@@ -265,10 +287,6 @@ module Dhaka
|
|
265
287
|
end
|
266
288
|
|
267
289
|
class RootNode < CatNode
|
268
|
-
def initialize(left)
|
269
|
-
super(left, AcceptingNode.new())
|
270
|
-
end
|
271
|
-
|
272
290
|
def label
|
273
291
|
"start"
|
274
292
|
end
|
@@ -344,6 +362,19 @@ module Dhaka
|
|
344
362
|
def calculate_follow_sets
|
345
363
|
end
|
346
364
|
end
|
365
|
+
|
366
|
+
class CheckpointNode < ASTNode
|
367
|
+
def to_dot(graph)
|
368
|
+
graph.node(self, :label => "lookahead")
|
369
|
+
end
|
370
|
+
|
371
|
+
def character
|
372
|
+
end
|
373
|
+
|
374
|
+
def checkpoint
|
375
|
+
true
|
376
|
+
end
|
377
|
+
end
|
347
378
|
|
348
379
|
class AcceptingNode < ASTNode
|
349
380
|
def accepting
|
@@ -352,6 +383,10 @@ module Dhaka
|
|
352
383
|
|
353
384
|
def character
|
354
385
|
end
|
386
|
+
|
387
|
+
def action(pattern)
|
388
|
+
AcceptAction.new(pattern)
|
389
|
+
end
|
355
390
|
|
356
391
|
def first
|
357
392
|
Set.new([self])
|
@@ -364,5 +399,44 @@ module Dhaka
|
|
364
399
|
graph.node(self, :label => '#')
|
365
400
|
end
|
366
401
|
end
|
402
|
+
|
403
|
+
class LookaheadAcceptingNode < AcceptingNode
|
404
|
+
def action(pattern)
|
405
|
+
LookaheadAcceptAction.new(pattern)
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
class AcceptAction
|
410
|
+
attr_reader :pattern
|
411
|
+
def initialize(pattern)
|
412
|
+
@pattern = pattern
|
413
|
+
end
|
414
|
+
|
415
|
+
def call(lexer_run)
|
416
|
+
lexer_run.accept(pattern)
|
417
|
+
end
|
418
|
+
|
419
|
+
def compile_to_ruby_source
|
420
|
+
"accept(#{pattern.inspect})"
|
421
|
+
end
|
422
|
+
|
423
|
+
def to_dot
|
424
|
+
"Accept #{pattern.inspect}"
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
class LookaheadAcceptAction < AcceptAction
|
429
|
+
def call(lexer_run)
|
430
|
+
lexer_run.accept_last_saved_checkpoint(pattern)
|
431
|
+
end
|
432
|
+
|
433
|
+
def compile_to_ruby_source
|
434
|
+
"accept_with_lookahead(#{pattern.inspect})"
|
435
|
+
end
|
436
|
+
|
437
|
+
def to_dot
|
438
|
+
"Accept With Lookahead #{pattern.inspect}"
|
439
|
+
end
|
440
|
+
end
|
367
441
|
end
|
368
|
-
end
|
442
|
+
end
|