dhaka 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/evaluator/evaluator.rb +18 -17
  2. data/lib/grammar/grammar.rb +4 -5
  3. data/lib/lexer/dfa.rb +63 -13
  4. data/lib/lexer/lexeme.rb +3 -4
  5. data/lib/lexer/lexer.rb +12 -3
  6. data/lib/lexer/lexer_run.rb +22 -10
  7. data/lib/lexer/regex_grammar.rb +88 -14
  8. data/lib/lexer/regex_parser.rb +1523 -1401
  9. data/lib/lexer/specification.rb +29 -3
  10. data/lib/lexer/state.rb +32 -9
  11. data/lib/lexer/state_machine.rb +2 -2
  12. data/lib/parser/channel.rb +4 -4
  13. data/lib/parser/parser.rb +17 -12
  14. data/lib/parser/parser_state.rb +3 -1
  15. data/test/chittagong/chittagong_lexer.rb +63 -63
  16. data/test/chittagong/chittagong_lexer.rb.rej +189 -0
  17. data/test/chittagong/chittagong_lexer_specification.rb +6 -8
  18. data/test/chittagong/chittagong_parser.rb +659 -659
  19. data/test/chittagong/chittagong_parser.rb.rej +1623 -0
  20. data/test/{another_lalr_but_not_slr_grammar.rb → core/another_lalr_but_not_slr_grammar.rb} +1 -1
  21. data/test/{compiled_parser_test.rb → core/compiled_parser_test.rb} +1 -1
  22. data/test/core/dfa_test.rb +170 -0
  23. data/test/{evaluator_test.rb → core/evaluator_test.rb} +3 -3
  24. data/test/{grammar_test.rb → core/grammar_test.rb} +3 -3
  25. data/test/{lalr_but_not_slr_grammar.rb → core/lalr_but_not_slr_grammar.rb} +0 -0
  26. data/test/core/lexer_test.rb +139 -0
  27. data/test/{malformed_grammar.rb → core/malformed_grammar.rb} +0 -0
  28. data/test/{malformed_grammar_test.rb → core/malformed_grammar_test.rb} +1 -1
  29. data/test/{nullable_grammar.rb → core/nullable_grammar.rb} +0 -0
  30. data/test/{parse_result_test.rb → core/parse_result_test.rb} +1 -1
  31. data/test/{parser_state_test.rb → core/parser_state_test.rb} +1 -1
  32. data/test/{parser_test.rb → core/parser_test.rb} +2 -2
  33. data/test/{precedence_grammar.rb → core/precedence_grammar.rb} +0 -0
  34. data/test/{precedence_grammar_test.rb → core/precedence_grammar_test.rb} +1 -1
  35. data/test/{rr_conflict_grammar.rb → core/rr_conflict_grammar.rb} +0 -0
  36. data/test/{simple_grammar.rb → core/simple_grammar.rb} +0 -0
  37. data/test/{sr_conflict_grammar.rb → core/sr_conflict_grammar.rb} +0 -0
  38. metadata +25 -22
  39. data/test/lexer_test.rb +0 -215
@@ -55,24 +55,24 @@ module Dhaka
55
55
  #
56
56
  # end
57
57
 
58
- class Evaluator
58
+ class Evaluator < SimpleDelegator
59
59
  class << self
60
- # Defining evaluation rules within a block passed to this method tells the evaluator to carry out a
61
- # rudimentary check of your definitions and define default evaluation rules for pass-through
62
- # productions (i.e. productions with expansions consisting of exactly one grammar symbol). The
63
- # default evaluation rule for such productions is to simply return the result of calling +evaluate+
64
- # on the unique child node. If you neglect to define a rule for a non-pass-through production (one
65
- # where the expansion consists of multiple symbols), the evaluator will raise an exception
66
- # at loading time, listing all the productions that absolutely need to be defined before you can
67
- # continue.
68
- def define_evaluation_rules
60
+ # Define evaluation rules within a block passed to this method. The evaluator will define
61
+ # default evaluation rules for pass-through productions (i.e. productions with expansions
62
+ # consisting of exactly one grammar symbol). The default evaluation rule for such productions
63
+ # is to simply return the result of calling +evaluate+ on the unique child node. Setting the
64
+ # <tt>:raise_error</tt> option to true tells the evaluator to throw an exception if you neglect
65
+ # to define a rule for a non-pass-through production (one where the expansion consists of
66
+ # multiple symbols), listing all the productions that absolutely need to be defined before you
67
+ # can continue.
68
+ def define_evaluation_rules(options = {})
69
69
  yield
70
- check_definitions
70
+ check_definitions(options)
71
71
  end
72
72
 
73
73
  private
74
74
 
75
- def check_definitions
75
+ def check_definitions(options)
76
76
  filter = lambda {|productions| productions.map {|production| production.name} - actions}
77
77
  pass_through_productions_without_rules = filter[grammar.productions.select {|production| production.expansion.size == 1}]
78
78
  pass_through_productions_without_rules.each do |rule_name|
@@ -81,7 +81,7 @@ module Dhaka
81
81
  end
82
82
  end
83
83
  non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
84
- raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
84
+ raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty? || !options[:raise_error]
85
85
  end
86
86
 
87
87
  def inherited(evaluator)
@@ -107,15 +107,16 @@ module Dhaka
107
107
  # Evaluate a parse tree node.
108
108
  def evaluate node
109
109
  @node_stack ||= []
110
- @node_stack << node.child_nodes
110
+ @node_stack << node
111
+ __setobj__(@node_stack.last)
111
112
  result = send(node.production.name)
112
113
  @node_stack.pop
114
+ __setobj__(@node_stack.last)
113
115
  result
114
116
  end
115
117
 
116
- # Returns the array of child nodes of the node being evaluated currently.
117
- def child_nodes
118
- @node_stack.last
118
+ def initialize
119
+
119
120
  end
120
121
  end
121
122
 
@@ -131,17 +131,16 @@ module Dhaka
131
131
  end
132
132
 
133
133
  def closure(kernel) #:nodoc:
134
- channels = Set.new
135
-
134
+ channels = Hash.new {|hash, start_item| hash[start_item] = Set.new}
136
135
  result = compute_closure(kernel) do |hash, item|
137
136
  if item.next_symbol and item.next_symbol.non_terminal
138
137
  productions_by_symbol[item.next_symbol].each do |production|
139
- channels << spontaneous_channel(item, hash[Item.new(production, 0)])
138
+ new_channel = spontaneous_channel(item, hash[Item.new(production, 0)])
139
+ channels[item] << new_channel
140
140
  end
141
141
  end
142
142
  end
143
-
144
- [channels, result]
143
+ [result, channels]
145
144
  end
146
145
 
147
146
  def passive_channel(start_item, end_item) #:nodoc:
data/lib/lexer/dfa.rb CHANGED
@@ -5,6 +5,22 @@ module Dhaka
5
5
  # in a LexerSpecification
6
6
  class InvalidRegexException < StandardError
7
7
  end
8
+
9
+ class CheckpointAction
10
+ attr_reader :pattern
11
+ def initialize(pattern)
12
+ @pattern = pattern
13
+ end
14
+
15
+ def call(lexer_run)
16
+ lexer_run.save_checkpoint(pattern)
17
+ end
18
+
19
+ def compile_to_ruby_source
20
+ "add_checkpoint(#{pattern.inspect})"
21
+ end
22
+ end
23
+
8
24
 
9
25
  class DFA < StateMachine #:nodoc:
10
26
  def initialize(regex)
@@ -12,7 +28,7 @@ module Dhaka
12
28
 
13
29
  tokenize_result = RegexTokenizer.tokenize(@regex)
14
30
  raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
15
-
31
+
16
32
  parse_result = RegexParser.parse(tokenize_result)
17
33
  raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
18
34
 
@@ -44,28 +60,62 @@ module Dhaka
44
60
  end
45
61
 
46
62
  def new_state_for_key key
47
- accepting = key.detect {|position| position.accepting}
48
- State.new(self, accepting && @regex)
63
+ accepting = key.detect {|position| position.accepting}
64
+ if accepting
65
+ new_state = State.new(self, accepting.action(@regex))
66
+ else
67
+ new_state = State.new(self)
68
+ end
69
+ if key.any? {|position| position.checkpoint}
70
+ new_state.checkpoint_actions << CheckpointAction.new(@regex)
71
+ end
72
+ new_state
49
73
  end
50
74
 
51
75
  def transition_characters key
52
76
  result = Set.new
53
77
  key.each do |node|
54
- result << node.character unless node.accepting
78
+ result << node.character unless (node.accepting || node.checkpoint)
55
79
  end
56
80
  result
57
81
  end
58
82
 
59
- def matches(string)
60
- curr_state = @start_state
61
- string.unpack("C*").each do |i|
62
- dest_state = curr_state.transitions[i.chr]
63
- return false unless dest_state
64
- curr_state = dest_state
83
+ def match(input)
84
+ DFARun.new(self, input).match
85
+ end
86
+ end
87
+
88
+ class DFARun
89
+ def initialize(dfa, input)
90
+ @dfa, @input = dfa, input
91
+ @matched = ""
92
+ @not_yet_accepted = ""
93
+ @curr_state = @dfa.start_state
94
+ end
95
+
96
+ def match
97
+ @input.unpack("C*").each do |i|
98
+ break unless dest_state = @curr_state.transitions[i.chr]
99
+ @not_yet_accepted << i.chr
100
+ @curr_state = dest_state
101
+ @curr_state.process(self)
65
102
  end
66
- return curr_state.accepting?
103
+ @matched
104
+ end
105
+
106
+ def save_checkpoint(pattern)
107
+ @last_saved_checkpoint = @matched + @not_yet_accepted
108
+ end
109
+
110
+ def accept(pattern)
111
+ @matched.concat @not_yet_accepted
112
+ @not_yet_accepted = ""
113
+ end
114
+
115
+ def accept_last_saved_checkpoint(pattern)
116
+ @matched = @last_saved_checkpoint
117
+ @not_yet_accepted = ""
67
118
  end
68
119
  end
69
-
70
120
  end
71
- end
121
+ end
data/lib/lexer/lexeme.rb CHANGED
@@ -2,11 +2,10 @@ module Dhaka
2
2
  # Represents a portion of the input string that has been recognized as matching a given lexer pattern.
3
3
  class Lexeme
4
4
  # The pattern matched by this lexeme.
5
- attr_accessor :pattern
5
+ attr_accessor :pattern, :characters
6
6
 
7
7
  # +input_position+ is the index in the input stream that this lexeme starts at.
8
8
  attr_reader :input_position
9
- attr_reader :characters
10
9
 
11
10
  def initialize(input_position) #:nodoc:
12
11
  @input_position = input_position
@@ -23,11 +22,11 @@ module Dhaka
23
22
  end
24
23
 
25
24
  def << char #:nodoc:
26
- @characters << char
25
+ characters << char
27
26
  end
28
27
 
29
28
  def concat chars #:nodoc:
30
- @characters.concat chars
29
+ characters.concat chars
31
30
  end
32
31
  end
33
32
  end
data/lib/lexer/lexer.rb CHANGED
@@ -41,8 +41,17 @@ module Dhaka
41
41
 
42
42
  private
43
43
  def new_state_for_key key
44
- item = key.select {|state| state.accepting?}.collect {|state| @specification.items[state.pattern]}.min
45
- LexerSupport::State.new(self, item && item.pattern)
44
+ accepting_states = key.select {|state| state.accepting?}
45
+ unless accepting_states.empty?
46
+ highest_precedence_state = accepting_states.min {|a, b| @specification.items[a.action.pattern] <=> @specification.items[b.action.pattern]}
47
+ new_state = LexerSupport::State.new(self, highest_precedence_state.action)
48
+ else
49
+ new_state = LexerSupport::State.new(self)
50
+ end
51
+ key.select {|state| !state.checkpoint_actions.empty?}.each do |state|
52
+ new_state.checkpoint_actions.concat state.checkpoint_actions
53
+ end
54
+ new_state
46
55
  end
47
56
 
48
57
  def transition_characters states
@@ -58,4 +67,4 @@ module Dhaka
58
67
  result
59
68
  end
60
69
  end
61
- end
70
+ end
@@ -8,11 +8,12 @@ module Dhaka
8
8
  @lexer, @input = lexer, input
9
9
  @input_position = 0
10
10
  @not_yet_accepted_chars = []
11
+ @last_saved_checkpoints = {}
11
12
  end
12
13
 
13
14
  # Constructs a token of type +symbol_name+ from the +current_lexeme+.
14
- def create_token(symbol_name)
15
- Token.new(symbol_name, @current_lexeme.characters.join, @current_lexeme.input_position)
15
+ def create_token(symbol_name, value = current_lexeme.characters.join)
16
+ Token.new(symbol_name, value, current_lexeme.input_position)
16
17
  end
17
18
 
18
19
  # Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
@@ -29,20 +30,31 @@ module Dhaka
29
30
  reset_and_rewind
30
31
  else
31
32
  @curr_state = dest_state
32
- if @curr_state.accepting?
33
- @current_lexeme.pattern = @curr_state.pattern
34
- @current_lexeme.concat @not_yet_accepted_chars
35
- @not_yet_accepted_chars = []
36
- @current_lexeme << c
37
- else
38
- @not_yet_accepted_chars << c
39
- end
33
+ @not_yet_accepted_chars << c
34
+ @curr_state.process(self)
40
35
  advance
41
36
  end
42
37
  end
43
38
  yield Token.new(END_SYMBOL_NAME, nil, nil)
44
39
  end
45
40
 
41
+ def accept(pattern) #:nodoc:
42
+ @current_lexeme.pattern = pattern
43
+ @current_lexeme.concat @not_yet_accepted_chars
44
+ @not_yet_accepted_chars = []
45
+ end
46
+
47
+ def save_checkpoint(pattern) #:nodoc:
48
+ @last_saved_checkpoints[pattern] = (@current_lexeme.characters + @not_yet_accepted_chars)
49
+ end
50
+
51
+ def accept_last_saved_checkpoint(pattern) #:nodoc:
52
+ @current_lexeme.pattern = pattern
53
+ @current_lexeme.concat @not_yet_accepted_chars
54
+ @not_yet_accepted_chars = @current_lexeme.characters[(@last_saved_checkpoints[pattern].size)..-1]
55
+ @current_lexeme.characters = @last_saved_checkpoints[pattern].dup
56
+ end
57
+
46
58
  private
47
59
  def reset_and_rewind
48
60
  @input_position -= @not_yet_accepted_chars.size
@@ -4,7 +4,7 @@ module Dhaka
4
4
  LOWERCASE_LETTERS = ('a'..'z').to_a
5
5
  UPPERCASE_LETTERS = ('A'..'Z').to_a
6
6
  LETTERS = LOWERCASE_LETTERS + UPPERCASE_LETTERS
7
- WHITESPACE = [" ", "\n", "\t"]
7
+ WHITESPACE = [" ", "\r", "\n", "\t"]
8
8
  SYMBOLS = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
9
9
  CLASSES = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
10
10
 
@@ -22,9 +22,10 @@ module Dhaka
22
22
  class RegexGrammar < Dhaka::Grammar
23
23
 
24
24
  for_symbol(Dhaka::START_SYMBOL_NAME) do
25
- regex %w| Disjunction | do RootNode.new(child_nodes[0]) end
26
- end
27
-
25
+ regex %w| Disjunction | do RootNode.new(child_nodes[0], AcceptingNode.new) end
26
+ regex_with_lookahead %w| Disjunction / Disjunction | do RootNode.new(LookaheadNode.new(child_nodes[0], child_nodes[2]), LookaheadAcceptingNode.new) end
27
+ end
28
+
28
29
  for_symbol('Disjunction') do
29
30
  disjunction %w| Alternative \| Disjunction | do OrNode.new(child_nodes[0], child_nodes[2]) end
30
31
  alternative %w| Alternative | do child_nodes[0] end
@@ -45,7 +46,7 @@ module Dhaka
45
46
  for_symbol('Atom') do
46
47
  group %w| ( Disjunction ) | do child_nodes[1] end
47
48
  char %w| Character | do LeafNode.new(child_nodes[0]) end
48
- anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
49
+ anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\r", "\n"]).collect {|char| LeafNode.new(char)}) end
49
50
  positive_set %w| [ SetContents ] | do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
50
51
  negative_set %w| [ ^ SetContents ] | do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
51
52
 
@@ -145,6 +146,10 @@ module Dhaka
145
146
 
146
147
 
147
148
  class ASTNode
149
+ def checkpoint
150
+ false
151
+ end
152
+
148
153
  def accepting
149
154
  false
150
155
  end
@@ -184,15 +189,19 @@ module Dhaka
184
189
  end
185
190
 
186
191
  def first
187
- children.inject(Set.new([])) do |result, child|
188
- result | child.first
192
+ result = Set.new
193
+ children.each do |child|
194
+ result.merge child.first
189
195
  end
196
+ result
190
197
  end
191
198
 
192
199
  def last
193
- children.inject(Set.new([])) do |result, child|
194
- result | child.last
200
+ result = Set.new
201
+ children.each do |child|
202
+ result.merge child.last
195
203
  end
204
+ result
196
205
  end
197
206
 
198
207
  def to_dot(graph)
@@ -234,6 +243,19 @@ module Dhaka
234
243
  end
235
244
  end
236
245
  end
246
+
247
+ class LookaheadNode < CatNode
248
+ def label
249
+ "/"
250
+ end
251
+
252
+ def calculate_follow_sets
253
+ super
254
+ left.last.each do |leaf_node|
255
+ leaf_node.follow_set.merge(Set.new([CheckpointNode.new]))
256
+ end
257
+ end
258
+ end
237
259
 
238
260
  class UnaryNode < ASTNode
239
261
  attr_reader :child
@@ -265,10 +287,6 @@ module Dhaka
265
287
  end
266
288
 
267
289
  class RootNode < CatNode
268
- def initialize(left)
269
- super(left, AcceptingNode.new())
270
- end
271
-
272
290
  def label
273
291
  "start"
274
292
  end
@@ -344,6 +362,19 @@ module Dhaka
344
362
  def calculate_follow_sets
345
363
  end
346
364
  end
365
+
366
+ class CheckpointNode < ASTNode
367
+ def to_dot(graph)
368
+ graph.node(self, :label => "lookahead")
369
+ end
370
+
371
+ def character
372
+ end
373
+
374
+ def checkpoint
375
+ true
376
+ end
377
+ end
347
378
 
348
379
  class AcceptingNode < ASTNode
349
380
  def accepting
@@ -352,6 +383,10 @@ module Dhaka
352
383
 
353
384
  def character
354
385
  end
386
+
387
+ def action(pattern)
388
+ AcceptAction.new(pattern)
389
+ end
355
390
 
356
391
  def first
357
392
  Set.new([self])
@@ -364,5 +399,44 @@ module Dhaka
364
399
  graph.node(self, :label => '#')
365
400
  end
366
401
  end
402
+
403
+ class LookaheadAcceptingNode < AcceptingNode
404
+ def action(pattern)
405
+ LookaheadAcceptAction.new(pattern)
406
+ end
407
+ end
408
+
409
+ class AcceptAction
410
+ attr_reader :pattern
411
+ def initialize(pattern)
412
+ @pattern = pattern
413
+ end
414
+
415
+ def call(lexer_run)
416
+ lexer_run.accept(pattern)
417
+ end
418
+
419
+ def compile_to_ruby_source
420
+ "accept(#{pattern.inspect})"
421
+ end
422
+
423
+ def to_dot
424
+ "Accept #{pattern.inspect}"
425
+ end
426
+ end
427
+
428
+ class LookaheadAcceptAction < AcceptAction
429
+ def call(lexer_run)
430
+ lexer_run.accept_last_saved_checkpoint(pattern)
431
+ end
432
+
433
+ def compile_to_ruby_source
434
+ "accept_with_lookahead(#{pattern.inspect})"
435
+ end
436
+
437
+ def to_dot
438
+ "Accept With Lookahead #{pattern.inspect}"
439
+ end
440
+ end
367
441
  end
368
- end
442
+ end