dhaka 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/evaluator/evaluator.rb +18 -17
  2. data/lib/grammar/grammar.rb +4 -5
  3. data/lib/lexer/dfa.rb +63 -13
  4. data/lib/lexer/lexeme.rb +3 -4
  5. data/lib/lexer/lexer.rb +12 -3
  6. data/lib/lexer/lexer_run.rb +22 -10
  7. data/lib/lexer/regex_grammar.rb +88 -14
  8. data/lib/lexer/regex_parser.rb +1523 -1401
  9. data/lib/lexer/specification.rb +29 -3
  10. data/lib/lexer/state.rb +32 -9
  11. data/lib/lexer/state_machine.rb +2 -2
  12. data/lib/parser/channel.rb +4 -4
  13. data/lib/parser/parser.rb +17 -12
  14. data/lib/parser/parser_state.rb +3 -1
  15. data/test/chittagong/chittagong_lexer.rb +63 -63
  16. data/test/chittagong/chittagong_lexer.rb.rej +189 -0
  17. data/test/chittagong/chittagong_lexer_specification.rb +6 -8
  18. data/test/chittagong/chittagong_parser.rb +659 -659
  19. data/test/chittagong/chittagong_parser.rb.rej +1623 -0
  20. data/test/{another_lalr_but_not_slr_grammar.rb → core/another_lalr_but_not_slr_grammar.rb} +1 -1
  21. data/test/{compiled_parser_test.rb → core/compiled_parser_test.rb} +1 -1
  22. data/test/core/dfa_test.rb +170 -0
  23. data/test/{evaluator_test.rb → core/evaluator_test.rb} +3 -3
  24. data/test/{grammar_test.rb → core/grammar_test.rb} +3 -3
  25. data/test/{lalr_but_not_slr_grammar.rb → core/lalr_but_not_slr_grammar.rb} +0 -0
  26. data/test/core/lexer_test.rb +139 -0
  27. data/test/{malformed_grammar.rb → core/malformed_grammar.rb} +0 -0
  28. data/test/{malformed_grammar_test.rb → core/malformed_grammar_test.rb} +1 -1
  29. data/test/{nullable_grammar.rb → core/nullable_grammar.rb} +0 -0
  30. data/test/{parse_result_test.rb → core/parse_result_test.rb} +1 -1
  31. data/test/{parser_state_test.rb → core/parser_state_test.rb} +1 -1
  32. data/test/{parser_test.rb → core/parser_test.rb} +2 -2
  33. data/test/{precedence_grammar.rb → core/precedence_grammar.rb} +0 -0
  34. data/test/{precedence_grammar_test.rb → core/precedence_grammar_test.rb} +1 -1
  35. data/test/{rr_conflict_grammar.rb → core/rr_conflict_grammar.rb} +0 -0
  36. data/test/{simple_grammar.rb → core/simple_grammar.rb} +0 -0
  37. data/test/{sr_conflict_grammar.rb → core/sr_conflict_grammar.rb} +0 -0
  38. metadata +25 -22
  39. data/test/lexer_test.rb +0 -215
@@ -55,24 +55,24 @@ module Dhaka
55
55
  #
56
56
  # end
57
57
 
58
- class Evaluator
58
+ class Evaluator < SimpleDelegator
59
59
  class << self
60
- # Defining evaluation rules within a block passed to this method tells the evaluator to carry out a
61
- # rudimentary check of your definitions and define default evaluation rules for pass-through
62
- # productions (i.e. productions with expansions consisting of exactly one grammar symbol). The
63
- # default evaluation rule for such productions is to simply return the result of calling +evaluate+
64
- # on the unique child node. If you neglect to define a rule for a non-pass-through production (one
65
- # where the expansion consists of multiple symbols), the evaluator will raise an exception
66
- # at loading time, listing all the productions that absolutely need to be defined before you can
67
- # continue.
68
- def define_evaluation_rules
60
+ # Define evaluation rules within a block passed to this method. The evaluator will define
61
+ # default evaluation rules for pass-through productions (i.e. productions with expansions
62
+ # consisting of exactly one grammar symbol). The default evaluation rule for such productions
63
+ # is to simply return the result of calling +evaluate+ on the unique child node. Setting the
64
+ # <tt>:raise_error</tt> option to true tells the evaluator to throw an exception if you neglect
65
+ # to define a rule for a non-pass-through production (one where the expansion consists of
66
+ # multiple symbols), listing all the productions that absolutely need to be defined before you
67
+ # can continue.
68
+ def define_evaluation_rules(options = {})
69
69
  yield
70
- check_definitions
70
+ check_definitions(options)
71
71
  end
72
72
 
73
73
  private
74
74
 
75
- def check_definitions
75
+ def check_definitions(options)
76
76
  filter = lambda {|productions| productions.map {|production| production.name} - actions}
77
77
  pass_through_productions_without_rules = filter[grammar.productions.select {|production| production.expansion.size == 1}]
78
78
  pass_through_productions_without_rules.each do |rule_name|
@@ -81,7 +81,7 @@ module Dhaka
81
81
  end
82
82
  end
83
83
  non_trivial_productions_with_rules_undefined = filter[grammar.productions.select {|production| production.expansion.size != 1}]
84
- raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty?
84
+ raise EvaluatorDefinitionError.new(non_trivial_productions_with_rules_undefined) unless non_trivial_productions_with_rules_undefined.empty? || !options[:raise_error]
85
85
  end
86
86
 
87
87
  def inherited(evaluator)
@@ -107,15 +107,16 @@ module Dhaka
107
107
  # Evaluate a parse tree node.
108
108
  def evaluate node
109
109
  @node_stack ||= []
110
- @node_stack << node.child_nodes
110
+ @node_stack << node
111
+ __setobj__(@node_stack.last)
111
112
  result = send(node.production.name)
112
113
  @node_stack.pop
114
+ __setobj__(@node_stack.last)
113
115
  result
114
116
  end
115
117
 
116
- # Returns the array of child nodes of the node being evaluated currently.
117
- def child_nodes
118
- @node_stack.last
118
+ def initialize
119
+
119
120
  end
120
121
  end
121
122
 
@@ -131,17 +131,16 @@ module Dhaka
131
131
  end
132
132
 
133
133
  def closure(kernel) #:nodoc:
134
- channels = Set.new
135
-
134
+ channels = Hash.new {|hash, start_item| hash[start_item] = Set.new}
136
135
  result = compute_closure(kernel) do |hash, item|
137
136
  if item.next_symbol and item.next_symbol.non_terminal
138
137
  productions_by_symbol[item.next_symbol].each do |production|
139
- channels << spontaneous_channel(item, hash[Item.new(production, 0)])
138
+ new_channel = spontaneous_channel(item, hash[Item.new(production, 0)])
139
+ channels[item] << new_channel
140
140
  end
141
141
  end
142
142
  end
143
-
144
- [channels, result]
143
+ [result, channels]
145
144
  end
146
145
 
147
146
  def passive_channel(start_item, end_item) #:nodoc:
data/lib/lexer/dfa.rb CHANGED
@@ -5,6 +5,22 @@ module Dhaka
5
5
  # in a LexerSpecification
6
6
  class InvalidRegexException < StandardError
7
7
  end
8
+
9
+ class CheckpointAction
10
+ attr_reader :pattern
11
+ def initialize(pattern)
12
+ @pattern = pattern
13
+ end
14
+
15
+ def call(lexer_run)
16
+ lexer_run.save_checkpoint(pattern)
17
+ end
18
+
19
+ def compile_to_ruby_source
20
+ "add_checkpoint(#{pattern.inspect})"
21
+ end
22
+ end
23
+
8
24
 
9
25
  class DFA < StateMachine #:nodoc:
10
26
  def initialize(regex)
@@ -12,7 +28,7 @@ module Dhaka
12
28
 
13
29
  tokenize_result = RegexTokenizer.tokenize(@regex)
14
30
  raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
15
-
31
+
16
32
  parse_result = RegexParser.parse(tokenize_result)
17
33
  raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
18
34
 
@@ -44,28 +60,62 @@ module Dhaka
44
60
  end
45
61
 
46
62
  def new_state_for_key key
47
- accepting = key.detect {|position| position.accepting}
48
- State.new(self, accepting && @regex)
63
+ accepting = key.detect {|position| position.accepting}
64
+ if accepting
65
+ new_state = State.new(self, accepting.action(@regex))
66
+ else
67
+ new_state = State.new(self)
68
+ end
69
+ if key.any? {|position| position.checkpoint}
70
+ new_state.checkpoint_actions << CheckpointAction.new(@regex)
71
+ end
72
+ new_state
49
73
  end
50
74
 
51
75
  def transition_characters key
52
76
  result = Set.new
53
77
  key.each do |node|
54
- result << node.character unless node.accepting
78
+ result << node.character unless (node.accepting || node.checkpoint)
55
79
  end
56
80
  result
57
81
  end
58
82
 
59
- def matches(string)
60
- curr_state = @start_state
61
- string.unpack("C*").each do |i|
62
- dest_state = curr_state.transitions[i.chr]
63
- return false unless dest_state
64
- curr_state = dest_state
83
+ def match(input)
84
+ DFARun.new(self, input).match
85
+ end
86
+ end
87
+
88
+ class DFARun
89
+ def initialize(dfa, input)
90
+ @dfa, @input = dfa, input
91
+ @matched = ""
92
+ @not_yet_accepted = ""
93
+ @curr_state = @dfa.start_state
94
+ end
95
+
96
+ def match
97
+ @input.unpack("C*").each do |i|
98
+ break unless dest_state = @curr_state.transitions[i.chr]
99
+ @not_yet_accepted << i.chr
100
+ @curr_state = dest_state
101
+ @curr_state.process(self)
65
102
  end
66
- return curr_state.accepting?
103
+ @matched
104
+ end
105
+
106
+ def save_checkpoint(pattern)
107
+ @last_saved_checkpoint = @matched + @not_yet_accepted
108
+ end
109
+
110
+ def accept(pattern)
111
+ @matched.concat @not_yet_accepted
112
+ @not_yet_accepted = ""
113
+ end
114
+
115
+ def accept_last_saved_checkpoint(pattern)
116
+ @matched = @last_saved_checkpoint
117
+ @not_yet_accepted = ""
67
118
  end
68
119
  end
69
-
70
120
  end
71
- end
121
+ end
data/lib/lexer/lexeme.rb CHANGED
@@ -2,11 +2,10 @@ module Dhaka
2
2
  # Represents a portion of the input string that has been recognized as matching a given lexer pattern.
3
3
  class Lexeme
4
4
  # The pattern matched by this lexeme.
5
- attr_accessor :pattern
5
+ attr_accessor :pattern, :characters
6
6
 
7
7
  # +input_position+ is the index in the input stream that this lexeme starts at.
8
8
  attr_reader :input_position
9
- attr_reader :characters
10
9
 
11
10
  def initialize(input_position) #:nodoc:
12
11
  @input_position = input_position
@@ -23,11 +22,11 @@ module Dhaka
23
22
  end
24
23
 
25
24
  def << char #:nodoc:
26
- @characters << char
25
+ characters << char
27
26
  end
28
27
 
29
28
  def concat chars #:nodoc:
30
- @characters.concat chars
29
+ characters.concat chars
31
30
  end
32
31
  end
33
32
  end
data/lib/lexer/lexer.rb CHANGED
@@ -41,8 +41,17 @@ module Dhaka
41
41
 
42
42
  private
43
43
  def new_state_for_key key
44
- item = key.select {|state| state.accepting?}.collect {|state| @specification.items[state.pattern]}.min
45
- LexerSupport::State.new(self, item && item.pattern)
44
+ accepting_states = key.select {|state| state.accepting?}
45
+ unless accepting_states.empty?
46
+ highest_precedence_state = accepting_states.min {|a, b| @specification.items[a.action.pattern] <=> @specification.items[b.action.pattern]}
47
+ new_state = LexerSupport::State.new(self, highest_precedence_state.action)
48
+ else
49
+ new_state = LexerSupport::State.new(self)
50
+ end
51
+ key.select {|state| !state.checkpoint_actions.empty?}.each do |state|
52
+ new_state.checkpoint_actions.concat state.checkpoint_actions
53
+ end
54
+ new_state
46
55
  end
47
56
 
48
57
  def transition_characters states
@@ -58,4 +67,4 @@ module Dhaka
58
67
  result
59
68
  end
60
69
  end
61
- end
70
+ end
@@ -8,11 +8,12 @@ module Dhaka
8
8
  @lexer, @input = lexer, input
9
9
  @input_position = 0
10
10
  @not_yet_accepted_chars = []
11
+ @last_saved_checkpoints = {}
11
12
  end
12
13
 
13
14
  # Constructs a token of type +symbol_name+ from the +current_lexeme+.
14
- def create_token(symbol_name)
15
- Token.new(symbol_name, @current_lexeme.characters.join, @current_lexeme.input_position)
15
+ def create_token(symbol_name, value = current_lexeme.characters.join)
16
+ Token.new(symbol_name, value, current_lexeme.input_position)
16
17
  end
17
18
 
18
19
  # Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
@@ -29,20 +30,31 @@ module Dhaka
29
30
  reset_and_rewind
30
31
  else
31
32
  @curr_state = dest_state
32
- if @curr_state.accepting?
33
- @current_lexeme.pattern = @curr_state.pattern
34
- @current_lexeme.concat @not_yet_accepted_chars
35
- @not_yet_accepted_chars = []
36
- @current_lexeme << c
37
- else
38
- @not_yet_accepted_chars << c
39
- end
33
+ @not_yet_accepted_chars << c
34
+ @curr_state.process(self)
40
35
  advance
41
36
  end
42
37
  end
43
38
  yield Token.new(END_SYMBOL_NAME, nil, nil)
44
39
  end
45
40
 
41
+ def accept(pattern) #:nodoc:
42
+ @current_lexeme.pattern = pattern
43
+ @current_lexeme.concat @not_yet_accepted_chars
44
+ @not_yet_accepted_chars = []
45
+ end
46
+
47
+ def save_checkpoint(pattern) #:nodoc:
48
+ @last_saved_checkpoints[pattern] = (@current_lexeme.characters + @not_yet_accepted_chars)
49
+ end
50
+
51
+ def accept_last_saved_checkpoint(pattern) #:nodoc:
52
+ @current_lexeme.pattern = pattern
53
+ @current_lexeme.concat @not_yet_accepted_chars
54
+ @not_yet_accepted_chars = @current_lexeme.characters[(@last_saved_checkpoints[pattern].size)..-1]
55
+ @current_lexeme.characters = @last_saved_checkpoints[pattern].dup
56
+ end
57
+
46
58
  private
47
59
  def reset_and_rewind
48
60
  @input_position -= @not_yet_accepted_chars.size
@@ -4,7 +4,7 @@ module Dhaka
4
4
  LOWERCASE_LETTERS = ('a'..'z').to_a
5
5
  UPPERCASE_LETTERS = ('A'..'Z').to_a
6
6
  LETTERS = LOWERCASE_LETTERS + UPPERCASE_LETTERS
7
- WHITESPACE = [" ", "\n", "\t"]
7
+ WHITESPACE = [" ", "\r", "\n", "\t"]
8
8
  SYMBOLS = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
9
9
  CLASSES = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
10
10
 
@@ -22,9 +22,10 @@ module Dhaka
22
22
  class RegexGrammar < Dhaka::Grammar
23
23
 
24
24
  for_symbol(Dhaka::START_SYMBOL_NAME) do
25
- regex %w| Disjunction | do RootNode.new(child_nodes[0]) end
26
- end
27
-
25
+ regex %w| Disjunction | do RootNode.new(child_nodes[0], AcceptingNode.new) end
26
+ regex_with_lookahead %w| Disjunction / Disjunction | do RootNode.new(LookaheadNode.new(child_nodes[0], child_nodes[2]), LookaheadAcceptingNode.new) end
27
+ end
28
+
28
29
  for_symbol('Disjunction') do
29
30
  disjunction %w| Alternative \| Disjunction | do OrNode.new(child_nodes[0], child_nodes[2]) end
30
31
  alternative %w| Alternative | do child_nodes[0] end
@@ -45,7 +46,7 @@ module Dhaka
45
46
  for_symbol('Atom') do
46
47
  group %w| ( Disjunction ) | do child_nodes[1] end
47
48
  char %w| Character | do LeafNode.new(child_nodes[0]) end
48
- anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
49
+ anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\r", "\n"]).collect {|char| LeafNode.new(char)}) end
49
50
  positive_set %w| [ SetContents ] | do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
50
51
  negative_set %w| [ ^ SetContents ] | do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
51
52
 
@@ -145,6 +146,10 @@ module Dhaka
145
146
 
146
147
 
147
148
  class ASTNode
149
+ def checkpoint
150
+ false
151
+ end
152
+
148
153
  def accepting
149
154
  false
150
155
  end
@@ -184,15 +189,19 @@ module Dhaka
184
189
  end
185
190
 
186
191
  def first
187
- children.inject(Set.new([])) do |result, child|
188
- result | child.first
192
+ result = Set.new
193
+ children.each do |child|
194
+ result.merge child.first
189
195
  end
196
+ result
190
197
  end
191
198
 
192
199
  def last
193
- children.inject(Set.new([])) do |result, child|
194
- result | child.last
200
+ result = Set.new
201
+ children.each do |child|
202
+ result.merge child.last
195
203
  end
204
+ result
196
205
  end
197
206
 
198
207
  def to_dot(graph)
@@ -234,6 +243,19 @@ module Dhaka
234
243
  end
235
244
  end
236
245
  end
246
+
247
+ class LookaheadNode < CatNode
248
+ def label
249
+ "/"
250
+ end
251
+
252
+ def calculate_follow_sets
253
+ super
254
+ left.last.each do |leaf_node|
255
+ leaf_node.follow_set.merge(Set.new([CheckpointNode.new]))
256
+ end
257
+ end
258
+ end
237
259
 
238
260
  class UnaryNode < ASTNode
239
261
  attr_reader :child
@@ -265,10 +287,6 @@ module Dhaka
265
287
  end
266
288
 
267
289
  class RootNode < CatNode
268
- def initialize(left)
269
- super(left, AcceptingNode.new())
270
- end
271
-
272
290
  def label
273
291
  "start"
274
292
  end
@@ -344,6 +362,19 @@ module Dhaka
344
362
  def calculate_follow_sets
345
363
  end
346
364
  end
365
+
366
+ class CheckpointNode < ASTNode
367
+ def to_dot(graph)
368
+ graph.node(self, :label => "lookahead")
369
+ end
370
+
371
+ def character
372
+ end
373
+
374
+ def checkpoint
375
+ true
376
+ end
377
+ end
347
378
 
348
379
  class AcceptingNode < ASTNode
349
380
  def accepting
@@ -352,6 +383,10 @@ module Dhaka
352
383
 
353
384
  def character
354
385
  end
386
+
387
+ def action(pattern)
388
+ AcceptAction.new(pattern)
389
+ end
355
390
 
356
391
  def first
357
392
  Set.new([self])
@@ -364,5 +399,44 @@ module Dhaka
364
399
  graph.node(self, :label => '#')
365
400
  end
366
401
  end
402
+
403
+ class LookaheadAcceptingNode < AcceptingNode
404
+ def action(pattern)
405
+ LookaheadAcceptAction.new(pattern)
406
+ end
407
+ end
408
+
409
+ class AcceptAction
410
+ attr_reader :pattern
411
+ def initialize(pattern)
412
+ @pattern = pattern
413
+ end
414
+
415
+ def call(lexer_run)
416
+ lexer_run.accept(pattern)
417
+ end
418
+
419
+ def compile_to_ruby_source
420
+ "accept(#{pattern.inspect})"
421
+ end
422
+
423
+ def to_dot
424
+ "Accept #{pattern.inspect}"
425
+ end
426
+ end
427
+
428
+ class LookaheadAcceptAction < AcceptAction
429
+ def call(lexer_run)
430
+ lexer_run.accept_last_saved_checkpoint(pattern)
431
+ end
432
+
433
+ def compile_to_ruby_source
434
+ "accept_with_lookahead(#{pattern.inspect})"
435
+ end
436
+
437
+ def to_dot
438
+ "Accept With Lookahead #{pattern.inspect}"
439
+ end
440
+ end
367
441
  end
368
- end
442
+ end