dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/Rakefile +64 -0
  2. data/lib/dhaka.rb +12 -0
  3. data/lib/dot/dot.rb +29 -0
  4. data/lib/evaluator/evaluator.rb +35 -26
  5. data/lib/grammar/grammar.rb +42 -17
  6. data/lib/grammar/grammar_symbol.rb +4 -3
  7. data/lib/grammar/production.rb +9 -3
  8. data/lib/lexer/compiled_lexer.rb +46 -0
  9. data/lib/lexer/dfa.rb +71 -0
  10. data/lib/lexer/lexeme.rb +33 -0
  11. data/lib/lexer/lexer.rb +61 -0
  12. data/lib/lexer/lexer_run.rb +66 -0
  13. data/lib/lexer/regex_grammar.rb +368 -0
  14. data/lib/lexer/regex_parser.rb +1888 -0
  15. data/lib/lexer/regex_tokenizer.rb +14 -0
  16. data/lib/lexer/specification.rb +69 -0
  17. data/lib/lexer/state.rb +45 -0
  18. data/lib/lexer/state_machine.rb +37 -0
  19. data/lib/parser/action.rb +3 -3
  20. data/lib/parser/compiled_parser.rb +11 -3
  21. data/lib/parser/parse_result.rb +3 -5
  22. data/lib/parser/parse_tree.rb +6 -17
  23. data/lib/parser/parser.rb +15 -14
  24. data/lib/parser/parser_run.rb +4 -2
  25. data/lib/parser/parser_state.rb +16 -8
  26. data/lib/tokenizer/tokenizer.rb +5 -3
  27. data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
  28. data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
  29. data/test/chittagong/chittagong_driver.rb +12 -13
  30. data/test/chittagong/chittagong_driver_test.rb +18 -11
  31. data/test/chittagong/chittagong_evaluator.rb +7 -16
  32. data/test/chittagong/chittagong_evaluator_test.rb +7 -4
  33. data/test/chittagong/chittagong_grammar.rb +0 -6
  34. data/test/chittagong/chittagong_lexer.rb +109 -0
  35. data/test/chittagong/chittagong_lexer_specification.rb +39 -0
  36. data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
  37. data/test/chittagong/chittagong_parser.rb +879 -0
  38. data/test/chittagong/chittagong_parser_test.rb +8 -10
  39. data/test/chittagong/chittagong_test.rb +17 -13
  40. data/test/compiled_parser_test.rb +7 -2
  41. data/test/evaluator_test.rb +0 -1
  42. data/test/grammar_test.rb +19 -1
  43. data/test/lexer_test.rb +215 -0
  44. data/test/parse_result_test.rb +8 -8
  45. data/test/parser_state_test.rb +0 -12
  46. metadata +21 -5
  47. data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
  48. data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,71 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+
4
+ # Raised when an invalid regular expression pattern is encountered
5
+ # in a LexerSpecification
6
+ class InvalidRegexException < StandardError
7
+ end
8
+
9
+ class DFA < StateMachine #:nodoc:
10
+ def initialize(regex)
11
+ @regex = regex
12
+
13
+ tokenize_result = RegexTokenizer.tokenize(@regex)
14
+ raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
15
+
16
+ parse_result = RegexParser.parse(tokenize_result)
17
+ raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
18
+
19
+ ast = parse_result
20
+ ast.calculate_follow_sets
21
+
22
+ super(ItemSet.new(ast.first))
23
+ end
24
+
25
+ def tokenize_error_message(tokenize_result)
26
+ "Invalid character #{@regex[tokenize_result.unexpected_char_index].chr}: #{@regex.dup.insert(tokenize_result.unexpected_char_index, '>>>')}"
27
+ end
28
+
29
+ def parse_error_message(parse_result)
30
+ unexpected_token = parse_result.unexpected_token
31
+ if unexpected_token.symbol_name == END_SYMBOL_NAME
32
+ "Unexpected end of regex."
33
+ else
34
+ "Unexpected token #{parse_result.unexpected_token.symbol_name}: #{@regex.dup.insert(parse_result.unexpected_token.input_position, '>>>')}"
35
+ end
36
+ end
37
+
38
+ def dest_key_for key, char
39
+ result = ItemSet.new
40
+ key.each do |position|
41
+ result.merge(position.follow_set) if position.character == char
42
+ end
43
+ result
44
+ end
45
+
46
+ def new_state_for_key key
47
+ accepting = key.detect {|position| position.accepting}
48
+ State.new(self, accepting && @regex)
49
+ end
50
+
51
+ def transition_characters key
52
+ result = Set.new
53
+ key.each do |node|
54
+ result << node.character unless node.accepting
55
+ end
56
+ result
57
+ end
58
+
59
+ def matches(string)
60
+ curr_state = @start_state
61
+ string.unpack("C*").each do |i|
62
+ dest_state = curr_state.transitions[i.chr]
63
+ return false unless dest_state
64
+ curr_state = dest_state
65
+ end
66
+ return curr_state.accepting?
67
+ end
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,33 @@
1
+ module Dhaka
2
+ # Represents a portion of the input string that has been recognized as matching a given lexer pattern.
3
+ class Lexeme
4
+ # The pattern matched by this lexeme.
5
+ attr_accessor :pattern
6
+
7
+ # +input_position+ is the index in the input stream that this lexeme starts at.
8
+ attr_reader :input_position
9
+ attr_reader :characters
10
+
11
+ def initialize(input_position) #:nodoc:
12
+ @input_position = input_position
13
+ @characters = []
14
+ end
15
+
16
+ # The substring of the input stream that this lexeme is comprised of.
17
+ def value
18
+ characters.join
19
+ end
20
+
21
+ def accepted? #:nodoc:
22
+ pattern
23
+ end
24
+
25
+ def << char #:nodoc:
26
+ @characters << char
27
+ end
28
+
29
+ def concat chars #:nodoc:
30
+ @characters.concat chars
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,61 @@
1
+ module Dhaka
2
+ # The lexer generator. To generate a lexer from a lexer specification +MyLexerSpecification+:
3
+ # lexer = Dhaka::Lexer.new(MyLexerSpecification)
4
+ #
5
+ # To compile this lexer as +MyLexer+ to a string of Ruby source:
6
+ # lexer.compile_to_ruby_source_as(:MyLexer)
7
+ class Lexer < LexerSupport::StateMachine
8
+ attr_reader :specification
9
+
10
+ # Creates a new lexer from a given specification.
11
+ def initialize(specification)
12
+ dfas = {}
13
+ @specification = specification
14
+ specification.items.each do |pattern, item|
15
+ dfas[pattern] = LexerSupport::DFA.new(pattern)
16
+ end
17
+ super(ItemSet.new(dfas.values.collect{|dfa| dfa.start_state}))
18
+ end
19
+
20
+ # Compiles the lexer to Ruby code that when executed, reloads all the states and actions of the lexer
21
+ # into a class named +lexer_class_name+.
22
+ def compile_to_ruby_source_as lexer_class_name
23
+ result = "class #{lexer_class_name} < Dhaka::CompiledLexer\n\n"
24
+ result << " self.specification = #{specification.name}\n\n"
25
+ result << " start_with #{start_state.object_id}\n\n"
26
+ @states.each do |key, state|
27
+ result << "#{state.compile_to_ruby_source}\n\n"
28
+ end
29
+ result << "end"
30
+ result
31
+ end
32
+
33
+ # Returns a LexerRun that tokenizes +input+.
34
+ def lex input
35
+ LexerRun.new(self, input)
36
+ end
37
+
38
+ def action_for_pattern pattern #:nodoc
39
+ @specification.items[pattern].action
40
+ end
41
+
42
+ private
43
+ def new_state_for_key key
44
+ item = key.select {|state| state.accepting?}.collect {|state| @specification.items[state.pattern]}.min
45
+ LexerSupport::State.new(self, item && item.pattern)
46
+ end
47
+
48
+ def transition_characters states
49
+ states.collect{|state| state.transitions.keys}.flatten.uniq
50
+ end
51
+
52
+ def dest_key_for states, char
53
+ result = ItemSet.new
54
+ states.each do |state|
55
+ dest_state = state.transitions[char]
56
+ result << dest_state if dest_state
57
+ end
58
+ result
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,66 @@
1
+ module Dhaka
2
+ # Represents a run of a lexer on a given input string.
3
+ class LexerRun
4
+ include Enumerable
5
+
6
+ attr_reader :current_lexeme
7
+ def initialize lexer, input
8
+ @lexer, @input = lexer, input
9
+ @input_position = 0
10
+ @not_yet_accepted_chars = []
11
+ end
12
+
13
+ # Constructs a token of type +symbol_name+ from the +current_lexeme+.
14
+ def create_token(symbol_name)
15
+ Token.new(symbol_name, @current_lexeme.characters.join, @current_lexeme.input_position)
16
+ end
17
+
18
+ # Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
19
+ def each
20
+ reset_and_rewind
21
+ loop do
22
+ c = curr_char
23
+ break if (c == "\0" && @not_yet_accepted_chars.empty? && !@current_lexeme.accepted?)
24
+ dest_state = @curr_state.transitions[c]
25
+ unless dest_state
26
+ return TokenizerErrorResult.new(@input_position) unless @current_lexeme.accepted?
27
+ token = get_token
28
+ yield token if token
29
+ reset_and_rewind
30
+ else
31
+ @curr_state = dest_state
32
+ if @curr_state.accepting?
33
+ @current_lexeme.pattern = @curr_state.pattern
34
+ @current_lexeme.concat @not_yet_accepted_chars
35
+ @not_yet_accepted_chars = []
36
+ @current_lexeme << c
37
+ else
38
+ @not_yet_accepted_chars << c
39
+ end
40
+ advance
41
+ end
42
+ end
43
+ yield Token.new(END_SYMBOL_NAME, nil, nil)
44
+ end
45
+
46
+ private
47
+ def reset_and_rewind
48
+ @input_position -= @not_yet_accepted_chars.size
49
+ @current_lexeme = Lexeme.new(@input_position)
50
+ @curr_state = @lexer.start_state
51
+ @not_yet_accepted_chars = []
52
+ end
53
+
54
+ def curr_char
55
+ (@input[@input_position] || 0).chr
56
+ end
57
+
58
+ def advance
59
+ @input_position += 1
60
+ end
61
+
62
+ def get_token
63
+ instance_eval(&@lexer.action_for_pattern(@current_lexeme.pattern))
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,368 @@
1
+ module Dhaka
2
+ module LexerSupport #:nodoc:all
3
+ DIGITS = ('0'..'9').to_a
4
+ LOWERCASE_LETTERS = ('a'..'z').to_a
5
+ UPPERCASE_LETTERS = ('A'..'Z').to_a
6
+ LETTERS = LOWERCASE_LETTERS + UPPERCASE_LETTERS
7
+ WHITESPACE = [" ", "\n", "\t"]
8
+ SYMBOLS = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
9
+ CLASSES = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
10
+
11
+ OPERATOR_CHARACTERS = {'(' => 'open_parenth', ')' => 'close_parenth', '[' => 'open_square_bracket',
12
+ ']' => 'close_square_bracket', '+' => 'plus', '*' => 'asterisk',
13
+ '?' => 'question_mark', '.' => 'period', '\\' => 'back_slash',
14
+ '|' => 'pipe', '{' => 'left_curly_brace', '}' => 'right_curly_brace',
15
+ '/' => 'forward_slash', '^' => 'caret', '$' => 'dollar'}
16
+
17
+ SET_OPERATOR_CHARACTERS = %w| - ^ [ ] \\ |
18
+
19
+ ALL_CHARACTERS = DIGITS + LETTERS + SYMBOLS + WHITESPACE + OPERATOR_CHARACTERS.keys
20
+
21
+
22
+ class RegexGrammar < Dhaka::Grammar
23
+
24
+ for_symbol(Dhaka::START_SYMBOL_NAME) do
25
+ regex %w| Disjunction | do RootNode.new(child_nodes[0]) end
26
+ end
27
+
28
+ for_symbol('Disjunction') do
29
+ disjunction %w| Alternative \| Disjunction | do OrNode.new(child_nodes[0], child_nodes[2]) end
30
+ alternative %w| Alternative | do child_nodes[0] end
31
+ end
32
+
33
+ for_symbol('Alternative') do
34
+ concatenation %w| Alternative Term | do CatNode.new(child_nodes[0], child_nodes[1]) end
35
+ term %w| Term | do child_nodes[0] end
36
+ end
37
+
38
+ for_symbol('Term') do
39
+ zero_or_more %w| Atom * | do ZeroOrMoreNode.new(child_nodes[0]) end
40
+ one_or_more %w| Atom + | do OneOrMoreNode.new(child_nodes[0]) end
41
+ zero_or_one %w| Atom ? | do ZeroOrOneNode.new(child_nodes[0]) end
42
+ atom %w| Atom | do child_nodes[0] end
43
+ end
44
+
45
+ for_symbol('Atom') do
46
+ group %w| ( Disjunction ) | do child_nodes[1] end
47
+ char %w| Character | do LeafNode.new(child_nodes[0]) end
48
+ anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
49
+ positive_set %w| [ SetContents ] | do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
50
+ negative_set %w| [ ^ SetContents ] | do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
51
+
52
+ CLASSES.each do |char, expansion|
53
+ send("character_class_#{char}", ['\\', char]) do
54
+ OrNode.new(*CLASSES[char].collect {|c| LeafNode.new(c)})
55
+ end
56
+ end
57
+
58
+ OPERATOR_CHARACTERS.each do |char, method_name|
59
+ send(method_name, ['\\', char]) do
60
+ LeafNode.new(char)
61
+ end
62
+ end
63
+ end
64
+
65
+ for_symbol('Character') do
66
+ letter_character %w| Letter | do child_nodes[0] end
67
+ digit_character %w| Digit | do child_nodes[0] end
68
+ white_space_character %w| Whitespace | do child_nodes[0] end
69
+ symbol_character %w| Symbol | do child_nodes[0] end
70
+ end
71
+
72
+
73
+ for_symbol('SetContents') do
74
+ single_item %w| SetItem | do child_nodes[0] end
75
+ multiple_items %w| SetContents SetItem | do child_nodes[0].concat child_nodes[1] end
76
+ end
77
+
78
+ for_symbol('SetItem') do
79
+ single_char_item %w| SetCharacter | do [child_nodes[0]] end
80
+ lower_case_letter_range %w| LowercaseLetter - LowercaseLetter | do (child_nodes[0]..child_nodes[2]).to_a end
81
+ upper_case_letter_range %w| UppercaseLetter - UppercaseLetter | do (child_nodes[0]..child_nodes[2]).to_a end
82
+ digit_range %w| Digit - Digit | do (child_nodes[0]..child_nodes[2]).to_a end
83
+ end
84
+
85
+
86
+
87
+ for_symbol('Letter') do
88
+ lower_case_letter %w| LowercaseLetter | do child_nodes[0] end
89
+ upper_case_letter %w| UppercaseLetter | do child_nodes[0] end
90
+ end
91
+
92
+ for_symbol('LowercaseLetter') do
93
+ LOWERCASE_LETTERS.each do |letter|
94
+ send("lower_char_letter_#{letter}", letter) do
95
+ letter
96
+ end
97
+ end
98
+ end
99
+
100
+ for_symbol('UppercaseLetter') do
101
+ UPPERCASE_LETTERS.each do |letter|
102
+ send("upper_case_letter_#{letter}", letter) do
103
+ letter
104
+ end
105
+ end
106
+ end
107
+
108
+ for_symbol('Digit') do
109
+ DIGITS.each do |digit|
110
+ send("digit_#{digit}", digit) do
111
+ digit
112
+ end
113
+ end
114
+ end
115
+
116
+ for_symbol('Whitespace') do
117
+ WHITESPACE.each do |whitespace_char|
118
+ send("whitespace_#{whitespace_char[0]}", whitespace_char) do
119
+ whitespace_char
120
+ end
121
+ end
122
+ end
123
+
124
+ for_symbol('Symbol') do
125
+ SYMBOLS.each do |symbol_char|
126
+ send("symbol_char_#{symbol_char[0]}", symbol_char) do
127
+ symbol_char
128
+ end
129
+ end
130
+ end
131
+
132
+ for_symbol('SetCharacter') do
133
+ (ALL_CHARACTERS - SET_OPERATOR_CHARACTERS).each do |char|
134
+ send("set_character_#{char[0]}", char) do
135
+ char
136
+ end
137
+ end
138
+ SET_OPERATOR_CHARACTERS.each do |char|
139
+ send("set_operator_character_#{char[0]}", ['\\', char]) do
140
+ char
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+
147
+ class ASTNode
148
+ def accepting
149
+ false
150
+ end
151
+ end
152
+
153
+ class BinaryNode < ASTNode
154
+ attr_reader :left, :right
155
+ def initialize left, right
156
+ @left, @right = left, right
157
+ end
158
+
159
+ def to_dot(graph)
160
+ graph.node(self, :label => label)
161
+ graph.edge(self, left)
162
+ graph.edge(self, right)
163
+ left.to_dot(graph)
164
+ right.to_dot(graph)
165
+ end
166
+
167
+ def calculate_follow_sets
168
+ left.calculate_follow_sets
169
+ right.calculate_follow_sets
170
+ end
171
+ end
172
+
173
+ class OrNode < ASTNode
174
+ attr_reader :children
175
+ def initialize(*children)
176
+ @children = children
177
+ end
178
+ def label
179
+ "|"
180
+ end
181
+
182
+ def nullable
183
+ children.any? {|child| child.nullable}
184
+ end
185
+
186
+ def first
187
+ children.inject(Set.new([])) do |result, child|
188
+ result | child.first
189
+ end
190
+ end
191
+
192
+ def last
193
+ children.inject(Set.new([])) do |result, child|
194
+ result | child.last
195
+ end
196
+ end
197
+
198
+ def to_dot(graph)
199
+ graph.node(self, :label => label)
200
+ children.each do |child|
201
+ graph.edge(self, child)
202
+ child.to_dot(graph)
203
+ end
204
+ end
205
+
206
+ def calculate_follow_sets
207
+ children.each do |child|
208
+ child.calculate_follow_sets
209
+ end
210
+ end
211
+ end
212
+
213
+ class CatNode < BinaryNode
214
+ def label
215
+ "cat"
216
+ end
217
+
218
+ def nullable
219
+ left.nullable && right.nullable
220
+ end
221
+
222
+ def first
223
+ left.nullable ? (left.first | right.first) : left.first
224
+ end
225
+
226
+ def last
227
+ right.nullable ? (left.last | right.last) : right.last
228
+ end
229
+
230
+ def calculate_follow_sets
231
+ super
232
+ left.last.each do |leaf_node|
233
+ leaf_node.follow_set.merge right.first
234
+ end
235
+ end
236
+ end
237
+
238
+ class UnaryNode < ASTNode
239
+ attr_reader :child
240
+ def initialize child
241
+ @child = child
242
+ end
243
+
244
+ def to_dot(graph)
245
+ graph.node(self, :label => label)
246
+ graph.edge(self, child)
247
+ child.to_dot(graph)
248
+ end
249
+
250
+ def nullable
251
+ child.nullable
252
+ end
253
+
254
+ def first
255
+ child.first
256
+ end
257
+
258
+ def last
259
+ child.last
260
+ end
261
+
262
+ def calculate_follow_sets
263
+ child.calculate_follow_sets
264
+ end
265
+ end
266
+
267
+ class RootNode < CatNode
268
+ def initialize(left)
269
+ super(left, AcceptingNode.new())
270
+ end
271
+
272
+ def label
273
+ "start"
274
+ end
275
+
276
+ def head_node?
277
+ true
278
+ end
279
+ end
280
+
281
+ class ZeroOrMoreNode < UnaryNode
282
+ def label
283
+ "*"
284
+ end
285
+
286
+ def nullable
287
+ true
288
+ end
289
+
290
+ def calculate_follow_sets
291
+ super
292
+ last.each do |leaf_node|
293
+ leaf_node.follow_set.merge first
294
+ end
295
+ end
296
+ end
297
+
298
+ class ZeroOrOneNode < UnaryNode
299
+ def label
300
+ "?"
301
+ end
302
+
303
+ def nullable
304
+ true
305
+ end
306
+ end
307
+
308
+ class OneOrMoreNode < UnaryNode
309
+ def label
310
+ "+"
311
+ end
312
+
313
+ def calculate_follow_sets
314
+ super
315
+ last.each do |leaf_node|
316
+ leaf_node.follow_set.merge first
317
+ end
318
+ end
319
+ end
320
+
321
+ class LeafNode < ASTNode
322
+ attr_reader :character, :follow_set
323
+ def initialize character
324
+ @character = character
325
+ @follow_set = Set.new
326
+ end
327
+
328
+ def to_dot(graph)
329
+ graph.node(self, :label => character)
330
+ end
331
+
332
+ def nullable
333
+ false
334
+ end
335
+
336
+ def first
337
+ Set.new([self])
338
+ end
339
+
340
+ def last
341
+ Set.new([self])
342
+ end
343
+
344
+ def calculate_follow_sets
345
+ end
346
+ end
347
+
348
+ class AcceptingNode < ASTNode
349
+ def accepting
350
+ true
351
+ end
352
+
353
+ def character
354
+ end
355
+
356
+ def first
357
+ Set.new([self])
358
+ end
359
+
360
+ def calculate_follow_sets
361
+ end
362
+
363
+ def to_dot(graph)
364
+ graph.node(self, :label => '#')
365
+ end
366
+ end
367
+ end
368
+ end