dhaka 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/Rakefile +64 -0
  2. data/lib/dhaka.rb +12 -0
  3. data/lib/dot/dot.rb +29 -0
  4. data/lib/evaluator/evaluator.rb +35 -26
  5. data/lib/grammar/grammar.rb +42 -17
  6. data/lib/grammar/grammar_symbol.rb +4 -3
  7. data/lib/grammar/production.rb +9 -3
  8. data/lib/lexer/compiled_lexer.rb +46 -0
  9. data/lib/lexer/dfa.rb +71 -0
  10. data/lib/lexer/lexeme.rb +33 -0
  11. data/lib/lexer/lexer.rb +61 -0
  12. data/lib/lexer/lexer_run.rb +66 -0
  13. data/lib/lexer/regex_grammar.rb +368 -0
  14. data/lib/lexer/regex_parser.rb +1888 -0
  15. data/lib/lexer/regex_tokenizer.rb +14 -0
  16. data/lib/lexer/specification.rb +69 -0
  17. data/lib/lexer/state.rb +45 -0
  18. data/lib/lexer/state_machine.rb +37 -0
  19. data/lib/parser/action.rb +3 -3
  20. data/lib/parser/compiled_parser.rb +11 -3
  21. data/lib/parser/parse_result.rb +3 -5
  22. data/lib/parser/parse_tree.rb +6 -17
  23. data/lib/parser/parser.rb +15 -14
  24. data/lib/parser/parser_run.rb +4 -2
  25. data/lib/parser/parser_state.rb +16 -8
  26. data/lib/tokenizer/tokenizer.rb +5 -3
  27. data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
  28. data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
  29. data/test/chittagong/chittagong_driver.rb +12 -13
  30. data/test/chittagong/chittagong_driver_test.rb +18 -11
  31. data/test/chittagong/chittagong_evaluator.rb +7 -16
  32. data/test/chittagong/chittagong_evaluator_test.rb +7 -4
  33. data/test/chittagong/chittagong_grammar.rb +0 -6
  34. data/test/chittagong/chittagong_lexer.rb +109 -0
  35. data/test/chittagong/chittagong_lexer_specification.rb +39 -0
  36. data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
  37. data/test/chittagong/chittagong_parser.rb +879 -0
  38. data/test/chittagong/chittagong_parser_test.rb +8 -10
  39. data/test/chittagong/chittagong_test.rb +17 -13
  40. data/test/compiled_parser_test.rb +7 -2
  41. data/test/evaluator_test.rb +0 -1
  42. data/test/grammar_test.rb +19 -1
  43. data/test/lexer_test.rb +215 -0
  44. data/test/parse_result_test.rb +8 -8
  45. data/test/parser_state_test.rb +0 -12
  46. metadata +21 -5
  47. data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
  48. data/test/chittagong/chittagong_tokenizer.rb +0 -88
@@ -0,0 +1,71 @@
1
+ module Dhaka
2
+ module LexerSupport
3
+
4
+ # Raised when an invalid regular expression pattern is encountered
5
+ # in a LexerSpecification
6
+ class InvalidRegexException < StandardError
7
+ end
8
+
9
+ class DFA < StateMachine #:nodoc:
10
+ def initialize(regex)
11
+ @regex = regex
12
+
13
+ tokenize_result = RegexTokenizer.tokenize(@regex)
14
+ raise InvalidRegexException.new(tokenize_error_message(tokenize_result)) if tokenize_result.has_error?
15
+
16
+ parse_result = RegexParser.parse(tokenize_result)
17
+ raise InvalidRegexException.new(parse_error_message(parse_result)) if parse_result.has_error?
18
+
19
+ ast = parse_result
20
+ ast.calculate_follow_sets
21
+
22
+ super(ItemSet.new(ast.first))
23
+ end
24
+
25
+ def tokenize_error_message(tokenize_result)
26
+ "Invalid character #{@regex[tokenize_result.unexpected_char_index].chr}: #{@regex.dup.insert(tokenize_result.unexpected_char_index, '>>>')}"
27
+ end
28
+
29
+ def parse_error_message(parse_result)
30
+ unexpected_token = parse_result.unexpected_token
31
+ if unexpected_token.symbol_name == END_SYMBOL_NAME
32
+ "Unexpected end of regex."
33
+ else
34
+ "Unexpected token #{parse_result.unexpected_token.symbol_name}: #{@regex.dup.insert(parse_result.unexpected_token.input_position, '>>>')}"
35
+ end
36
+ end
37
+
38
+ def dest_key_for key, char
39
+ result = ItemSet.new
40
+ key.each do |position|
41
+ result.merge(position.follow_set) if position.character == char
42
+ end
43
+ result
44
+ end
45
+
46
+ def new_state_for_key key
47
+ accepting = key.detect {|position| position.accepting}
48
+ State.new(self, accepting && @regex)
49
+ end
50
+
51
+ def transition_characters key
52
+ result = Set.new
53
+ key.each do |node|
54
+ result << node.character unless node.accepting
55
+ end
56
+ result
57
+ end
58
+
59
+ def matches(string)
60
+ curr_state = @start_state
61
+ string.unpack("C*").each do |i|
62
+ dest_state = curr_state.transitions[i.chr]
63
+ return false unless dest_state
64
+ curr_state = dest_state
65
+ end
66
+ return curr_state.accepting?
67
+ end
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,33 @@
1
+ module Dhaka
2
+ # Represents a portion of the input string that has been recognized as matching a given lexer pattern.
3
+ class Lexeme
4
+ # The pattern matched by this lexeme.
5
+ attr_accessor :pattern
6
+
7
+ # +input_position+ is the index in the input stream that this lexeme starts at.
8
+ attr_reader :input_position
9
+ attr_reader :characters
10
+
11
+ def initialize(input_position) #:nodoc:
12
+ @input_position = input_position
13
+ @characters = []
14
+ end
15
+
16
+ # The substring of the input stream that this lexeme is comprised of.
17
+ def value
18
+ characters.join
19
+ end
20
+
21
+ def accepted? #:nodoc:
22
+ pattern
23
+ end
24
+
25
+ def << char #:nodoc:
26
+ @characters << char
27
+ end
28
+
29
+ def concat chars #:nodoc:
30
+ @characters.concat chars
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,61 @@
1
+ module Dhaka
2
+ # The lexer generator. To generate a lexer from a lexer specification +MyLexerSpecification+:
3
+ # lexer = Dhaka::Lexer.new(MyLexerSpecification)
4
+ #
5
+ # To compile this lexer as +MyLexer+ to a string of Ruby source:
6
+ # lexer.compile_to_ruby_source_as(:MyLexer)
7
+ class Lexer < LexerSupport::StateMachine
8
+ attr_reader :specification
9
+
10
+ # Creates a new lexer from a given specification.
11
+ def initialize(specification)
12
+ dfas = {}
13
+ @specification = specification
14
+ specification.items.each do |pattern, item|
15
+ dfas[pattern] = LexerSupport::DFA.new(pattern)
16
+ end
17
+ super(ItemSet.new(dfas.values.collect{|dfa| dfa.start_state}))
18
+ end
19
+
20
+ # Compiles the lexer to Ruby code that when executed, reloads all the states and actions of the lexer
21
+ # into a class named +lexer_class_name+.
22
+ def compile_to_ruby_source_as lexer_class_name
23
+ result = "class #{lexer_class_name} < Dhaka::CompiledLexer\n\n"
24
+ result << " self.specification = #{specification.name}\n\n"
25
+ result << " start_with #{start_state.object_id}\n\n"
26
+ @states.each do |key, state|
27
+ result << "#{state.compile_to_ruby_source}\n\n"
28
+ end
29
+ result << "end"
30
+ result
31
+ end
32
+
33
+ # Returns a LexerRun that tokenizes +input+.
34
+ def lex input
35
+ LexerRun.new(self, input)
36
+ end
37
+
38
+ def action_for_pattern pattern #:nodoc
39
+ @specification.items[pattern].action
40
+ end
41
+
42
+ private
43
+ def new_state_for_key key
44
+ item = key.select {|state| state.accepting?}.collect {|state| @specification.items[state.pattern]}.min
45
+ LexerSupport::State.new(self, item && item.pattern)
46
+ end
47
+
48
+ def transition_characters states
49
+ states.collect{|state| state.transitions.keys}.flatten.uniq
50
+ end
51
+
52
+ def dest_key_for states, char
53
+ result = ItemSet.new
54
+ states.each do |state|
55
+ dest_state = state.transitions[char]
56
+ result << dest_state if dest_state
57
+ end
58
+ result
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,66 @@
1
+ module Dhaka
2
+ # Represents a run of a lexer on a given input string.
3
+ class LexerRun
4
+ include Enumerable
5
+
6
+ attr_reader :current_lexeme
7
+ def initialize lexer, input
8
+ @lexer, @input = lexer, input
9
+ @input_position = 0
10
+ @not_yet_accepted_chars = []
11
+ end
12
+
13
+ # Constructs a token of type +symbol_name+ from the +current_lexeme+.
14
+ def create_token(symbol_name)
15
+ Token.new(symbol_name, @current_lexeme.characters.join, @current_lexeme.input_position)
16
+ end
17
+
18
+ # Yields each token as it is recognized. Returns a TokenizerErrorResult if an error occurs during tokenization.
19
+ def each
20
+ reset_and_rewind
21
+ loop do
22
+ c = curr_char
23
+ break if (c == "\0" && @not_yet_accepted_chars.empty? && !@current_lexeme.accepted?)
24
+ dest_state = @curr_state.transitions[c]
25
+ unless dest_state
26
+ return TokenizerErrorResult.new(@input_position) unless @current_lexeme.accepted?
27
+ token = get_token
28
+ yield token if token
29
+ reset_and_rewind
30
+ else
31
+ @curr_state = dest_state
32
+ if @curr_state.accepting?
33
+ @current_lexeme.pattern = @curr_state.pattern
34
+ @current_lexeme.concat @not_yet_accepted_chars
35
+ @not_yet_accepted_chars = []
36
+ @current_lexeme << c
37
+ else
38
+ @not_yet_accepted_chars << c
39
+ end
40
+ advance
41
+ end
42
+ end
43
+ yield Token.new(END_SYMBOL_NAME, nil, nil)
44
+ end
45
+
46
+ private
47
+ def reset_and_rewind
48
+ @input_position -= @not_yet_accepted_chars.size
49
+ @current_lexeme = Lexeme.new(@input_position)
50
+ @curr_state = @lexer.start_state
51
+ @not_yet_accepted_chars = []
52
+ end
53
+
54
+ def curr_char
55
+ (@input[@input_position] || 0).chr
56
+ end
57
+
58
+ def advance
59
+ @input_position += 1
60
+ end
61
+
62
+ def get_token
63
+ instance_eval(&@lexer.action_for_pattern(@current_lexeme.pattern))
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,368 @@
1
+ module Dhaka
2
+ module LexerSupport #:nodoc:all
3
+ DIGITS = ('0'..'9').to_a
4
+ LOWERCASE_LETTERS = ('a'..'z').to_a
5
+ UPPERCASE_LETTERS = ('A'..'Z').to_a
6
+ LETTERS = LOWERCASE_LETTERS + UPPERCASE_LETTERS
7
+ WHITESPACE = [" ", "\n", "\t"]
8
+ SYMBOLS = %w| ~ ` ! @ # % & _ = : ; " ' < , > - |
9
+ CLASSES = {'d' => DIGITS, 'w' => LETTERS, 's' => WHITESPACE}
10
+
11
+ OPERATOR_CHARACTERS = {'(' => 'open_parenth', ')' => 'close_parenth', '[' => 'open_square_bracket',
12
+ ']' => 'close_square_bracket', '+' => 'plus', '*' => 'asterisk',
13
+ '?' => 'question_mark', '.' => 'period', '\\' => 'back_slash',
14
+ '|' => 'pipe', '{' => 'left_curly_brace', '}' => 'right_curly_brace',
15
+ '/' => 'forward_slash', '^' => 'caret', '$' => 'dollar'}
16
+
17
+ SET_OPERATOR_CHARACTERS = %w| - ^ [ ] \\ |
18
+
19
+ ALL_CHARACTERS = DIGITS + LETTERS + SYMBOLS + WHITESPACE + OPERATOR_CHARACTERS.keys
20
+
21
+
22
+ class RegexGrammar < Dhaka::Grammar
23
+
24
+ for_symbol(Dhaka::START_SYMBOL_NAME) do
25
+ regex %w| Disjunction | do RootNode.new(child_nodes[0]) end
26
+ end
27
+
28
+ for_symbol('Disjunction') do
29
+ disjunction %w| Alternative \| Disjunction | do OrNode.new(child_nodes[0], child_nodes[2]) end
30
+ alternative %w| Alternative | do child_nodes[0] end
31
+ end
32
+
33
+ for_symbol('Alternative') do
34
+ concatenation %w| Alternative Term | do CatNode.new(child_nodes[0], child_nodes[1]) end
35
+ term %w| Term | do child_nodes[0] end
36
+ end
37
+
38
+ for_symbol('Term') do
39
+ zero_or_more %w| Atom * | do ZeroOrMoreNode.new(child_nodes[0]) end
40
+ one_or_more %w| Atom + | do OneOrMoreNode.new(child_nodes[0]) end
41
+ zero_or_one %w| Atom ? | do ZeroOrOneNode.new(child_nodes[0]) end
42
+ atom %w| Atom | do child_nodes[0] end
43
+ end
44
+
45
+ for_symbol('Atom') do
46
+ group %w| ( Disjunction ) | do child_nodes[1] end
47
+ char %w| Character | do LeafNode.new(child_nodes[0]) end
48
+ anything %w| . | do OrNode.new(*(ALL_CHARACTERS - ["\n"]).collect {|char| LeafNode.new(char)}) end
49
+ positive_set %w| [ SetContents ] | do OrNode.new(*child_nodes[1].collect{|char| LeafNode.new(char)}) end
50
+ negative_set %w| [ ^ SetContents ] | do OrNode.new(*(ALL_CHARACTERS - child_nodes[2]).collect {|char| LeafNode.new(char)}) end
51
+
52
+ CLASSES.each do |char, expansion|
53
+ send("character_class_#{char}", ['\\', char]) do
54
+ OrNode.new(*CLASSES[char].collect {|c| LeafNode.new(c)})
55
+ end
56
+ end
57
+
58
+ OPERATOR_CHARACTERS.each do |char, method_name|
59
+ send(method_name, ['\\', char]) do
60
+ LeafNode.new(char)
61
+ end
62
+ end
63
+ end
64
+
65
+ for_symbol('Character') do
66
+ letter_character %w| Letter | do child_nodes[0] end
67
+ digit_character %w| Digit | do child_nodes[0] end
68
+ white_space_character %w| Whitespace | do child_nodes[0] end
69
+ symbol_character %w| Symbol | do child_nodes[0] end
70
+ end
71
+
72
+
73
+ for_symbol('SetContents') do
74
+ single_item %w| SetItem | do child_nodes[0] end
75
+ multiple_items %w| SetContents SetItem | do child_nodes[0].concat child_nodes[1] end
76
+ end
77
+
78
+ for_symbol('SetItem') do
79
+ single_char_item %w| SetCharacter | do [child_nodes[0]] end
80
+ lower_case_letter_range %w| LowercaseLetter - LowercaseLetter | do (child_nodes[0]..child_nodes[2]).to_a end
81
+ upper_case_letter_range %w| UppercaseLetter - UppercaseLetter | do (child_nodes[0]..child_nodes[2]).to_a end
82
+ digit_range %w| Digit - Digit | do (child_nodes[0]..child_nodes[2]).to_a end
83
+ end
84
+
85
+
86
+
87
+ for_symbol('Letter') do
88
+ lower_case_letter %w| LowercaseLetter | do child_nodes[0] end
89
+ upper_case_letter %w| UppercaseLetter | do child_nodes[0] end
90
+ end
91
+
92
+ for_symbol('LowercaseLetter') do
93
+ LOWERCASE_LETTERS.each do |letter|
94
+ send("lower_char_letter_#{letter}", letter) do
95
+ letter
96
+ end
97
+ end
98
+ end
99
+
100
+ for_symbol('UppercaseLetter') do
101
+ UPPERCASE_LETTERS.each do |letter|
102
+ send("upper_case_letter_#{letter}", letter) do
103
+ letter
104
+ end
105
+ end
106
+ end
107
+
108
+ for_symbol('Digit') do
109
+ DIGITS.each do |digit|
110
+ send("digit_#{digit}", digit) do
111
+ digit
112
+ end
113
+ end
114
+ end
115
+
116
+ for_symbol('Whitespace') do
117
+ WHITESPACE.each do |whitespace_char|
118
+ send("whitespace_#{whitespace_char[0]}", whitespace_char) do
119
+ whitespace_char
120
+ end
121
+ end
122
+ end
123
+
124
+ for_symbol('Symbol') do
125
+ SYMBOLS.each do |symbol_char|
126
+ send("symbol_char_#{symbol_char[0]}", symbol_char) do
127
+ symbol_char
128
+ end
129
+ end
130
+ end
131
+
132
+ for_symbol('SetCharacter') do
133
+ (ALL_CHARACTERS - SET_OPERATOR_CHARACTERS).each do |char|
134
+ send("set_character_#{char[0]}", char) do
135
+ char
136
+ end
137
+ end
138
+ SET_OPERATOR_CHARACTERS.each do |char|
139
+ send("set_operator_character_#{char[0]}", ['\\', char]) do
140
+ char
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+
147
+ class ASTNode
148
+ def accepting
149
+ false
150
+ end
151
+ end
152
+
153
+ class BinaryNode < ASTNode
154
+ attr_reader :left, :right
155
+ def initialize left, right
156
+ @left, @right = left, right
157
+ end
158
+
159
+ def to_dot(graph)
160
+ graph.node(self, :label => label)
161
+ graph.edge(self, left)
162
+ graph.edge(self, right)
163
+ left.to_dot(graph)
164
+ right.to_dot(graph)
165
+ end
166
+
167
+ def calculate_follow_sets
168
+ left.calculate_follow_sets
169
+ right.calculate_follow_sets
170
+ end
171
+ end
172
+
173
+ class OrNode < ASTNode
174
+ attr_reader :children
175
+ def initialize(*children)
176
+ @children = children
177
+ end
178
+ def label
179
+ "|"
180
+ end
181
+
182
+ def nullable
183
+ children.any? {|child| child.nullable}
184
+ end
185
+
186
+ def first
187
+ children.inject(Set.new([])) do |result, child|
188
+ result | child.first
189
+ end
190
+ end
191
+
192
+ def last
193
+ children.inject(Set.new([])) do |result, child|
194
+ result | child.last
195
+ end
196
+ end
197
+
198
+ def to_dot(graph)
199
+ graph.node(self, :label => label)
200
+ children.each do |child|
201
+ graph.edge(self, child)
202
+ child.to_dot(graph)
203
+ end
204
+ end
205
+
206
+ def calculate_follow_sets
207
+ children.each do |child|
208
+ child.calculate_follow_sets
209
+ end
210
+ end
211
+ end
212
+
213
+ class CatNode < BinaryNode
214
+ def label
215
+ "cat"
216
+ end
217
+
218
+ def nullable
219
+ left.nullable && right.nullable
220
+ end
221
+
222
+ def first
223
+ left.nullable ? (left.first | right.first) : left.first
224
+ end
225
+
226
+ def last
227
+ right.nullable ? (left.last | right.last) : right.last
228
+ end
229
+
230
+ def calculate_follow_sets
231
+ super
232
+ left.last.each do |leaf_node|
233
+ leaf_node.follow_set.merge right.first
234
+ end
235
+ end
236
+ end
237
+
238
+ class UnaryNode < ASTNode
239
+ attr_reader :child
240
+ def initialize child
241
+ @child = child
242
+ end
243
+
244
+ def to_dot(graph)
245
+ graph.node(self, :label => label)
246
+ graph.edge(self, child)
247
+ child.to_dot(graph)
248
+ end
249
+
250
+ def nullable
251
+ child.nullable
252
+ end
253
+
254
+ def first
255
+ child.first
256
+ end
257
+
258
+ def last
259
+ child.last
260
+ end
261
+
262
+ def calculate_follow_sets
263
+ child.calculate_follow_sets
264
+ end
265
+ end
266
+
267
+ class RootNode < CatNode
268
+ def initialize(left)
269
+ super(left, AcceptingNode.new())
270
+ end
271
+
272
+ def label
273
+ "start"
274
+ end
275
+
276
+ def head_node?
277
+ true
278
+ end
279
+ end
280
+
281
+ class ZeroOrMoreNode < UnaryNode
282
+ def label
283
+ "*"
284
+ end
285
+
286
+ def nullable
287
+ true
288
+ end
289
+
290
+ def calculate_follow_sets
291
+ super
292
+ last.each do |leaf_node|
293
+ leaf_node.follow_set.merge first
294
+ end
295
+ end
296
+ end
297
+
298
+ class ZeroOrOneNode < UnaryNode
299
+ def label
300
+ "?"
301
+ end
302
+
303
+ def nullable
304
+ true
305
+ end
306
+ end
307
+
308
+ class OneOrMoreNode < UnaryNode
309
+ def label
310
+ "+"
311
+ end
312
+
313
+ def calculate_follow_sets
314
+ super
315
+ last.each do |leaf_node|
316
+ leaf_node.follow_set.merge first
317
+ end
318
+ end
319
+ end
320
+
321
+ class LeafNode < ASTNode
322
+ attr_reader :character, :follow_set
323
+ def initialize character
324
+ @character = character
325
+ @follow_set = Set.new
326
+ end
327
+
328
+ def to_dot(graph)
329
+ graph.node(self, :label => character)
330
+ end
331
+
332
+ def nullable
333
+ false
334
+ end
335
+
336
+ def first
337
+ Set.new([self])
338
+ end
339
+
340
+ def last
341
+ Set.new([self])
342
+ end
343
+
344
+ def calculate_follow_sets
345
+ end
346
+ end
347
+
348
+ class AcceptingNode < ASTNode
349
+ def accepting
350
+ true
351
+ end
352
+
353
+ def character
354
+ end
355
+
356
+ def first
357
+ Set.new([self])
358
+ end
359
+
360
+ def calculate_follow_sets
361
+ end
362
+
363
+ def to_dot(graph)
364
+ graph.node(self, :label => '#')
365
+ end
366
+ end
367
+ end
368
+ end