aurum 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +29 -0
- data/examples/dangling_else/grammar.rb +23 -0
- data/examples/expression/grammar.rb +28 -0
- data/examples/smalltalk/grammar.rb +151 -0
- data/examples/smalltalk/interpreter.rb +70 -0
- data/examples/yacc/grammar.rb +72 -0
- data/lib/aurum.rb +1 -9
- data/lib/aurum/engine.rb +39 -175
- data/lib/aurum/engine/parsing_facility.rb +107 -0
- data/lib/aurum/engine/tokenization_facility.rb +86 -0
- data/lib/aurum/grammar.rb +52 -219
- data/lib/aurum/grammar/automata.rb +194 -0
- data/lib/aurum/grammar/builder/augmented_grammar.rb +83 -0
- data/lib/aurum/grammar/builder/dot_logger.rb +66 -0
- data/lib/aurum/grammar/builder/lexical_table_builder.rb +55 -0
- data/lib/aurum/grammar/builder/parsing_table_builder.rb +238 -0
- data/lib/aurum/grammar/builder/set_of_items.rb +190 -0
- data/lib/aurum/grammar/compiled_tables.rb +20 -0
- data/lib/aurum/grammar/dsl/lexical_definition.rb +94 -0
- data/lib/aurum/grammar/dsl/syntax_definition.rb +79 -0
- data/lib/aurum/grammar/lexical_rules.rb +224 -0
- data/lib/aurum/grammar/metalang/grammar.rb +47 -0
- data/lib/aurum/grammar/syntax_rules.rb +95 -0
- data/spec/builder/dsl_definition/aurum_grammar_spec.rb +33 -0
- data/spec/engine/lexer_spec.rb +59 -0
- data/spec/engine/parser_spec.rb +90 -0
- data/spec/examples/dangling_else_example.rb +30 -0
- data/spec/examples/expression_example.rb +48 -0
- data/spec/examples/smalltalk_example.rb +50 -0
- data/spec/examples/yacc_spec.rb +30 -0
- data/spec/grammar/builder/lexical_table/automata_spec.rb +55 -0
- data/spec/grammar/builder/lexical_table/builder_spec.rb +78 -0
- data/spec/grammar/builder/lexical_table/character_set_spec.rb +100 -0
- data/spec/grammar/builder/lexical_table/pattern_spec.rb +11 -0
- data/spec/grammar/builder/lexical_table/regular_expression.rb +40 -0
- data/spec/grammar/builder/parsing_table/augmented_grammar_spec.rb +36 -0
- data/spec/grammar/builder/parsing_table/builder_spec.rb +152 -0
- data/spec/grammar/builder/parsing_table/digraph_traverser_spec.rb +42 -0
- data/spec/grammar/builder/parsing_table/item_spec.rb +51 -0
- data/spec/grammar/builder/parsing_table/sources_spec.rb +66 -0
- data/spec/grammar/builder/parsing_table/state_spec.rb +82 -0
- data/spec/grammar/dsl/character_classes_builder_spec.rb +50 -0
- data/spec/grammar/dsl/lexical_rules_builder_spec.rb +181 -0
- data/spec/grammar/dsl/precedence_builder_spec.rb +64 -0
- data/spec/grammar/dsl/productions_builder_spec.rb +78 -0
- data/spec/grammar/metalang/metalang_spec.rb +0 -0
- data/spec/grammar/precedence_spec.rb +42 -0
- data/spec/grammar/syntax_rules_spec.rb +31 -0
- data/spec/parser_matcher.rb +69 -0
- data/spec/pattern_matcher.rb +123 -0
- data/spec/spec_helper.rb +133 -0
- metadata +70 -36
- data/example/expression/expression.rb +0 -35
- data/example/expression/lisp.rb +0 -26
- data/lib/aurum/lexical_table_generator.rb +0 -429
- data/lib/aurum/parsing_table_generator.rb +0 -464
- data/test/engine/lexer_test.rb +0 -59
- data/test/engine/semantic_attributes_test.rb +0 -15
- data/test/grammar_definition/character_class_definition_test.rb +0 -28
- data/test/grammar_definition/grammar_definition_test.rb +0 -55
- data/test/grammar_definition/lexical_definition_test.rb +0 -56
- data/test/grammar_definition/operator_precedence_definition_test.rb +0 -35
- data/test/grammar_definition/production_definition_test.rb +0 -60
- data/test/lexical_table_generator/automata_test.rb +0 -74
- data/test/lexical_table_generator/character_set_test.rb +0 -73
- data/test/lexical_table_generator/interval_test.rb +0 -36
- data/test/lexical_table_generator/pattern_test.rb +0 -115
- data/test/lexical_table_generator/subset_determinizer_test.rb +0 -19
- data/test/lexical_table_generator/table_generator_test.rb +0 -126
- data/test/parsing_table_generator/augmented_grammar_test.rb +0 -45
- data/test/parsing_table_generator/lalr_n_computation_test.rb +0 -92
- data/test/parsing_table_generator/lr_0_automata_test.rb +0 -94
- data/test/parsing_table_generator/lr_item_test.rb +0 -27
- data/test/parsing_table_generator/parsing_table_state_test.rb +0 -39
- data/test/parsing_table_generator/precedence_table_test.rb +0 -28
- data/test/parsing_table_generator/production_test.rb +0 -9
- data/test/test_helper.rb +0 -103
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
|
3
|
+
module Aurum
|
4
|
+
module Builder
|
5
|
+
class LRItem < Struct.new(:production, :position)
|
6
|
+
attr_reader :dot_symbol, :is_kernel, :is_handle, :remaining
|
7
|
+
def initialize(production, position = 0)
|
8
|
+
super production, position
|
9
|
+
@is_handle = position >= production.symbols.size
|
10
|
+
@is_kernel = @is_handle || position != 0 || production.nonterminal == StartSymbol
|
11
|
+
@dot_symbol = production.symbols[position]
|
12
|
+
@remaining = @is_handle ? [] : production.symbols[position..-1]
|
13
|
+
end
|
14
|
+
|
15
|
+
def next
|
16
|
+
LRItem.new(production, position + 1)
|
17
|
+
end
|
18
|
+
|
19
|
+
def reducable?(symbol)
|
20
|
+
(position == production.symbols.size - 1) && (@dot_symbol == symbol)
|
21
|
+
end
|
22
|
+
|
23
|
+
def inspect
|
24
|
+
result = "#{production.nonterminal.name} -> "
|
25
|
+
production.symbols.each_with_index do |symbol, index|
|
26
|
+
result << '.' if index == position
|
27
|
+
result << symbol.inspect << ' '
|
28
|
+
end
|
29
|
+
result << '.' if position >= production.symbols.length
|
30
|
+
result
|
31
|
+
end
|
32
|
+
alias kernel? is_kernel
|
33
|
+
alias handle? is_handle
|
34
|
+
alias to_s inspect
|
35
|
+
end
|
36
|
+
|
37
|
+
class State < DelegateClass(Array)
|
38
|
+
attr_accessor :index
|
39
|
+
attr_reader :actions, :handles, :all_items, :expect_symbols
|
40
|
+
def initialize augmented_grammar, items
|
41
|
+
@augmented_grammar = augmented_grammar
|
42
|
+
super(kernels_of_closure(items))
|
43
|
+
@handles = find_all {|item| item.handle?}
|
44
|
+
@actions, @goto_states, @predecessors, @expect_symbols = Hash.new([]), {}, {}, []
|
45
|
+
@all_items.each {|item| @expect_symbols << item.dot_symbol unless item.handle?}
|
46
|
+
end
|
47
|
+
|
48
|
+
def inconsistent?
|
49
|
+
@handles.size > 1 || (@handles.size != 0 && @handles.size != size)
|
50
|
+
end
|
51
|
+
|
52
|
+
def conflict?
|
53
|
+
@actions.values.any? {|actions| actions.size > 1 }
|
54
|
+
end
|
55
|
+
|
56
|
+
def conflicts
|
57
|
+
conflict = []
|
58
|
+
@actions.each {|symbol, actions| conflict << symbol if actions.size > 1}
|
59
|
+
conflict
|
60
|
+
end
|
61
|
+
|
62
|
+
def read_reduce
|
63
|
+
(@handles.size == 1 && size == 1) ? first.production : nil
|
64
|
+
end
|
65
|
+
|
66
|
+
def goto symbol
|
67
|
+
new_state = []
|
68
|
+
@all_items.each {|item| new_state << item.next if item.dot_symbol == symbol}
|
69
|
+
new_state.empty? ? nil : State.new(@augmented_grammar, new_state)
|
70
|
+
end
|
71
|
+
|
72
|
+
def add_reduce symbol, production
|
73
|
+
add_action(symbol, Aurum::ParsingTable::Action.new(:reduce, production))
|
74
|
+
end
|
75
|
+
|
76
|
+
def add_read_reduce symbol, production
|
77
|
+
add_action(symbol, Aurum::ParsingTable::Action.new(:read_reduce, production))
|
78
|
+
end
|
79
|
+
|
80
|
+
def add_shift symbol, state
|
81
|
+
add_action(symbol, Aurum::ParsingTable::Action.new(:shift, state.index))
|
82
|
+
add_goto_state(symbol, state)
|
83
|
+
end
|
84
|
+
|
85
|
+
def add_lookahead_shift symbol, state
|
86
|
+
add_goto_state(symbol, state)
|
87
|
+
@actions[symbol] = [Aurum::ParsingTable::Action.new(:lookahead_shift, state.index)]
|
88
|
+
end
|
89
|
+
|
90
|
+
def add_action symbol, action
|
91
|
+
@actions[symbol] = [] unless @actions.has_key?(symbol)
|
92
|
+
@actions[symbol] << action unless @actions[symbol].include?(action)
|
93
|
+
end
|
94
|
+
|
95
|
+
def predecessors(symbols = nil)
|
96
|
+
return @predecessors unless symbols
|
97
|
+
result = [self]
|
98
|
+
symbols.reverse_each do |symbol|
|
99
|
+
new_result = []
|
100
|
+
result.each {|state| new_result |= state.predecessors[symbol] if state.predecessors[symbol]}
|
101
|
+
result.replace(new_result)
|
102
|
+
end
|
103
|
+
result
|
104
|
+
end
|
105
|
+
|
106
|
+
def read_set symbol
|
107
|
+
return [] unless @expect_symbols.include?(symbol)
|
108
|
+
result, state =[].to_set, @goto_states[symbol]
|
109
|
+
state.each {|item| result |= first_set_of(item.remaining)} if state
|
110
|
+
result
|
111
|
+
end
|
112
|
+
|
113
|
+
def include_each nonterminal
|
114
|
+
return unless @expect_symbols.include?(nonterminal)
|
115
|
+
for item in @all_items
|
116
|
+
for predecessor in predecessors(item.production.symbols[0, item.position])
|
117
|
+
yield predecessor, item.production.nonterminal
|
118
|
+
end if (item.dot_symbol == nonterminal && nullable?(item.remaining[1..-1]))
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def read nonterminal
|
123
|
+
return unless @expect_symbols.include?(nonterminal)
|
124
|
+
for action in @actions[nonterminal]
|
125
|
+
if action.shift_action?
|
126
|
+
state, direct, indirect = @goto_states[nonterminal], [], []
|
127
|
+
state.expect_symbols.each do |symbol|
|
128
|
+
direct << symbol if symbol.is_terminal
|
129
|
+
indirect << symbol if @augmented_grammar.nullable?(symbol)
|
130
|
+
end
|
131
|
+
yield(state, direct, indirect)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def read_reduce_items nonterminal
|
137
|
+
return unless @expect_symbols.include?(nonterminal)
|
138
|
+
for item in @all_items
|
139
|
+
yield(item.production, item.position) if item.reducable?(nonterminal)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def reducable_items nonterminal
|
144
|
+
return unless @expect_symbols.include?(nonterminal)
|
145
|
+
for item in @all_items
|
146
|
+
suffix = item.remaining[1..-1]
|
147
|
+
yield(item.production, item.position) if item.dot_symbol == nonterminal && nullable?(suffix)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
def kernels_of_closure items
|
153
|
+
@all_items = items.dup
|
154
|
+
kernel_items = @all_items.find_all {|item| item.kernel?}
|
155
|
+
Builder.working_list items.dup do |unvisited, visiting|
|
156
|
+
symbol = visiting.dot_symbol
|
157
|
+
for production in @augmented_grammar.productions(symbol)
|
158
|
+
item = LRItem.new(production)
|
159
|
+
unless @all_items.include?(item)
|
160
|
+
@all_items << item
|
161
|
+
unvisited << item
|
162
|
+
kernel_items << item if item.kernel?
|
163
|
+
end
|
164
|
+
end unless !symbol || symbol.is_terminal
|
165
|
+
end
|
166
|
+
kernel_items
|
167
|
+
end
|
168
|
+
|
169
|
+
def first_set_of symbols
|
170
|
+
result = [].to_set
|
171
|
+
for symbol in symbols
|
172
|
+
result |= @augmented_grammar.first_set(symbol)
|
173
|
+
break unless @augmented_grammar.nullable?(symbol)
|
174
|
+
end
|
175
|
+
result
|
176
|
+
end
|
177
|
+
|
178
|
+
def nullable? symbols
|
179
|
+
return true if symbols.empty?
|
180
|
+
symbols.all? {|symbol| @augmented_grammar.nullable?(symbol)}
|
181
|
+
end
|
182
|
+
|
183
|
+
def add_goto_state symbol, state
|
184
|
+
@goto_states[symbol] = state
|
185
|
+
state.predecessors[symbol] = [] unless state.predecessors[symbol]
|
186
|
+
state.predecessors[symbol] << self
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Aurum
|
2
|
+
class ParsingTable < Struct.new(:productions, :actions, :lookahead)
|
3
|
+
class Action < Struct.new(:type, :value)
|
4
|
+
[:shift, :reduce, :lookahead_shift, :read_reduce].each do |type|
|
5
|
+
define_method("#{type}?") { self.type == type }
|
6
|
+
end
|
7
|
+
def shift_action?
|
8
|
+
type == :shift || type == :lookahead_shift
|
9
|
+
end
|
10
|
+
|
11
|
+
def reduce_action?
|
12
|
+
type == :reduce || type == :read_reduce
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class LexicalTable < Struct.new(:states, :lexical_states, :actions)
|
18
|
+
Action = Struct.new(:token, :state, :event, :action)
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module Aurum
|
2
|
+
class Grammar
|
3
|
+
module DSL
|
4
|
+
class CharacterClassesBuilder
|
5
|
+
instance_methods.each { |m| undef_method m unless m =~ /^__/ || m == 'new' || m=='instance_eval'}
|
6
|
+
attr_reader :__named_character_classes__
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@__named_character_classes__ = {'any' => Aurum::Grammar::LexicalRules::CharacterSet.any,
|
10
|
+
'underscore' => enum('_'),'single_quote' => enum("'"), 'double_quote' => enum('"'),
|
11
|
+
'ascii' => range(0, 255), 'decimal_number' => range(?0, ?9),
|
12
|
+
'ascii_punctuation' => range(33, 47) + range(58, 64) + range(91, 96) + range(123, 126)}
|
13
|
+
end
|
14
|
+
|
15
|
+
def range first, last = first
|
16
|
+
Grammar.range(first, last)
|
17
|
+
end
|
18
|
+
|
19
|
+
def enum literal
|
20
|
+
Grammar.enum(literal)
|
21
|
+
end
|
22
|
+
|
23
|
+
def method_missing name, char_set = nil
|
24
|
+
name = name.to_s
|
25
|
+
return @__named_character_classes__[name] unless char_set
|
26
|
+
raise "already defined character class: #{name}" if @__named_character_classes__.has_key? name
|
27
|
+
@__named_character_classes__[name] = char_set.kind_of?(Aurum::Grammar::LexicalRules::CharacterSet) ? char_set : enum(char_set.to_s)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class LexicalRulesBuilder
|
32
|
+
instance_methods.each {|m| undef_method m unless m =~ /^__/ || m=='instance_eval'}
|
33
|
+
|
34
|
+
def initialize lexical_rules, character_classes
|
35
|
+
@lexical_rules, @character_classes = lexical_rules, character_classes
|
36
|
+
@named_patterns, @current_states = {}, [['initial']]
|
37
|
+
end
|
38
|
+
|
39
|
+
def range first, last = first
|
40
|
+
Aurum::Grammar::LexicalRules::Pattern.character_set(Grammar.range(first, last))
|
41
|
+
end
|
42
|
+
|
43
|
+
def enum literal
|
44
|
+
Aurum::Grammar::LexicalRules::Pattern.enum(literal)
|
45
|
+
end
|
46
|
+
|
47
|
+
def string literal
|
48
|
+
Aurum::Grammar::LexicalRules::Pattern.string literal
|
49
|
+
end
|
50
|
+
|
51
|
+
def concat *patterns
|
52
|
+
Aurum::Grammar::LexicalRules::Pattern.concat(patterns.map do
|
53
|
+
|pattern| pattern.is_a?(Aurum::Grammar::LexicalRules::Pattern) ? pattern : string(pattern.to_s)
|
54
|
+
end)
|
55
|
+
end
|
56
|
+
|
57
|
+
def within *states, &definition
|
58
|
+
@saved_named_patterns = @named_patterns.dup
|
59
|
+
@current_states << states.map {|state| state.to_s}
|
60
|
+
instance_eval(&definition)
|
61
|
+
@current_states.pop
|
62
|
+
@named_patterns = @saved_named_patterns
|
63
|
+
end
|
64
|
+
|
65
|
+
def ignore pattern, options = {}, &semantic_action
|
66
|
+
options[:recognize] = '$ignored'
|
67
|
+
match(pattern, options, &semantic_action)
|
68
|
+
end
|
69
|
+
|
70
|
+
def match pattern, options = {}, &semantic_action
|
71
|
+
event = options[:event].to_s if options.has_key?(:event)
|
72
|
+
token = options[:recognize].to_s if options.has_key?(:recognize)
|
73
|
+
lexical_state = options[:shift_to].to_s if options.has_key?(:shift_to)
|
74
|
+
action = Aurum::LexicalTable::Action.new(token, lexical_state, event, semantic_action)
|
75
|
+
pattern = string(pattern.to_s) unless pattern.is_a?(Aurum::Grammar::LexicalRules::Pattern)
|
76
|
+
for state in @current_states.last
|
77
|
+
@lexical_rules.add_lexical_action(state, pattern, action)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def method_missing name, *patterns, &semantic_action
|
82
|
+
name, pattern = name.to_s, concat(*patterns)
|
83
|
+
return match(pattern, :recognize => name, &semantic_action) if name =~ /^_/
|
84
|
+
if patterns.empty?
|
85
|
+
return @named_patterns[name] if @named_patterns.has_key? name
|
86
|
+
return Aurum::Grammar::LexicalRules::Pattern.character_set(@character_classes.__named_character_classes__[name]) if @character_classes.__named_character_classes__.has_key? name
|
87
|
+
raise "can not find pattern named '#{name}'"
|
88
|
+
end
|
89
|
+
@named_patterns[name] = concat(*patterns)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Aurum
|
2
|
+
class Grammar
|
3
|
+
module DSL
|
4
|
+
class PrecedencesBuilder
|
5
|
+
def initialize syntax_rules
|
6
|
+
@syntax_rules, @level = syntax_rules, 0
|
7
|
+
end
|
8
|
+
|
9
|
+
def nonassoc *operators
|
10
|
+
operator :non_associative, *operators
|
11
|
+
end
|
12
|
+
|
13
|
+
def left *operators
|
14
|
+
operator :left_associative, *operators
|
15
|
+
end
|
16
|
+
|
17
|
+
def right *operators
|
18
|
+
operator :right_associative, *operators
|
19
|
+
end
|
20
|
+
|
21
|
+
def operator associative, *operators
|
22
|
+
for operator in operators
|
23
|
+
name = operator.is_a?(String) ? "$literal_#{operator}" : operator.to_s
|
24
|
+
@syntax_rules.add_operator_precedence name, Aurum::Grammar.precedence(associative, @level)
|
25
|
+
end
|
26
|
+
@level -= 1 unless operators.empty?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class ProductionsBuilder
|
31
|
+
instance_methods.each {|m| undef_method m unless m =~ /^__/ || m=='instance_eval'}
|
32
|
+
|
33
|
+
def initialize syntax_rules, actions
|
34
|
+
@syntax_rules, @actions, @symbols = syntax_rules, actions, {}
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def method_missing name, *arguments, &action
|
39
|
+
name = name.to_s
|
40
|
+
__add_symbol__ name
|
41
|
+
symbol = (@symbols[name] || Aurum::Grammar::Epsilon)
|
42
|
+
if arguments.empty?
|
43
|
+
symbol.action = block_given? ? action : nil
|
44
|
+
return symbol
|
45
|
+
end
|
46
|
+
production_name = arguments.pop.to_s if arguments.last.is_a?(Symbol)
|
47
|
+
production_action = action || __action_of_last_grammar_symbol__(arguments)
|
48
|
+
symbols = arguments.inject([]) do |result, arg|
|
49
|
+
break result if arg == Aurum::Grammar::Epsilon
|
50
|
+
result << (arg.is_a?(Aurum::Grammar::SyntaxRules::Symbol) ? arg : __add_literal__(arg))
|
51
|
+
end
|
52
|
+
production = Aurum::Grammar.production(symbol, symbols)
|
53
|
+
production.name = production_name || "$production_#{production.object_id}"
|
54
|
+
result = @syntax_rules.add_syntax_rule(production)
|
55
|
+
@actions[production.name] = production_action if result && production_action
|
56
|
+
end
|
57
|
+
|
58
|
+
def __add_literal__ literal
|
59
|
+
@syntax_rules.add_literal literal.to_s
|
60
|
+
literal = "$literal_#{literal}"
|
61
|
+
@symbols.has_key?(literal) ? @symbols[literal] : @symbols[literal] = Aurum::Grammar.terminal(literal)
|
62
|
+
end
|
63
|
+
|
64
|
+
def __add_symbol__ name
|
65
|
+
@symbols[name] = case name
|
66
|
+
when /^[a-z]/ : Aurum::Grammar.nonterminal(name)
|
67
|
+
when /^_.+/ : Aurum::Grammar.terminal(name)
|
68
|
+
when /\$literal_/ : Aurum::Grammar.terminal(name)
|
69
|
+
end unless @symbols.has_key?(name) or name == '_'
|
70
|
+
end
|
71
|
+
|
72
|
+
def __action_of_last_grammar_symbol__ symbols
|
73
|
+
last = symbols.last
|
74
|
+
last.is_a?(Aurum::Grammar::SyntaxRules::Symbol) ? last.action : nil
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,224 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'automata')
|
2
|
+
|
3
|
+
module Aurum
|
4
|
+
class Grammar
|
5
|
+
class LexicalRules
|
6
|
+
attr_reader :patterns
|
7
|
+
def initialize
|
8
|
+
@patterns = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_lexical_action state, pattern, action
|
12
|
+
state_name = state.to_s
|
13
|
+
@patterns[state_name] = {} unless @patterns.has_key?(state_name)
|
14
|
+
@patterns[state_name][pattern] = action
|
15
|
+
end
|
16
|
+
|
17
|
+
class Pattern
|
18
|
+
def self.string(literal)
|
19
|
+
automata, index = Automata.new(literal.size + 1), 0
|
20
|
+
literal.each_byte {|byte|automata.connect(index, CharacterSet::Interval.new(byte).to_char_set, (index += 1))}
|
21
|
+
Pattern.new(automata, index)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.enum(literal)
|
25
|
+
character_set(CharacterSet.enum(literal))
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.character_set(char_set)
|
29
|
+
automata = Automata.new(2)
|
30
|
+
Pattern.new(automata, automata.connect(0, char_set, 1))
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.concat patterns
|
34
|
+
automata, index = Automata.new, 0
|
35
|
+
for pattern in patterns
|
36
|
+
index = automata.connect(index, Epsilon, automata.merge!(pattern.automata)) + pattern.accept
|
37
|
+
end
|
38
|
+
Pattern.new(automata, index)
|
39
|
+
end
|
40
|
+
|
41
|
+
attr_reader :automata, :accept
|
42
|
+
def initialize automata, accept
|
43
|
+
@automata, @accept = automata, accept
|
44
|
+
end
|
45
|
+
|
46
|
+
def zero_or_more
|
47
|
+
automata = @automata.dup
|
48
|
+
automata.connect(0, Epsilon, @accept)
|
49
|
+
automata.connect(@accept, Epsilon, 0)
|
50
|
+
Pattern.new(automata, @accept)
|
51
|
+
end
|
52
|
+
|
53
|
+
def one_or_more
|
54
|
+
automata = @automata.dup
|
55
|
+
automata.connect(@accept, Epsilon, 0)
|
56
|
+
Pattern.new(automata, @accept)
|
57
|
+
end
|
58
|
+
|
59
|
+
def zero_or_one
|
60
|
+
automata = @automata.dup
|
61
|
+
automata.connect(0, Epsilon, @accept)
|
62
|
+
Pattern.new(automata, @accept)
|
63
|
+
end
|
64
|
+
|
65
|
+
def not
|
66
|
+
deterministic, accepts = SubsetDeterminizer.new(@automata, [@accept]).determinize
|
67
|
+
sink = deterministic.new_state
|
68
|
+
deterministic.connect(sink, CharacterSet.any, sink)
|
69
|
+
sink.times do |state|
|
70
|
+
joint = CharacterSet.any
|
71
|
+
deterministic.table[state].each {|tran| joint -= tran.character_set}
|
72
|
+
deterministic.connect(state, joint, sink) unless joint.empty?
|
73
|
+
end
|
74
|
+
accept = deterministic.new_state
|
75
|
+
accept.times {|state| deterministic.connect(state, Epsilon, accept) unless accepts.has_key? state }
|
76
|
+
Pattern.new(deterministic, accept)
|
77
|
+
end
|
78
|
+
|
79
|
+
def [] least, most = least
|
80
|
+
Pattern.concat([self] * least + [self.zero_or_one] * (most-least))
|
81
|
+
end
|
82
|
+
|
83
|
+
def | other
|
84
|
+
automata = Automata.new(2)
|
85
|
+
for pattern in [self, other]
|
86
|
+
automata.connect(automata.connect(0, Epsilon, automata.merge!(pattern.automata)) + pattern.accept, Epsilon, 1)
|
87
|
+
end
|
88
|
+
Pattern.new(automata, 1)
|
89
|
+
end
|
90
|
+
|
91
|
+
def ~
|
92
|
+
any = Pattern.character_set(CharacterSet.any).zero_or_more
|
93
|
+
Pattern.concat([Pattern.concat([any, self, any]).not, self])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class CharacterSet
|
98
|
+
def self.enum(literal)
|
99
|
+
intervals = []
|
100
|
+
literal.each_byte {|char| intervals << Interval.new(char)}
|
101
|
+
CharacterSet.new(intervals)
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.range(a, b=a)
|
105
|
+
Interval.new(a, b).to_char_set
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.any
|
109
|
+
range(0, 65535)
|
110
|
+
end
|
111
|
+
attr_reader :intervals
|
112
|
+
def initialize(intervals = [])
|
113
|
+
@intervals = intervals
|
114
|
+
merge_intervals
|
115
|
+
end
|
116
|
+
|
117
|
+
def include? char
|
118
|
+
@intervals.any? {|interval| interval.include?(char)}
|
119
|
+
end
|
120
|
+
|
121
|
+
def empty?
|
122
|
+
@intervals.empty?
|
123
|
+
end
|
124
|
+
|
125
|
+
def + other
|
126
|
+
CharacterSet.new(@intervals + other.intervals)
|
127
|
+
end
|
128
|
+
|
129
|
+
def - other
|
130
|
+
intervals = @intervals.dup
|
131
|
+
for interval in other.intervals
|
132
|
+
next unless to_be_replaced = intervals.find {|x| x.include?(interval.first) || x.include?(interval.last)}
|
133
|
+
intervals.delete to_be_replaced
|
134
|
+
intervals << Interval.new(to_be_replaced.first, interval.first-1) if to_be_replaced.first <= interval.first-1
|
135
|
+
intervals << Interval.new(interval.last + 1, to_be_replaced.last) if interval.last + 1 <= to_be_replaced.last
|
136
|
+
end
|
137
|
+
CharacterSet.new(intervals)
|
138
|
+
end
|
139
|
+
|
140
|
+
def to_points destination
|
141
|
+
@intervals.inject [] do |points, interval|
|
142
|
+
points << Point.new(interval.first, true, destination)
|
143
|
+
points << Point.new(interval.last, false, destination)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def == other
|
148
|
+
return false unless other.is_a?(Aurum::Grammar::LexicalRules::CharacterSet)
|
149
|
+
other.intervals == @intervals
|
150
|
+
end
|
151
|
+
|
152
|
+
def inspect
|
153
|
+
@intervals.map{|interval| interval.inspect}.join(',')
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
def merge_intervals
|
158
|
+
@intervals.sort!
|
159
|
+
merging = nil
|
160
|
+
for interval in @intervals.dup
|
161
|
+
if merging and merging.merge!(interval)
|
162
|
+
@intervals.delete(interval)
|
163
|
+
else
|
164
|
+
merging = interval
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
class Interval < Struct.new(:first, :last)
|
170
|
+
include Comparable
|
171
|
+
def initialize first, last = first
|
172
|
+
super first, last
|
173
|
+
end
|
174
|
+
def include? char
|
175
|
+
char = char[0] if char.kind_of? String
|
176
|
+
first <= char && char <= last
|
177
|
+
end
|
178
|
+
|
179
|
+
def merge! other
|
180
|
+
return nil unless include?(other.first) || include?(other.last) || other.first - last == 1 || first - other.last == 1
|
181
|
+
self.first, self.last = [first, other.first].min, [last, other.last].max
|
182
|
+
self
|
183
|
+
end
|
184
|
+
|
185
|
+
def <=> other
|
186
|
+
return first <=> other.first unless first == other.first
|
187
|
+
(last - first) <=> (other.last - other.first)
|
188
|
+
end
|
189
|
+
|
190
|
+
def to_char_set
|
191
|
+
CharacterSet.new([self])
|
192
|
+
end
|
193
|
+
|
194
|
+
def inspect
|
195
|
+
first == last ? chr_of(first) : "#{chr_of(first)}-#{chr_of(last)}"
|
196
|
+
end
|
197
|
+
private
|
198
|
+
def chr_of integer
|
199
|
+
return "##{integer};" if integer < 33
|
200
|
+
integer.chr rescue integer
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
Epsilon = CharacterSet.new
|
205
|
+
|
206
|
+
class Point < Struct.new(:char, :is_start, :destination)
|
207
|
+
include Comparable
|
208
|
+
def <=> other
|
209
|
+
char == other.char ? (is_start ? (other.is_start ? 0 : -1) : (other.is_start ? 1 : 0)) : (char <=> other.char)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def Grammar.range first, last = first
|
215
|
+
first = first[0] if first.kind_of? String
|
216
|
+
last = last[0] if last.kind_of? String
|
217
|
+
Aurum::Grammar::LexicalRules::CharacterSet.range(first, last)
|
218
|
+
end
|
219
|
+
|
220
|
+
def Grammar.enum literal
|
221
|
+
Aurum::Grammar::LexicalRules::CharacterSet.enum(literal.to_s)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|