pegex 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemspec +21 -0
- data/CHANGELOG.yaml +3 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +16 -0
- data/LICENSE +21 -0
- data/README.rdoc +78 -0
- data/Rakefile +64 -0
- data/lib/pegex/compiler.rb +91 -0
- data/lib/pegex/grammar/atoms.rb +96 -0
- data/lib/pegex/grammar.rb +21 -0
- data/lib/pegex/input.rb +41 -0
- data/lib/pegex/parser.rb +287 -0
- data/lib/pegex/pegex/ast.rb +148 -0
- data/lib/pegex/pegex/grammar.rb +414 -0
- data/lib/pegex/receiver.rb +7 -0
- data/lib/pegex/tree/wrap.rb +13 -0
- data/lib/pegex/tree.rb +17 -0
- data/lib/pegex.rb +18 -0
- data/test/compiler-checks.rb +271 -0
- data/test/compiler-checks.tml +271 -0
- data/test/compiler-equivalence.rb +79 -0
- data/test/compiler.rb +42 -0
- data/test/compiler.tml +111 -0
- data/test/error.rb +161 -0
- data/test/export_ok.rb +36 -0
- data/test/grammar-api.rb +21 -0
- data/test/lib/recursive_sort.rb +17 -0
- data/test/lib/test_pegex.rb +33 -0
- data/test/lib/testast.rb +15 -0
- data/test/lib/xxx.rb +13 -0
- data/test/tree-pegex.tml +35 -0
- data/test/tree.rb +47 -0
- data/test/tree.tml +449 -0
- metadata +99 -0
data/lib/pegex/parser.rb
ADDED
@@ -0,0 +1,287 @@
|
|
1
|
+
require 'pegex/input'
|
2
|
+
|
3
|
+
$pegex_nil = []
|
4
|
+
$dummy = [1]
|
5
|
+
|
6
|
+
class Pegex::Parser
|
7
|
+
attr_accessor :grammar
|
8
|
+
attr_accessor :receiver
|
9
|
+
attr_accessor :parent
|
10
|
+
attr_accessor :rule
|
11
|
+
attr_accessor :debug
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@position = 0
|
15
|
+
@farthest = 0
|
16
|
+
@optimized = false
|
17
|
+
@debug = false
|
18
|
+
@throw_on_error = true
|
19
|
+
# @debug = true
|
20
|
+
yield self if block_given?
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse input, start=nil
|
24
|
+
@position = 0
|
25
|
+
if input.kind_of? String
|
26
|
+
input = Pegex::Input.new do |i|
|
27
|
+
i.string = input
|
28
|
+
end
|
29
|
+
end
|
30
|
+
@input = input
|
31
|
+
@input.open unless @input.open?
|
32
|
+
@buffer = @input.read
|
33
|
+
@length = @buffer.length
|
34
|
+
|
35
|
+
fail "No 'grammar'. Can't parse" unless @grammar
|
36
|
+
@tree = @grammar.tree ||= @grammar.make_tree
|
37
|
+
|
38
|
+
start_rule_ref = start ||
|
39
|
+
@tree['+toprule'] ||
|
40
|
+
(@tree['TOP'] ? 'TOP' : nil) or
|
41
|
+
fail "No starting rule for Pegex::Parser::parse"
|
42
|
+
|
43
|
+
optimize_grammar start_rule_ref
|
44
|
+
|
45
|
+
fail "No 'receiver'. Can't parse" unless @receiver
|
46
|
+
|
47
|
+
# XXX does ruby have problems with circulat references
|
48
|
+
@receiver.parser = self
|
49
|
+
|
50
|
+
if @receiver.respond_to? 'initial'
|
51
|
+
@rule, @parent = $start_rule_ref, {}
|
52
|
+
end
|
53
|
+
|
54
|
+
match = match_ref start_rule_ref, {}
|
55
|
+
|
56
|
+
@input.close
|
57
|
+
|
58
|
+
if !match or @position < @length
|
59
|
+
throw_error "Parse document failed for some reason"
|
60
|
+
return
|
61
|
+
end
|
62
|
+
|
63
|
+
if @receiver.respond_to? 'final'
|
64
|
+
@rule, @parent = start_rule_ref, {}
|
65
|
+
match = [ @receiver.final(match.first) ]
|
66
|
+
end
|
67
|
+
|
68
|
+
return match.first
|
69
|
+
end
|
70
|
+
|
71
|
+
def optimize_grammar start
|
72
|
+
return if @optimized
|
73
|
+
@tree.each_pair do |name, node|
|
74
|
+
next if node.kind_of? String
|
75
|
+
optimize_node node
|
76
|
+
end
|
77
|
+
optimize_node '.ref' => start
|
78
|
+
@optimized = true
|
79
|
+
end
|
80
|
+
|
81
|
+
def optimize_node node
|
82
|
+
['ref', 'rgx', 'all', 'any', 'err', 'code', 'xxx'].each do |kind|
|
83
|
+
fail if kind == 'xxx'
|
84
|
+
if node['rule'] = node[".#{kind}"]
|
85
|
+
node['kind'] = kind
|
86
|
+
node['method'] = self.method "match_#{kind}"
|
87
|
+
break
|
88
|
+
end
|
89
|
+
end
|
90
|
+
min, max = node.values_at '+min', '+max'
|
91
|
+
node['+min'] ||= max == nil ? 1 : 0
|
92
|
+
node['+max'] ||= min == nil ? 1 : 0
|
93
|
+
node['+asr'] ||= nil
|
94
|
+
node['+min'] = node['+min'].to_i
|
95
|
+
node['+max'] = node['+max'].to_i
|
96
|
+
|
97
|
+
if ['any', 'all'].include? node['kind']
|
98
|
+
node['rule'].each do |elem|
|
99
|
+
optimize_node elem
|
100
|
+
end
|
101
|
+
elsif node['kind'] == 'ref'
|
102
|
+
ref = node['rule']
|
103
|
+
rule = @tree[ref]
|
104
|
+
if @receiver.respond_to? "got_#{ref}"
|
105
|
+
rule['action'] = receiver.method "got_#{ref}"
|
106
|
+
elsif receiver.respond_to? 'gotrule'
|
107
|
+
rule['action'] = receiver.method 'gotrule'
|
108
|
+
end
|
109
|
+
node['method'] = self.method 'match_ref_trace' if @debug
|
110
|
+
elsif node['kind'] == 'rgx'
|
111
|
+
node['rule'] = Regexp.new "\\A#{node['.rgx']}"
|
112
|
+
end
|
113
|
+
if sep = node['.sep']
|
114
|
+
optimize_node sep
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def match_next next_
|
119
|
+
return match_next_with_sep next_ if next_['.sep']
|
120
|
+
|
121
|
+
rule, method, kind, min, max, assertion =
|
122
|
+
next_.values_at 'rule', 'method', 'kind', '+min', '+max', '+asr'
|
123
|
+
|
124
|
+
position, match, count = @position, [], 0
|
125
|
+
|
126
|
+
while return_ = method.call(rule, next_)
|
127
|
+
position = @position unless assertion
|
128
|
+
count += 1
|
129
|
+
match.concat return_ unless return_.equal? $pegex_nil
|
130
|
+
break if max == 1
|
131
|
+
end
|
132
|
+
if max != 1
|
133
|
+
match = [match]
|
134
|
+
@farthest = position if (@position = position) > @farthest
|
135
|
+
end
|
136
|
+
result = (count >= min and (max == 0 or count <= max)) ^ (assertion == -1)
|
137
|
+
if not result or assertion
|
138
|
+
@farthest = position if (@position = position) > @farthest
|
139
|
+
end
|
140
|
+
|
141
|
+
return result ? next_['-skip'] ? [] : match : false
|
142
|
+
end
|
143
|
+
|
144
|
+
def match_next_with_sep next_
|
145
|
+
rule, method, kind, min, max, sep =
|
146
|
+
next_.values_at 'rule', 'method', 'kind', '+min', '+max', '.sep'
|
147
|
+
|
148
|
+
position, match, count, scount, smin, smax =
|
149
|
+
@position, [], 0, 0, sep.values_at('+min', '+max')
|
150
|
+
|
151
|
+
while return_ = method.call(rule, next_)
|
152
|
+
position = @position
|
153
|
+
count += 1
|
154
|
+
match.concat return_
|
155
|
+
return_ = match_next(sep) or break
|
156
|
+
match.concat return_
|
157
|
+
scount += 1
|
158
|
+
end
|
159
|
+
if max != 1
|
160
|
+
match = [match]
|
161
|
+
end
|
162
|
+
result = count >= min and (max == 0 or count <= max)
|
163
|
+
if count == scount and not sep['+eok']
|
164
|
+
@farthest = position if (@position = position) > @farthest
|
165
|
+
end
|
166
|
+
|
167
|
+
return result ? next_['-skip'] ? [] : match : false
|
168
|
+
end
|
169
|
+
|
170
|
+
def match_ref ref, parent
|
171
|
+
rule = @tree[ref]
|
172
|
+
match = match_next(rule) or return false
|
173
|
+
return $dummy unless rule['action']
|
174
|
+
@rule, @parent = ref, parent
|
175
|
+
result = rule['action'].call(match.first)
|
176
|
+
return (result.equal? $pegex_nil) ? result : [result]
|
177
|
+
end
|
178
|
+
|
179
|
+
def match_rgx regexp, parent=nil
|
180
|
+
position = @position
|
181
|
+
string = @buffer[position .. -1]
|
182
|
+
(m = string.match regexp) or return false
|
183
|
+
position += m[0].length
|
184
|
+
match = m[1..-1]
|
185
|
+
match = [ match ] if m.length > 2
|
186
|
+
@farthest = position if (@position = position) > @farthest
|
187
|
+
return match
|
188
|
+
end
|
189
|
+
|
190
|
+
def match_all list, parent=nil
|
191
|
+
position, set, len = @position, [], 0
|
192
|
+
list.each do |elem|
|
193
|
+
if match = match_next(elem)
|
194
|
+
if !elem['+asr'] and !elem['-skip']
|
195
|
+
set.concat match
|
196
|
+
len += 1
|
197
|
+
end
|
198
|
+
else
|
199
|
+
@farthest = position if (@position = position) > @farthest
|
200
|
+
return false
|
201
|
+
end
|
202
|
+
end
|
203
|
+
set = [set] if len > 1
|
204
|
+
return set
|
205
|
+
end
|
206
|
+
|
207
|
+
def match_any list, parent=nil
|
208
|
+
list.each do |elem|
|
209
|
+
if (match = match_next elem)
|
210
|
+
return match
|
211
|
+
end
|
212
|
+
end
|
213
|
+
return false
|
214
|
+
end
|
215
|
+
|
216
|
+
def match_err error, parent=nil
|
217
|
+
throw_error error
|
218
|
+
end
|
219
|
+
|
220
|
+
def match_ref_trace ref, parent
|
221
|
+
rule = @tree[ref]
|
222
|
+
trace_on = ! rule['+asr']
|
223
|
+
trace "try_#{ref}" if trace_on
|
224
|
+
result = nil
|
225
|
+
if (result = match_ref ref, parent)
|
226
|
+
trace "got_#{ref}" if trace_on
|
227
|
+
else
|
228
|
+
trace "not_#{ref}" if trace_on
|
229
|
+
end
|
230
|
+
return result
|
231
|
+
end
|
232
|
+
|
233
|
+
def trace action
|
234
|
+
indent = !!action.match(/^try_/)
|
235
|
+
@indent ||= 0
|
236
|
+
@indent -= 1 unless indent
|
237
|
+
$stderr.print ' ' * @indent
|
238
|
+
@indent += 1 if indent
|
239
|
+
snippet = @buffer[@position..-1]
|
240
|
+
snippet = snippet[0..30] + '...' if snippet.length > 30;
|
241
|
+
snippet.gsub! /\n/, "\\n"
|
242
|
+
$stderr.printf "%-30s", action
|
243
|
+
$stderr.print indent ? " >#{snippet}<\n" : "\n"
|
244
|
+
end
|
245
|
+
|
246
|
+
def throw_error msg
|
247
|
+
raise msg
|
248
|
+
end
|
249
|
+
|
250
|
+
class PegexParseError < RuntimeError
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
def throw_error msg
|
255
|
+
@error = format_error msg
|
256
|
+
return nil unless @throw_on_error
|
257
|
+
raise PegexParseError, @error
|
258
|
+
end
|
259
|
+
|
260
|
+
def format_error msg
|
261
|
+
buffer = @buffer
|
262
|
+
position = @farthest
|
263
|
+
real_pos = @position
|
264
|
+
|
265
|
+
line = buffer[0, position].scan(/\n/).size + 1
|
266
|
+
column = position - (buffer.rindex("\n", position) || -1)
|
267
|
+
|
268
|
+
pretext = @buffer[
|
269
|
+
position < 50 ? 0 : position - 50,
|
270
|
+
position < 50 ? position : 50
|
271
|
+
]
|
272
|
+
context = @buffer[position, 50]
|
273
|
+
pretext.gsub! /.*\n/m, ''
|
274
|
+
context.gsub! /\n/, "\\n"
|
275
|
+
|
276
|
+
return <<"..."
|
277
|
+
Error parsing Pegex document:
|
278
|
+
msg: #{msg}
|
279
|
+
line: #{line}
|
280
|
+
column: #{column}
|
281
|
+
context: #{pretext}#{context}
|
282
|
+
#{' ' * (pretext.length + 10)}^
|
283
|
+
position: #{position} (#{real_pos} pre-lookahead)
|
284
|
+
...
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
require 'pegex/tree'
|
2
|
+
require 'pegex/grammar/atoms'
|
3
|
+
|
4
|
+
class Pegex::Pegex::AST < Pegex::Tree
|
5
|
+
def initialize
|
6
|
+
@atoms = Pegex::Grammar::Atoms.new.atoms
|
7
|
+
@extra_rules = {}
|
8
|
+
@prefixes = {
|
9
|
+
'!' => ['+asr', -1],
|
10
|
+
'=' => ['+asr', 1],
|
11
|
+
'.' => '-skip',
|
12
|
+
'-' => '-pass',
|
13
|
+
'+' => '-wrap',
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
def got_grammar got
|
18
|
+
meta_section, rule_section = got
|
19
|
+
grammar =
|
20
|
+
{'+toprule' => @toprule}.merge(@extra_rules).merge(meta_section)
|
21
|
+
rule_section.each do |rule|
|
22
|
+
key, value = rule.first
|
23
|
+
grammar[key] = value
|
24
|
+
end
|
25
|
+
return grammar
|
26
|
+
end
|
27
|
+
|
28
|
+
def got_meta_section got
|
29
|
+
meta = {}
|
30
|
+
got.each do |next_|
|
31
|
+
key, val = next_
|
32
|
+
key = "+#{key}"
|
33
|
+
old = meta[key]
|
34
|
+
if ! old.nil?
|
35
|
+
if old.kind_of? Array
|
36
|
+
old << val
|
37
|
+
else
|
38
|
+
meta[key] = [ old, val ]
|
39
|
+
end
|
40
|
+
else
|
41
|
+
meta[key] = val
|
42
|
+
end
|
43
|
+
end
|
44
|
+
return meta
|
45
|
+
end
|
46
|
+
|
47
|
+
def got_rule_definition got
|
48
|
+
name, value = got
|
49
|
+
@toprule = name if name == 'TOP'
|
50
|
+
@toprule ||= name
|
51
|
+
return { name => value }
|
52
|
+
end
|
53
|
+
|
54
|
+
def got_bracketed_group got
|
55
|
+
prefix, group, suffix = got
|
56
|
+
unless prefix.empty?
|
57
|
+
group[@prefixes[prefix]] = 1
|
58
|
+
end
|
59
|
+
unless suffix.empty?
|
60
|
+
set_quantity group, suffix
|
61
|
+
end
|
62
|
+
return group
|
63
|
+
end
|
64
|
+
|
65
|
+
def got_all_group got
|
66
|
+
list = get_group got
|
67
|
+
fail unless list.length > 0
|
68
|
+
return list.first if list.length == 1
|
69
|
+
return '.all' => list
|
70
|
+
end
|
71
|
+
|
72
|
+
def got_any_group got
|
73
|
+
list = get_group got
|
74
|
+
fail unless list.length > 0
|
75
|
+
return list.first if list.length == 1
|
76
|
+
return '.any' => list
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_group group
|
80
|
+
return group.flatten
|
81
|
+
end
|
82
|
+
|
83
|
+
def got_rule_part got
|
84
|
+
rule, sep_op, sep_rule = got
|
85
|
+
if sep_rule
|
86
|
+
sep_rule['+eok'] = true if sep_op == '%%'
|
87
|
+
rule['.sep'] = sep_rule
|
88
|
+
end
|
89
|
+
return rule
|
90
|
+
end
|
91
|
+
|
92
|
+
def got_rule_reference got
|
93
|
+
prefix, ref1, ref2, suffix = got
|
94
|
+
ref = ref1 || ref2 # TODO: determine if ref1 is falsy enough
|
95
|
+
node = { '.ref' => ref }
|
96
|
+
if (regex = @atoms[ref])
|
97
|
+
@extra_rules[ref] = {'.rgx' => regex}
|
98
|
+
end
|
99
|
+
unless suffix.empty?
|
100
|
+
set_quantity node, suffix
|
101
|
+
end
|
102
|
+
unless prefix.empty?
|
103
|
+
if @prefixes[prefix].kind_of? Array
|
104
|
+
key, val = @prefixes[prefix]
|
105
|
+
else
|
106
|
+
key, val = @prefixes[prefix], 1
|
107
|
+
end
|
108
|
+
node[key] = val
|
109
|
+
end
|
110
|
+
return node
|
111
|
+
end
|
112
|
+
|
113
|
+
def got_regular_expression got
|
114
|
+
got.gsub! /\s*#.*\n/, ''
|
115
|
+
got.gsub! /\s+/, ''
|
116
|
+
got.gsub! /\ \:|\=|\!/, " ?#{$1}"
|
117
|
+
return {'.rgx' => got}
|
118
|
+
end
|
119
|
+
|
120
|
+
def got_whitespace_token got
|
121
|
+
return '.rgx' => "<ws#{got.length}>"
|
122
|
+
end
|
123
|
+
|
124
|
+
def got_error_message got
|
125
|
+
return '.err' => got
|
126
|
+
end
|
127
|
+
|
128
|
+
def set_quantity object, quantifier
|
129
|
+
case quantifier
|
130
|
+
when ?*
|
131
|
+
object['+min'] = 0
|
132
|
+
when ?+
|
133
|
+
object['+min'] = 1
|
134
|
+
when ??
|
135
|
+
object['+max'] = 1
|
136
|
+
when /^(\d+)\+$/
|
137
|
+
object['+min'] = $1
|
138
|
+
when /^(\d+)\-(\d+)+$/
|
139
|
+
object['+min'] = $1
|
140
|
+
object['+max'] = $2
|
141
|
+
when /^(\d+)$/
|
142
|
+
object['+min'] = $1
|
143
|
+
object['+max'] = $1
|
144
|
+
else
|
145
|
+
fail "Invalid quantifier: '#{quantifier}'"
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|