rpeg 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +7 -0
- data/README.md +155 -0
- data/Rakefile +9 -0
- data/lib/rpeg/captures.rb +702 -0
- data/lib/rpeg/parsing_machine.rb +457 -0
- data/lib/rpeg/re.rb +233 -0
- data/lib/rpeg/rpeg.rb +1622 -0
- data/lib/rpeg.rb +5 -0
- metadata +81 -0
data/lib/rpeg/rpeg.rb
ADDED
@@ -0,0 +1,1622 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# See the README file for a little context. And also:
|
4
|
+
#
|
5
|
+
# http://www.inf.puc-rio.br/~roberto/lpeg/#func
|
6
|
+
|
7
|
+
# require 'byebug'
|
8
|
+
require 'set'
|
9
|
+
require 'must_be'
|
10
|
+
|
11
|
+
# TODO: disable this. For now I get a whole lot of warning messages
|
12
|
+
# MustBe.disable
|
13
|
+
|
14
|
+
require_relative 'captures'
|
15
|
+
require_relative 'parsing_machine'
|
16
|
+
|
17
|
+
# This is intended to play the same role as LPEG's lpeg module.
|
18
|
+
#
|
19
|
+
# Top-level differences from LPEG:
|
20
|
+
#
|
21
|
+
# - AND patterns in LPEG are written as #patt (&patt in the first version) but +patt here
|
22
|
+
# - unary & apparently can't be overloaded in Ruby
|
23
|
+
# - I tried using the "&" operator by overriding #to_proc but the Ruby parser rejects the &patt expression.
|
24
|
+
# - +patt doesn't read well, even though -patt does. I think this is because binary plus is so much more common when buliding
|
25
|
+
# patterns than binary minus is.
|
26
|
+
# - There doesn't seem to be another workable option. According to https://stackoverflow.com/a/21060235/1299011 the unary
|
27
|
+
# operators are !, ~, +, and -. We use - already and ! needs to be avoided because of idiomiatic Ruby checks like !foo for
|
28
|
+
# existence. Operator ~ works, but that character is easy to mistake for a - when reading, which is bad. Ruby uses & as a
|
29
|
+
# unary (for to_proc) but the Ruby parser doesn't allow its use in general. So I think we're stuck with +.
|
30
|
+
#
|
31
|
+
# - repeating patterns still use exponentiation, but it now looks like patt**n rather than patt^n because of Ruby's syntax
|
32
|
+
#
|
33
|
+
# - grammars are represented by hashes or arrays. LPEG uses Lua tables (which are mashups of hashtables and arrays)
|
34
|
+
#
|
35
|
+
# If an array is given then the nonterminals aren't named and all open calls must use numeric indices. The first element of the
|
36
|
+
# array is either
|
37
|
+
# - a non-negative integer 0, 1, 2, ... and specifies the (rule of the) initial nonterminal among the remaining elements with
|
38
|
+
# indices reckoned _without_ that initial integer
|
39
|
+
# - something else, which is interpreted as the pattern for the initial nonterminal
|
40
|
+
#
|
41
|
+
# Otherwise the grammar is defined with a Hash. The keys are the nonterminal symbols and the values are the rule patterns.
|
42
|
+
# - the keys must be symbols or strings (which are converted to symbols). No rule can use :initial or "initial" as
|
43
|
+
# nonterminal.
|
44
|
+
# - the open calls can refer either to the nonterminals (as strings or symbols) or to rule indices as they appear in the hash,
|
45
|
+
# ignoring the :initial key (if present)
|
46
|
+
# - :initial/"initial" can appear as a key in the hash and its value specifies the initial nonterminal.
|
47
|
+
# - if it is a non-zero integer it gives the index of the initial terminal's rule, reckoned without the presence of the :initial
|
48
|
+
# key itself.
|
49
|
+
# - if it is a symbol or a string it specifies the initial nonterminal directly
|
50
|
+
#
|
51
|
+
# - "Table" captures return an instace of TableCapture, which impelements a little bit of a Lua's table functionality
|
52
|
+
# - other formats haven't worked out well
|
53
|
+
#
|
54
|
+
# Function captures
|
55
|
+
#
|
56
|
+
# Various kinds of captures involve calling a function (proc) provided by client code. For example, the construction (patt / fn)
|
57
|
+
# takes the captures made by patt and passes them as arguments to fn. Then the values returned by fn become the captures of the
|
58
|
+
# expression. Lua is better than Ruby at distinguishing between a function that returns multiple values and one that returns a
|
59
|
+
# single value that is an array. In RPEG, returns from function in contexts like this are treated as follows:
|
60
|
+
#
|
61
|
+
# - [1, 2, 3]: multiple captures, 1, 2, 3.
|
62
|
+
# - this is the natural interpretation as it's the standard way that a Ruby function returns multiple values
|
63
|
+
# - [[1, 2, 3]]: a single capture that is an array
|
64
|
+
# - nil: no captures
|
65
|
+
# - even if the function says something like "return nil", the capture code has no way to distinguish between that and a
|
66
|
+
# function that returns nothing
|
67
|
+
# - [nil]: a single capture with value nil
|
68
|
+
# - the weirdest case, but I don't see an alternative
|
69
|
+
#
|
70
|
+
# TODO:
|
71
|
+
# - program generation optimations
|
72
|
+
# - other pattern-based optimizations: need to scan through the LPEG code again
|
73
|
+
# - I think I've done them all now
|
74
|
+
# - profiling
|
75
|
+
# - LPEG's locale support
|
76
|
+
# - what would this look like in Ruby?
|
77
|
+
module RPEG
|
78
|
+
extend self
|
79
|
+
|
80
|
+
# Match any character in string (regarded as a set of characters), range, or Set
|
81
|
+
#
|
82
|
+
# If the set is empty we have NFALSE, which always fails
|
83
|
+
# If the set has a single element we have CHAR pattern, which is a little faster in the VM
|
84
|
+
# Otherwise we have a CHARSET pattern
|
85
|
+
def S(charset)
|
86
|
+
case charset
|
87
|
+
when Set
|
88
|
+
size = charset.size
|
89
|
+
return P(false) if size.zero?
|
90
|
+
return P(charset.first) if size == 1
|
91
|
+
return P(1) if charset == Pattern::FULL_CHAR_SET
|
92
|
+
|
93
|
+
Pattern.new(Pattern::CHARSET, data: charset)
|
94
|
+
when String
|
95
|
+
S(Set.new(charset.chars))
|
96
|
+
else
|
97
|
+
raise "Cannot create a character set pattern from #{chars}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Take argument and turn it into a pattern
|
102
|
+
def P(arg)
|
103
|
+
case arg
|
104
|
+
when Pattern
|
105
|
+
arg
|
106
|
+
when String
|
107
|
+
# a sequence of CHAR patterns
|
108
|
+
patt = P(true)
|
109
|
+
arg.chars.reverse_each do |ch|
|
110
|
+
patt = Pattern.new(Pattern::CHAR, data: ch) * patt
|
111
|
+
end
|
112
|
+
patt
|
113
|
+
when Integer
|
114
|
+
# When n >= 0, match at least n chars.
|
115
|
+
# When n < 0, there must not be n or more characters left
|
116
|
+
return -P(-arg) if arg.negative?
|
117
|
+
|
118
|
+
# In LPEG the ANY VM instruction takes no arg and matches a single char, unlike the description in the paper. I think it
|
119
|
+
# makes certain code optimizations simpler to analyze. We do the same.
|
120
|
+
patt = P(true)
|
121
|
+
arg.times do
|
122
|
+
patt = Pattern.new(Pattern::ANY) * patt
|
123
|
+
end
|
124
|
+
patt
|
125
|
+
when FalseClass
|
126
|
+
@false_tree ||= Pattern.new(Pattern::NFALSE)
|
127
|
+
when TrueClass
|
128
|
+
@true_tree ||= Pattern.new(Pattern::NTRUE)
|
129
|
+
when Hash, Array
|
130
|
+
Pattern.new(Pattern::GRAMMAR, data: arg)
|
131
|
+
when Proc
|
132
|
+
# a pattern equivalent to a match-time capture over the empty string.
|
133
|
+
Pattern.new(Pattern::RUNTIME, P(true), data: arg)
|
134
|
+
else
|
135
|
+
raise "RPEG.P does not support argument #{arg}"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Given a 2-char string xy, the ASCII range x..y. Each argument gives a range and we match on their union.
|
140
|
+
#
|
141
|
+
# Always represent with a Set.
|
142
|
+
def R(*ranges)
|
143
|
+
return P(false) if ranges.empty?
|
144
|
+
|
145
|
+
check = lambda do |str|
|
146
|
+
raise "Bad data #{str} for Pattern#R" unless str.is_a?(String) && str.size == 2
|
147
|
+
|
148
|
+
Set.new ((str[0])..(str[1])).to_a
|
149
|
+
end
|
150
|
+
|
151
|
+
result = ranges.map{ check.call(_1) }.reduce(:|)
|
152
|
+
S(result)
|
153
|
+
end
|
154
|
+
|
155
|
+
# An "open call" reference to a rule in a grammar. As we don't have the grammar yet - it is available in full only when we are
|
156
|
+
# ready to compile - we remember it this way.
|
157
|
+
#
|
158
|
+
# ref should be either
|
159
|
+
# - a non-negative integer n, referring to the n-th rule in the grammar (0-based) or
|
160
|
+
# - a value that will be the key in the final grammar - a Hash or Array - of the rule being referenced
|
161
|
+
# - strings are turned into symbols
|
162
|
+
def V(ref)
|
163
|
+
ref = ref.to_sym if ref.is_a?(String)
|
164
|
+
Pattern.new(Pattern::OPEN_CALL, data: ref)
|
165
|
+
end
|
166
|
+
|
167
|
+
# LPEG: Creates a simple capture, which captures the substring of the subject that matches patt. The captured value is a
|
168
|
+
# string. If patt has other captures, their values are returned after this one.
|
169
|
+
#
|
170
|
+
# Note: it appears that when a simple capture is over another simple capture - like C(C(patt)) - we squeeze out the
|
171
|
+
# duplication. See this test at l.216 of test.lua:
|
172
|
+
#
|
173
|
+
# assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i)
|
174
|
+
#
|
175
|
+
# (In lua the # operator gives the size of a string or table)
|
176
|
+
def C(pattern)
|
177
|
+
pattern = P(pattern)
|
178
|
+
return pattern if pattern.type == Pattern::CAPTURE && pattern.capture == Capture::SIMPLE
|
179
|
+
|
180
|
+
Pattern.new(Pattern::CAPTURE, pattern, capture: Capture::SIMPLE)
|
181
|
+
end
|
182
|
+
|
183
|
+
# Capture the n-th extra argument provided to #match. The first extra argument is n=1, etc.
|
184
|
+
#
|
185
|
+
# We accept a missing argument to match some LPEG test cases but an error is raised
|
186
|
+
def Carg(num = nil)
|
187
|
+
raise "Invalid argument for Carg: #{num || 'nil'}" unless num.is_a?(Integer) && num&.positive?
|
188
|
+
|
189
|
+
Pattern.new(Pattern::CAPTURE, P(true), data: num, capture: Capture::ARGUMENT)
|
190
|
+
end
|
191
|
+
|
192
|
+
# From LPEG docs:
|
193
|
+
#
|
194
|
+
# Creates a back capture. This pattern matches the empty string and produces the values produced by the most recent group
|
195
|
+
# capture named name (where name can be any Lua value).
|
196
|
+
#
|
197
|
+
# Most recent means the last complete outermost group capture with the given name. A Complete capture means that the entire
|
198
|
+
# pattern corresponding to the capture has matched. An Outermost capture means that the capture is not inside another complete
|
199
|
+
# capture.
|
200
|
+
#
|
201
|
+
# nil is not allowed as a name
|
202
|
+
def Cb(name)
|
203
|
+
raise "Back capture must specify name of group" unless name
|
204
|
+
|
205
|
+
Pattern.new(Pattern::CAPTURE, P(true), data: name, capture: Capture::BACKREF)
|
206
|
+
end
|
207
|
+
|
208
|
+
# LPEG: Creates a constant capture. This pattern matches the empty string and produces all given values as its captured values.
|
209
|
+
#
|
210
|
+
# No value at all - Cc() - adds nothing to the result, which is different from a value of nil.
|
211
|
+
#
|
212
|
+
# We capture several values with individual captures.
|
213
|
+
def Cc(*values)
|
214
|
+
return P(true) if values.empty?
|
215
|
+
|
216
|
+
patt = Pattern.new(Pattern::CAPTURE, P(true), data: values.first, capture: Capture::CONST)
|
217
|
+
return patt if values.size == 1
|
218
|
+
|
219
|
+
# Otherwise, follow LPEG and make an anonymous Group capture over a sequence of single-val const captures
|
220
|
+
values[1...].each do |val|
|
221
|
+
patt *= Cc(val)
|
222
|
+
end
|
223
|
+
Cg(patt)
|
224
|
+
end
|
225
|
+
|
226
|
+
# From LPEG docs:
|
227
|
+
#
|
228
|
+
# Creates a fold capture. If patt produces a list of captures C1 C2 ... Cn, this capture will produce the value
|
229
|
+
# func(...func(func(C1, C2), C3)..., Cn), that is, it will fold (or accumulate, or reduce) the captures from patt using
|
230
|
+
# function func.
|
231
|
+
#
|
232
|
+
# The second argument should be a lambda taking two arguments and returning one in the standard way.
|
233
|
+
#
|
234
|
+
# The first nested capture is examined. If there isn't one or it captures no values there is an error. If this capture contains
|
235
|
+
# more than one value all but the first are discarded. This is the initial value for the fold. Then we extract the remaining
|
236
|
+
# nested captures and use their values C2, ..., Cn in the fold as described.
|
237
|
+
#
|
238
|
+
# If Ci (i >= 2) produces k values then the lambda will receive k+1 arguments: the accumulator and the k captured values.
|
239
|
+
# an array.
|
240
|
+
def Cf(pattern, lambda)
|
241
|
+
raise "Fold capture must have an accumulation lambda" unless lambda
|
242
|
+
|
243
|
+
Pattern.new(Pattern::CAPTURE, P(pattern), data: lambda, capture: Capture::FOLD)
|
244
|
+
end
|
245
|
+
|
246
|
+
# From LPEG docs:
|
247
|
+
#
|
248
|
+
# Creates a group capture. It groups all values returned by patt into a single capture. The group may be anonymous (if no name
|
249
|
+
# is given) or named with the given name (which can be any non-nil Lua value).
|
250
|
+
#
|
251
|
+
# An anonymous group serves to join values from several captures into a single capture. A named group has a different
|
252
|
+
# behavior. In most situations, a named group returns no values at all. Its values are only relevant for a following back
|
253
|
+
# capture or when used inside a table capture.
|
254
|
+
#
|
255
|
+
# The name doesn't have to be string. It can be anything other than nil, because nil means it's an anonymous group.
|
256
|
+
def Cg(pattern, name = nil)
|
257
|
+
Pattern.new(Pattern::CAPTURE, P(pattern), data: name, capture: Capture::GROUP)
|
258
|
+
end
|
259
|
+
|
260
|
+
# LPEG: Creates a position capture. It matches the empty string and captures the position in the subject where the match
|
261
|
+
# occurs. The captured value is a number.
|
262
|
+
def Cp
|
263
|
+
Pattern.new(Pattern::CAPTURE, P(true), capture: Capture::POSITION)
|
264
|
+
end
|
265
|
+
|
266
|
+
# From LPEG:
|
267
|
+
#
|
268
|
+
# Creates a substitution capture, which captures the substring of the subject that matches patt, with substitutions. For any
|
269
|
+
# capture inside patt with a value, the substring that matched the capture is replaced by the capture value (which should be a
|
270
|
+
# string). The final captured value is the string resulting from all replacements.
|
271
|
+
def Cs(patt)
|
272
|
+
Pattern.new(Pattern::CAPTURE, P(patt), capture: Capture::SUBST)
|
273
|
+
end
|
274
|
+
|
275
|
+
# From LPEG:
|
276
|
+
#
|
277
|
+
# Creates a table capture. This capture returns a table with all values from all anonymous captures made by patt inside this
|
278
|
+
# table in successive integer keys, starting at 1. Moreover, for each named capture group created by patt, the first value of
|
279
|
+
# the group is put into the table with the group name as its key. The captured value is only the table.
|
280
|
+
#
|
281
|
+
# For us the capture takes the form of a custom class, TableCapture. It is intended to mimic a little bit of the functionality
|
282
|
+
# of Lua's tables, which are a combination Array/Hashtable
|
283
|
+
# - indexing is by 0, 1, 2, ... for the anonmyous catpures
|
284
|
+
# - indexing by group names otherwise
|
285
|
+
# - #unpack gives an arry of the anonymous captures.
|
286
|
+
#
|
287
|
+
# See the class definition (captures.rb) for more details.
|
288
|
+
#
|
289
|
+
# Other things tried:
|
290
|
+
# - returning a hash when there are named captures and an array when there are not
|
291
|
+
# - in the hash, anonmyous captures are at keys 0, 1, 2, ...
|
292
|
+
# - this turned out to be somewhat frustrating in unit tests.
|
293
|
+
# - returning a hash with group names as keys and a special key of :anon for the array of anonmyous captures
|
294
|
+
# - I thought this would work better, but actually turned out to be much worse to work with
|
295
|
+
def Ct(patt)
|
296
|
+
Pattern.new(Pattern::CAPTURE, P(patt), capture: Capture::TABLE)
|
297
|
+
end
|
298
|
+
|
299
|
+
# From LPEG:
|
300
|
+
#
|
301
|
+
# Creates a match-time capture. Unlike all other captures, this one is evaluated immediately when a match occurs (even if it
|
302
|
+
# is part of a larger pattern that fails later). It forces the immediate evaluation of all its nested captures and then calls
|
303
|
+
# function.
|
304
|
+
#
|
305
|
+
# The given function gets as arguments the entire subject, the current position (after the match of patt), plus any capture
|
306
|
+
# values produced by patt.
|
307
|
+
#
|
308
|
+
# The first value returned by function defines how the match happens. If the call returns a number, the match succeeds and the
|
309
|
+
# returned number becomes the new current position. (Assuming a subject s and current position i, the returned number must be
|
310
|
+
# in the range [i, len(s) + 1].) If the call returns true, the match succeeds without consuming any input. (So, to return true
|
311
|
+
# is equivalent to return i.) If the call returns false, nil, or no value, the match fails.
|
312
|
+
#
|
313
|
+
# Any extra values returned by the function become the values produced by the capture.
|
314
|
+
def Cmt(patt, function)
|
315
|
+
# LPEG uses a separate RUNTIME node type instead of CAPTURE because certain functions, like hascaptures and fixedlen, need
|
316
|
+
# special behavior here. Note
|
317
|
+
#
|
318
|
+
# LPEG also uses "runtime" interally instead of "matchtime". We follow
|
319
|
+
Pattern.new(Pattern::RUNTIME, P(patt), data: function)
|
320
|
+
end
|
321
|
+
|
322
|
+
# Returns a pattern that matches only if the input string at the current position is preceded by patt. Pattern patt must match
|
323
|
+
# only strings with some fixed length, and it cannot contain captures.
|
324
|
+
def B(patt)
|
325
|
+
patt = P(patt)
|
326
|
+
len = patt.fixed_len
|
327
|
+
raise "Behind match: pattern may not have fixed length" unless len
|
328
|
+
raise "Behind match: pattern has captures" if patt.has_captures?
|
329
|
+
|
330
|
+
# LPEG puts an upper bound of MAXBEHIND = 255 on how large the match can be here. I think it is because the value is packed
|
331
|
+
# into a byte of memory. We don't care about that
|
332
|
+
Pattern.new(Pattern::BEHIND, patt, data: len)
|
333
|
+
end
|
334
|
+
|
335
|
+
# See the instance method #match for the arguments
|
336
|
+
def match(thing, string, init = 0, *extra_args)
|
337
|
+
P(thing).match(string, init, *extra_args)
|
338
|
+
end
|
339
|
+
|
340
|
+
# The class representing "patterns" and containing the logic to turn them into programs for the virtual machine.
|
341
|
+
#
|
342
|
+
# Very roughly, this is where the LPEG code in lptree.c and lpcode.c lives
|
343
|
+
class Pattern
|
344
|
+
NODE_TYPES = %i[
|
345
|
+
charset char any seq ordered_choice repeated not and
|
346
|
+
ntrue nfalse grammar open_call rule call capture runtime behind
|
347
|
+
].each do |op|
|
348
|
+
const_set op.upcase, op
|
349
|
+
end
|
350
|
+
|
351
|
+
# We assume we have UTF-8 input with no multibyte characters.
|
352
|
+
FULL_CHAR_SET = Set.new((0..255).map{ _1.chr(Encoding::UTF_8) })
|
353
|
+
|
354
|
+
attr_reader :type, :left, :right, :capture
|
355
|
+
attr_accessor :data # sometimes we need to tweak this
|
356
|
+
|
357
|
+
# Return the index just after the matching prefix of str or nil if there is no match
|
358
|
+
#
|
359
|
+
# str: the string the match against
|
360
|
+
# init: the string index to start at, defaulting to 0
|
361
|
+
# extra_args: used by Argument Captures
|
362
|
+
def match(str, init = 0, *extra_args)
|
363
|
+
# Note that the program doesn't depend on the arguments so we can cache it
|
364
|
+
@program ||= optimize_jumps(code(follow_set: FULL_CHAR_SET) + [Instruction.new(i::OP_END)])
|
365
|
+
|
366
|
+
machine = ParsingMachine.new(@program, str, init, extra_args)
|
367
|
+
machine.run
|
368
|
+
|
369
|
+
return machine.captures if machine.success?
|
370
|
+
end
|
371
|
+
|
372
|
+
# If left is defined and right is nil - so we have a unary op - we can get child here
|
373
|
+
def child
|
374
|
+
raise 'Pattern is not unary' if right
|
375
|
+
|
376
|
+
left
|
377
|
+
end
|
378
|
+
|
379
|
+
########################################
|
380
|
+
# Operator overloading
|
381
|
+
#
|
382
|
+
# The LPEG library makes heavy use of operator overriding in Lua to combine patterns in a convenient way. We will follow.
|
383
|
+
|
384
|
+
# This only happens if other is a Numeric type, which is annoying. See the monkeypatching below for other cases.
|
385
|
+
def coerce(other)
|
386
|
+
[RPEG.P(other), self]
|
387
|
+
end
|
388
|
+
|
389
|
+
# p1 * p2 means p1 followed by p2
|
390
|
+
def *(other)
|
391
|
+
other = fix_type(other)
|
392
|
+
|
393
|
+
# true is the identity for *
|
394
|
+
return self if other.type == NTRUE
|
395
|
+
return other if type == NTRUE
|
396
|
+
|
397
|
+
# rejigger to make SEQ right-associative. I don't know that it makes a difference, but LPEG does it.
|
398
|
+
return left * (right * other) if type == SEQ
|
399
|
+
|
400
|
+
Pattern.new(SEQ, self, other)
|
401
|
+
end
|
402
|
+
|
403
|
+
# p1 + p2 is ordered choice: if p1 matches we match and never consider p2, otherwise try matching on p2
|
404
|
+
def +(other)
|
405
|
+
other = fix_type(other)
|
406
|
+
|
407
|
+
if charsetlike? && other.charsetlike?
|
408
|
+
# Take the union of the charsets
|
409
|
+
RPEG.S(charset + other.charset)
|
410
|
+
elsif type == NFALSE
|
411
|
+
other
|
412
|
+
elsif other.type == NFALSE
|
413
|
+
self
|
414
|
+
elsif type == ORDERED_CHOICE
|
415
|
+
# rejigger to make this operation right-associative, giving more efficient compiled code. See Ierusalimschy 4.2
|
416
|
+
left + (right + other)
|
417
|
+
else
|
418
|
+
Pattern.new(ORDERED_CHOICE, self, other)
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
# pat ** n means "n or more occurrences of pat" when n is non-negative, and "up to -n occurrences of pat" if n is negative.
|
423
|
+
def **(other)
|
424
|
+
n = other.must_be_a(Integer)
|
425
|
+
|
426
|
+
if n >= 0
|
427
|
+
raise "Pattern may match 0-length string so repetition may lead to an infinite loop" if nullable?
|
428
|
+
|
429
|
+
patt = Pattern.new(REPEATED, self) # this repeats 0 or more times
|
430
|
+
while n.positive?
|
431
|
+
patt = self * patt
|
432
|
+
n -= 1
|
433
|
+
end
|
434
|
+
else
|
435
|
+
n = -n
|
436
|
+
patt = RPEG.P(true)
|
437
|
+
while n.positive?
|
438
|
+
patt = self * patt + true
|
439
|
+
n -= 1
|
440
|
+
end
|
441
|
+
end
|
442
|
+
patt
|
443
|
+
end
|
444
|
+
|
445
|
+
# Unary negation represents "does not match". So -patt says that there is no match at the current position and we don't consume
|
446
|
+
# any of the string
|
447
|
+
def -@
|
448
|
+
Pattern.new(NOT, self)
|
449
|
+
end
|
450
|
+
|
451
|
+
# Unary "and": pattern matches here (without consuming any input)
|
452
|
+
#
|
453
|
+
# Ierusalimschy points out that &patt can be implemented as --patt, but there is an optimization for the VM, so we preserve it
|
454
|
+
def +@
|
455
|
+
Pattern.new(AND, self)
|
456
|
+
end
|
457
|
+
|
458
|
+
# Difference is "this but not that". So p1 - p2 matches if p1 does and p2 doesn't. Here, p2 doesn't consume any input but p1
|
459
|
+
# does. The pattern is equivalent to -p2 * p1.
|
460
|
+
#
|
461
|
+
# Special case: if both patterns are charsets we replace with a single charset
|
462
|
+
def -(other)
|
463
|
+
other = fix_type(other)
|
464
|
+
|
465
|
+
return RPEG.S(charset - other.charset) if charsetlike? && other.charsetlike?
|
466
|
+
|
467
|
+
# Otherwise we use -p2 * p1, i.e., "p2 doesn't match here" followed by "try to match and consume p1"
|
468
|
+
-other * self
|
469
|
+
end
|
470
|
+
|
471
|
+
# Replacement captures of various kinds
|
472
|
+
#
|
473
|
+
# From the LPEG docs:
|
474
|
+
#
|
475
|
+
# patt / string
|
476
|
+
#
|
477
|
+
# Creates a string capture. It creates a capture string based on string. The captured value is a copy of string, except that
|
478
|
+
# the character % works as an escape character: any sequence in string of the form %n, with n between 1 and 9, stands for
|
479
|
+
# the match of the n-th capture in patt. The sequence %0 stands for the whole match. The sequence %% stands for a single %.
|
480
|
+
#
|
481
|
+
# patt / number
|
482
|
+
#
|
483
|
+
# Creates a numbered capture. For a non-zero number, the captured value is the n-th value captured by patt. When number is
|
484
|
+
# zero, there are no captured values.
|
485
|
+
#
|
486
|
+
# patt / table [for this we accept a Hash]
|
487
|
+
#
|
488
|
+
# Creates a query capture. It indexes the given table using as key the first value captured by patt, or the whole match if
|
489
|
+
# patt produced no value. The value at that index is the final value of the capture. If the table does not have that key,
|
490
|
+
# there is no captured value.
|
491
|
+
#
|
492
|
+
# patt / function
|
493
|
+
#
|
494
|
+
# Creates a function capture. It calls the given function passing all captures made by patt as arguments, or the whole match
|
495
|
+
# if patt made no capture. The values returned by the function are the final values of the capture. In particular, if
|
496
|
+
# function returns no value, there is no captured value.
|
497
|
+
def /(other)
|
498
|
+
case other
|
499
|
+
when String
|
500
|
+
Pattern.new(CAPTURE, self, data: other, capture: Capture::STRING)
|
501
|
+
when Integer
|
502
|
+
raise "Cannot use negative number for numbered capture" if other.negative?
|
503
|
+
|
504
|
+
Pattern.new(CAPTURE, self, data: other, capture: Capture::NUM)
|
505
|
+
when Hash
|
506
|
+
Pattern.new(CAPTURE, self, data: other, capture: Capture::QUERY)
|
507
|
+
when Proc
|
508
|
+
Pattern.new(CAPTURE, self, data: other, capture: Capture::FUNCTION)
|
509
|
+
else
|
510
|
+
raise "Replacement capture is not supported for #{other}"
|
511
|
+
end
|
512
|
+
end
|
513
|
+
|
514
|
+
def fix_type(other)
|
515
|
+
return other if other.is_a?(Pattern)
|
516
|
+
|
517
|
+
RPEG.P(other) # see what we can do
|
518
|
+
end
|
519
|
+
|
520
|
+
def to_s
|
521
|
+
return @to_s if @to_s
|
522
|
+
|
523
|
+
result = []
|
524
|
+
do_sub_pattern = lambda do |sub_patt|
|
525
|
+
sub_patt.to_s.split("\n").each do |line|
|
526
|
+
result << "| #{line}"
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
type_s = type.to_s.capitalize
|
531
|
+
|
532
|
+
case type
|
533
|
+
when CHARSET
|
534
|
+
result << "Charset: #{data.join.dump}"
|
535
|
+
when ANY
|
536
|
+
result << "#{type_s}: #{data}"
|
537
|
+
when CHAR
|
538
|
+
result << "#{type_s}: #{data.dump}"
|
539
|
+
when NTRUE
|
540
|
+
result << "TRUE"
|
541
|
+
when NFALSE
|
542
|
+
result << "FALSE"
|
543
|
+
when OPEN_CALL
|
544
|
+
result << "OpenCall: #{data}"
|
545
|
+
when CALL
|
546
|
+
result << "Call: #{data}"
|
547
|
+
when SEQ, ORDERED_CHOICE
|
548
|
+
result << (type == SEQ ? "Seq:" : "Ordered Choice:")
|
549
|
+
do_sub_pattern.call(left)
|
550
|
+
do_sub_pattern.call(right)
|
551
|
+
when RULE
|
552
|
+
result << "nonterminal: #{data}"
|
553
|
+
when REPEATED, NOT, AND, BEHIND
|
554
|
+
result << "#{type_s}: "
|
555
|
+
do_sub_pattern.call(child)
|
556
|
+
when CAPTURE
|
557
|
+
result << "Capture: #{capture} #{data.inspect}"
|
558
|
+
do_sub_pattern.call(child)
|
559
|
+
when RUNTIME
|
560
|
+
result << "Runtime: #{capture} #{data.inspect}"
|
561
|
+
do_sub_pattern.call(child)
|
562
|
+
when GRAMMAR
|
563
|
+
result << "Grammar:"
|
564
|
+
first = true
|
565
|
+
child.each do |nonterminal, rule_pattern|
|
566
|
+
prefix = " #{nonterminal}: "
|
567
|
+
first = true
|
568
|
+
rule_pattern.to_s.split("\n").each do |line|
|
569
|
+
line_prefix = first ? prefix : (" " * prefix.len)
|
570
|
+
result << "#{line_prefix}#{line}"
|
571
|
+
end
|
572
|
+
end
|
573
|
+
else
|
574
|
+
raise "Unhandled type for to_s: #{type}"
|
575
|
+
end
|
576
|
+
|
577
|
+
@to_s = result.join("\n")
|
578
|
+
end
|
579
|
+
|
580
|
+
########################################
|
581
|
+
# Pattern properties
|
582
|
+
|
583
|
+
def charsetlike?
|
584
|
+
type == CHARSET || type == CHAR || type == ANY
|
585
|
+
end
|
586
|
+
|
587
|
+
def charset
|
588
|
+
raise "Pattern #{type} isn't charset-like" unless charsetlike?
|
589
|
+
|
590
|
+
case type
|
591
|
+
when CHARSET
|
592
|
+
data
|
593
|
+
when CHAR
|
594
|
+
Set.new([data])
|
595
|
+
when ANY
|
596
|
+
FULL_CHAR_SET
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
def nullable?
|
601
|
+
return @nullable if defined? @nullable
|
602
|
+
|
603
|
+
@nullable = check_pred(:nullable)
|
604
|
+
end
|
605
|
+
|
606
|
+
def nofail?
|
607
|
+
return @nofail if defined? @nofail
|
608
|
+
|
609
|
+
@nofail = check_pred(:nofail)
|
610
|
+
end
|
611
|
+
|
612
|
+
# The is lpeg's checkaux from lpcode.c. Comment from that function (reformatted):
|
613
|
+
#
|
614
|
+
# /*
|
615
|
+
# ** Checks how a pattern behaves regarding the empty string, in one of two different ways:
|
616
|
+
#
|
617
|
+
# ** - A pattern is *nullable* if it can match without consuming any character;
|
618
|
+
# ** - A pattern is *nofail* if it never fails for any string (including the empty string).
|
619
|
+
#
|
620
|
+
# ** The difference is only for predicates and run-time captures; for other patterns, the two properties are equivalent. (With
|
621
|
+
# ** predicates, &'a' is nullable but not nofail. Of course, nofail => nullable.)
|
622
|
+
#
|
623
|
+
# ** These functions are all convervative in the following way:
|
624
|
+
# ** p is nullable => nullable(p)
|
625
|
+
# ** nofail(p) => p cannot fail
|
626
|
+
#
|
627
|
+
# ** The function assumes that TOpenCall is not nullable; this will be checked again when the grammar is fixed.
|
628
|
+
#
|
629
|
+
# ** Run-time captures can do whatever they want, so the result is conservative.
|
630
|
+
# */
|
631
|
+
def check_pred(pred)
|
632
|
+
raise "Bad check predicate #{pred}" unless %i[nullable nofail].include?(pred)
|
633
|
+
|
634
|
+
case type
|
635
|
+
when CHAR, CHARSET, ANY, OPEN_CALL, NFALSE
|
636
|
+
# Not nullable; for open_call this is a blind assumption
|
637
|
+
false
|
638
|
+
when NTRUE, REPEATED
|
639
|
+
true
|
640
|
+
when NOT, BEHIND
|
641
|
+
# can match empty, but can fail
|
642
|
+
pred != :nofail
|
643
|
+
when AND
|
644
|
+
# can match empty; can fail exactly when body can
|
645
|
+
return true if pred == :nullable
|
646
|
+
|
647
|
+
child.check_pred(pred)
|
648
|
+
when RUNTIME
|
649
|
+
# can fail; match empty iff body does
|
650
|
+
return false if pred == :nofail
|
651
|
+
|
652
|
+
child.check_pred(pred)
|
653
|
+
when SEQ
|
654
|
+
left.check_pred(pred) && right.check_pred(pred)
|
655
|
+
when ORDERED_CHOICE
|
656
|
+
left.check_pred(pred) || right.check_pred(pred)
|
657
|
+
when GRAMMAR
|
658
|
+
# Strings are matched by the initial nonterminal
|
659
|
+
child.first.check_pred(pred)
|
660
|
+
when CALL, RULE, CAPTURE
|
661
|
+
child.check_pred(pred)
|
662
|
+
else
|
663
|
+
raise "Unhandled pattern type #{type}"
|
664
|
+
end
|
665
|
+
end
|
666
|
+
|
667
|
+
# fixedlen from LPEG's lpcode.h
|
668
|
+
#
|
669
|
+
# /*
|
670
|
+
# ** number of characters to match a pattern (or -1 if variable)
|
671
|
+
# */
|
672
|
+
#
|
673
|
+
# We return nil if the node's matches are not all of the same length
|
674
|
+
def fixed_len
|
675
|
+
case type
|
676
|
+
when CHARSET, CHAR, ANY
|
677
|
+
1
|
678
|
+
when NOT, AND, NTRUE, NFALSE, BEHIND
|
679
|
+
0
|
680
|
+
when REPEATED, OPEN_CALL, RUNTIME
|
681
|
+
nil
|
682
|
+
when CAPTURE, RULE
|
683
|
+
child.fixed_len
|
684
|
+
when GRAMMAR
|
685
|
+
child.first.fixed_len # the first rule is the initial nonterminal
|
686
|
+
when CALL
|
687
|
+
call_recursive(:fixed_len, nil)
|
688
|
+
when SEQ
|
689
|
+
left_len = left.fixed_len
|
690
|
+
return nil unless left_len
|
691
|
+
|
692
|
+
right_len = right.fixed_len
|
693
|
+
return nil unless right_len
|
694
|
+
|
695
|
+
left_len + right_len
|
696
|
+
when ORDERED_CHOICE
|
697
|
+
left_len = left.fixed_len
|
698
|
+
return nil unless left_len
|
699
|
+
|
700
|
+
right_len = right.fixed_len
|
701
|
+
right_len == left_len ? right_len : nil
|
702
|
+
else
|
703
|
+
raise "Unhandled node type #{type}"
|
704
|
+
end
|
705
|
+
end
|
706
|
+
|
707
|
+
# From checkloops in lptree.c
|
708
|
+
#
|
709
|
+
# /*
|
710
|
+
# ** Check whether a tree has potential infinite loops
|
711
|
+
# */
|
712
|
+
def loops?
|
713
|
+
return true if type == REPEATED && child.nullable?
|
714
|
+
|
715
|
+
# /* sub-grammars already checked */, i.e., by verify_grammar
|
716
|
+
return false if type == GRAMMAR || type == CALL
|
717
|
+
|
718
|
+
case num_children
|
719
|
+
when 1
|
720
|
+
child.loops?
|
721
|
+
when 2
|
722
|
+
left.loops? || right.loops?
|
723
|
+
end
|
724
|
+
end
|
725
|
+
|
726
|
+
# From callrecursive in LPEG's lpcode.c
|
727
|
+
#
|
728
|
+
# /*
|
729
|
+
# ** Visit a TCall node taking care to stop recursion. If node not yet
|
730
|
+
# ** visited, return 'f(sib2(tree))', otherwise return 'def' (default
|
731
|
+
# ** value)
|
732
|
+
# */
|
733
|
+
#
|
734
|
+
# This method acts as a circuit breaker for structural recursion that might otherwise get in a loop among mutually recursive
|
735
|
+
# grammar rules.
|
736
|
+
#
|
737
|
+
# It's janky, but we follow LPEG's approach of hijacking the key field (which we call data) to keep track of the recursion
|
738
|
+
def call_recursive(func, default)
|
739
|
+
type.must_be CALL
|
740
|
+
child.type.must_be RULE
|
741
|
+
|
742
|
+
already_visited = :already_visited
|
743
|
+
|
744
|
+
saved_data = @data
|
745
|
+
|
746
|
+
if saved_data == already_visited
|
747
|
+
default
|
748
|
+
else
|
749
|
+
# first time we've been here
|
750
|
+
@data = already_visited
|
751
|
+
result = send(func)
|
752
|
+
@data = saved_data
|
753
|
+
result
|
754
|
+
end
|
755
|
+
end
|
756
|
+
|
757
|
+
# From hascaptures in LPEG's lpcode.c
|
758
|
+
# /*
|
759
|
+
# ** Check whether a pattern tree has captures
|
760
|
+
# */
|
761
|
+
def has_captures?
|
762
|
+
case type
|
763
|
+
when CAPTURE, RUNTIME
|
764
|
+
true
|
765
|
+
when CALL
|
766
|
+
call_recursive(:has_captures?, false)
|
767
|
+
when GRAMMAR
|
768
|
+
child.any?(&:has_captures?)
|
769
|
+
else
|
770
|
+
case num_children
|
771
|
+
when 0
|
772
|
+
false
|
773
|
+
when 1
|
774
|
+
child.has_captures?
|
775
|
+
when 2
|
776
|
+
left.has_captures? || right.has_captures?
|
777
|
+
end
|
778
|
+
end
|
779
|
+
end
|
780
|
+
|
781
|
+
# LPEG's getfirst
|
782
|
+
#
|
783
|
+
# static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
|
784
|
+
#
|
785
|
+
# /*
|
786
|
+
# ** Computes the 'first set' of a pattern.
|
787
|
+
# ** The result is a conservative aproximation:
|
788
|
+
# ** match p ax -> x (for some x) ==> a belongs to first(p)
|
789
|
+
# ** or
|
790
|
+
# ** a not in first(p) ==> match p ax -> fail (for all x)
|
791
|
+
# [So we want to know the set of characters that can make the pattern succeed, at least on the first characters]
|
792
|
+
# **
|
793
|
+
# ** The set 'follow' is the first set of what follows the
|
794
|
+
# ** pattern (full set if nothing follows it).
|
795
|
+
# **
|
796
|
+
# ** The function returns 0 when this resulting set can be used for
|
797
|
+
# ** test instructions that avoid the pattern altogether.
|
798
|
+
# ** A non-zero return can happen for two reasons:
|
799
|
+
# ** 1) match p '' -> '' ==> return has bit 1 set
|
800
|
+
# ** (tests cannot be used because they would always fail for an empty input);
|
801
|
+
# ** 2) there is a match-time capture ==> return has bit 2 set
|
802
|
+
# ** (optimizations should not bypass match-time captures).
|
803
|
+
# */
|
804
|
+
#
|
805
|
+
# I don't really understand what is going on here. I'm hoping it will make more sense as I port it. I think we pass in follow
|
806
|
+
# and return the int and firstset.
|
807
|
+
def first_set(follow_set = FULL_CHAR_SET)
|
808
|
+
case type
|
809
|
+
when CHAR, CHARSET, ANY
|
810
|
+
[0, charset]
|
811
|
+
when NTRUE
|
812
|
+
[1, follow_set.clone] # /* accepts the empty string */
|
813
|
+
when NFALSE
|
814
|
+
[0, Set.new]
|
815
|
+
when ORDERED_CHOICE
|
816
|
+
e1, first1 = left.first_set(follow_set)
|
817
|
+
e2, first2 = right.first_set(follow_set)
|
818
|
+
[e1 | e2, first1 | first2]
|
819
|
+
when SEQ
|
820
|
+
if !left.nullable?
|
821
|
+
# /* when p1 is not nullable, p2 has nothing to contribute;
|
822
|
+
# return getfirst(sib1(tree), fullset, firstset); */
|
823
|
+
left.first_set(FULL_CHAR_SET)
|
824
|
+
else
|
825
|
+
e2, first2 = right.first_set(follow_set)
|
826
|
+
e1, first1 = left.first_set(first2)
|
827
|
+
return [0, first1] if e1.zero? # /* 'e1' ensures that first can be used */
|
828
|
+
return [2, first1] if (e1 | e2) & 2 == 2 # /* one of the children has a matchtime? */
|
829
|
+
|
830
|
+
[e2, first1] # /* else depends on 'e2' */
|
831
|
+
end
|
832
|
+
when REPEATED
|
833
|
+
_, first_cs = child.first_set(follow_set)
|
834
|
+
[1, first_cs | follow_set] # /* accept the empty string */
|
835
|
+
when CAPTURE, RULE
|
836
|
+
child.first_set(follow_set)
|
837
|
+
when GRAMMAR
|
838
|
+
child.first.first_set(follow_set)
|
839
|
+
when RUNTIME
|
840
|
+
# NOTE: I don't understand this
|
841
|
+
#
|
842
|
+
# /* function invalidates any follow info. */
|
843
|
+
e, first_set = child.first_set(FULL_CHAR_SET)
|
844
|
+
if e.positive?
|
845
|
+
[2, first_set] # /* function is not "protected"? */
|
846
|
+
else
|
847
|
+
[0, first_set] # /* pattern inside capture ensures first can be used */
|
848
|
+
end
|
849
|
+
when CALL
|
850
|
+
child.first_set(follow_set)
|
851
|
+
when AND
|
852
|
+
e, first_set = child.first_set(follow_set)
|
853
|
+
[e, first_set & follow_set]
|
854
|
+
when NOT, BEHIND
|
855
|
+
if type == NOT && child.charsetlike?
|
856
|
+
[1, FULL_CHAR_SET - child.charset]
|
857
|
+
else
|
858
|
+
# /* instruction gives no new information */
|
859
|
+
# /* call 'getfirst' only to check for math-time captures */
|
860
|
+
e, = child.first_set(follow_set)
|
861
|
+
[e | 1, follow_set] # /* always can accept the empty string */
|
862
|
+
end
|
863
|
+
else
|
864
|
+
raise "Unhandled node type #{type}"
|
865
|
+
end
|
866
|
+
end
|
867
|
+
|
868
|
+
# LPEG's headfail
|
869
|
+
#
|
870
|
+
# /*
|
871
|
+
# ** If 'headfail(tree)' true, then 'tree' can fail only depending on the
|
872
|
+
# ** next character of the subject.
|
873
|
+
# */
|
874
|
+
def head_fail?
|
875
|
+
case type
|
876
|
+
when CHAR, CHARSET, ANY, NFALSE
|
877
|
+
true
|
878
|
+
when NTRUE, REPEATED, RUNTIME, NOT
|
879
|
+
false
|
880
|
+
when CAPTURE, RULE, AND, CALL
|
881
|
+
child.head_fail?
|
882
|
+
when GRAMMAR
|
883
|
+
child.first.head_fail?
|
884
|
+
when SEQ
|
885
|
+
return false unless right.nofail?
|
886
|
+
|
887
|
+
left.head_fail?
|
888
|
+
when ORDERED_CHOICE
|
889
|
+
left.head_fail? && right.head_fail?
|
890
|
+
else
|
891
|
+
raise "Unhandled node type #{type}"
|
892
|
+
end
|
893
|
+
end
|
894
|
+
|
895
|
+
# TODO: consider recurising via a #children method. Then we can handle the necessary rules in a GRAMMAR just once.
|
896
|
+
def num_children
|
897
|
+
case type
|
898
|
+
when CHARSET, CHAR, ANY, NTRUE, NFALSE, OPEN_CALL
|
899
|
+
0
|
900
|
+
when REPEATED, AND, NOT, CALL, RULE, CAPTURE, RUNTIME, BEHIND
|
901
|
+
1
|
902
|
+
when SEQ, ORDERED_CHOICE
|
903
|
+
2
|
904
|
+
when GRAMMAR
|
905
|
+
raise "#num_children isn't meaningful for GRAMMAR nodes"
|
906
|
+
else
|
907
|
+
raise "Unhandled pattern type #{type}"
|
908
|
+
end
|
909
|
+
end
|
910
|
+
|
911
|
+
########################################
|
912
|
+
# Code generation
|
913
|
+
|
914
|
+
# shorthand
|
915
|
+
def i
|
916
|
+
Instruction
|
917
|
+
end
|
918
|
+
|
919
|
+
# follow_set
|
920
|
+
# - the set of first characters accepted by whatever comes after us, or the full set of characters if nothing follows us
|
921
|
+
#
|
922
|
+
# dominating_test
|
923
|
+
# - a TEST_CHAR, TEST_CHARSET, or TEST_ANY instruction that we can assume has succeeded and which might save us a little time
|
924
|
+
# - see the tt argument sprinkled through the functions in LPEG's lpcode.c
|
925
|
+
#
|
926
|
+
# active_option
|
927
|
+
# - there is a CHOICE instruction still "active" in the code already generated
|
928
|
+
# - certain pattern types can take advantange of this to avoid another CHOICE instruction
|
929
|
+
# - this is called 'opt' in lpcode.c
|
930
|
+
#
|
931
|
+
# NOTE: don't cache the results as we did before, because the code depends on the arguments
|
932
|
+
def code(follow_set: FULL_CHAR_SET, dominating_test: nil, active_choice: false)
|
933
|
+
code = []
|
934
|
+
case type
|
935
|
+
when CHARSET
|
936
|
+
code << charset_code(data, dominating_test)
|
937
|
+
when CHAR
|
938
|
+
code << char_code(data, dominating_test)
|
939
|
+
when ANY
|
940
|
+
code << Instruction.new(i::ANY)
|
941
|
+
when SEQ
|
942
|
+
code.concat seq_code(follow_set, dominating_test)
|
943
|
+
when NTRUE
|
944
|
+
# we always succeed, which means we don't have to do anything at all
|
945
|
+
when NFALSE
|
946
|
+
code << Instruction.new(i::FAIL)
|
947
|
+
when OPEN_CALL
|
948
|
+
# we resolved these to CALL when the grammar node was created. So if we see one now it is because it was not contained in a
|
949
|
+
# grammar.
|
950
|
+
raise 'OPEN_CALL node appears outside of a grammar'
|
951
|
+
when CALL
|
952
|
+
# This is symbolic target for now. It will be converted to a numeric offset during GRAMMAR analysis
|
953
|
+
code << Instruction.new(i::CALL, offset: data)
|
954
|
+
when ORDERED_CHOICE
|
955
|
+
code.concat choice_code(follow_set, active_choice)
|
956
|
+
when REPEATED
|
957
|
+
code.concat repeated_code(follow_set, active_choice)
|
958
|
+
when NOT
|
959
|
+
code.concat not_code
|
960
|
+
when AND
|
961
|
+
code.concat and_code(dominating_test)
|
962
|
+
when BEHIND
|
963
|
+
code << Instruction.new(i::BEHIND, aux: data) if data.positive?
|
964
|
+
code.concat child.code
|
965
|
+
when CAPTURE
|
966
|
+
c = child.code(follow_set:, dominating_test:)
|
967
|
+
len = fixed_len
|
968
|
+
if len && !child.has_captures?
|
969
|
+
code.concat c
|
970
|
+
code << Instruction.new(i::FULL_CAPTURE, data:, aux: { capture_length: len, kind: capture })
|
971
|
+
else
|
972
|
+
code << Instruction.new(i::OPEN_CAPTURE, data:, aux: { kind: capture })
|
973
|
+
code.concat c
|
974
|
+
code << Instruction.new(i::CLOSE_CAPTURE, aux: { kind: Capture::CLOSE })
|
975
|
+
end
|
976
|
+
when RUNTIME
|
977
|
+
code << Instruction.new(i::OPEN_CAPTURE, data:, aux: { kind: Capture::GROUP })
|
978
|
+
code.concat child.code(follow_set: FULL_CHAR_SET, dominating_test:)
|
979
|
+
code << Instruction.new(i::CLOSE_RUN_TIME, aux: { kind: Capture::CLOSE })
|
980
|
+
when RULE
|
981
|
+
code = child.code(follow_set:)
|
982
|
+
code[0] = code.first.clone
|
983
|
+
code.first.dec = data # decorate with the nonterminal, but clone first to avoid unexpected mutations
|
984
|
+
when GRAMMAR
|
985
|
+
code.concat grammar_code
|
986
|
+
else
|
987
|
+
raise "Unhandled pattern type #{type}"
|
988
|
+
end
|
989
|
+
|
990
|
+
code
|
991
|
+
end
|
992
|
+
|
993
|
+
# LPEG's codeseq1
|
994
|
+
#
|
995
|
+
# /*
|
996
|
+
# ** Code first child of a sequence
|
997
|
+
# ** (second child is called in-place to allow tail call)
|
998
|
+
# ** Return 'tt' for second child
|
999
|
+
# */
|
1000
|
+
#
|
1001
|
+
# We do both parts of the sequence as we don't do TCO
|
1002
|
+
private def seq_code(follow_set, dominating_test)
|
1003
|
+
code = []
|
1004
|
+
if left.need_follow?
|
1005
|
+
_, follow_set1 = right.first_set(follow_set)
|
1006
|
+
code = left.code(follow_set: follow_set1, dominating_test:)
|
1007
|
+
else
|
1008
|
+
code = left.code(dominating_test:)
|
1009
|
+
end
|
1010
|
+
if left.fixed_len != 0
|
1011
|
+
# /* can 'p1' consume anything? */
|
1012
|
+
dominating_test = nil # /* invalidate test */
|
1013
|
+
# /* else 'tt' still protects sib2 */
|
1014
|
+
end
|
1015
|
+
code.concat right.code(follow_set:, dominating_test:)
|
1016
|
+
code
|
1017
|
+
end
|
1018
|
+
|
1019
|
+
# LPEG's codechoice (lpcode.c)
|
1020
|
+
#
|
1021
|
+
# /*
|
1022
|
+
# ** Choice; optimizations:
|
1023
|
+
# ** - when p1 is headfail or
|
1024
|
+
# ** when first(p1) and first(p2) are disjoint, than
|
1025
|
+
# ** a character not in first(p1) cannot go to p1, and a character
|
1026
|
+
# ** in first(p1) cannot go to p2 (at it is not in first(p2)).
|
1027
|
+
# ** (The optimization is not valid if p1 accepts the empty string,
|
1028
|
+
# ** as then there is no character at all...)
|
1029
|
+
# ** - when p2 is empty and opt is true; a IPartialCommit can reuse
|
1030
|
+
# ** the Choice already active in the stack.
|
1031
|
+
# */
|
1032
|
+
private def choice_code(follow_set, active_choice)
|
1033
|
+
raise "Not an ORDERED_CHOICE pattern" unless type == ORDERED_CHOICE
|
1034
|
+
|
1035
|
+
code = []
|
1036
|
+
right_empty = (right.type == NTRUE)
|
1037
|
+
e1, left_first_set = left.first_set
|
1038
|
+
if left.head_fail? ||
|
1039
|
+
(e1.zero? && ((_, right_first_set = right.first_set(follow_set)) && left_first_set.disjoint?(right_first_set)))
|
1040
|
+
|
1041
|
+
# We can optimize here. See the comment at ParsingMachine#test_char about how the LPEG approach (which we copy) differs from
|
1042
|
+
# what is described in Ierusalimschy's paper.
|
1043
|
+
test = testset_code(left_first_set)
|
1044
|
+
|
1045
|
+
left_code = left.code(follow_set:, dominating_test: test)
|
1046
|
+
offset = 1 + left_code.size
|
1047
|
+
offset += 1 unless right_empty
|
1048
|
+
|
1049
|
+
test.offset = offset
|
1050
|
+
|
1051
|
+
code << test
|
1052
|
+
code.concat left_code
|
1053
|
+
unless right_empty
|
1054
|
+
right_code = right.code(follow_set:, active_choice:)
|
1055
|
+
code << Instruction.new(i::JUMP, offset: 1 + right_code.size)
|
1056
|
+
code.concat right_code
|
1057
|
+
end
|
1058
|
+
elsif active_choice && right_empty
|
1059
|
+
code << Instruction.new(i::PARTIAL_COMMIT, 1)
|
1060
|
+
code.concat child.code(active_choice: true)
|
1061
|
+
else
|
1062
|
+
test = testset_code(left_first_set) if e1.zero?
|
1063
|
+
|
1064
|
+
p1 = left.code(dominating_test: test, active_choice: right_empty)
|
1065
|
+
p2 = right.code(follow_set:, active_choice:)
|
1066
|
+
|
1067
|
+
if test
|
1068
|
+
test.offset = 3 + p1.size
|
1069
|
+
code << test
|
1070
|
+
end
|
1071
|
+
|
1072
|
+
code << Instruction.new(i::CHOICE, offset: 2 + p1.size)
|
1073
|
+
code.concat p1
|
1074
|
+
code << Instruction.new(i::COMMIT, offset: 1 + p2.size)
|
1075
|
+
code.concat p2
|
1076
|
+
end
|
1077
|
+
code
|
1078
|
+
end
|
1079
|
+
|
1080
|
+
# LPEG's coderep (lpcode.c)
|
1081
|
+
# /*
|
1082
|
+
# ** Repetion; optimizations:
|
1083
|
+
# ** When pattern is a charset, can use special instruction ISpan.
|
1084
|
+
# ** When pattern is head fail, or if it starts with characters that
|
1085
|
+
# ** are disjoint from what follows the repetions, a simple test
|
1086
|
+
# ** is enough (a fail inside the repetition would backtrack to fail
|
1087
|
+
# ** again in the following pattern, so there is no need for a choice).
|
1088
|
+
# ** When 'opt' is true, the repetion can reuse the Choice already
|
1089
|
+
# ** active in the stack.
|
1090
|
+
# */
|
1091
|
+
private def repeated_code(follow_set, active_choice)
|
1092
|
+
raise "Not a REPEATED pattern" unless type == REPEATED
|
1093
|
+
|
1094
|
+
# Special, quicker handling when the thing we are repeated over is a charset. See Ierusalimschy 4.3
|
1095
|
+
return [Instruction.new(i::SPAN, data: child.charset)] if child.charsetlike?
|
1096
|
+
|
1097
|
+
code = []
|
1098
|
+
e1, first_set = child.first_set(follow_set)
|
1099
|
+
if child.head_fail? || (e1.zero? && first_set.disjoint?(follow_set))
|
1100
|
+
test = testset_code(first_set)
|
1101
|
+
p = child.code(dominating_test: test)
|
1102
|
+
test.offset = 2 + p.size
|
1103
|
+
code << test
|
1104
|
+
code.concat p
|
1105
|
+
code << Instruction.new(i::JUMP, offset: -(1 + p.size))
|
1106
|
+
else
|
1107
|
+
p = child.code
|
1108
|
+
code << testset_code(first_set, 3 + p.size) if e1.zero?
|
1109
|
+
if active_choice
|
1110
|
+
code << Instruction.new(i::PARTIAL_COMMIT, 1)
|
1111
|
+
else
|
1112
|
+
code << Instruction.new(i::CHOICE, offset: 2 + p.size)
|
1113
|
+
end
|
1114
|
+
code.concat p
|
1115
|
+
code << Instruction.new(i::PARTIAL_COMMIT, offset: -p.size)
|
1116
|
+
end
|
1117
|
+
code
|
1118
|
+
end
|
1119
|
+
|
1120
|
+
private def grammar_code
|
1121
|
+
raise "Not a GRAMMAR pattern" unless type == GRAMMAR
|
1122
|
+
|
1123
|
+
code = []
|
1124
|
+
start_line_of_nonterminal = {}
|
1125
|
+
full_rule_code = []
|
1126
|
+
|
1127
|
+
# we need to put the initial nonterminal's rules first
|
1128
|
+
initial_rule = child.find { |rule| rule.data == data }
|
1129
|
+
raise "Cannot find initial rule, for #{data}" unless initial_rule
|
1130
|
+
|
1131
|
+
the_rules = [initial_rule] + child.find.reject { |r| r == initial_rule }
|
1132
|
+
|
1133
|
+
the_rules.each do |rule|
|
1134
|
+
nonterminal = rule.data
|
1135
|
+
start_line_of_nonterminal[nonterminal] = 2 + full_rule_code.size
|
1136
|
+
full_rule_code.concat rule.code(follow_set: FULL_CHAR_SET) + [Instruction.new(i::RETURN)]
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
code << Instruction.new(i::CALL, offset: data) # call the nonterminal, in @data by fix_up_grammar
|
1140
|
+
code << Instruction.new(i::JUMP, offset: 1 + full_rule_code.size) # we are done: jump to the line after the grammar's code
|
1141
|
+
code.concat full_rule_code
|
1142
|
+
|
1143
|
+
# Now close the CALL instructions.
|
1144
|
+
code.each_with_index do |instr, idx|
|
1145
|
+
next unless instr.op_code == CALL
|
1146
|
+
|
1147
|
+
nonterminal = instr.offset # still symbolic
|
1148
|
+
next if nonterminal.is_a?(Integer) # ... expect when we're in a subgrammar, since it has already been fixed up.
|
1149
|
+
|
1150
|
+
start_line = start_line_of_nonterminal[nonterminal]
|
1151
|
+
raise "Nonterminal #{nonterminal} does not have a rule in grammar" unless start_line
|
1152
|
+
|
1153
|
+
# We replaced OPEN_CALL with CALL earlier in #fix_up_grammar. But, if the following instruction is a :return this a tail
|
1154
|
+
# call and we can eliminate the stack push by using a :jump instead of the call. The following :return must remain, as we
|
1155
|
+
# may reach there via another jump/commit/etc
|
1156
|
+
offset = start_line - idx
|
1157
|
+
dec = "->#{nonterminal}"
|
1158
|
+
code[idx] = if code[finaltarget(code, idx + 1)]&.op_code == :return
|
1159
|
+
Instruction.new(i::JUMP, offset:, dec:)
|
1160
|
+
else
|
1161
|
+
Instruction.new(i::CALL, offset:, dec:)
|
1162
|
+
end
|
1163
|
+
end
|
1164
|
+
end
|
1165
|
+
|
1166
|
+
# LPEG's codeand (lpcode.c)
|
1167
|
+
# /*
|
1168
|
+
# ** And predicate
|
1169
|
+
# ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
|
1170
|
+
# ** (valid only when 'p' has no captures)
|
1171
|
+
# */
|
1172
|
+
private def and_code(dominating_test)
|
1173
|
+
code = []
|
1174
|
+
p = child.code(dominating_test:)
|
1175
|
+
len = child.fixed_len
|
1176
|
+
if len && !child.has_captures?
|
1177
|
+
code.concat p
|
1178
|
+
code << Instruction.new(i::BEHIND, aux: len, dec: :and) if len.positive?
|
1179
|
+
else
|
1180
|
+
code << Instruction.new(i::CHOICE, offset: 2 + p.size)
|
1181
|
+
code.concat p
|
1182
|
+
code << Instruction.new(i::BACK_COMMIT, offset: 2)
|
1183
|
+
code << Instruction.new(i::FAIL)
|
1184
|
+
end
|
1185
|
+
code
|
1186
|
+
end
|
1187
|
+
|
1188
|
+
# LPEG's codenot (lpcode.c)
|
1189
|
+
#
|
1190
|
+
# /*
|
1191
|
+
# ** Not predicate; optimizations:
|
1192
|
+
# ** In any case, if first test fails, 'not' succeeds, so it can jump to
|
1193
|
+
# ** the end. If pattern is headfail, that is all (it cannot fail
|
1194
|
+
# ** in other parts); this case includes 'not' of simple sets. Otherwise,
|
1195
|
+
# ** use the default code (a choice plus a failtwice).
|
1196
|
+
# */
|
1197
|
+
private def not_code
|
1198
|
+
code = []
|
1199
|
+
e, first_set = child.first_set
|
1200
|
+
if e.zero? && child.head_fail?
|
1201
|
+
code << testset_code(first_set, 2)
|
1202
|
+
code << Instruction.new(i::FAIL)
|
1203
|
+
else
|
1204
|
+
p = child.code
|
1205
|
+
|
1206
|
+
code << Instruction.new(i::CHOICE, offset: 2 + p.size)
|
1207
|
+
code.concat p
|
1208
|
+
code << Instruction.new(i::FAIL_TWICE)
|
1209
|
+
end
|
1210
|
+
code
|
1211
|
+
end
|
1212
|
+
|
1213
|
+
# LPEG's codetestset (lpcode.c)
|
1214
|
+
#
|
1215
|
+
# /*
|
1216
|
+
# ** code a test set, optimizing unit sets for ITestChar, "complete"
|
1217
|
+
# ** sets for ITestAny, and empty sets for IJmp (always fails).
|
1218
|
+
# ** 'e' is true iff test should accept the empty string. (Test
|
1219
|
+
# ** instructions in the current VM never accept the empty string.)
|
1220
|
+
# */
|
1221
|
+
#
|
1222
|
+
# first_set is the set of first-chars that we want to match on.
|
1223
|
+
# Offset is where to jump to if we don't match one of them.
|
1224
|
+
#
|
1225
|
+
# If offset is not given we don't mind: client code is responsible for that.
|
1226
|
+
private def testset_code(first_set, offset = nil)
|
1227
|
+
case first_set.size
|
1228
|
+
when 0
|
1229
|
+
Instruction.new(i::JUMP, offset:) # we will always fail, so just jump
|
1230
|
+
when 1
|
1231
|
+
Instruction.new(i::TEST_CHAR, offset:, data: first_set.first)
|
1232
|
+
when FULL_CHAR_SET.size
|
1233
|
+
Instruction.new(i::TEST_ANY, offset:)
|
1234
|
+
else
|
1235
|
+
Instruction.new(i::TEST_CHARSET, offset:, data: first_set)
|
1236
|
+
end
|
1237
|
+
end
|
1238
|
+
|
1239
|
+
private def charset_code(charset, dominating_test)
|
1240
|
+
if charset.size == 1
|
1241
|
+
char_code(charset.first, dominating_test)
|
1242
|
+
elsif dominating_test&.op_code == i::TEST_CHARSET && dominating_test&.data == charset
|
1243
|
+
# the "dominating test" has already checked for us so we can use ANY, which is quicker
|
1244
|
+
Instruction.new(i::ANY)
|
1245
|
+
else
|
1246
|
+
Instruction.new(i::CHARSET, data: charset)
|
1247
|
+
end
|
1248
|
+
end
|
1249
|
+
|
1250
|
+
private def char_code(char, dominating_test)
|
1251
|
+
if dominating_test&.op_code == i::TEST_CHAR && dominating_test&.data == char
|
1252
|
+
Instruction.new(i::ANY)
|
1253
|
+
else
|
1254
|
+
Instruction.new(i::CHAR, data: char)
|
1255
|
+
end
|
1256
|
+
end
|
1257
|
+
|
1258
|
+
# LPEG's peephole (lpcode.c)
|
1259
|
+
#
|
1260
|
+
# /*
|
1261
|
+
# ** Optimize jumps and other jump-like instructions.
|
1262
|
+
# ** * Update labels of instructions with labels to their final
|
1263
|
+
# ** destinations (e.g., choice L1; ... L1: jmp L2: becomes
|
1264
|
+
# ** choice L2)
|
1265
|
+
# ** * Jumps to other instructions that do jumps become those
|
1266
|
+
# ** instructions (e.g., jump to return becomes a return; jump
|
1267
|
+
# ** to commit becomes a commit)
|
1268
|
+
# */
|
1269
|
+
private def optimize_jumps(program)
|
1270
|
+
i = Instruction # shorthand
|
1271
|
+
|
1272
|
+
program.each_with_index do |instr, idx|
|
1273
|
+
case instr.op_code
|
1274
|
+
when i::CHOICE, i::CALL, i::COMMIT, i::PARTIAL_COMMIT, i::BACK_COMMIT
|
1275
|
+
n_off = finallabel(program, idx) - idx
|
1276
|
+
instr.offset = n_off
|
1277
|
+
when i::JUMP
|
1278
|
+
final_t = finaltarget(program, idx)
|
1279
|
+
case program[final_t].op_code
|
1280
|
+
when i::RETURN, i::FAIL, i::FAIL_TWICE, i::OP_END
|
1281
|
+
# instructions with unconditional implicit jumps. The jump just becomes that instruction
|
1282
|
+
program[idx] = program[final_t].clone
|
1283
|
+
when i::COMMIT, i::PARTIAL_COMMIT, i::BACK_COMMIT
|
1284
|
+
# instruction with unconditional explicit jumps
|
1285
|
+
final_final_t = finallabel(program, final_t)
|
1286
|
+
instr = program[final_t].clone # The jump becomes that instruction...
|
1287
|
+
instr.offset = final_final_t - idx # ... but we must correct the offset
|
1288
|
+
program[idx] = instr
|
1289
|
+
redo # "reoptimize the label"
|
1290
|
+
else
|
1291
|
+
# just optimize the label
|
1292
|
+
program[idx].offset = final_t - idx
|
1293
|
+
end
|
1294
|
+
end
|
1295
|
+
end
|
1296
|
+
end
|
1297
|
+
|
1298
|
+
# LPEG's needfollow (lpcode.c)
|
1299
|
+
#
|
1300
|
+
# /*
|
1301
|
+
# ** Check whether the code generation for the given tree can benefit
|
1302
|
+
# ** from a follow set (to avoid computing the follow set when it is
|
1303
|
+
# ** not needed)
|
1304
|
+
# */
|
1305
|
+
def need_follow?
|
1306
|
+
case type
|
1307
|
+
when CHAR, CHARSET, ANY, NFALSE, NTRUE, AND, NOT, RUNTIME, GRAMMAR, CALL, BEHIND
|
1308
|
+
false
|
1309
|
+
when ORDERED_CHOICE, REPEATED
|
1310
|
+
true
|
1311
|
+
when CAPTURE
|
1312
|
+
child.need_follow?
|
1313
|
+
when SEQ
|
1314
|
+
right.need_follow?
|
1315
|
+
else
|
1316
|
+
raise "Unhandled case #{type} in need_follow?"
|
1317
|
+
end
|
1318
|
+
end
|
1319
|
+
|
1320
|
+
private def verify_grammar
|
1321
|
+
raise "Not a grammar!" unless type == GRAMMAR
|
1322
|
+
|
1323
|
+
# /* check for infinite loops inside rules */
|
1324
|
+
child.each do |rule|
|
1325
|
+
rule.verify_rule
|
1326
|
+
raise "Grammar has potential infinite loop in rule '#{rule.data}'" if rule.loops?
|
1327
|
+
end
|
1328
|
+
end
|
1329
|
+
|
1330
|
+
# We check if a rule can be left-recursive, i.e., whether we can return to the rule without consuming any input. The plan is to
|
1331
|
+
# walk the tree into subtrees whenever we see we can do so without consuming any input.
|
1332
|
+
#
|
1333
|
+
# LPEG comment follows. Note that we check for nullability directly for sanity's sake.
|
1334
|
+
#
|
1335
|
+
# /*
|
1336
|
+
# ** Check whether a rule can be left recursive; raise an error in that
|
1337
|
+
# ** case; otherwise return 1 iff pattern is nullable.
|
1338
|
+
# ** The return value is used to check sequences, where the second pattern
|
1339
|
+
# ** is only relevant if the first is nullable.
|
1340
|
+
# ** Parameter 'nb' works as an accumulator, to allow tail calls in
|
1341
|
+
# ** choices. ('nb' true makes function returns true.)
|
1342
|
+
# ** Parameter 'passed' is a list of already visited rules, 'npassed'
|
1343
|
+
# ** counts the elements in 'passed'.
|
1344
|
+
# ** Assume ktable at the top of the stack.
|
1345
|
+
# */
|
1346
|
+
def verify_rule
|
1347
|
+
raise "verify_rule called on something that isn't a rule" unless type == RULE
|
1348
|
+
|
1349
|
+
rules_seen = []
|
1350
|
+
|
1351
|
+
local_rec = lambda do |pattern, num_rules_seen|
|
1352
|
+
case pattern.type
|
1353
|
+
when CHAR, CHARSET, ANY, NTRUE, NFALSE, BEHIND
|
1354
|
+
# no op
|
1355
|
+
when NOT, AND, REPEATED, CAPTURE, RUNTIME
|
1356
|
+
# nullable, so keep going
|
1357
|
+
local_rec.call(pattern.child, num_rules_seen)
|
1358
|
+
when CALL
|
1359
|
+
local_rec.call(pattern.child, num_rules_seen)
|
1360
|
+
when SEQ
|
1361
|
+
local_rec.call(pattern.left, num_rules_seen)
|
1362
|
+
# only check 2nd child if first is nullable
|
1363
|
+
local_rec.call(pattern.right, num_rules_seen) if pattern.left.nullable?
|
1364
|
+
when ORDERED_CHOICE
|
1365
|
+
# must check both children
|
1366
|
+
local_rec.call(pattern.left, num_rules_seen)
|
1367
|
+
local_rec.call(pattern.right, num_rules_seen)
|
1368
|
+
when RULE
|
1369
|
+
raise "rule '#{pattern.data}' may be left-recursive" if rules_seen[0...num_rules_seen].include?(pattern)
|
1370
|
+
|
1371
|
+
num_rules_seen += 1
|
1372
|
+
rules_seen[num_rules_seen] = pattern
|
1373
|
+
local_rec.call(pattern.child, num_rules_seen)
|
1374
|
+
when GRAMMAR
|
1375
|
+
# LPEG says: /* sub-grammar cannot be left recursive */
|
1376
|
+
# But why? I guess because we would have rejected it at creation.
|
1377
|
+
else
|
1378
|
+
raise "Unhandled case #{pattern.type} in verify_rule"
|
1379
|
+
end
|
1380
|
+
end
|
1381
|
+
|
1382
|
+
local_rec.call(self, 0)
|
1383
|
+
end
|
1384
|
+
|
1385
|
+
# LPEG's target (lpcode.c)
|
1386
|
+
#
|
1387
|
+
# The absolute target of the instruction at index idx
|
1388
|
+
private def target(program, idx)
|
1389
|
+
idx + program[idx].offset
|
1390
|
+
end
|
1391
|
+
|
1392
|
+
# LPEG's finaltarget (lpcode.c)
|
1393
|
+
#
|
1394
|
+
# Find the final [absolute] destination of a sequence of jumps
|
1395
|
+
private def finaltarget(program, idx)
|
1396
|
+
idx = target(program, idx) while program[idx]&.op_code == i::JUMP
|
1397
|
+
idx
|
1398
|
+
end
|
1399
|
+
|
1400
|
+
# LPEG's finallabel (lpcode.c)
|
1401
|
+
#
|
1402
|
+
# final label (after traversing any jumps)
|
1403
|
+
private def finallabel(program, idx)
|
1404
|
+
finaltarget(program, target(program, idx))
|
1405
|
+
end
|
1406
|
+
|
1407
|
+
########################################
|
1408
|
+
# Misc
|
1409
|
+
|
1410
|
+
# Left and right are subpatterns.
|
1411
|
+
# data is other relevant data
|
1412
|
+
# capture is used for Capture patterns
|
1413
|
+
def initialize(type, left = nil, right = nil, data: nil, capture: nil)
|
1414
|
+
raise "Bad node type #{type}" unless NODE_TYPES.include?(type)
|
1415
|
+
|
1416
|
+
@type = type
|
1417
|
+
@left = left
|
1418
|
+
@right = right
|
1419
|
+
@data = data
|
1420
|
+
@capture = capture
|
1421
|
+
sanity_check
|
1422
|
+
return unless type == GRAMMAR
|
1423
|
+
|
1424
|
+
fix_up_grammar
|
1425
|
+
verify_grammar
|
1426
|
+
end
|
1427
|
+
|
1428
|
+
# Special operation when closing open calls
|
1429
|
+
def convert_open_call_to_call!(rule, ref)
|
1430
|
+
raise "Cannot convert pattern to CALL" unless type == OPEN_CALL
|
1431
|
+
raise "Must give rule and nonterminal symbol to CALL pattern" unless rule && ref
|
1432
|
+
raise "Rule for CALL pattern must be a rule, got #{rule.type}" unless rule.type == RULE
|
1433
|
+
|
1434
|
+
@type = CALL
|
1435
|
+
@left = rule
|
1436
|
+
@data = ref
|
1437
|
+
|
1438
|
+
# We must check these again when needed rather than use the memoized values
|
1439
|
+
%i[@nullable @nofail].each do |ivar|
|
1440
|
+
remove_instance_variable(ivar) if instance_variable_defined?(ivar)
|
1441
|
+
end
|
1442
|
+
end
|
1443
|
+
|
1444
|
+
private def sanity_check
|
1445
|
+
capture.must_be nil unless type == CAPTURE
|
1446
|
+
|
1447
|
+
case type
|
1448
|
+
when CHARSET
|
1449
|
+
data.must_be_a Set
|
1450
|
+
|
1451
|
+
# assume we only worry about 8-bit ascii characters. Note that empty set should have been converted to NFALSE and singletons
|
1452
|
+
# to CHAR.
|
1453
|
+
data.size.must_be_in(2..255)
|
1454
|
+
when NTRUE, NFALSE, ANY
|
1455
|
+
data.must_be nil
|
1456
|
+
when GRAMMAR
|
1457
|
+
data.must_be_a(Hash, Array)
|
1458
|
+
data.must_not.empty?
|
1459
|
+
when OPEN_CALL
|
1460
|
+
data.must_not.negative? if left.is_a?(Integer)
|
1461
|
+
when CALL
|
1462
|
+
data.must_be
|
1463
|
+
when RULE
|
1464
|
+
data.must_be
|
1465
|
+
when CAPTURE
|
1466
|
+
capture.must_be
|
1467
|
+
when BEHIND
|
1468
|
+
data.must_be
|
1469
|
+
when RUNTIME
|
1470
|
+
data.must_be_a(Proc)
|
1471
|
+
when CHAR
|
1472
|
+
data.must_be_a(String)
|
1473
|
+
data.length.must_be 1
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
return if type == GRAMMAR
|
1477
|
+
|
1478
|
+
case num_children
|
1479
|
+
when 0
|
1480
|
+
left.must_be nil
|
1481
|
+
right.must_be nil
|
1482
|
+
when 1
|
1483
|
+
left.must_be_a(Pattern)
|
1484
|
+
right.must_be nil
|
1485
|
+
when 2
|
1486
|
+
left.must_be_a(Pattern)
|
1487
|
+
right.must_be_a(Pattern)
|
1488
|
+
end
|
1489
|
+
end
|
1490
|
+
|
1491
|
+
# The grammar is currently in @data. It can be either a Hash or an Array
|
1492
|
+
#
|
1493
|
+
# We do several things
|
1494
|
+
# - make sure each rule pattern is actually a pattern.
|
1495
|
+
# - since we can specify rules as strings, say, or subgrammars (as hash) we need to step in here
|
1496
|
+
# - the hash/array in @data is replaced with an array of RULE patterns in @left
|
1497
|
+
# - the initial nonterminal (a symbol) is put into @data
|
1498
|
+
# - :opencall(v) nodes are replaced with CALL(rule) nodes
|
1499
|
+
#
|
1500
|
+
# We set up
|
1501
|
+
# @nonterminal_indices: map nonterminal symbols to their index (0, 1, ...)
|
1502
|
+
# @nonterminal_by_index: map indices to the corresopnding nonterminal
|
1503
|
+
private def fix_up_grammar
|
1504
|
+
raise "Bad type for #fix_up_grammar" unless type == GRAMMAR
|
1505
|
+
|
1506
|
+
@nonterminal_indices = {}
|
1507
|
+
@nonterminal_by_index = []
|
1508
|
+
|
1509
|
+
if data.is_a?(Array)
|
1510
|
+
# We replace it with an equivalent Hash
|
1511
|
+
initial_nonterminal = nil
|
1512
|
+
if data.first.is_a?(Integer)
|
1513
|
+
initial_rule_idx = data.shift # discard the Integer
|
1514
|
+
raise "Bad index for initial nonterminal in grammar" if initial_rule_idx.negative?
|
1515
|
+
else
|
1516
|
+
initial_rule_idx = 0
|
1517
|
+
end
|
1518
|
+
# Convert to a hash with sythentic keys
|
1519
|
+
as_hash = {}
|
1520
|
+
data.each_with_index do |p, i|
|
1521
|
+
key = "__#{i}".to_sym
|
1522
|
+
as_hash[key] = p
|
1523
|
+
initial_nonterminal = key if i == initial_rule_idx
|
1524
|
+
end
|
1525
|
+
|
1526
|
+
unless initial_nonterminal
|
1527
|
+
raise "Bad grammar: no rule correspnds to an index of #{initial_rule_dix} for initial nonterminal"
|
1528
|
+
end
|
1529
|
+
|
1530
|
+
as_hash[:initial] = initial_nonterminal
|
1531
|
+
@data = as_hash
|
1532
|
+
end
|
1533
|
+
|
1534
|
+
# Canonical representations of keys (symbols) and values (patterns)
|
1535
|
+
grammar_hash = {}
|
1536
|
+
data.each do |nonterminal, pattern|
|
1537
|
+
nonterminal = nonterminal.to_sym if nonterminal.is_a?(String)
|
1538
|
+
raise "Nonterminal symbol can be only a string or a symbol" unless nonterminal.is_a?(Symbol)
|
1539
|
+
|
1540
|
+
# the only case in which we don't specify a rule pattern
|
1541
|
+
if nonterminal == :initial
|
1542
|
+
pattern.must_be_a(String, Symbol)
|
1543
|
+
grammar_hash[:initial] = pattern.to_sym
|
1544
|
+
else
|
1545
|
+
grammar_hash[nonterminal] = RPEG.P(pattern)
|
1546
|
+
end
|
1547
|
+
end
|
1548
|
+
|
1549
|
+
initial_symbol = grammar_hash.delete(:initial)
|
1550
|
+
initial_symbol ||= grammar_hash.keys.first
|
1551
|
+
|
1552
|
+
rule_hash = {}
|
1553
|
+
rule_list = []
|
1554
|
+
|
1555
|
+
grammar_hash.each_with_index do |rule, idx|
|
1556
|
+
nonterminal, rule_pattern = rule
|
1557
|
+
raise "Nonterminal #{nonterminal} appears twice in grammar" if @nonterminal_indices[nonterminal]
|
1558
|
+
|
1559
|
+
rule = Pattern.new(RULE, rule_pattern, data: nonterminal)
|
1560
|
+
rule_list << rule
|
1561
|
+
rule_hash[nonterminal] = rule
|
1562
|
+
@nonterminal_indices[nonterminal] = idx
|
1563
|
+
@nonterminal_by_index[idx] = nonterminal
|
1564
|
+
end
|
1565
|
+
|
1566
|
+
@left = rule_list
|
1567
|
+
@data = initial_symbol.must_be # we don't need the Hash any more
|
1568
|
+
|
1569
|
+
# Traverse a rule rules and fix open calls. Do it in-line so we don't risk traversing the tree(s) via a generic visitor while
|
1570
|
+
# modifying the tree
|
1571
|
+
fix_it = lambda do |node|
|
1572
|
+
return if node.type == GRAMMAR # subgrammars already fixed
|
1573
|
+
return if node.type == CALL # already done
|
1574
|
+
|
1575
|
+
if node.type == OPEN_CALL
|
1576
|
+
ref = node.data
|
1577
|
+
if ref.is_a?(Integer) && ref >= 0
|
1578
|
+
symb_ref = @nonterminal_by_index[ref]
|
1579
|
+
raise "bad grammar index for rule '#{ref}'" unless symb_ref
|
1580
|
+
|
1581
|
+
ref = symb_ref
|
1582
|
+
end
|
1583
|
+
raise "bad grammar reference for rule '#{ref}'" unless @nonterminal_indices[ref]
|
1584
|
+
|
1585
|
+
rule = rule_hash[ref].must_be
|
1586
|
+
node.convert_open_call_to_call!(rule, ref)
|
1587
|
+
return
|
1588
|
+
end
|
1589
|
+
|
1590
|
+
return if node.num_children.zero?
|
1591
|
+
|
1592
|
+
fix_it.call(node.left)
|
1593
|
+
fix_it.call(node.right) if node.num_children == 2
|
1594
|
+
end
|
1595
|
+
|
1596
|
+
rule_list.each { |rule| fix_it.call(rule) }
|
1597
|
+
end
|
1598
|
+
end
|
1599
|
+
|
1600
|
+
########################################
|
1601
|
+
# Monkeypatching
|
1602
|
+
#
|
1603
|
+
# Very annoyingly, Ruby's #coerce mechanism is only used by the Numeric types. This means it doesn't help with things like the
|
1604
|
+
# convenient "a" + P(-1). The only way I can think to make it work is to monkeypatch.
|
1605
|
+
|
1606
|
+
# Technique from https://stackoverflow.com/a/61438012/1299011
|
1607
|
+
module NonNumericOverloadExtension
|
1608
|
+
%i[+ * -].each do |sym|
|
1609
|
+
define_method sym do |other|
|
1610
|
+
return RPEG.P(self).send(sym, other) if other.is_a?(Pattern)
|
1611
|
+
|
1612
|
+
super(other)
|
1613
|
+
end
|
1614
|
+
end
|
1615
|
+
end
|
1616
|
+
|
1617
|
+
[::String, ::TrueClass, ::FalseClass, ::Hash, ::Array].each do |klass|
|
1618
|
+
klass.class_eval do
|
1619
|
+
prepend NonNumericOverloadExtension
|
1620
|
+
end
|
1621
|
+
end
|
1622
|
+
end
|