rpeg 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +7 -0
- data/README.md +155 -0
- data/Rakefile +9 -0
- data/lib/rpeg/captures.rb +702 -0
- data/lib/rpeg/parsing_machine.rb +457 -0
- data/lib/rpeg/re.rb +233 -0
- data/lib/rpeg/rpeg.rb +1622 -0
- data/lib/rpeg.rb +5 -0
- metadata +81 -0
@@ -0,0 +1,457 @@
|
|
1
|
+
require_relative 'captures'
|
2
|
+
|
3
|
+
# Instances are generated during program generation in Pattern and consumed in the ParsingMachine
|
4
|
+
#
|
5
|
+
# - op_code: the instruction op
|
6
|
+
# - offset: the address offset used in jumps, calls, etc.
|
7
|
+
# - aux: extra information used by instructions like capture
|
8
|
+
# - in LPEG this is used to carefully pack data by bit-twiddling, etc., but we can use anything, such as structs, etc., as needed
|
9
|
+
# - data: this is called "key" in LPEG and is used to store pointers to Lua-based objects, etc.
|
10
|
+
# - we will just store Ruby objects here.
|
11
|
+
# - it contains things like the set of characters for Charset instructions, etc.
|
12
|
+
# - dec: "decorations" for other things like labels that might be useful later in debugging, etc.
|
13
|
+
# - it is ignored by the VM
|
14
|
+
class Instruction
|
15
|
+
OP_CODES = %i[
|
16
|
+
char charset any jump choice call return commit back_commit
|
17
|
+
partial_commit span op_end fail fail_twice
|
18
|
+
open_capture close_capture close_run_time full_capture behind
|
19
|
+
test_char test_charset test_any
|
20
|
+
].each do |op|
|
21
|
+
const_set op.upcase, op
|
22
|
+
end
|
23
|
+
|
24
|
+
OP_WIDTH = OP_CODES.map(&:length).max
|
25
|
+
DECORATION_WIDTH = 15
|
26
|
+
|
27
|
+
attr_reader :op_code, :data, :aux
|
28
|
+
attr_accessor :offset, :dec
|
29
|
+
|
30
|
+
def initialize(op_code, offset: nil, data: nil, aux: nil, dec: nil)
|
31
|
+
raise "Bad instruction op_code #{op_code}" unless OP_CODES.include?(op_code)
|
32
|
+
|
33
|
+
@op_code = op_code
|
34
|
+
@offset = offset
|
35
|
+
@data = data
|
36
|
+
@aux = aux
|
37
|
+
@dec = dec
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s
|
41
|
+
return @to_s if @to_s
|
42
|
+
|
43
|
+
str = (dec || "").to_s.rjust(DECORATION_WIDTH) + " :"
|
44
|
+
str << op_code.to_s.upcase.rjust(OP_WIDTH + 1)
|
45
|
+
|
46
|
+
str << " offset: #{offset}" if [TEST_CHAR, TEST_ANY, TEST_CHARSET].include?(op_code)
|
47
|
+
|
48
|
+
case op_code
|
49
|
+
when CHAR, TEST_CHAR
|
50
|
+
str << " #{data.dump}"
|
51
|
+
when BEHIND
|
52
|
+
str << " #{aux}"
|
53
|
+
when CHARSET, SPAN, TEST_CHARSET
|
54
|
+
str << " #{charset_rep(data)}"
|
55
|
+
when JUMP, CHOICE, CALL, COMMIT, BACK_COMMIT, PARTIAL_COMMIT
|
56
|
+
str << " #{offset}"
|
57
|
+
when RETURN, OP_END, FAIL, FAIL_TWICE, ANY, TEST_ANY
|
58
|
+
# no-op
|
59
|
+
when OPEN_CAPTURE, CLOSE_CAPTURE, FULL_CAPTURE, CLOSE_RUN_TIME
|
60
|
+
str << " data:#{data}, aux:#{aux}"
|
61
|
+
else
|
62
|
+
raise "Unhandled op_code #{op_code} in Instruction#to_s"
|
63
|
+
end
|
64
|
+
@to_s = str
|
65
|
+
end
|
66
|
+
|
67
|
+
# A shorter representation of a charset
|
68
|
+
private def charset_rep(char_set)
|
69
|
+
return "" if char_set.empty?
|
70
|
+
|
71
|
+
bools = []
|
72
|
+
char_set.each do |ch|
|
73
|
+
bools[ch.ord] = true
|
74
|
+
end
|
75
|
+
|
76
|
+
# attach an artificial false bool to trigger ship-out
|
77
|
+
bools << false
|
78
|
+
|
79
|
+
parts = []
|
80
|
+
|
81
|
+
open = false
|
82
|
+
first = last = nil
|
83
|
+
bools.each_with_index do |present, idx|
|
84
|
+
if present
|
85
|
+
if open
|
86
|
+
last = idx
|
87
|
+
else
|
88
|
+
# start a new range
|
89
|
+
first = idx
|
90
|
+
last = idx
|
91
|
+
open = true
|
92
|
+
end
|
93
|
+
elsif open
|
94
|
+
# a subrange just closed
|
95
|
+
if last == first
|
96
|
+
parts << first.chr(Encoding::UTF_8).dump
|
97
|
+
else
|
98
|
+
parts << (first.chr(Encoding::UTF_8) + ".." + last.chr(Encoding::UTF_8)).dump
|
99
|
+
end
|
100
|
+
first = last = nil
|
101
|
+
open = false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
parts.join(", ")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# The VM used to run the programs generated from the patterns.
|
110
|
+
#
|
111
|
+
# See lpvm.c in the LPEG code.
|
112
|
+
class ParsingMachine
|
113
|
+
# program: the program to run
|
114
|
+
# subject: the string to match against
|
115
|
+
# initial_pos: the position in subject to start the search at
|
116
|
+
# extra_args may have been supplied in the initial #match call. These are consumed by Argument Captures.
|
117
|
+
def initialize(program, subject, initial_pos, extra_args)
|
118
|
+
@program = program.clone.freeze.must_only_contain(Instruction)
|
119
|
+
@prog_len = @program_size
|
120
|
+
|
121
|
+
# When benchmarking with a large subject (4.5 MB) I found that the search took an enormous amount of time (1300 s), and 95% of
|
122
|
+
# it was due to 3.6 million calls to String::[]. I don't know why this was so slow, as accessing a large string in a small
|
123
|
+
# scratch loop is fast. I am very confused. Converting the string to an array of chars is much faster (1300 s became 9 s).
|
124
|
+
# @original_subject = subject.clone.freeze
|
125
|
+
@subject = subject.chars.freeze
|
126
|
+
@subject_size = @subject.size
|
127
|
+
|
128
|
+
@i_ptr = 0 # index in @program of the next instruction
|
129
|
+
@subject_index = initial_pos
|
130
|
+
@stack = []
|
131
|
+
@breadcrumbs = [].must_only_ever_contain(Capture::Breadcrumb) # the records of the captures we make during parsing
|
132
|
+
@bread_count = 0 # the number of breadcrumbs in @breadcrumbs (some in the array may be stale)
|
133
|
+
|
134
|
+
@extra_args = extra_args.clone
|
135
|
+
end
|
136
|
+
|
137
|
+
def success?
|
138
|
+
@success
|
139
|
+
end
|
140
|
+
|
141
|
+
private def done!
|
142
|
+
@done = true
|
143
|
+
end
|
144
|
+
|
145
|
+
private def done?
|
146
|
+
@done
|
147
|
+
end
|
148
|
+
|
149
|
+
def run
|
150
|
+
step until @done
|
151
|
+
end
|
152
|
+
|
153
|
+
def step
|
154
|
+
instr = @program[@i_ptr]
|
155
|
+
|
156
|
+
case instr.op_code
|
157
|
+
when Instruction::TEST_CHARSET
|
158
|
+
test_char(instr.data.include?(@subject[@subject_index]), instr.offset)
|
159
|
+
when Instruction::TEST_CHAR
|
160
|
+
test_char(instr.data == @subject[@subject_index], instr.offset)
|
161
|
+
when Instruction::TEST_ANY
|
162
|
+
test_char(@subject_index < @subject_size, instr.offset)
|
163
|
+
when Instruction::ANY
|
164
|
+
check_char(@subject_index < @subject_size)
|
165
|
+
when Instruction::CHARSET
|
166
|
+
check_char(instr.data.include?(@subject[@subject_index]))
|
167
|
+
when Instruction::CHAR
|
168
|
+
check_char(instr.data == @subject[@subject_index])
|
169
|
+
when Instruction::JUMP
|
170
|
+
@i_ptr += instr.offset
|
171
|
+
when Instruction::CHOICE
|
172
|
+
# We push the offset for the other side of the choice
|
173
|
+
push(:state, instr.offset)
|
174
|
+
@i_ptr += 1
|
175
|
+
when Instruction::CALL
|
176
|
+
# Call is like jump, but we push the return address onto the stack first
|
177
|
+
push(:instruction, 1)
|
178
|
+
@i_ptr += instr.offset
|
179
|
+
when Instruction::RETURN
|
180
|
+
@i_ptr = pop(:instruction).i_ptr
|
181
|
+
when Instruction::COMMIT
|
182
|
+
# we pop and discard the top of the stack (which must be a full state) and then do the jump given by arg1. Even though we
|
183
|
+
# are discarding it check that it was a full state for sanity.
|
184
|
+
_ = pop(:state)
|
185
|
+
@i_ptr += instr.offset
|
186
|
+
when Instruction::PARTIAL_COMMIT
|
187
|
+
# Sort of a combination of commit (which pops) and choice (which pushes), but we just tweak the top of the stack. See
|
188
|
+
# Ierusalimschy, sec 4.3
|
189
|
+
stack_top = peek(:state)
|
190
|
+
raise "Empty stack for partial commit!" unless stack_top
|
191
|
+
|
192
|
+
stack_top.subject_index = @subject_index
|
193
|
+
stack_top.bread_count = @bread_count
|
194
|
+
@i_ptr += instr.offset
|
195
|
+
when Instruction::BACK_COMMIT
|
196
|
+
# A combination of a fail and a commit. We backtrack, but then jump to the specified instruction rather than using the
|
197
|
+
# backtrack label. It's used for the AND pattern. See Ierusalimschy, 4.4
|
198
|
+
stack_top = pop(:state)
|
199
|
+
@subject_index = stack_top.subject_index
|
200
|
+
@bread_count = stack_top.bread_count
|
201
|
+
@i_ptr += instr.offset
|
202
|
+
when Instruction::SPAN
|
203
|
+
# Special instruction for when we are repeating over a charset, which is common. We just consume as many maching characters
|
204
|
+
# as there are. This never fails as we can always match at least zero.
|
205
|
+
@subject_index += 1 while instr.data.include?(@subject[@subject_index])
|
206
|
+
@i_ptr += 1
|
207
|
+
when Instruction::BEHIND
|
208
|
+
n = instr.aux # the (fixed) length of the pattern we want to match.
|
209
|
+
if n > @subject_index
|
210
|
+
# We can't jump so far back in the subject
|
211
|
+
handle_fail_ptr
|
212
|
+
else
|
213
|
+
@subject_index -= n
|
214
|
+
@i_ptr += 1
|
215
|
+
end
|
216
|
+
when Instruction::FAIL
|
217
|
+
handle_fail_ptr
|
218
|
+
when Instruction::FAIL_TWICE
|
219
|
+
# An optimization for the NOT implementation. We pop the top of the stack and discard it, and then enter the fail routine
|
220
|
+
# again. For sanity's sake we'll check that the thing we are popping is a :state entry. See Ierusalimschy, 4.4
|
221
|
+
_ = pop(:state)
|
222
|
+
handle_fail_ptr
|
223
|
+
when Instruction::CLOSE_RUN_TIME
|
224
|
+
# The LPEG code for runtime captures is very complicated. Reading through it, it appears that the complexity comes from
|
225
|
+
# needing to carefully manage the capture breadcrumbs wrt to the Lua values living on the Lua stack to avoid memory
|
226
|
+
# leaks. We don't have to worry about that here, as everything is in Ruby and we can leave the hard stuff to the garbage
|
227
|
+
# collector. The remaining work is little more than we have with a function capture.
|
228
|
+
result = run_time_capture
|
229
|
+
handle_run_time_capture_result(result)
|
230
|
+
when Instruction::OPEN_CAPTURE
|
231
|
+
record_capture(instr, size: 0, subject_index: @subject_index)
|
232
|
+
when Instruction::CLOSE_CAPTURE
|
233
|
+
# As in LPEG: "if possible, turn capture into a full capture"
|
234
|
+
raise "Close capture without an open" unless @bread_count.positive?
|
235
|
+
|
236
|
+
lc = @breadcrumbs[@bread_count - 1].must_be # still on the breadcrumb list
|
237
|
+
if lc.size.zero? && (@subject_index - lc.subject_index) < 255 # TODO: should we care about an upper bound here?
|
238
|
+
# The previous breadcrumb was an OPEN, and we are closing it
|
239
|
+
lc.size = @subject_index - lc.subject_index + 1
|
240
|
+
@i_ptr += 1
|
241
|
+
else
|
242
|
+
record_capture(instr, size: 1, subject_index: @subject_index)
|
243
|
+
end
|
244
|
+
when Instruction::FULL_CAPTURE
|
245
|
+
# We have an all-in-one match, and the "capture length" tells us how far back in the subject the match started.
|
246
|
+
len = (instr.aux[:capture_length] || 0).must_be(Integer)
|
247
|
+
record_capture(instr, size: 1 + len, subject_index: @subject_index - len)
|
248
|
+
when Instruction::OP_END
|
249
|
+
@success = true
|
250
|
+
done!
|
251
|
+
else
|
252
|
+
raise "Unhandled op code #{instr.op_code}"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
########################################
|
257
|
+
# Support for a debugger
|
258
|
+
|
259
|
+
# These are internals that aren't useful in the usual case
|
260
|
+
attr_reader :program, :subject, :subject_index, :extra_args, :stack, :i_ptr
|
261
|
+
|
262
|
+
def breadcrumbs
|
263
|
+
@breadcrumbs[0, @bread_count]
|
264
|
+
end
|
265
|
+
|
266
|
+
#
|
267
|
+
########################################
|
268
|
+
|
269
|
+
# For this and the handling of the foo_CAPTURE op codes above, see the corresponding LPEG code in lpvm.c
|
270
|
+
#
|
271
|
+
# In that code, captop points to the "next" or "new" capture info, so captop - 1 is the current top.
|
272
|
+
private def record_capture(instr, size:, subject_index:)
|
273
|
+
add_capture Capture::Breadcrumb.new(size, subject_index, instr.data, instr.aux[:kind].must_be)
|
274
|
+
@i_ptr += 1
|
275
|
+
end
|
276
|
+
|
277
|
+
private def add_capture(breadcrumb)
|
278
|
+
@breadcrumbs[@bread_count] = breadcrumb
|
279
|
+
@bread_count += 1
|
280
|
+
end
|
281
|
+
|
282
|
+
# React to a character match or failure
|
283
|
+
private def check_char(success)
|
284
|
+
if success
|
285
|
+
@i_ptr += 1
|
286
|
+
@subject_index += 1
|
287
|
+
else
|
288
|
+
handle_fail_ptr
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# React to a character match or failure in one of the TestFoo instruction
|
293
|
+
#
|
294
|
+
# IMPORTANT NOTE
|
295
|
+
#
|
296
|
+
# Ierusalimschy's paper describes these as consuming the next character, and code generated for things like char sequences being
|
297
|
+
# tweaked to take this into account. BUT the LPEG code does it differently. These _check_ the current character but do not consume
|
298
|
+
# it: the following test is expected to do so. During code generation the "currently controlling" TEST_FOO is passed along so
|
299
|
+
# followup checks can be optimized. See codechar and codecharset in lpcode.c.
|
300
|
+
private def test_char(success, offset)
|
301
|
+
@i_ptr += success ? 1 : offset
|
302
|
+
end
|
303
|
+
|
304
|
+
# We have to backtrack, or fail the match if we can't
|
305
|
+
private def handle_fail_ptr
|
306
|
+
if @stack.empty?
|
307
|
+
@success = false
|
308
|
+
done!
|
309
|
+
else
|
310
|
+
# pop off stack elements until we get a full state to restore
|
311
|
+
top = pop
|
312
|
+
return handle_fail_ptr if top.type == :instruction
|
313
|
+
|
314
|
+
@i_ptr = top.i_ptr
|
315
|
+
@subject_index = top.subject_index
|
316
|
+
@bread_count = top.bread_count
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
########################################
|
321
|
+
# Stack manipulation
|
322
|
+
|
323
|
+
Frame = Struct.new :type, :i_ptr, :subject_index, :bread_count
|
324
|
+
|
325
|
+
# We push either
|
326
|
+
# - an instruction pointer, which may later be used to jump, etc, or
|
327
|
+
# - the current state with an offset, which is the [instr ptr + offset, subject_index, breadcrumb list] triple.
|
328
|
+
private def push(type, offset)
|
329
|
+
raise "Must push something onto stack" unless offset
|
330
|
+
raise "Bad stack frame type" unless %i[instruction state].include?(type)
|
331
|
+
|
332
|
+
frame = if type == :state
|
333
|
+
Frame.new(type, @i_ptr + offset, @subject_index, @bread_count)
|
334
|
+
else
|
335
|
+
Frame.new(type, @i_ptr + offset)
|
336
|
+
end
|
337
|
+
@stack.push frame
|
338
|
+
end
|
339
|
+
|
340
|
+
# Pop and return the top stack frame. If expected_type is non nil check that the frame has that type
|
341
|
+
#
|
342
|
+
# Raise if stack is empty
|
343
|
+
private def pop(expected_type = nil)
|
344
|
+
raise "Nothing in stack to pop" if @stack.empty?
|
345
|
+
|
346
|
+
frame = @stack.pop
|
347
|
+
check_frame(frame, expected_type)
|
348
|
+
frame
|
349
|
+
end
|
350
|
+
|
351
|
+
# Peek and return the top of the stack without popping it. Return nil if the stack is empty.
|
352
|
+
#
|
353
|
+
# If expecting is given make sure that the top of the stack is of the given type
|
354
|
+
private def peek(expected_type = nil)
|
355
|
+
return nil if @stack.empty?
|
356
|
+
|
357
|
+
frame = @stack.last
|
358
|
+
check_frame(frame, expected_type)
|
359
|
+
frame
|
360
|
+
end
|
361
|
+
|
362
|
+
private def check_frame(frame, expected_type)
|
363
|
+
return unless expected_type
|
364
|
+
|
365
|
+
raise "Top of stack is of type #{frame.type}, not of expected type #{expected_type}" unless frame.type == expected_type
|
366
|
+
end
|
367
|
+
|
368
|
+
########################################
|
369
|
+
# Capture extraction code
|
370
|
+
|
371
|
+
# From the ICloseRuntIme main-loop switch statement in lpvm.c
|
372
|
+
def handle_run_time_capture_result(results)
|
373
|
+
directive, *dyn_captures = results
|
374
|
+
unless directive
|
375
|
+
handle_fail_ptr
|
376
|
+
return
|
377
|
+
end
|
378
|
+
|
379
|
+
@subject_index = if directive == true
|
380
|
+
@subject_index
|
381
|
+
else
|
382
|
+
directive.must_be_a(Integer)
|
383
|
+
if directive < @subject_index || directive > @subject_size
|
384
|
+
raise 'invalid position returned by match-time capture'
|
385
|
+
end
|
386
|
+
|
387
|
+
directive
|
388
|
+
end
|
389
|
+
|
390
|
+
if dyn_captures.empty?
|
391
|
+
# no dynamic captures. Just get rid of the OPEN capture we still have
|
392
|
+
@bread_count -= 1
|
393
|
+
else
|
394
|
+
# This is LPEG's adddyncaptures in lpvm.c
|
395
|
+
@breadcrumbs[@bread_count - 1].data = nil # make the group capture an anonymous group
|
396
|
+
dyn_captures.each do |cap_val|
|
397
|
+
# LPEG uses a special RUNTIME capture kind here to help find these things later if they need to be removed. We don't appear
|
398
|
+
# to need it - we could just use a CONST capture. But let's follow LPEG just in case.
|
399
|
+
add_capture Capture::Breadcrumb.new(1, @subject_index, cap_val, Capture::RUNTIME)
|
400
|
+
end
|
401
|
+
add_capture Capture::Breadcrumb.new(1, @subject_index, nil, Capture::CLOSE) # close the group
|
402
|
+
end
|
403
|
+
@i_ptr += 1
|
404
|
+
end
|
405
|
+
|
406
|
+
# Returns the captures obtained when we ran the machine.
|
407
|
+
#
|
408
|
+
# If there are no captures we return the final index into the subject string. This is typically one past the matched section.
|
409
|
+
# If there is exactly one capture we return it.
|
410
|
+
# If there are multiple captures we return them in an array.
|
411
|
+
#
|
412
|
+
# The capture code in LPEG (mostly in lpcap.c) looks complicated at first but it is made up of a bunch of pieces that each do one
|
413
|
+
# thing and coordinate well togehter. Some extra complexity comes from the manual memory management required in C and the need to
|
414
|
+
# interact with Lua values - this appears to be especially the case with the Runtime capture code, which is bewildering at first
|
415
|
+
# view. Porting it one capture kind at a time let me understand it at some level as I went.
|
416
|
+
#
|
417
|
+
# Basic model:
|
418
|
+
#
|
419
|
+
# - We push Breadcrumb objects onto the stack as we run the VM based on the instructions generated from the patterns. We never pop
|
420
|
+
# anything from the stack: the Captures are breadcrumbs that let us work out after the fact what happend. Things do get removed
|
421
|
+
# from the Capture stack but only at backtrack points because a match has failed.
|
422
|
+
# - The End instruction tacks on an unbalanced CloseCapture. This appears to be simply an end-marker like the null string
|
423
|
+
# terminator. We don't do this
|
424
|
+
# - After the VM runs we analyze the Breadcrumbs to calculate the captures. We go back and forth through the data. So isn't not a
|
425
|
+
# stack, but an array.
|
426
|
+
#
|
427
|
+
# This method plays the same role as LPEG's getcaptures (lpcap.c)
|
428
|
+
def captures
|
429
|
+
raise "Cannot call #captures unless machine ran sucessfully" unless done? && success?
|
430
|
+
|
431
|
+
@capture_state = new_capture_state
|
432
|
+
@capture_state.capture_all
|
433
|
+
|
434
|
+
result = @capture_state.captures
|
435
|
+
|
436
|
+
return @subject_index if result.empty?
|
437
|
+
return result.first if result.size == 1
|
438
|
+
|
439
|
+
result
|
440
|
+
end
|
441
|
+
|
442
|
+
# This stub needs to be in ParsingMachine and not CaptureState because it must modify @bread_count
|
443
|
+
def run_time_capture
|
444
|
+
# We need point to the close capture we just hit. LPEG is tricksy here: there isn't actually a CLOSE capture/breadcrumb yet, but
|
445
|
+
# the data structure - an array of Capture objects - means that the "next capture" memory is interpreted as a Capture. We have
|
446
|
+
# to do something manually that in the C code happens "automatically"
|
447
|
+
add_capture Capture::Breadcrumb.new(0, @subject_index, nil, Capture::CLOSE)
|
448
|
+
capture_state = new_capture_state(@bread_count - 1) # start on the CLOSE we just tacked on
|
449
|
+
|
450
|
+
@bread_count, result = capture_state.run_time_capture
|
451
|
+
result
|
452
|
+
end
|
453
|
+
|
454
|
+
def new_capture_state(starting_index = nil)
|
455
|
+
CaptureState.new(@breadcrumbs[0, @bread_count], @subject, @subject_index, @extra_args, starting_index:)
|
456
|
+
end
|
457
|
+
end
|
data/lib/rpeg/re.rb
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative 'rpeg'
|
4
|
+
|
5
|
+
# A straight port of LPEG's re module, though without any support for locales
|
6
|
+
module RPEG::RE
|
7
|
+
extend self
|
8
|
+
|
9
|
+
Pattern = RPEG::Pattern
|
10
|
+
|
11
|
+
@mem = {} # memo space for patterns
|
12
|
+
@fmem = {}
|
13
|
+
@gmem = {}
|
14
|
+
|
15
|
+
# What does "compiled" mean here?
|
16
|
+
#
|
17
|
+
# Oh. Maybe it is the Pattern built from the regexp-y thing.
|
18
|
+
def compile(pattern, *defs)
|
19
|
+
return pattern if pattern.is_a?(Pattern)
|
20
|
+
|
21
|
+
defs = [{}] if defs.empty? # for the sake of p_def, below
|
22
|
+
|
23
|
+
cp = PATTERN.match(pattern, 0, *defs)
|
24
|
+
raise "incorrect pattern" unless cp
|
25
|
+
|
26
|
+
cp
|
27
|
+
end
|
28
|
+
|
29
|
+
def match(str, pattern, start_pos = 0)
|
30
|
+
cp = (@mem[pattern] ||= compile(pattern))
|
31
|
+
cp.match(str, start_pos)
|
32
|
+
end
|
33
|
+
|
34
|
+
def find(str, pattern, start_pos = 0)
|
35
|
+
cp = @fmem[pattern]
|
36
|
+
unless cp
|
37
|
+
cp = compile(pattern) / 0
|
38
|
+
cp = RPEG.P([RPEG.Cp() * cp * RPEG.Cp() + 1 * RPEG.V(0)])
|
39
|
+
@fmem[pattern] = cp
|
40
|
+
end
|
41
|
+
|
42
|
+
i, e = cp.match(str, start_pos)
|
43
|
+
|
44
|
+
return [i, e - 1] if i
|
45
|
+
end
|
46
|
+
|
47
|
+
def gsub(str, pattern, rep)
|
48
|
+
g = @gmem[pattern] || {} #-- ensure gmem[p] is not collected while here. What does that mean?
|
49
|
+
@gmem[pattern] = g
|
50
|
+
cp = g[rep]
|
51
|
+
unless cp
|
52
|
+
cp = compile(pattern)
|
53
|
+
cp = RPEG.Cs((cp / rep + 1)**0)
|
54
|
+
g[rep] = cp
|
55
|
+
end
|
56
|
+
cp.match(str)
|
57
|
+
end
|
58
|
+
|
59
|
+
private def internals
|
60
|
+
m = RPEG
|
61
|
+
|
62
|
+
any = m.P(1)
|
63
|
+
lower = m.R("az")
|
64
|
+
upper = m.R("AZ")
|
65
|
+
alpha = lower + upper
|
66
|
+
digit = m.R("09")
|
67
|
+
alnum = alpha + digit
|
68
|
+
space = m.S(" \n\t")
|
69
|
+
printable = m.R(' ~')
|
70
|
+
word = alnum
|
71
|
+
predef = {
|
72
|
+
alpha:, digit:, lower:, upper:, space:, alnum:, word:,
|
73
|
+
graph: printable - space,
|
74
|
+
xdigit: digit + m.R("af", "AF"),
|
75
|
+
punct: printable - (space + alnum),
|
76
|
+
cntrl: any - printable
|
77
|
+
}
|
78
|
+
predef.keys.each do |key|
|
79
|
+
short = key.to_s[0].to_sym
|
80
|
+
predef[short] = predef[key]
|
81
|
+
predef[short.upcase] = any - predef[key]
|
82
|
+
end
|
83
|
+
predef[:nl] = m.P("\n")
|
84
|
+
|
85
|
+
name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")**0
|
86
|
+
p_space = (predef[:space] + "--" * (any - predef[:nl])**0)**0
|
87
|
+
|
88
|
+
arrow = p_space * "<-"
|
89
|
+
seq_follow = m.P("/") + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1
|
90
|
+
name = m.C(name)
|
91
|
+
|
92
|
+
# -- a defined name only have meaning in a given environment
|
93
|
+
p_def = name * m.Carg(1)
|
94
|
+
|
95
|
+
num = m.C(m.R("09")**1) * p_space / ->(s) { s.to_i }
|
96
|
+
p_spacetring = "'" * m.C((any - "'")**0) * "'" + '"' * m.C((any - '"')**0) * '"'
|
97
|
+
|
98
|
+
defined = "%" * p_def / lambda do |c, defs|
|
99
|
+
cat = (defs && (defs[c] || defs[c.to_sym])) || predef[c.to_sym]
|
100
|
+
raise "name '#{c}' undefined" unless cat
|
101
|
+
|
102
|
+
cat
|
103
|
+
end
|
104
|
+
|
105
|
+
# Why do we have this as well as defined ?
|
106
|
+
getdef = lambda do |id, defs|
|
107
|
+
c = defs && (defs[id] || defs[id.to_sym])
|
108
|
+
raise "undefined name: #{id}" unless c
|
109
|
+
|
110
|
+
c
|
111
|
+
end
|
112
|
+
|
113
|
+
p_range = m.Cs(any * (m.P("-") / "") * (any - "]")) / ->(s) { m.R(s) }
|
114
|
+
item = (defined + p_range + m.C(any)) / ->(a) { m.P(a) }
|
115
|
+
|
116
|
+
p_class = "[" *
|
117
|
+
m.C(m.P("^")**-1) * # -- optional complement symbol
|
118
|
+
m.Cf(item * (item - "]")**0, ->(y, z) { y + z }) / ->(c, p) { c == "^" ? any - p : p } *
|
119
|
+
"]"
|
120
|
+
|
121
|
+
patt_error = lambda do |s, i, *|
|
122
|
+
msg = s.length < i + 20 ? s[i...] : "#{s[i, 20]}..."
|
123
|
+
msg = "pattern error near '#{msg}'"
|
124
|
+
raise msg # re.lua has error(msg, 2) but I don't know what that does
|
125
|
+
end
|
126
|
+
|
127
|
+
mult = lambda do |patt, n|
|
128
|
+
np = m.P(true)
|
129
|
+
while n >= 1
|
130
|
+
np *= patt if n.odd?
|
131
|
+
patt *= patt
|
132
|
+
n /= 2
|
133
|
+
end
|
134
|
+
np
|
135
|
+
end
|
136
|
+
|
137
|
+
equalcap = lambda do |s, i, c|
|
138
|
+
return nil unless c.is_a?(String)
|
139
|
+
|
140
|
+
e = c.length + i
|
141
|
+
e if s[i..(e - 1)] == c
|
142
|
+
end
|
143
|
+
|
144
|
+
adddef = lambda do |t, k, exp|
|
145
|
+
if t[k]
|
146
|
+
error("'#{k}' already defined as a rule")
|
147
|
+
else
|
148
|
+
t[k] = exp
|
149
|
+
end
|
150
|
+
return t
|
151
|
+
end
|
152
|
+
|
153
|
+
firstdef = ->(n, r) { adddef.call({}, n, r) }
|
154
|
+
|
155
|
+
f_open_call = lambda do |n, b|
|
156
|
+
raise "rule '#{n}' used outside a grammar" unless b
|
157
|
+
|
158
|
+
m.V(n)
|
159
|
+
end
|
160
|
+
|
161
|
+
# -- match a name and return a group of its corresponding definition
|
162
|
+
# -- and 'f' (to be folded in 'Suffix')
|
163
|
+
defwithfunc = ->(f) { m.Cg(p_def / getdef * m.Cc(f)) }
|
164
|
+
|
165
|
+
patt_add = ->(p1, p2) { p1 + p2 }
|
166
|
+
patt_mul = ->(p1, p2) { p1 * p2 }
|
167
|
+
patt_rpt = ->(p1, n) { p1**n }
|
168
|
+
patt_replace = ->(p1, rep) { p1 / rep }
|
169
|
+
pos_capture = ->(*) { m.Cp }
|
170
|
+
tonumber = ->(s) { Integer(s) }
|
171
|
+
|
172
|
+
call_patt = lambda do |fun|
|
173
|
+
->(*args) { RPEG.send(fun, *args) }
|
174
|
+
end
|
175
|
+
|
176
|
+
exp = m.P(
|
177
|
+
{
|
178
|
+
initial: :Exp,
|
179
|
+
Exp: p_space * (
|
180
|
+
m.V("Grammar") +
|
181
|
+
m.Cf(m.V("Seq") * ("/" * p_space * m.V("Seq"))**0, patt_add)
|
182
|
+
),
|
183
|
+
Seq: m.Cf(m.Cc(m.P("")) * m.V("Prefix")**0, patt_mul) * (+seq_follow + patt_error),
|
184
|
+
Prefix: ("&" * p_space * m.V("Prefix") / ->(p) { +p } +
|
185
|
+
"!" * p_space * m.V("Prefix") / ->(p) { -p } +
|
186
|
+
m.V("Suffix")),
|
187
|
+
Suffix: m.Cf(m.V("Primary") * p_space *
|
188
|
+
((m.P("+") * m.Cc(1, patt_rpt) +
|
189
|
+
m.P("*") * m.Cc(0, patt_rpt) +
|
190
|
+
m.P("?") * m.Cc(-1, patt_rpt) +
|
191
|
+
"^" * (
|
192
|
+
m.Cg(num * m.Cc(mult)) +
|
193
|
+
m.Cg(m.C(m.S("+-") * m.R("09")**1) / tonumber * m.Cc(patt_rpt))
|
194
|
+
) +
|
195
|
+
"->" * p_space * (
|
196
|
+
m.Cg((p_spacetring + num) * m.Cc(patt_replace)) +
|
197
|
+
m.P("{}") * m.Cc(nil, call_patt[:Ct]) +
|
198
|
+
defwithfunc[patt_replace]
|
199
|
+
) +
|
200
|
+
"=>" * p_space * defwithfunc[call_patt[:Cmt]] +
|
201
|
+
"~>" * p_space * defwithfunc[call_patt[:Cf]]
|
202
|
+
) * p_space
|
203
|
+
)**0, ->(a, b, f) { f.call(a, b) }),
|
204
|
+
Primary: (
|
205
|
+
"(" * m.V("Exp") * ")" +
|
206
|
+
p_spacetring / call_patt.call(:P) +
|
207
|
+
p_class +
|
208
|
+
defined +
|
209
|
+
"{:" * (name * ":" + m.Cc(nil)) * m.V("Exp") * ":}" / ->(n, p) { m.Cg(p, n) } +
|
210
|
+
"=" * name / ->(n) { m.Cmt(m.Cb(n), ->(*args) { equalcap[*args] }) } +
|
211
|
+
m.P("{}") / pos_capture +
|
212
|
+
"{~" * m.V("Exp") * "~}" / call_patt.call(:Cs) +
|
213
|
+
"{|" * m.V("Exp") * "|}" / call_patt.call(:Ct) +
|
214
|
+
"{" * m.V("Exp") * "}" / call_patt.call(:C) +
|
215
|
+
m.P(".") * m.Cc(any) +
|
216
|
+
(name * -arrow + "<" * name * ">") * m.Cb("G") / ->(*args) { f_open_call.call(*args) }
|
217
|
+
),
|
218
|
+
Definition: name * arrow * m.V("Exp"),
|
219
|
+
Grammar: (
|
220
|
+
m.Cg(m.Cc(true), "G") *
|
221
|
+
m.Cf(
|
222
|
+
m.V("Definition") / firstdef * m.Cg(m.V("Definition"))**0,
|
223
|
+
adddef
|
224
|
+
) / call_patt.call(:P)
|
225
|
+
)
|
226
|
+
}
|
227
|
+
)
|
228
|
+
|
229
|
+
p_space * m.Cg(m.Cc(false), "G") * exp / call_patt.call(:P) * (-any + patt_error)
|
230
|
+
end
|
231
|
+
|
232
|
+
PATTERN = internals
|
233
|
+
end
|