rpeg 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,457 @@
1
+ require_relative 'captures'
2
+
3
+ # Instances are generated during program generation in Pattern and consumed in the ParsingMachine
4
+ #
5
+ # - op_code: the instruction op
6
+ # - offset: the address offset used in jumps, calls, etc.
7
+ # - aux: extra information used by instructions like capture
8
+ # - in LPEG this is used to carefully pack data by bit-twiddling, etc., but we can use anything, such as structs, etc., as needed
9
+ # - data: this is called "key" in LPEG and is used to store pointers to Lua-based objects, etc.
10
+ # - we will just store Ruby objects here.
11
+ # - it contains things like the set of characters for Charset instructions, etc.
12
+ # - dec: "decorations" for other things like labels that might be useful later in debugging, etc.
13
+ # - it is ignored by the VM
14
+ class Instruction
15
+ OP_CODES = %i[
16
+ char charset any jump choice call return commit back_commit
17
+ partial_commit span op_end fail fail_twice
18
+ open_capture close_capture close_run_time full_capture behind
19
+ test_char test_charset test_any
20
+ ].each do |op|
21
+ const_set op.upcase, op
22
+ end
23
+
24
+ OP_WIDTH = OP_CODES.map(&:length).max
25
+ DECORATION_WIDTH = 15
26
+
27
+ attr_reader :op_code, :data, :aux
28
+ attr_accessor :offset, :dec
29
+
30
+ def initialize(op_code, offset: nil, data: nil, aux: nil, dec: nil)
31
+ raise "Bad instruction op_code #{op_code}" unless OP_CODES.include?(op_code)
32
+
33
+ @op_code = op_code
34
+ @offset = offset
35
+ @data = data
36
+ @aux = aux
37
+ @dec = dec
38
+ end
39
+
40
+ def to_s
41
+ return @to_s if @to_s
42
+
43
+ str = (dec || "").to_s.rjust(DECORATION_WIDTH) + " :"
44
+ str << op_code.to_s.upcase.rjust(OP_WIDTH + 1)
45
+
46
+ str << " offset: #{offset}" if [TEST_CHAR, TEST_ANY, TEST_CHARSET].include?(op_code)
47
+
48
+ case op_code
49
+ when CHAR, TEST_CHAR
50
+ str << " #{data.dump}"
51
+ when BEHIND
52
+ str << " #{aux}"
53
+ when CHARSET, SPAN, TEST_CHARSET
54
+ str << " #{charset_rep(data)}"
55
+ when JUMP, CHOICE, CALL, COMMIT, BACK_COMMIT, PARTIAL_COMMIT
56
+ str << " #{offset}"
57
+ when RETURN, OP_END, FAIL, FAIL_TWICE, ANY, TEST_ANY
58
+ # no-op
59
+ when OPEN_CAPTURE, CLOSE_CAPTURE, FULL_CAPTURE, CLOSE_RUN_TIME
60
+ str << " data:#{data}, aux:#{aux}"
61
+ else
62
+ raise "Unhandled op_code #{op_code} in Instruction#to_s"
63
+ end
64
+ @to_s = str
65
+ end
66
+
67
+ # A shorter representation of a charset
68
+ private def charset_rep(char_set)
69
+ return "" if char_set.empty?
70
+
71
+ bools = []
72
+ char_set.each do |ch|
73
+ bools[ch.ord] = true
74
+ end
75
+
76
+ # attach an artificial false bool to trigger ship-out
77
+ bools << false
78
+
79
+ parts = []
80
+
81
+ open = false
82
+ first = last = nil
83
+ bools.each_with_index do |present, idx|
84
+ if present
85
+ if open
86
+ last = idx
87
+ else
88
+ # start a new range
89
+ first = idx
90
+ last = idx
91
+ open = true
92
+ end
93
+ elsif open
94
+ # a subrange just closed
95
+ if last == first
96
+ parts << first.chr(Encoding::UTF_8).dump
97
+ else
98
+ parts << (first.chr(Encoding::UTF_8) + ".." + last.chr(Encoding::UTF_8)).dump
99
+ end
100
+ first = last = nil
101
+ open = false
102
+ end
103
+ end
104
+
105
+ parts.join(", ")
106
+ end
107
+ end
108
+
109
+ # The VM used to run the programs generated from the patterns.
110
+ #
111
+ # See lpvm.c in the LPEG code.
112
+ class ParsingMachine
113
+ # program: the program to run
114
+ # subject: the string to match against
115
+ # initial_pos: the position in subject to start the search at
116
+ # extra_args may have been supplied in the initial #match call. These are consumed by Argument Captures.
117
+ def initialize(program, subject, initial_pos, extra_args)
118
+ @program = program.clone.freeze.must_only_contain(Instruction)
119
+ @prog_len = @program_size
120
+
121
+ # When benchmarking with a large subject (4.5 MB) I found that the search took an enormous amount of time (1300 s), and 95% of
122
+ # it was due to 3.6 million calls to String::[]. I don't know why this was so slow, as accessing a large string in a small
123
+ # scratch loop is fast. I am very confused. Converting the string to an array of chars is much faster (1300 s became 9 s).
124
+ # @original_subject = subject.clone.freeze
125
+ @subject = subject.chars.freeze
126
+ @subject_size = @subject.size
127
+
128
+ @i_ptr = 0 # index in @program of the next instruction
129
+ @subject_index = initial_pos
130
+ @stack = []
131
+ @breadcrumbs = [].must_only_ever_contain(Capture::Breadcrumb) # the records of the captures we make during parsing
132
+ @bread_count = 0 # the number of breadcrumbs in @breadcrumbs (some in the array may be stale)
133
+
134
+ @extra_args = extra_args.clone
135
+ end
136
+
137
+ def success?
138
+ @success
139
+ end
140
+
141
+ private def done!
142
+ @done = true
143
+ end
144
+
145
+ private def done?
146
+ @done
147
+ end
148
+
149
+ def run
150
+ step until @done
151
+ end
152
+
153
+ def step
154
+ instr = @program[@i_ptr]
155
+
156
+ case instr.op_code
157
+ when Instruction::TEST_CHARSET
158
+ test_char(instr.data.include?(@subject[@subject_index]), instr.offset)
159
+ when Instruction::TEST_CHAR
160
+ test_char(instr.data == @subject[@subject_index], instr.offset)
161
+ when Instruction::TEST_ANY
162
+ test_char(@subject_index < @subject_size, instr.offset)
163
+ when Instruction::ANY
164
+ check_char(@subject_index < @subject_size)
165
+ when Instruction::CHARSET
166
+ check_char(instr.data.include?(@subject[@subject_index]))
167
+ when Instruction::CHAR
168
+ check_char(instr.data == @subject[@subject_index])
169
+ when Instruction::JUMP
170
+ @i_ptr += instr.offset
171
+ when Instruction::CHOICE
172
+ # We push the offset for the other side of the choice
173
+ push(:state, instr.offset)
174
+ @i_ptr += 1
175
+ when Instruction::CALL
176
+ # Call is like jump, but we push the return address onto the stack first
177
+ push(:instruction, 1)
178
+ @i_ptr += instr.offset
179
+ when Instruction::RETURN
180
+ @i_ptr = pop(:instruction).i_ptr
181
+ when Instruction::COMMIT
182
+ # we pop and discard the top of the stack (which must be a full state) and then do the jump given by arg1. Even though we
183
+ # are discarding it check that it was a full state for sanity.
184
+ _ = pop(:state)
185
+ @i_ptr += instr.offset
186
+ when Instruction::PARTIAL_COMMIT
187
+ # Sort of a combination of commit (which pops) and choice (which pushes), but we just tweak the top of the stack. See
188
+ # Ierusalimschy, sec 4.3
189
+ stack_top = peek(:state)
190
+ raise "Empty stack for partial commit!" unless stack_top
191
+
192
+ stack_top.subject_index = @subject_index
193
+ stack_top.bread_count = @bread_count
194
+ @i_ptr += instr.offset
195
+ when Instruction::BACK_COMMIT
196
+ # A combination of a fail and a commit. We backtrack, but then jump to the specified instruction rather than using the
197
+ # backtrack label. It's used for the AND pattern. See Ierusalimschy, 4.4
198
+ stack_top = pop(:state)
199
+ @subject_index = stack_top.subject_index
200
+ @bread_count = stack_top.bread_count
201
+ @i_ptr += instr.offset
202
+ when Instruction::SPAN
203
+ # Special instruction for when we are repeating over a charset, which is common. We just consume as many maching characters
204
+ # as there are. This never fails as we can always match at least zero.
205
+ @subject_index += 1 while instr.data.include?(@subject[@subject_index])
206
+ @i_ptr += 1
207
+ when Instruction::BEHIND
208
+ n = instr.aux # the (fixed) length of the pattern we want to match.
209
+ if n > @subject_index
210
+ # We can't jump so far back in the subject
211
+ handle_fail_ptr
212
+ else
213
+ @subject_index -= n
214
+ @i_ptr += 1
215
+ end
216
+ when Instruction::FAIL
217
+ handle_fail_ptr
218
+ when Instruction::FAIL_TWICE
219
+ # An optimization for the NOT implementation. We pop the top of the stack and discard it, and then enter the fail routine
220
+ # again. For sanity's sake we'll check that the thing we are popping is a :state entry. See Ierusalimschy, 4.4
221
+ _ = pop(:state)
222
+ handle_fail_ptr
223
+ when Instruction::CLOSE_RUN_TIME
224
+ # The LPEG code for runtime captures is very complicated. Reading through it, it appears that the complexity comes from
225
+ # needing to carefully manage the capture breadcrumbs wrt to the Lua values living on the Lua stack to avoid memory
226
+ # leaks. We don't have to worry about that here, as everything is in Ruby and we can leave the hard stuff to the garbage
227
+ # collector. The remaining work is little more than we have with a function capture.
228
+ result = run_time_capture
229
+ handle_run_time_capture_result(result)
230
+ when Instruction::OPEN_CAPTURE
231
+ record_capture(instr, size: 0, subject_index: @subject_index)
232
+ when Instruction::CLOSE_CAPTURE
233
+ # As in LPEG: "if possible, turn capture into a full capture"
234
+ raise "Close capture without an open" unless @bread_count.positive?
235
+
236
+ lc = @breadcrumbs[@bread_count - 1].must_be # still on the breadcrumb list
237
+ if lc.size.zero? && (@subject_index - lc.subject_index) < 255 # TODO: should we care about an upper bound here?
238
+ # The previous breadcrumb was an OPEN, and we are closing it
239
+ lc.size = @subject_index - lc.subject_index + 1
240
+ @i_ptr += 1
241
+ else
242
+ record_capture(instr, size: 1, subject_index: @subject_index)
243
+ end
244
+ when Instruction::FULL_CAPTURE
245
+ # We have an all-in-one match, and the "capture length" tells us how far back in the subject the match started.
246
+ len = (instr.aux[:capture_length] || 0).must_be(Integer)
247
+ record_capture(instr, size: 1 + len, subject_index: @subject_index - len)
248
+ when Instruction::OP_END
249
+ @success = true
250
+ done!
251
+ else
252
+ raise "Unhandled op code #{instr.op_code}"
253
+ end
254
+ end
255
+
256
+ ########################################
257
+ # Support for a debugger
258
+
259
+ # These are internals that aren't useful in the usual case
260
+ attr_reader :program, :subject, :subject_index, :extra_args, :stack, :i_ptr
261
+
262
+ def breadcrumbs
263
+ @breadcrumbs[0, @bread_count]
264
+ end
265
+
266
+ #
267
+ ########################################
268
+
269
+ # For this and the handling of the foo_CAPTURE op codes above, see the corresponding LPEG code in lpvm.c
270
+ #
271
+ # In that code, captop points to the "next" or "new" capture info, so captop - 1 is the current top.
272
+ private def record_capture(instr, size:, subject_index:)
273
+ add_capture Capture::Breadcrumb.new(size, subject_index, instr.data, instr.aux[:kind].must_be)
274
+ @i_ptr += 1
275
+ end
276
+
277
+ private def add_capture(breadcrumb)
278
+ @breadcrumbs[@bread_count] = breadcrumb
279
+ @bread_count += 1
280
+ end
281
+
282
+ # React to a character match or failure
283
+ private def check_char(success)
284
+ if success
285
+ @i_ptr += 1
286
+ @subject_index += 1
287
+ else
288
+ handle_fail_ptr
289
+ end
290
+ end
291
+
292
+ # React to a character match or failure in one of the TestFoo instruction
293
+ #
294
+ # IMPORTANT NOTE
295
+ #
296
+ # Ierusalimschy's paper describes these as consuming the next character, and code generated for things like char sequences being
297
+ # tweaked to take this into account. BUT the LPEG code does it differently. These _check_ the current character but do not consume
298
+ # it: the following test is expected to do so. During code generation the "currently controlling" TEST_FOO is passed along so
299
+ # followup checks can be optimized. See codechar and codecharset in lpcode.c.
300
+ private def test_char(success, offset)
301
+ @i_ptr += success ? 1 : offset
302
+ end
303
+
304
+ # We have to backtrack, or fail the match if we can't
305
+ private def handle_fail_ptr
306
+ if @stack.empty?
307
+ @success = false
308
+ done!
309
+ else
310
+ # pop off stack elements until we get a full state to restore
311
+ top = pop
312
+ return handle_fail_ptr if top.type == :instruction
313
+
314
+ @i_ptr = top.i_ptr
315
+ @subject_index = top.subject_index
316
+ @bread_count = top.bread_count
317
+ end
318
+ end
319
+
320
+ ########################################
321
+ # Stack manipulation
322
+
323
+ Frame = Struct.new :type, :i_ptr, :subject_index, :bread_count
324
+
325
+ # We push either
326
+ # - an instruction pointer, which may later be used to jump, etc, or
327
+ # - the current state with an offset, which is the [instr ptr + offset, subject_index, breadcrumb list] triple.
328
+ private def push(type, offset)
329
+ raise "Must push something onto stack" unless offset
330
+ raise "Bad stack frame type" unless %i[instruction state].include?(type)
331
+
332
+ frame = if type == :state
333
+ Frame.new(type, @i_ptr + offset, @subject_index, @bread_count)
334
+ else
335
+ Frame.new(type, @i_ptr + offset)
336
+ end
337
+ @stack.push frame
338
+ end
339
+
340
+ # Pop and return the top stack frame. If expected_type is non nil check that the frame has that type
341
+ #
342
+ # Raise if stack is empty
343
+ private def pop(expected_type = nil)
344
+ raise "Nothing in stack to pop" if @stack.empty?
345
+
346
+ frame = @stack.pop
347
+ check_frame(frame, expected_type)
348
+ frame
349
+ end
350
+
351
+ # Peek and return the top of the stack without popping it. Return nil if the stack is empty.
352
+ #
353
+ # If expecting is given make sure that the top of the stack is of the given type
354
+ private def peek(expected_type = nil)
355
+ return nil if @stack.empty?
356
+
357
+ frame = @stack.last
358
+ check_frame(frame, expected_type)
359
+ frame
360
+ end
361
+
362
+ private def check_frame(frame, expected_type)
363
+ return unless expected_type
364
+
365
+ raise "Top of stack is of type #{frame.type}, not of expected type #{expected_type}" unless frame.type == expected_type
366
+ end
367
+
368
+ ########################################
369
+ # Capture extraction code
370
+
371
+ # From the ICloseRuntIme main-loop switch statement in lpvm.c
372
+ def handle_run_time_capture_result(results)
373
+ directive, *dyn_captures = results
374
+ unless directive
375
+ handle_fail_ptr
376
+ return
377
+ end
378
+
379
+ @subject_index = if directive == true
380
+ @subject_index
381
+ else
382
+ directive.must_be_a(Integer)
383
+ if directive < @subject_index || directive > @subject_size
384
+ raise 'invalid position returned by match-time capture'
385
+ end
386
+
387
+ directive
388
+ end
389
+
390
+ if dyn_captures.empty?
391
+ # no dynamic captures. Just get rid of the OPEN capture we still have
392
+ @bread_count -= 1
393
+ else
394
+ # This is LPEG's adddyncaptures in lpvm.c
395
+ @breadcrumbs[@bread_count - 1].data = nil # make the group capture an anonymous group
396
+ dyn_captures.each do |cap_val|
397
+ # LPEG uses a special RUNTIME capture kind here to help find these things later if they need to be removed. We don't appear
398
+ # to need it - we could just use a CONST capture. But let's follow LPEG just in case.
399
+ add_capture Capture::Breadcrumb.new(1, @subject_index, cap_val, Capture::RUNTIME)
400
+ end
401
+ add_capture Capture::Breadcrumb.new(1, @subject_index, nil, Capture::CLOSE) # close the group
402
+ end
403
+ @i_ptr += 1
404
+ end
405
+
406
+ # Returns the captures obtained when we ran the machine.
407
+ #
408
+ # If there are no captures we return the final index into the subject string. This is typically one past the matched section.
409
+ # If there is exactly one capture we return it.
410
+ # If there are multiple captures we return them in an array.
411
+ #
412
+ # The capture code in LPEG (mostly in lpcap.c) looks complicated at first but it is made up of a bunch of pieces that each do one
413
+ # thing and coordinate well togehter. Some extra complexity comes from the manual memory management required in C and the need to
414
+ # interact with Lua values - this appears to be especially the case with the Runtime capture code, which is bewildering at first
415
+ # view. Porting it one capture kind at a time let me understand it at some level as I went.
416
+ #
417
+ # Basic model:
418
+ #
419
+ # - We push Breadcrumb objects onto the stack as we run the VM based on the instructions generated from the patterns. We never pop
420
+ # anything from the stack: the Captures are breadcrumbs that let us work out after the fact what happend. Things do get removed
421
+ # from the Capture stack but only at backtrack points because a match has failed.
422
+ # - The End instruction tacks on an unbalanced CloseCapture. This appears to be simply an end-marker like the null string
423
+ # terminator. We don't do this
424
+ # - After the VM runs we analyze the Breadcrumbs to calculate the captures. We go back and forth through the data. So isn't not a
425
+ # stack, but an array.
426
+ #
427
+ # This method plays the same role as LPEG's getcaptures (lpcap.c)
428
+ def captures
429
+ raise "Cannot call #captures unless machine ran sucessfully" unless done? && success?
430
+
431
+ @capture_state = new_capture_state
432
+ @capture_state.capture_all
433
+
434
+ result = @capture_state.captures
435
+
436
+ return @subject_index if result.empty?
437
+ return result.first if result.size == 1
438
+
439
+ result
440
+ end
441
+
442
+ # This stub needs to be in ParsingMachine and not CaptureState because it must modify @bread_count
443
+ def run_time_capture
444
+ # We need point to the close capture we just hit. LPEG is tricksy here: there isn't actually a CLOSE capture/breadcrumb yet, but
445
+ # the data structure - an array of Capture objects - means that the "next capture" memory is interpreted as a Capture. We have
446
+ # to do something manually that in the C code happens "automatically"
447
+ add_capture Capture::Breadcrumb.new(0, @subject_index, nil, Capture::CLOSE)
448
+ capture_state = new_capture_state(@bread_count - 1) # start on the CLOSE we just tacked on
449
+
450
+ @bread_count, result = capture_state.run_time_capture
451
+ result
452
+ end
453
+
454
+ def new_capture_state(starting_index = nil)
455
+ CaptureState.new(@breadcrumbs[0, @bread_count], @subject, @subject_index, @extra_args, starting_index:)
456
+ end
457
+ end
data/lib/rpeg/re.rb ADDED
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'rpeg'
4
+
5
+ # A straight port of LPEG's re module, though without any support for locales
6
+ module RPEG::RE
7
+ extend self
8
+
9
+ Pattern = RPEG::Pattern
10
+
11
+ @mem = {} # memo space for patterns
12
+ @fmem = {}
13
+ @gmem = {}
14
+
15
+ # What does "compiled" mean here?
16
+ #
17
+ # Oh. Maybe it is the Pattern built from the regexp-y thing.
18
+ def compile(pattern, *defs)
19
+ return pattern if pattern.is_a?(Pattern)
20
+
21
+ defs = [{}] if defs.empty? # for the sake of p_def, below
22
+
23
+ cp = PATTERN.match(pattern, 0, *defs)
24
+ raise "incorrect pattern" unless cp
25
+
26
+ cp
27
+ end
28
+
29
+ def match(str, pattern, start_pos = 0)
30
+ cp = (@mem[pattern] ||= compile(pattern))
31
+ cp.match(str, start_pos)
32
+ end
33
+
34
+ def find(str, pattern, start_pos = 0)
35
+ cp = @fmem[pattern]
36
+ unless cp
37
+ cp = compile(pattern) / 0
38
+ cp = RPEG.P([RPEG.Cp() * cp * RPEG.Cp() + 1 * RPEG.V(0)])
39
+ @fmem[pattern] = cp
40
+ end
41
+
42
+ i, e = cp.match(str, start_pos)
43
+
44
+ return [i, e - 1] if i
45
+ end
46
+
47
+ def gsub(str, pattern, rep)
48
+ g = @gmem[pattern] || {} #-- ensure gmem[p] is not collected while here. What does that mean?
49
+ @gmem[pattern] = g
50
+ cp = g[rep]
51
+ unless cp
52
+ cp = compile(pattern)
53
+ cp = RPEG.Cs((cp / rep + 1)**0)
54
+ g[rep] = cp
55
+ end
56
+ cp.match(str)
57
+ end
58
+
59
+ private def internals
60
+ m = RPEG
61
+
62
+ any = m.P(1)
63
+ lower = m.R("az")
64
+ upper = m.R("AZ")
65
+ alpha = lower + upper
66
+ digit = m.R("09")
67
+ alnum = alpha + digit
68
+ space = m.S(" \n\t")
69
+ printable = m.R(' ~')
70
+ word = alnum
71
+ predef = {
72
+ alpha:, digit:, lower:, upper:, space:, alnum:, word:,
73
+ graph: printable - space,
74
+ xdigit: digit + m.R("af", "AF"),
75
+ punct: printable - (space + alnum),
76
+ cntrl: any - printable
77
+ }
78
+ predef.keys.each do |key|
79
+ short = key.to_s[0].to_sym
80
+ predef[short] = predef[key]
81
+ predef[short.upcase] = any - predef[key]
82
+ end
83
+ predef[:nl] = m.P("\n")
84
+
85
+ name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")**0
86
+ p_space = (predef[:space] + "--" * (any - predef[:nl])**0)**0
87
+
88
+ arrow = p_space * "<-"
89
+ seq_follow = m.P("/") + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1
90
+ name = m.C(name)
91
+
92
+ # -- a defined name only have meaning in a given environment
93
+ p_def = name * m.Carg(1)
94
+
95
+ num = m.C(m.R("09")**1) * p_space / ->(s) { s.to_i }
96
+ p_spacetring = "'" * m.C((any - "'")**0) * "'" + '"' * m.C((any - '"')**0) * '"'
97
+
98
+ defined = "%" * p_def / lambda do |c, defs|
99
+ cat = (defs && (defs[c] || defs[c.to_sym])) || predef[c.to_sym]
100
+ raise "name '#{c}' undefined" unless cat
101
+
102
+ cat
103
+ end
104
+
105
+ # Why do we have this as well as defined ?
106
+ getdef = lambda do |id, defs|
107
+ c = defs && (defs[id] || defs[id.to_sym])
108
+ raise "undefined name: #{id}" unless c
109
+
110
+ c
111
+ end
112
+
113
+ p_range = m.Cs(any * (m.P("-") / "") * (any - "]")) / ->(s) { m.R(s) }
114
+ item = (defined + p_range + m.C(any)) / ->(a) { m.P(a) }
115
+
116
+ p_class = "[" *
117
+ m.C(m.P("^")**-1) * # -- optional complement symbol
118
+ m.Cf(item * (item - "]")**0, ->(y, z) { y + z }) / ->(c, p) { c == "^" ? any - p : p } *
119
+ "]"
120
+
121
+ patt_error = lambda do |s, i, *|
122
+ msg = s.length < i + 20 ? s[i...] : "#{s[i, 20]}..."
123
+ msg = "pattern error near '#{msg}'"
124
+ raise msg # re.lua has error(msg, 2) but I don't know what that does
125
+ end
126
+
127
+ mult = lambda do |patt, n|
128
+ np = m.P(true)
129
+ while n >= 1
130
+ np *= patt if n.odd?
131
+ patt *= patt
132
+ n /= 2
133
+ end
134
+ np
135
+ end
136
+
137
+ equalcap = lambda do |s, i, c|
138
+ return nil unless c.is_a?(String)
139
+
140
+ e = c.length + i
141
+ e if s[i..(e - 1)] == c
142
+ end
143
+
144
+ adddef = lambda do |t, k, exp|
145
+ if t[k]
146
+ error("'#{k}' already defined as a rule")
147
+ else
148
+ t[k] = exp
149
+ end
150
+ return t
151
+ end
152
+
153
+ firstdef = ->(n, r) { adddef.call({}, n, r) }
154
+
155
+ f_open_call = lambda do |n, b|
156
+ raise "rule '#{n}' used outside a grammar" unless b
157
+
158
+ m.V(n)
159
+ end
160
+
161
+ # -- match a name and return a group of its corresponding definition
162
+ # -- and 'f' (to be folded in 'Suffix')
163
+ defwithfunc = ->(f) { m.Cg(p_def / getdef * m.Cc(f)) }
164
+
165
+ patt_add = ->(p1, p2) { p1 + p2 }
166
+ patt_mul = ->(p1, p2) { p1 * p2 }
167
+ patt_rpt = ->(p1, n) { p1**n }
168
+ patt_replace = ->(p1, rep) { p1 / rep }
169
+ pos_capture = ->(*) { m.Cp }
170
+ tonumber = ->(s) { Integer(s) }
171
+
172
+ call_patt = lambda do |fun|
173
+ ->(*args) { RPEG.send(fun, *args) }
174
+ end
175
+
176
+ exp = m.P(
177
+ {
178
+ initial: :Exp,
179
+ Exp: p_space * (
180
+ m.V("Grammar") +
181
+ m.Cf(m.V("Seq") * ("/" * p_space * m.V("Seq"))**0, patt_add)
182
+ ),
183
+ Seq: m.Cf(m.Cc(m.P("")) * m.V("Prefix")**0, patt_mul) * (+seq_follow + patt_error),
184
+ Prefix: ("&" * p_space * m.V("Prefix") / ->(p) { +p } +
185
+ "!" * p_space * m.V("Prefix") / ->(p) { -p } +
186
+ m.V("Suffix")),
187
+ Suffix: m.Cf(m.V("Primary") * p_space *
188
+ ((m.P("+") * m.Cc(1, patt_rpt) +
189
+ m.P("*") * m.Cc(0, patt_rpt) +
190
+ m.P("?") * m.Cc(-1, patt_rpt) +
191
+ "^" * (
192
+ m.Cg(num * m.Cc(mult)) +
193
+ m.Cg(m.C(m.S("+-") * m.R("09")**1) / tonumber * m.Cc(patt_rpt))
194
+ ) +
195
+ "->" * p_space * (
196
+ m.Cg((p_spacetring + num) * m.Cc(patt_replace)) +
197
+ m.P("{}") * m.Cc(nil, call_patt[:Ct]) +
198
+ defwithfunc[patt_replace]
199
+ ) +
200
+ "=>" * p_space * defwithfunc[call_patt[:Cmt]] +
201
+ "~>" * p_space * defwithfunc[call_patt[:Cf]]
202
+ ) * p_space
203
+ )**0, ->(a, b, f) { f.call(a, b) }),
204
+ Primary: (
205
+ "(" * m.V("Exp") * ")" +
206
+ p_spacetring / call_patt.call(:P) +
207
+ p_class +
208
+ defined +
209
+ "{:" * (name * ":" + m.Cc(nil)) * m.V("Exp") * ":}" / ->(n, p) { m.Cg(p, n) } +
210
+ "=" * name / ->(n) { m.Cmt(m.Cb(n), ->(*args) { equalcap[*args] }) } +
211
+ m.P("{}") / pos_capture +
212
+ "{~" * m.V("Exp") * "~}" / call_patt.call(:Cs) +
213
+ "{|" * m.V("Exp") * "|}" / call_patt.call(:Ct) +
214
+ "{" * m.V("Exp") * "}" / call_patt.call(:C) +
215
+ m.P(".") * m.Cc(any) +
216
+ (name * -arrow + "<" * name * ">") * m.Cb("G") / ->(*args) { f_open_call.call(*args) }
217
+ ),
218
+ Definition: name * arrow * m.V("Exp"),
219
+ Grammar: (
220
+ m.Cg(m.Cc(true), "G") *
221
+ m.Cf(
222
+ m.V("Definition") / firstdef * m.Cg(m.V("Definition"))**0,
223
+ adddef
224
+ ) / call_patt.call(:P)
225
+ )
226
+ }
227
+ )
228
+
229
+ p_space * m.Cg(m.Cc(false), "G") * exp / call_patt.call(:P) * (-any + patt_error)
230
+ end
231
+
232
+ PATTERN = internals
233
+ end