reg 0.4.8 → 0.5.0a0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -0
- data/COPYING +0 -0
- data/History.txt +14 -0
- data/Makefile +59 -0
- data/README +87 -40
- data/article.txt +838 -0
- data/{assert.rb → lib/assert.rb} +3 -3
- data/{reg.rb → lib/reg.rb} +11 -4
- data/lib/reg/version.rb +21 -0
- data/lib/regarray.rb +455 -0
- data/{regarrayold.rb → lib/regarrayold.rb} +33 -7
- data/lib/regbackref.rb +73 -0
- data/lib/regbind.rb +230 -0
- data/{regcase.rb → lib/regcase.rb} +15 -5
- data/lib/regcompiler.rb +2341 -0
- data/{regcore.rb → lib/regcore.rb} +196 -85
- data/{regdeferred.rb → lib/regdeferred.rb} +35 -4
- data/{regposition.rb → lib/regevent.rb} +36 -38
- data/lib/reggraphpoint.rb +28 -0
- data/lib/reghash.rb +631 -0
- data/lib/reginstrumentation.rb +36 -0
- data/{regitem_that.rb → lib/regitem_that.rb} +32 -11
- data/{regknows.rb → lib/regknows.rb} +4 -2
- data/{reglogic.rb → lib/reglogic.rb} +76 -59
- data/{reglookab.rb → lib/reglookab.rb} +31 -21
- data/lib/regmatchset.rb +323 -0
- data/{regold.rb → lib/regold.rb} +27 -27
- data/{regpath.rb → lib/regpath.rb} +91 -1
- data/lib/regposition.rb +79 -0
- data/lib/regprogress.rb +1522 -0
- data/lib/regrepeat.rb +307 -0
- data/lib/regreplace.rb +254 -0
- data/lib/regslicing.rb +581 -0
- data/lib/regsubseq.rb +72 -0
- data/lib/regsugar.rb +361 -0
- data/lib/regvar.rb +180 -0
- data/lib/regxform.rb +212 -0
- data/{trace.rb → lib/trace_during.rb} +6 -4
- data/lib/warning.rb +37 -0
- data/parser.txt +26 -8
- data/philosophy.txt +18 -0
- data/reg.gemspec +58 -25
- data/regguide.txt +18 -0
- data/test/andtest.rb +46 -0
- data/test/regcompiler_test.rb +346 -0
- data/test/regdemo.rb +20 -0
- data/{item_thattest.rb → test/regitem_thattest.rb} +2 -2
- data/test/regtest.rb +2125 -0
- data/test/test_all.rb +32 -0
- data/test/test_reg.rb +19 -0
- metadata +108 -73
- data/calc.reg +0 -73
- data/forward_to.rb +0 -49
- data/numberset.rb +0 -200
- data/regarray.rb +0 -675
- data/regbackref.rb +0 -126
- data/regbind.rb +0 -74
- data/reggrid.csv +1 -2
- data/reghash.rb +0 -318
- data/regprogress.rb +0 -1054
- data/regreplace.rb +0 -114
- data/regsugar.rb +0 -230
- data/regtest.rb +0 -1078
- data/regvar.rb +0 -76
checksums.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
1
|
+
---
|
2
|
+
SHA512:
|
3
|
+
metadata.gz: b1f48843c59604389cf4051e66948f705f210c19f1b448841b1c757fdb814217caeec7a27abc8ab8eee10866a555f39e9209997f5772170176c55f4fdc15790d
|
4
|
+
data.tar.gz: 888c7196689f1465c52d3d50dff10fc2112926224ef04f83d237a94f6ed4f1a0fc5f2f9f4d52618e37e0421049a475a32072e8d37030651935f1cce585b4eaac
|
data/COPYING
CHANGED
File without changes
|
data/History.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
=== 0.5.0a0 / 15jun2016
|
2
|
+
new much faster backtracking engine (turns patterns into equivalent ruby to eval)
|
3
|
+
but hash matchers in new engine are currently broken in some cases
|
4
|
+
beginnings of search-and-replace (but this doesn't work yet)
|
5
|
+
ported to ruby 1.9+
|
6
|
+
_ and __ are new preferred names for OB and OBS
|
7
|
+
use something more like standard project structure
|
8
|
+
taking a stab at backreference-like functionality
|
9
|
+
support for utilities like inspect, hash, marshal
|
10
|
+
trace operator for debugging
|
11
|
+
|
12
|
+
=== 0.4.8 / 21dec2009
|
13
|
+
* Last stable release
|
14
|
+
|
data/Makefile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# reg - the ruby extended grammar
|
2
|
+
# Copyright (C) 2016 Caleb Clausen
|
3
|
+
#
|
4
|
+
# This library is free software; you can redistribute it and/or
|
5
|
+
# modify it under the terms of the GNU Lesser General Public
|
6
|
+
# License as published by the Free Software Foundation; either
|
7
|
+
# version 2.1 of the License, or (at your option) any later version.
|
8
|
+
#
|
9
|
+
# This library is distributed in the hope that it will be useful,
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12
|
+
# Lesser General Public License for more details.
|
13
|
+
#
|
14
|
+
# You should have received a copy of the GNU Lesser General Public
|
15
|
+
# License along with this library; if not, write to the Free Software
|
16
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
17
|
+
name=Reg
|
18
|
+
lname=reg
|
19
|
+
gemname=reg
|
20
|
+
|
21
|
+
#everything after this line is generic
|
22
|
+
|
23
|
+
version=$(shell ruby -r ./lib/$(lname)/version.rb -e "puts $(name)::VERSION")
|
24
|
+
filelist=$(shell git ls-files)
|
25
|
+
|
26
|
+
.PHONY: all test docs gem tar pkg email
|
27
|
+
all: test
|
28
|
+
|
29
|
+
test:
|
30
|
+
ruby -Ilib test/test_all.rb
|
31
|
+
|
32
|
+
docs:
|
33
|
+
rdoc lib/*
|
34
|
+
|
35
|
+
pkg: gem tar
|
36
|
+
|
37
|
+
gem:
|
38
|
+
gem build $(lname).gemspec
|
39
|
+
|
40
|
+
tar:
|
41
|
+
tar cf - $(filelist) | ( mkdir $(gemname)-$(version); cd $(gemname)-$(version); tar xf - )
|
42
|
+
tar czf $(gemname)-$(version).tar.gz $(gemname)-$(version)
|
43
|
+
rm -rf $(gemname)-$(version)
|
44
|
+
|
45
|
+
email: README.txt History.txt
|
46
|
+
ruby -e ' \
|
47
|
+
require "rubygems"; \
|
48
|
+
load "./$(lname).gemspec"; \
|
49
|
+
spec= Gem::Specification.list.find{|x| x.name=="$(gemname)"}; \
|
50
|
+
puts "\
|
51
|
+
Subject: [ANN] $(name) #{spec.version} Released \
|
52
|
+
\n\n$(name) version #{spec.version} has been released! \n\n\
|
53
|
+
#{Array(spec.homepage).map{|url| " * #{url}\n" }} \
|
54
|
+
\n\
|
55
|
+
#{$(name)::Description} \
|
56
|
+
\n\nChanges:\n\n \
|
57
|
+
#{$(name)::Latest_changes} \
|
58
|
+
"\
|
59
|
+
'
|
data/README
CHANGED
@@ -13,16 +13,14 @@ want to figure out how to make it work for what you need to do, contact me:
|
|
13
13
|
Reg is a RubyForge project. RubyForge is another good place to send your
|
14
14
|
bug reports or whatever: http://rubyforge.org/projects/reg/
|
15
15
|
|
16
|
-
(There aren't any bug filed against Reg there yet, but don't be afraid
|
17
|
-
that your report will get lonely.)
|
18
16
|
|
19
17
|
|
20
18
|
|
21
19
|
The implementation:
|
22
20
|
The engine (according to what I can tell from Friedl's book,
|
23
|
-
_Mastering_Regular_Expressions_,) is a traditional DFA with non-greedy
|
24
|
-
For performance, I'd like to move to a more NFA-oriented
|
25
|
-
different alternatives in parallel).
|
21
|
+
_Mastering_Regular_Expressions_,) is a traditional DFA with non-greedy
|
22
|
+
alternation. For performance, I'd like to move to a more NFA-oriented
|
23
|
+
approach (trying many different alternatives in parallel).
|
26
24
|
|
27
25
|
Status:
|
28
26
|
The only real (public) matching operator implemented thus far is:
|
@@ -36,6 +34,14 @@ backreferences and substitutions.
|
|
36
34
|
The backtracking engine appears to be completely functional now. Vector
|
37
35
|
Reg::And doesn't work.
|
38
36
|
|
37
|
+
This release should be much faster, for 2 reasons. First, the cursor library
|
38
|
+
has been dropped in favor of sequence, which is much faster. Second, and more
|
39
|
+
important, the interpreted backtracking engine has been replaced with a
|
40
|
+
compiled engine. This means completely new implementations of Reg::Array and
|
41
|
+
all the vector matchers. (I tried to write compilers for Reg::Hash and Reg::
|
42
|
+
Object, but they didn't get completed...) The majority of my concerns about
|
43
|
+
performance are now resolved, although the backtracking algorithm is still
|
44
|
+
very simplistic, and could do with a good dose of fixed match cognizance.
|
39
45
|
|
40
46
|
This table compares syntax of Reg and Regexp for various constructs. Keep
|
41
47
|
in mind that all Regs are ordinary ruby expressions. The special syntax
|
@@ -63,9 +69,9 @@ r-n re{n,} Reg::Repeat #at most n matches
|
|
63
69
|
r+m re{,m} Reg::Repeat #at least m matches
|
64
70
|
OB . Reg::Any #a single item
|
65
71
|
OBS .* Reg::AnyMultiple #zero or more items
|
66
|
-
BR
|
72
|
+
BR(1,2) \1,\2 Reg::Backref #backreference ***
|
67
73
|
r>>x or sub sub,gsub Reg::Transform #search and replace ***
|
68
|
-
|
74
|
+
:a<<r () Reg::Bound #capture into a backreference ***
|
69
75
|
|
70
76
|
here are features of reg that don't have an equivalent in regexp
|
71
77
|
r.la Reg::Lookahead #lookahead ***
|
@@ -177,31 +183,30 @@ a conflict.
|
|
177
183
|
the api (mostly unimplemented):
|
178
184
|
r represents a reg
|
179
185
|
t represents a transform
|
180
|
-
o represents
|
186
|
+
o represents any object
|
181
187
|
a represents an array
|
182
188
|
s represents a string
|
183
189
|
h represents a hash
|
184
190
|
scan represents the entire stringscanner interface...
|
185
191
|
-(scan,skip,match?,check and their unanchored and backward forms)
|
186
|
-
c represents a
|
192
|
+
c represents a ::Sequence
|
187
193
|
! implies in-place modification
|
188
194
|
|
189
195
|
r===o #v
|
190
196
|
r=~o #v
|
191
|
-
|
197
|
+
ach=~r #v-
|
192
198
|
r.match o #result contains changes
|
193
199
|
r.match! o
|
194
|
-
|
195
|
-
|
200
|
+
coah.sub!(r[,t])
|
201
|
+
coah.gsub!(r[,t])
|
196
202
|
oah.sub(r[,t]) #modifies in result
|
197
|
-
oah.gsub(r[,t])
|
198
|
-
oah.sub!(r[,t]) #inplace modify
|
199
|
-
oah.gsub!(r[,t])
|
203
|
+
oah.gsub(r[,t]) #modifies in result
|
200
204
|
a.scan(r) #modifies in result
|
201
205
|
|
202
|
-
c.index/rindex r #
|
203
|
-
c.slice
|
204
|
-
c.
|
206
|
+
c.index/rindex r #no modify
|
207
|
+
c.slice r #no modify
|
208
|
+
c.slice! r #deletes matching elems
|
209
|
+
c.split r #no modify
|
205
210
|
c.find_all r #like String#scan
|
206
211
|
c.find r
|
207
212
|
ho.find_all [r-key,] r-value
|
@@ -217,7 +222,7 @@ s.delete_all r
|
|
217
222
|
s.delete_all! r
|
218
223
|
|
219
224
|
#these require wrapping library methods to also take different args
|
220
|
-
|
225
|
+
ac.slice r
|
221
226
|
ahoc.slice! r
|
222
227
|
o=~r
|
223
228
|
oahc[r]
|
@@ -246,37 +251,38 @@ s.scan(r) #=> rscan... note scan only conflicts; the rest of the stringscan
|
|
246
251
|
Reg::Progress work list:
|
247
252
|
|
248
253
|
phase 1: array only
|
249
|
-
fill out backtrack
|
250
|
-
import asserts from backtrace=>backtrack
|
251
|
-
disable backtrace
|
254
|
+
v fill out backtrack
|
255
|
+
v import asserts from backtrace=>backtrack
|
256
|
+
v disable backtrace
|
252
257
|
backtrack should respect update_di
|
253
|
-
callers of backtrace must use a progress instead
|
254
|
-
call backtrack on progress instead of backtrace...
|
255
|
-
matchsets unmodified as yet (ok, except repeat and subseq matchsets)
|
256
|
-
push_match and push_matchset need to be called in right places in Reg::Array (what else?)
|
258
|
+
v callers of backtrace must use a progress instead
|
259
|
+
v call backtrack on progress instead of backtrace...
|
260
|
+
v matchsets unmodified as yet (ok, except repeat and subseq matchsets)
|
261
|
+
v push_match and push_matchset need to be called in right places in Reg::Array (what else?)
|
257
262
|
note which parts of regarray.rb have been obsoleted by regprogress.rb
|
258
263
|
|
259
264
|
phase 2:
|
260
265
|
eventually, MatchSet#next_match will take a cursor parameter, and return a number of items consumed or progress or nil
|
261
|
-
entering some types of subreg creates a subprogress
|
266
|
+
x entering some types of subreg creates a subprogress
|
262
267
|
arrange for process_deferreds to be called in the right places
|
263
268
|
create Reg::Bound (for vars) and Reg::SideEffect, Reg::Undo, Reg::Eventually with sugar
|
264
269
|
-Reg#bind, Reg#side_effect, Reg#undo, Reg#eventually
|
265
270
|
-and of course Reg::Transform and Reg::Replace
|
266
271
|
-Reg::Reg#>>(Reg::Replace) makes a Transform, and certain things can mix in module Replace
|
267
|
-
|
268
|
-
should Reg::
|
269
|
-
Reg::
|
272
|
+
create Reg::BackRef
|
273
|
+
should Reg::BackRef be a module?
|
274
|
+
should Reg::BackRef be a Deferred?
|
275
|
+
Reg::Transform calls Reg::Progress#eventually?
|
270
276
|
implicit progress needs to be made when doing standalone compare of
|
271
277
|
-Reg::Object, Reg::Hash, Reg::Array, Reg::logicals, Reg::Bound, Reg::Transform, maybe others
|
272
278
|
|
273
279
|
these are stubbed at least now:
|
274
280
|
Backtrace.clean_result and Backtrace.check_result should operate on progresses instead
|
275
|
-
need Reg::Progress#bt_match,last_next_match,to_result,check_result,clean_result
|
276
|
-
need Reg::Progress#deep_copy for use in repeat and subseq matchsets
|
281
|
+
v need Reg::Progress#bt_match,last_next_match,to_result,check_result,clean_result
|
282
|
+
x need Reg::Progress#deep_copy for use in repeat and subseq matchsets
|
277
283
|
need MatchSet#clean_result which delegates to the internal Progress, if any
|
278
|
-
rewrite repeat and subseq to use progress internally? (in progress only...)
|
279
|
-
Reg::(and,repeat,subseq,array) require progress help
|
284
|
+
v rewrite repeat and subseq to use progress internally? (in progress only...)
|
285
|
+
v Reg::(and,repeat,subseq,array) require progress help
|
280
286
|
|
281
287
|
|
282
288
|
varieties of Reg::Replace:
|
@@ -316,8 +322,27 @@ anchors (edge cognizance)
|
|
316
322
|
|
317
323
|
|
318
324
|
todo:
|
325
|
+
v move position_stack into Progress::Context
|
326
|
+
v move matchfail_todo into Progress::Context
|
327
|
+
v move matchset_stack into context
|
328
|
+
v all matchsets should reference a Progress
|
329
|
+
v all matchsets should reference a Context (except maybe SingleMatch_MatchSet?)
|
330
|
+
v MatchSet constructors must take a progress
|
331
|
+
matchset#next_match's should use @progress/@context instead of passed in arr/start
|
332
|
+
v replace subprogress calls with newcontext/endcontext
|
333
|
+
v newcontext/endcontext needs to be used in other contexts too! (Reg::Array, Reg::Object, etc)
|
334
|
+
v need to backtrack in nexted Reg::Array
|
335
|
+
when backup_stacks is called (maybe indirectly) in a MatchSet's #next_match, should it affect the
|
336
|
+
-@progress or the @context of that MatchSet?
|
337
|
+
inspect all uses of position_stack and position_inc_stack for similar problems
|
338
|
+
|
339
|
+
|
340
|
+
array_like/hash_like/object_like as aliases for +[]/+{}/-{}
|
341
|
+
why isn't ArrayGraphPoint ever used? it should be.
|
342
|
+
=== sometimes can raise an exception! (eg: ("r".."s")===[])
|
343
|
+
-make sure all calls to === are protected by appending 'rescue false' to them.
|
319
344
|
vector Reg::Proc,Reg::ItemThat,Reg::Block,Reg::Variable,Reg::Constant
|
320
|
-
convert
|
345
|
+
convert mmatch_full to mmatch in another class (or module) in logicals, subseq, repeat, etc?
|
321
346
|
performance
|
322
347
|
variable binding
|
323
348
|
variable tracking... keeping each value assigned to a variable during the match in an array
|
@@ -339,7 +364,8 @@ array matcher should match array-like things like enum or (especially) two-way e
|
|
339
364
|
How should Reg::Array match cursors?
|
340
365
|
arguments (including backref's) in property matchers
|
341
366
|
discontinuous number sets (and reg multipliers for them)
|
342
|
-
lookahead (including negated regmultiples)
|
367
|
+
v? lookahead (including negated regmultiples)
|
368
|
+
lookback
|
343
369
|
laziness
|
344
370
|
inspect (mostly implemented... but maybe needs another name)
|
345
371
|
fix all the warnings
|
@@ -364,7 +390,7 @@ need a way to constrain the types of matcher that are allowed
|
|
364
390
|
Pair and Knows::WithArgs need constraint parameterization this way too.
|
365
391
|
v what is the meaning of :meth[]? no parameters for parameterlessness, use +:meth
|
366
392
|
all reg classes and matchers need to implement #==, #eql?, and #hash
|
367
|
-
-defaults only check object ids, so for instance: +[] != +[]
|
393
|
+
-defaults only check object ids, so for instance, currently: +[] != +[]
|
368
394
|
Reg::Array should be renamed Reg::Sequence (or something...) it's not just for arrays anymore...
|
369
395
|
when extending existing classes, check for func names already existing and chain to them
|
370
396
|
-(or just abort if the wrong ones are defined.)
|
@@ -374,20 +400,41 @@ allow expressions like this in hash and object matchers: +{:foo=>/bar/.-} to mea
|
|
374
400
|
v potentially confusing name conflict: const vs Const (in regsugar.rb vs regdeferred.rb)
|
375
401
|
sugar is too complicated. need to split into many small files in their own
|
376
402
|
-directory, ala the nano gem. (makes set piracy easier too.)
|
403
|
+
add methods to Module/Class to declare which methods are safe/dangerous
|
404
|
+
-then allow only safe methods to be called via item_that/Reg::Object, etc.
|
405
|
+
add lots more instrumentation
|
406
|
+
remove weird eee stuff in regitem_that.rb
|
407
|
+
need an object matcher that takes positional instead of named parameters...
|
408
|
+
-more succinct, but slightly more limited than the current form.
|
409
|
+
I need ArrayMatchSet (like SubseqMatchSet), Hash/ObjectMatchSet (like AndMatchSet)
|
410
|
+
-each of these will have to keep track of how many other matchsets were pushed on
|
411
|
+
-the stack while they were being matched.
|
412
|
+
AndMatchSet still needs a lot of work.
|
413
|
+
need vector analogs to the scalar matchers item_that and reg_that, called items_that and regs_that
|
414
|
+
|
415
|
+
|
416
|
+
infectious modules:
|
417
|
+
Multiple infects every container except Array (not allowed in Hash,Object,RestrictHash,Case)
|
418
|
+
Undoable infects every container (implies HasCmatch or HasBmatch)
|
419
|
+
HasCmatch infects every Multiple container (& infects non-Multiple with HasBmatch)
|
420
|
+
HasBmatch infects every container (unless HasCmatch also present)
|
421
|
+
HasCmatch_And_Bound infects every container (&infects with HasCmatch too)
|
422
|
+
|
423
|
+
|
377
424
|
|
378
425
|
|
379
426
|
known bugs:
|
380
427
|
no backreferences
|
381
428
|
no substitutions
|
382
|
-
vector & and ^ wont work
|
429
|
+
v vector & and ^ wont work
|
383
430
|
explicit duck-typing (on mmatch) is used to distinguish regs and literals... should be is_a? Reg::Reg instead.
|
384
|
-
0*
|
431
|
+
0*Infinity should at least cause a warning
|
385
432
|
some test cases are so slow as to be effectively unusable.
|
386
433
|
|
387
434
|
|
388
435
|
|
389
436
|
reg - the ruby extended grammar
|
390
|
-
Copyright (C) 2005 Caleb Clausen
|
437
|
+
Copyright (C) 2005, 2016 Caleb Clausen
|
391
438
|
|
392
439
|
This library is free software; you can redistribute it and/or
|
393
440
|
modify it under the terms of the GNU Lesser General Public
|
data/article.txt
ADDED
@@ -0,0 +1,838 @@
|
|
1
|
+
=begin copyright
|
2
|
+
reg - the ruby extended grammar
|
3
|
+
Copyright (C) 2016 Caleb Clausen
|
4
|
+
|
5
|
+
This library is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU Lesser General Public
|
7
|
+
License as published by the Free Software Foundation; either
|
8
|
+
version 2.1 of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This library is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
Lesser General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU Lesser General Public
|
16
|
+
License along with this library; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
18
|
+
=end
|
19
|
+
Reg (Ruby Extended Grammar) is a language I've been working on for some
|
20
|
+
time. Some of the different ways I have described it are:
|
21
|
+
+ A pattern matching language
|
22
|
+
+ An interpreter interpreter
|
23
|
+
+ A parser tool in the interpretive style
|
24
|
+
+ A declarative programming language
|
25
|
+
+ A general purpose regular expression engine
|
26
|
+
|
27
|
+
Of all these, I most prefer 'pattern matching language'. It seems to best
|
28
|
+
convey how I think of it.
|
29
|
+
|
30
|
+
Ruby already has Regexp for matching patterns in Strings. Reg complements
|
31
|
+
that capability by providing pattern matching for other types of data,
|
32
|
+
such as Arrays, Hashes, and Objects.
|
33
|
+
|
34
|
+
But let me explain what I mean by that in more detail. What is a pattern?
|
35
|
+
Ruby is an object-oriented language. So, clearly, a pattern in Ruby should
|
36
|
+
be an object. Pattern objects represent predicates or yes-no questions
|
37
|
+
about an as-yet unseen other object. What is matching? Matching, then, is
|
38
|
+
asking the pattern if it 'looks like' that other object. You ask by
|
39
|
+
calling a method. And that method is #===.
|
40
|
+
|
41
|
+
#=== is my favorite Ruby operator. The task of building Reg has largely
|
42
|
+
been one of creating different types of pattern classes (classes that
|
43
|
+
implement === in some interesting way).
|
44
|
+
|
45
|
+
Now, these are not design patterns that I'm talking about. These aren't
|
46
|
+
patterns from the Gang of Four book. These are patterns in the sense that
|
47
|
+
Regexp is a pattern. I'll also use another word, 'matcher'; it also means
|
48
|
+
pattern.
|
49
|
+
|
50
|
+
Interesting matchers
|
51
|
+
Before talking about some of the patterns that I have created, let's look
|
52
|
+
at the pattern classes that already exist in Ruby. Each of these
|
53
|
+
implements #=== in a different way:
|
54
|
+
|
55
|
+
<table 1 -- matcher classes provided by Ruby itself>
|
56
|
+
class | returns true iff the rhs of ===:
|
57
|
+
=======+===========================================
|
58
|
+
Regexp | matches the Regexp
|
59
|
+
Class | is an instance of that Class
|
60
|
+
Module | is an instance of that Module
|
61
|
+
Range | is within the Range
|
62
|
+
Object | is exactly equal to the Object (uninteresting)
|
63
|
+
Set | is a member of the Set (or is equal to a member)
|
64
|
+
</table>
|
65
|
+
|
66
|
+
(Actually, Set requires a little help... the current release of Reg does this:
|
67
|
+
<listing 1 -- Making Set a matcher>
|
68
|
+
class Set
|
69
|
+
alias === include?
|
70
|
+
end
|
71
|
+
</listing>
|
72
|
+
|
73
|
+
But the next release will make Set a matcher in a trickier way, which does
|
74
|
+
not interfere with the standard definition of Set#===.)
|
75
|
+
|
76
|
+
I define 'interesting matcher' to mean one that implements ===.
|
77
|
+
An uninteresting matcher is a matcher which implements === the same as ==.
|
78
|
+
This is the definition inheirited from Object. Most objects in ruby are
|
79
|
+
uninteresting matchers. However, this is an important property; it means
|
80
|
+
that most ordinary objects can be used in Reg in contexts that expect a
|
81
|
+
full matcher. They represent themselves, or other objects exactly equal to
|
82
|
+
themselves. This is similar to Regexp, where
|
83
|
+
most characters can be used to
|
84
|
+
represent just themselves.
|
85
|
+
|
86
|
+
[[implementation extracts are simplified to keep exposition uncluttered]]
|
87
|
+
|
88
|
+
The ancestor for most of the matchers in Reg is the module Reg::Reg . (The
|
89
|
+
name (with a repeated Reg) is somewhat unfortunate and may change.) The
|
90
|
+
Reg::Reg module extends pattern classes (classes that implement #===) with
|
91
|
+
useful operators for combining and composing bigger, more complicated
|
92
|
+
patterns. Several existing Ruby pattern classes are re-opened to include
|
93
|
+
Reg::Reg: Regexp Range Module Class Set.
|
94
|
+
|
95
|
+
Reg::Reg has the following operators, among others:
|
96
|
+
<table 2 -- some common Reg operators>
|
97
|
+
operator | meaning
|
98
|
+
==========+===========================
|
99
|
+
& | and
|
100
|
+
| | or
|
101
|
+
^ | xor (one and only one)
|
102
|
+
~ | not
|
103
|
+
*,+,- | repeated / optional match
|
104
|
+
** | create a pair
|
105
|
+
</table>
|
106
|
+
|
107
|
+
These operators allow you to compose patterns into larger patterns.
|
108
|
+
I want to be really clear about this.
|
109
|
+
The above operators (among others) are added to
|
110
|
+
Ruby's existing pattern classes Regexp, Range, Class, Module, and Set.
|
111
|
+
(In the case of Set, some of those
|
112
|
+
operators are already defined, so 'added' is not quite accurate. However, be assured that the existing semantics of Set#+, for instance, are left unchanged when the right operand is an Enumerable.)
|
113
|
+
Ruby 2 selector namespaces should allow Reg to make use of these operators
|
114
|
+
without creating a conflict with other libraries which may have defined
|
115
|
+
them in a different way.
|
116
|
+
|
117
|
+
If anyone objects to Reg re-opening these core classes, please let me know. (So far, no one has objected.)
|
118
|
+
|
119
|
+
Atomic patterns
|
120
|
+
Ruby's few built-in pattern classes are what I call atomic patterns; they
|
121
|
+
contain no other patterns within themselves. The simplest types of
|
122
|
+
patterns provided by Reg are also atomic. Let's briefly discuss the
|
123
|
+
semantics and implementation of three: OB, the method check pattern, and
|
124
|
+
item_that.
|
125
|
+
|
126
|
+
<table 3 -- OB, the universal matcher>
|
127
|
+
expression | what it matches | expr===x equivalent to:
|
128
|
+
=============+========================+==========================
|
129
|
+
OB | everything | Object===x (or true)
|
130
|
+
</table 3>
|
131
|
+
|
132
|
+
OB matches anything; any single item. As such, it is equivalent to the
|
133
|
+
(built-in) pattern Object, just shorter. The simplest implementation of OB
|
134
|
+
is:
|
135
|
+
|
136
|
+
<listing 2 -- OB>
|
137
|
+
OB=Object
|
138
|
+
</listing>
|
139
|
+
|
140
|
+
<table 4 -- the method check matcher>
|
141
|
+
expression | matches objects that... | expr===x equivalent to:
|
142
|
+
===========+=============================+===========================
|
143
|
+
-:foo | respond to the named method | x.respond_to? :foo
|
144
|
+
</table 4>
|
145
|
+
|
146
|
+
Method check patterns match items that respond to the method named by the
|
147
|
+
symbol. For instance, -:reverse would match everything that respond_to?
|
148
|
+
the method #reverse. All strings and arrays would match that pattern, but
|
149
|
+
no hashes or symbols.
|
150
|
+
|
151
|
+
Let's see how it's implemented:
|
152
|
+
<listing 3 -- Reg::Knows, the method check matcher>
|
153
|
+
class Symbol
|
154
|
+
def -@; Reg::Knows.new(self) end
|
155
|
+
end
|
156
|
+
|
157
|
+
module Reg
|
158
|
+
class Knows
|
159
|
+
def initialize(sym)
|
160
|
+
@sym=sym
|
161
|
+
end
|
162
|
+
def ===(other)
|
163
|
+
other.respond_to? @sym
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
</listing>
|
168
|
+
|
169
|
+
Unary minus on symbol creates a wrapper object around the symbol which
|
170
|
+
calls #respond_to? on that symbol when === on it is called. (I wanted to
|
171
|
+
use unary plus for this feature originally, but there seems to be a bug in
|
172
|
+
ruby that prevents that.)
|
173
|
+
|
174
|
+
I'm not going to say a lot about item_that, other than it allows you to
|
175
|
+
write natural-sounding expressions like these, which do pretty much what
|
176
|
+
they seem like they should:
|
177
|
+
|
178
|
+
<table 5 -- item_that>
|
179
|
+
expression | matches objects... | expr===x
|
180
|
+
| | equivalent to:
|
181
|
+
======================+==================================+===============
|
182
|
+
item_that.has_prop? | ...whose #has_prop? returns true | x.has_prop?
|
183
|
+
item_that<40 | ...less than 40 | x<40
|
184
|
+
item_that.is_valid? | ...whose #is_valid? returns true | x.is_valid?
|
185
|
+
item_that.num_cols<55 | #...with property num_cols < 55 | x.num_cols<55
|
186
|
+
</table>
|
187
|
+
|
188
|
+
#item_that returns an object that, for (almost) any method called on it,
|
189
|
+
saves the receiver, name of the method called, and arguments, and block
|
190
|
+
given in the call, returning all in another item_that-like object.
|
191
|
+
|
192
|
+
Think of it as saving up the name, and parameters (including the block and
|
193
|
+
receiver) of every method called on it in the dot-pipeline following the
|
194
|
+
item_that. The saved up methods get called when === is eventually called
|
195
|
+
on the built-up item_that-like expression.
|
196
|
+
|
197
|
+
item_that is made possible by some method_missing magic that just a little
|
198
|
+
too long to list here. Those who are interested should look at Jim
|
199
|
+
Weirich's Deferred class, on which item_that is based, which he explains
|
200
|
+
in this blog posting:
|
201
|
+
http://onestepback.org/index.cgi/Tech/Ruby/SlowingDownCalculations.rdoc
|
202
|
+
|
203
|
+
Standalone item_that is a little pointless... 'item_that<40===x' is both longer and less clear than simply saying 'x<40'.
|
204
|
+
We will find that it is useful when incorporated into larger patterns.
|
205
|
+
|
206
|
+
Beware of Side Effects!
|
207
|
+
|
208
|
+
Reg has several constructs (most notably replacement expressions) that
|
209
|
+
permit you to embed side effects into query expressions; some of these
|
210
|
+
will be introduced later. Users are strongly encouraged to use these
|
211
|
+
approved mechanisms instead of putting side effects into item_that or
|
212
|
+
other types of query expression that should not be making changes to the
|
213
|
+
data they are querying. Since the language can understand that (eg)
|
214
|
+
replacement expressions have side effects, it can safely handle those side
|
215
|
+
effects, saving them up til the end of the entire match attempt, to ensure
|
216
|
+
they are executed only once. If you do put side effects into something
|
217
|
+
like item_that, you may find that those side effects are executed many
|
218
|
+
more times than you thought they should be.
|
219
|
+
|
220
|
+
For example, 'item_that.chop!'
|
221
|
+
is a really bad idea. The chop! will modify the item being queried (if it
|
222
|
+
is a String, which has a destructive chop!). As a standalone expression,
|
223
|
+
this might work ok, but if you put it inside a Reg::Array (which does
|
224
|
+
backtracking) you may quickly get into trouble without understanding why.
|
225
|
+
In general, query expressions should not modify their data, so the
|
226
|
+
presence of any method ending in a ! should be an indication of possible
|
227
|
+
danger.
|
228
|
+
|
229
|
+
Now lets look at some ways to combine Reg patterns together.
|
230
|
+
|
231
|
+
Logicals: and, or, xor, not
|
232
|
+
|
233
|
+
<table 6 -- the logical operators>
|
234
|
+
expression | what it matches | expression===x
|
235
|
+
| | equivalent to:
|
236
|
+
==============+==========================+================
|
237
|
+
File |\ | Files, and strings with | File===x or
|
238
|
+
/a+/ |\ | 1 or more 'a' and | /a+/===x or
|
239
|
+
(1..5) | numbers between 1 and 5 | (1..5)===x
|
240
|
+
--------------+--------------------------+-------------
|
241
|
+
/fo+/ &\ | strings that have: foo, | /fo+/===x and
|
242
|
+
/ba*[rz]/ &\ | bar/baz, and quux | /ba*[rz]/ and
|
243
|
+
/quu?x/ | (w/ # of vowels varying) | /quu?x/
|
244
|
+
--------------+--------------------------+-------------
|
245
|
+
/^if/ ^\ | strings containing | [/^if/===x, %r{/x}===x,
|
246
|
+
%r{/x} ^\ | 1 and only 1 of: | /rescue/===x
|
247
|
+
/rescue/ | if, rescue, or '/x' | ].select{|b| b}.size==1
|
248
|
+
--------------+--------------------------+-------------
|
249
|
+
~/66/ | all strings w/o '66', + | !(/66/===x)
|
250
|
+
| all non-strings as well |
|
251
|
+
</table>
|
252
|
+
|
253
|
+
The | operator takes 2 sub-patterns as its operands and returns a larger
|
254
|
+
pattern that will match everything which matches either sub-pattern.
|
255
|
+
|
256
|
+
The & operator takes 2 sub-patterns as its operands and returns a larger
|
257
|
+
pattern that will match everything which matches both sub-patterns.
|
258
|
+
|
259
|
+
The ^ operator takes 2 sub-patterns as its operands and returns a larger
|
260
|
+
pattern that will match everything that matches one but not both of the sub-patterns. When ^ operations are chained together (like this: a^b^c^d^e^f), the entire expression matches if one and only one alternative matches.
|
261
|
+
|
262
|
+
Note: Regexp#~ is RE-DEFINED. The original semantics of Regexp#~ (as
|
263
|
+
documented in Pickaxe) are completely destroyed. ~/66/ matches all strings
|
264
|
+
without '66' in them (as well as all non- strings), rather than comparing
|
265
|
+
the Regexp /66/ to the default variable, $_.
|
266
|
+
|
267
|
+
(Unfortunately, there are some nasty bugs
|
268
|
+
in the current release of Reg (0.4.5)
|
269
|
+
that affect the & and | operators;
|
270
|
+
if the data being matched is nil or
|
271
|
+
false, | will always fail, even
|
272
|
+
in cases where it shouldn't. And & doesn't work at all. The next release will fix these.)
|
273
|
+
|
274
|
+
<listing 4 -- implementation of logical operators>
|
275
|
+
module Reg
|
276
|
+
module Reg
|
277
|
+
def |(other)
|
278
|
+
::Reg::Or.new(self,other)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
class Or
|
283
|
+
def initialize(left,right)
|
284
|
+
@left,@right=left,right
|
285
|
+
end
|
286
|
+
def ===(other)
|
287
|
+
@left===other || @right===other
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
</listing>
|
292
|
+
|
293
|
+
This pattern should be familiar; it is similar to the implementation of
|
294
|
+
the method check matcher, Reg::Knows above.
|
295
|
+
|
296
|
+
Logical operators return a wrapper object around their arguments, which
|
297
|
+
implements ===. Keep in mind that this is a simplified version of the
|
298
|
+
implementation; the real version has more optimizations, handles
|
299
|
+
backtracking, allows sub-expressions to contain side effects, and to match
|
300
|
+
more than just a single item.
|
301
|
+
|
302
|
+
(De)Composition
|
303
|
+
|
304
|
+
Even with the syntax introduced so far, it is possible to create long and
|
305
|
+
complicated expressions which are hard to follow. Here's an example.
|
306
|
+
|
307
|
+
<example 1 -- a complicated matcher>
|
308
|
+
-:foo|-:bar|(-:baz&-:quux) | (item_that.count%2).zero? === x
|
309
|
+
</example>
|
310
|
+
|
311
|
+
The usual way to deal with this problem is to allow the programmer to
|
312
|
+
break up long expressions into more digestible chunks, each of which is
|
313
|
+
given a (hopefully meaningful) name. We might break up the above
|
314
|
+
expression like this:...
|
315
|
+
|
316
|
+
<example 2 -- a complicated matcher, broken down>
|
317
|
+
knows_foo_or_bar = -:foo|-:bar
|
318
|
+
knows_baz_and_quux = -:baz&-:quux
|
319
|
+
knows_my_methods = knows_foo_or_bar | knows_baz_and_quux
|
320
|
+
|
321
|
+
count_even = (item_that.count%2).zero?
|
322
|
+
|
323
|
+
knows_my_methods | count_even === x
|
324
|
+
</example>
|
325
|
+
|
326
|
+
Note that I didn't have to write any enabling code. These are just normal
|
327
|
+
variable assignments. One of the really great things about of creating a
|
328
|
+
DSL or mini-language within ruby itself by defining methods; you get a lot
|
329
|
+
of 'core language' features for free! It would have been necessary to
|
330
|
+
invent it if this feature did not exist.
|
331
|
+
|
332
|
+
Data models and matcher models
|
333
|
+
|
334
|
+
Let's talk about the data model of everyone's favorite language, Perl:
|
335
|
+
Scalars are numbers and strings.
|
336
|
+
Vectors are lists of scalars.
|
337
|
+
Hashes are associations (or maps) of scalars to scalars.
|
338
|
+
Objects are special hashes whose keys are always strings.
|
339
|
+
|
340
|
+
The same is more or less true of Ruby as well. Ruby makes Object the
|
341
|
+
central concept; all scalars, vectors, and hashes are different types of
|
342
|
+
Object. Scalars, in this model (and in Perl as well) are extended to
|
343
|
+
include references to any Object. (This allows us to create lists of
|
344
|
+
lists, etc.)
|
345
|
+
|
346
|
+
This is a useful conceptual tool, but a rough model only; some of these
|
347
|
+
'data types' are actually more than one data type.
|
348
|
+
|
349
|
+
Reg provides multiple scalar, hash, and object matchers, but only one
|
350
|
+
array matcher. However, the array matcher is the only one able to contain
|
351
|
+
the various vector matchers. A vector matcher is a matcher that might
|
352
|
+
match more (or less) than one item in sequence.
|
353
|
+
|
354
|
+
The hash matcher
|
355
|
+
|
356
|
+
<example 3 -- Hash matcher>
|
357
|
+
+{:foo=>:bar,
|
358
|
+
1=>/flux/,
|
359
|
+
-:times=>"zork",
|
360
|
+
/^[rs]/=>item_that.reverse,
|
361
|
+
OB=>Integer
|
362
|
+
}
|
363
|
+
|
364
|
+
#equivalent to:
|
365
|
+
x[:foo]==:bar and /flux/===x[1] and
|
366
|
+
(x.keys-[:foo,1]).each{|k|
|
367
|
+
k.respond_to?(:times) and
|
368
|
+
x[k]=="zork"||break or
|
369
|
+
/^[rs]/===k and
|
370
|
+
x[k].reverse||break or
|
371
|
+
Integer===x[k]
|
372
|
+
} rescue false
|
373
|
+
|
374
|
+
#Matches:
|
375
|
+
{ :foo=>:bar, 1=>"flux cap", 3=>"zork",
|
376
|
+
"rat"=>"long string", String=>4**99 }
|
377
|
+
|
378
|
+
#Doesn't match:
|
379
|
+
{ :foo=>:bar, 1=>"flux", 2=>"zork", "r"=>"a string",
|
380
|
+
:rest=>3**99, <red>:fibble=>:foomp</red> }
|
381
|
+
{:foo=><red>:baz</red>, 1=>"flux", 2=>"zork",
|
382
|
+
"r"=>"a string", :rest=>3**99}
|
383
|
+
{:foo=>:bar, 2=>"zork", "r"=>"a string", :rest=>3**99
|
384
|
+
<red>#no entry with key 1</red>
|
385
|
+
}
|
386
|
+
</example>
|
387
|
+
|
388
|
+
In the examples of non-matching objects above, the part of the object
|
389
|
+
that caused the match to fail is colored red.
|
390
|
+
|
391
|
+
Hash#+@ (unary plus) turns any hash into a Reg::Hash. All keys and values
|
392
|
+
in a Reg::Hash are interpreted as matchers. Each key-value pair acts like
|
393
|
+
a filter on potential matching hashes. Every key-value pair in the data
|
394
|
+
must match some key-value filter in the hash matcher. Furthermore, each
|
395
|
+
filter in the hash matcher must have matched something in the hash being
|
396
|
+
tested.
|
397
|
+
|
398
|
+
The filters are prioritized into 3 groups based on the
|
399
|
+
type of key matcher. Each key-value pair in the data is
|
400
|
+
tried against the filters in the matcher in the following order.
|
401
|
+
First, filters with uninteresting key matchers (those that match only themselves) are tried.
|
402
|
+
Then filters with interesting key matchers are tried.
|
403
|
+
Finally, the catchall (with a key matcher of OB) is given a final chance
|
404
|
+
to match.
|
405
|
+
|
406
|
+
Filters are mandatory; if a filter is present in the matcher, it must match something in the data.
|
407
|
+
You can, however, make a filter optional by appending '|nil' to the value
|
408
|
+
matcher. (Assuming the default for the hash being tested is nil.)
|
409
|
+
(Unfortunately, the bug in the | operator which prevents it from ever matching nil prevents you from being able to make filters optional this way.)
|
410
|
+
|
411
|
+
The object matcher
|
412
|
+
|
413
|
+
Just as in the data model, objects behave much like hashes (with instance
|
414
|
+
variable/attribute names being the keys) object matchers behave much like
|
415
|
+
hash matchers. Matchers may be used on the key side of the filters, but
|
416
|
+
since object 'keys' are always strings (names of instance variables and
|
417
|
+
methods), the key matchers must match strings... usually they are strings
|
418
|
+
or regular expressions. Symbols in object key matchers are auto-converted
|
419
|
+
into strings. Unlike the hash matcher, every key (instance var/attribute
|
420
|
+
name) in the object to be matched does not have to be accounted for.
|
421
|
+
However, every filter in the object must match something.
|
422
|
+
|
423
|
+
Currently, any methods called must take an empty parameter list.
|
424
|
+
Once again: beware of side effects when calling methods.
|
425
|
+
|
426
|
+
<example 4 -- an object matcher>
|
427
|
+
-{:f=>1, /^[gh]+$/=>3..4, :@v=>/=[a-z]+$/}
|
428
|
+
|
429
|
+
#Given:
|
430
|
+
class Example
|
431
|
+
attr_reader *%w{f g h v}
|
432
|
+
def initialize(f,g,h,v)
|
433
|
+
@f,@g,@h,@v=f,g,h,v
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
#Matches:
|
438
|
+
Example.new(1,3,4,"foo=bar")
|
439
|
+
Example.new(1,4,3,"foo=bar")
|
440
|
+
|
441
|
+
#Doesn't match:
|
442
|
+
Example.new(<red>2</red>,3,4,"foo=bar")
|
443
|
+
Example.new(1,<red>33</red>,4,"foo=bar")
|
444
|
+
Example.new(1,3,<red>44</red>,"foo=bar")
|
445
|
+
Example.new(1,3,4,<red>"foo=BAR"</red>)
|
446
|
+
</example>
|
447
|
+
|
448
|
+
|
449
|
+
Sequence matching -- Regexp and Reg
|
450
|
+
|
451
|
+
So far, all the patterns I've presented match exactly one item at a time.
|
452
|
+
Let's meet the matchers that can match more (or less) that one item at
|
453
|
+
once. The most important of these is Reg::Array, which matches a sequence
|
454
|
+
of ruby objects in an array much like Regexp matches a sequence of
|
455
|
+
characters in a string.
|
456
|
+
|
457
|
+
[[maybe this table should be moved up nearer the top??]]
|
458
|
+
<table 7 -- regex-reg equivalence table>
|
459
|
+
Description | Regexp | Reg |
|
460
|
+
===================+=========+===========+
|
461
|
+
sequence | /re/ | +[r] |
|
462
|
+
subsequence | (re) | -[r] |
|
463
|
+
hash matcher | n/a | +{r1=>r2} |
|
464
|
+
object matcher | n/a | -{r1=>r2} |
|
465
|
+
literal | \re | r.lit |
|
466
|
+
dynamic inclusion | #{re} | regproc{r}|
|
467
|
+
alternation (or) | re1|re2 | r1|r2 |
|
468
|
+
conjunction (and) | n/a | r1&r2 |
|
469
|
+
xor | n/a | r1^r2 |
|
470
|
+
negation | [^re] | ~r |
|
471
|
+
any number of | re* | r.* |
|
472
|
+
at least 1 | re+ | r.+ |
|
473
|
+
optional | re? | r.- |
|
474
|
+
exactly n of | re{n} | r*n |
|
475
|
+
n to m of | re{n,m} | r*(n..m) |
|
476
|
+
at most n of | re{n,} | r-n |
|
477
|
+
at least m of | re{,m} | r+m |
|
478
|
+
1 item | . | OB |
|
479
|
+
0 or more items | .* | OBS |
|
480
|
+
capture | (re) | :a<<r |
|
481
|
+
backreference | \1 | BR[:a] |
|
482
|
+
</table>
|
483
|
+
|
484
|
+
Simple Sequences
|
485
|
+
|
486
|
+
Each item in the Reg::Array tries to match the item (or items) at the same
|
487
|
+
relative point in the Array to be matched. Each item in the pattern can
|
488
|
+
match more (or less) than one item in data. Subsequences are a good
|
489
|
+
example of patterns that match more than one item.
|
490
|
+
|
491
|
+
Reg::Array is created by applying the unary plus operator to an Array, like so:
|
492
|
+
|
493
|
+
<example 5 -- sequence matching>
|
494
|
+
+[1,/^a/,item_that.size]===x
|
495
|
+
|
496
|
+
#equivalent to:
|
497
|
+
#(x.size==3 and x[0]==1 and /^a/===x[1] and x[2].size) rescue false
|
498
|
+
|
499
|
+
Matches:
|
500
|
+
[1,"a","b"]
|
501
|
+
[1,"aardvark",[]]
|
502
|
+
|
503
|
+
Doesn't match:
|
504
|
+
[1,"a","b",<red>:foo</red>]
|
505
|
+
[<red>0</red>,1,"a","b"]
|
506
|
+
[<red>2</red>,"a","b"]
|
507
|
+
[1,<red>""</red>,"b"]
|
508
|
+
[1,"a",<red>:b</red>]
|
509
|
+
[1,"a"] <red>#no 3rd element
|
510
|
+
[1,<red>[</red>"a","b"<red>]</red>]
|
511
|
+
</example>
|
512
|
+
|
513
|
+
Because each pattern within this Reg::Array is a scalar matcher, this
|
514
|
+
pattern matches arrays of exactly 3 items, where the first is the number
|
515
|
+
1, the second is a string beginning with 'a', and the third is an object
|
516
|
+
that responds to #size and has a #size that is not nil (or false).
|
517
|
+
|
518
|
+
Subsequences
|
519
|
+
|
520
|
+
Reg subsequences can be thought of as roughly equivalent to parenthesized
|
521
|
+
expressions in Regexp. They contain a series of matchers, which must all
|
522
|
+
match in order at the place where the subequence occurs in the enclosing
|
523
|
+
Reg::Array. Subsequences must always be contained with a Reg::Array; they
|
524
|
+
cannot be used on their own. However, they need not be directly within the
|
525
|
+
Reg::Array; usually, they will be inside a Reg::Repeat, some kind of
|
526
|
+
Reg::Logical expression, or another Subsequence.
|
527
|
+
|
528
|
+
Here is another array matcher; this one contains 2 patterns. The first
|
529
|
+
matches the number 1, the second is a subsequence, which itself contains 2
|
530
|
+
more patterns. The totality is exactly equivalent to the previous example.
|
531
|
+
|
532
|
+
<example 6 -- subsequence matching>
|
533
|
+
+[1,-[/^a/,item_that.size]]
|
534
|
+
|
535
|
+
#equivalent to:
|
536
|
+
#(x.size==3 and x[0]==1 and /^a/===x[1] and x[2].size) rescue false
|
537
|
+
|
538
|
+
|
539
|
+
#Matches:
|
540
|
+
[1,"a","b"]
|
541
|
+
[1,"aardvark",[]]
|
542
|
+
|
543
|
+
#Doesn't match:
|
544
|
+
[1,"a","b",<red>:foo</red>]
|
545
|
+
[<red>0</red>,1,"a","b"]
|
546
|
+
[<red>2</red>,"a","b"]
|
547
|
+
[1,<red>""</red>,"b"]
|
548
|
+
[1,"a",<red>:b</red>]
|
549
|
+
[1,"a"] <red>#no 3rd element
|
550
|
+
[1,<red>[</red>"a","b"<red>]</red>]
|
551
|
+
</example>
|
552
|
+
|
553
|
+
|
554
|
+
Nested Sequences
|
555
|
+
|
556
|
+
So is Reg::Array a scalar or vector matcher?
|
557
|
+
|
558
|
+
<example 7 -- matching an array containing an array>
|
559
|
+
+[1,+[/^a/,item_that.size]]===x
|
560
|
+
|
561
|
+
#equivalent to:
|
562
|
+
#(x.size==2 and x[0]==1 and
|
563
|
+
# x[1].size==2 and /^a/===x[1][0] and x[1][1].size) rescue false
|
564
|
+
|
565
|
+
#Matches:
|
566
|
+
[1,["a","b"]]
|
567
|
+
[1,["aardvark",[]]]
|
568
|
+
|
569
|
+
#Doesn't match:
|
570
|
+
[1,"a","b"]
|
571
|
+
[1,"aardvark",[]]
|
572
|
+
</example>
|
573
|
+
|
574
|
+
This example helps illustrate the
|
575
|
+
paradox about arrays and array matchers. Is an
|
576
|
+
array a vector, which contains a sequence of items, or a scalar, a single
|
577
|
+
item that can be contained in a vector? Likewise, is an array matcher a
|
578
|
+
vector, which matches a list of
|
579
|
+
scalars, or is it a scalar, which
|
580
|
+
matches a single item?
|
581
|
+
|
582
|
+
The answer is that it is both; it acts as a scalar within the expression
|
583
|
+
that contains it. But Reg::Array sets up a new (vector) context for
|
584
|
+
matching to occur in.
|
585
|
+
|
586
|
+
Repetitions
|
587
|
+
|
588
|
+
Repetitions allow you to match the same pattern multiple times within a
|
589
|
+
sequence. The number of times to match can be constant (please match this
|
590
|
+
pattern 5 times) or can vary (at least 5 times) (at most 5 times) (between
|
591
|
+
5 and 10 times). The +, -, and * operators create repetitions. These work
|
592
|
+
similarly to the +, ?, and * operators of Regexp.
|
593
|
+
|
594
|
+
<example 8>
|
595
|
+
#Repetition:
|
596
|
+
+[1,(2..98)*5,99] #exactly 5
|
597
|
+
+[1,(2..98)+5,99] #at least 5
|
598
|
+
+[1,(2..98)-5,99] #at most 5
|
599
|
+
+[1,(2..98)*(5..10),99] #between 5 and 10
|
600
|
+
+[1,(2..98).*,99] #any number
|
601
|
+
+[1,(2..98).+,99] #at least 1
|
602
|
+
+[1,(2..98).-,99] #at most 1
|
603
|
+
|
604
|
+
</example>
|
605
|
+
|
606
|
+
The pattern repeated can also be a subsequence or other vector matcher,
|
607
|
+
allowing more than one item to be matched on each loop pass.
|
608
|
+
|
609
|
+
Repetitions and subsequences only make sense within an array pattern, tho
|
610
|
+
they need not be directly within the array. (They can be nested inside
|
611
|
+
another repetition or subsequence, for instance.)
|
612
|
+
|
613
|
+
The number of times to repeat is determined by the number (or range) to
|
614
|
+
the right of the repetition operator. If no number is given, they default
|
615
|
+
to the count that will allow them to work like the corresponding Regexp
|
616
|
+
repetition operator (with - standing in for ?). +,*,and- each take a
|
617
|
+
sensible default argument, as illustrated here. Now who said ruby doesn't
|
618
|
+
have unary postfix operators? Note: the dot is required when using +, *,
|
619
|
+
and - as postfix operators.
|
620
|
+
|
621
|
+
Backtracking
|
622
|
+
<example 9 -- Regexp Backtracking>
|
623
|
+
|
624
|
+
/<yellow>foo</yellow><red>.*</red><green>bar</green>/==="foo some random stuff bar"
|
625
|
+
|
626
|
+
"<yellow>foo</yellow><red> some random stuff bar</red>"
|
627
|
+
"<yellow>foo</yellow><red> some random stuff ba</red>r"
|
628
|
+
"<yellow>foo</yellow><red> some random stuff b</red>ar"
|
629
|
+
"<yellow>foo</yellow><red> some random stuff </red><green>bar</green>"
|
630
|
+
</example>
|
631
|
+
|
632
|
+
About the regexp: clearly, this matches strings that begin with foo,
|
633
|
+
end with bar, and have anything else in between. However, there's a
|
634
|
+
little extra magic going on under the surface that you may not be aware
|
635
|
+
of. There are basically 3 sub-expressions here, /foo/, /bar/, and /.*/.
|
636
|
+
The latter is the interesting one.
|
637
|
+
|
638
|
+
It's a repetition operator, one of the types that can cause backtracking.
|
639
|
+
Each sub-pattern matches sequentially, so in this case first /foo/
|
640
|
+
matches, then /.*/ matches everything up until the end of the string,
|
641
|
+
_including_the_"bar"_. Then there's nothing left for /bar/ to match, so
|
642
|
+
the /bar/ sub-expression fails. This does not necessarily cause the whole
|
643
|
+
expression to fail; instead, the regexp goes back to previous
|
644
|
+
sub-expressions to see if they have a different way to match that will
|
645
|
+
allow /bar/ to match. /.*/ can match any number of chars, so it gives up
|
646
|
+
the last char it matched, "r". So then the regexp goes on to try to match
|
647
|
+
/bar/ again, but /bar/ still still won't match just "r". This process
|
648
|
+
continues twice more until /.*/ has given up the whole of "bar", which the
|
649
|
+
pattern /bar/ can match (finally) and the pattern as a whole can succeed.
|
650
|
+
|
651
|
+
Regexp's | operator can also cause backtracking.
|
652
|
+
|
653
|
+
<example 10 -- Alternation causes backtracking>
|
654
|
+
#alternation can also cause backtracking
|
655
|
+
/<green>eft</green>(<yellow>foobar</yellow>|<red>foo</red>)<blue>bar</blue>/==="eftfoobar"
|
656
|
+
"<green>eft</green><yellow>foobar</yellow>" #first foobar matches, leaving nothing for final bar
|
657
|
+
"<green>eft</green><red>foo</red><blue>bar</blue>" #backtracks to match just foo, letting final bar match
|
658
|
+
</example>
|
659
|
+
|
660
|
+
Here is a Reg expression that uses backtracking:
|
661
|
+
<example 11 -- Backtracking in a Reg matcher>
|
662
|
+
+[1,(2..99).*,99]===x
|
663
|
+
#equivalent to: (with backtracking optimized away)
|
664
|
+
# (x[0]==1 and x[1...-1].all?{|y| (2..99)===y } and x[-1]==99)
|
665
|
+
|
666
|
+
+[<red>1</red>,<yellow>(2..99).*</yellow>,<green>99</green>]===[1,50,99]
|
667
|
+
|
668
|
+
[<red>1</red>,<yellow>50,99</yellow>]
|
669
|
+
[<red>1</red>,<yellow>50</yellow>,<green>99</green>]
|
670
|
+
</example>
|
671
|
+
|
672
|
+
Vector Logicals
|
673
|
+
|
674
|
+
Recall the logical operators we met some time ago? Well, the arguments to
|
675
|
+
them need not be strictly scalar (== matching only one item) as I showed
|
676
|
+
before. The arguments can be vector patterns (matching more or less than 1
|
677
|
+
item), such as a subsequence or repetition, as long as the entire
|
678
|
+
expression is ultimately contained in an array matcher.
|
679
|
+
|
680
|
+
Using the or operator within an array matcher is another way to make
|
681
|
+
backtracking happen, especially if its arguments are vectors.
|
682
|
+
|
683
|
+
When matchers of differing lengths are ored together, the resulting
|
684
|
+
matcher matches whatever was matched by
|
685
|
+
the first sub-expression that happens to match.
|
686
|
+
|
687
|
+
When matchers of differing lengths are anded together, the resulting
|
688
|
+
matcher matches whatever the longest subexpression matched.
|
689
|
+
|
690
|
+
Negation of a scalar pattern is always scalar (still matches just 1 item),
|
691
|
+
but negation of a non-scalar is automatically a lookahead.
|
692
|
+
|
693
|
+
<example 12 -- Vector logic>
|
694
|
+
+[-[1,2,3] | /dd/*(2..8) | :foo]
|
695
|
+
|
696
|
+
#matches
|
697
|
+
[1,2,3]
|
698
|
+
["adduced", "udder"]
|
699
|
+
[:foo]
|
700
|
+
|
701
|
+
|
702
|
+
|
703
|
+
+[-[/a/,/b/,/c/] & (item_that.size < 4) ]
|
704
|
+
|
705
|
+
#Matches:
|
706
|
+
["al", "robert", "chuck"]
|
707
|
+
|
708
|
+
# Doesn't match:
|
709
|
+
["albert", "robert", "chuck"]
|
710
|
+
</example>
|
711
|
+
|
712
|
+
Recursive patterns
|
713
|
+
|
714
|
+
Occasionally, it is necessary to be able to have patterns contain
|
715
|
+
themselves, in order to be able to match (for instance)
|
716
|
+
a parenthesized list which
|
717
|
+
can contain another parenthesized list or to match an array that
|
718
|
+
can contain
|
719
|
+
another array of the same type.
|
720
|
+
|
721
|
+
Suppose you want to match a tree. How would you do it? Let's suppose nodes
|
722
|
+
in our tree are 3-element arrays, the first and third elements of which
|
723
|
+
are the left and right sub-trees, respectively. (Or nil if no sub-tree is
|
724
|
+
present.) The middle element is an integer representing the value of this
|
725
|
+
node. The code to match such a tree would look like this:
|
726
|
+
|
727
|
+
<example 13 -- recursive matchers>
|
728
|
+
tree=Reg.const
|
729
|
+
tree.set! +[tree|nil, Integer, tree|nil]
|
730
|
+
|
731
|
+
#equivalent to:
|
732
|
+
#def treematch(x)
|
733
|
+
# x.size===3 and
|
734
|
+
# x[0]==nil || treematch(x[0]) and
|
735
|
+
# Integer===x[1] and
|
736
|
+
# x[2].nil? || treematch(x[2])
|
737
|
+
#end
|
738
|
+
</example>
|
739
|
+
|
740
|
+
(Unfortunately, the |nil bug in
|
741
|
+
the 0.4.5 release breaks this particular
|
742
|
+
usage as well.)
|
743
|
+
|
744
|
+
This syntax is somewhat clumsy. I apologize; it's the best that
|
745
|
+
I have come up with so far.
|
746
|
+
|
747
|
+
OBS and unanchoring
|
748
|
+
|
749
|
+
I haven't talked explicitly about this, but unlike Regexp, Reg::Array is
|
750
|
+
implicitly anchored on both ends. So, instead of putting special symbols
|
751
|
+
at the edges of an array pattern to anchor it (^ and $ in Regexp), you
|
752
|
+
must put special stuff in if you want it to _not_ be anchored. The special
|
753
|
+
pattern OBS represents 0 or more of any item. To be strictly equivalent to
|
754
|
+
Regexp, you need to use OBS.l. (The 'l' operator makes OBS (or any
|
755
|
+
pattern) lazy. Not working in 0.4.5.)
|
756
|
+
|
757
|
+
<listing 5 -- unanchored matching>
|
758
|
+
OBS=OB.*
|
759
|
+
|
760
|
+
+[1,2,3]===x
|
761
|
+
#equivalent to:
|
762
|
+
#x==[1,2,3]
|
763
|
+
|
764
|
+
+[OBS,1,2,3]===x
|
765
|
+
#equivalent to:
|
766
|
+
#x[-1]==3 and x[-2]==2 and x[-1]==1
|
767
|
+
|
768
|
+
+[OBS,1,2,3,OBS]===x
|
769
|
+
#equivalent to:
|
770
|
+
#x.size.-(3).downto(0) {|i|
|
771
|
+
# break(true) if x[i]==1 and x[i+1]==2 and x[i+2]==3
|
772
|
+
#}
|
773
|
+
|
774
|
+
+[OBS.l,1,2,3,OBS.l]===x
|
775
|
+
#equivalent to:
|
776
|
+
#x.each_with_index{|v,i|
|
777
|
+
# break(true) if v==1 and x[i+1]==2 and x[i+2]==3
|
778
|
+
#}
|
779
|
+
|
780
|
+
+[1,2,3,OBS,4,5,6,OBS,7,8,9]===x
|
781
|
+
#equivalent to:
|
782
|
+
#x[0]==1 and x[1]==2 and x[2]==3 and
|
783
|
+
#x[-1]==9 and x[-2]==8 and x[-3]==7 and
|
784
|
+
#x.size.-(6).downto(3){|i|
|
785
|
+
# break(true) if x[i]==4 and x[i+1]==5 and x[i+2]==6
|
786
|
+
#}
|
787
|
+
</listing>
|
788
|
+
|
789
|
+
Captures and Backreferences
|
790
|
+
|
791
|
+
[[DOESN'T WORK YET]]
|
792
|
+
A backreference allows you to match
|
793
|
+
repeated data. First, you must capture the data that will be repeated
|
794
|
+
using Symbol#<<; then you make a backreference to it at a subsequent
|
795
|
+
point in the larger match.
|
796
|
+
|
797
|
+
Unlike Regexp, in Reg, backreferenced items are always referred to by
|
798
|
+
name instead of number. In Regexp, parentheses captures a value. In Reg, Symbol#<< is the capture operator.
|
799
|
+
|
800
|
+
<example 14 -- backreferences>
|
801
|
+
+[:a<<OB, BR[:a]]
|
802
|
+
#equivalent to:
|
803
|
+
#x.size==2 and (a=x[0]).is_a? Object and x[1]==a
|
804
|
+
</example>
|
805
|
+
|
806
|
+
Suppose you want to match arrays containing exactly two of the same item. Here's how you'd do it. The first matcher matches any item and captures
|
807
|
+
it into the 'variable' named :a. The second matcher is a backreference to
|
808
|
+
the captured item in :a.
|
809
|
+
|
810
|
+
Substitutions
|
811
|
+
|
812
|
+
So far, everything I've introduced only allows you to make queries on
|
813
|
+
your data; let's look at substitutions, which allow you to change
|
814
|
+
(part of) the data once it has been found to match.
|
815
|
+
[[DOESN'T WORK YET]]
|
816
|
+
|
817
|
+
<example 15 -- Substitutions>
|
818
|
+
+[String>>1, OBS]
|
819
|
+
|
820
|
+
#x.size==2 and String===x[0] and x[0]=1
|
821
|
+
</example>
|
822
|
+
|
823
|
+
This example would match arrays beginning with a string,
|
824
|
+
replacing that string with the number 1.
|
825
|
+
|
826
|
+
=== never changes matched data, even if matcher had a
|
827
|
+
substitution in it, so #match! must be used instead.
|
828
|
+
|
829
|
+
References:
|
830
|
+
reg rubyforge project
|
831
|
+
blankslate and deferred by jim weirich
|
832
|
+
http://onestepback.org/index.cgi/Tech/Ruby/SlowingDownCalculations.rdoc
|
833
|
+
grammar and syntax by eric mahurin
|
834
|
+
some other parser proj by ???
|
835
|
+
rparsec
|
836
|
+
spirit from boost c++ library
|
837
|
+
mauricio's blog post
|
838
|
+
gema
|