regexp_optimized_union 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/regexp_optimized_union.rb +76 -15
- data/readme.md +2 -1
- metadata +2 -2
@@ -1,3 +1,14 @@
|
|
1
|
+
if RUBY_VERSION < '1.9'
|
2
|
+
require 'enumerator'
|
3
|
+
class String
|
4
|
+
unless defined?(ord)
|
5
|
+
def ord
|
6
|
+
unpack('C').first
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
1
12
|
class Regexp
|
2
13
|
# trie for optimization
|
3
14
|
class OptimizeTrie < Hash
|
@@ -14,7 +25,7 @@ class Regexp
|
|
14
25
|
def single_char?
|
15
26
|
size == 1 and values[0].empty?
|
16
27
|
end
|
17
|
-
|
28
|
+
|
18
29
|
# prereq: single_branch?
|
19
30
|
def to_chars
|
20
31
|
if empty?
|
@@ -23,7 +34,7 @@ class Regexp
|
|
23
34
|
[keys[0], *values[0].to_chars]
|
24
35
|
end
|
25
36
|
end
|
26
|
-
|
37
|
+
|
27
38
|
# prereq: opt_suffix
|
28
39
|
# returns: regexp src
|
29
40
|
def extract_common_suffix
|
@@ -46,12 +57,12 @@ class Regexp
|
|
46
57
|
break
|
47
58
|
end
|
48
59
|
end
|
49
|
-
|
60
|
+
|
50
61
|
if common_size
|
51
62
|
common = branches[0].take(common_size).reverse.join
|
52
63
|
if branches.all?{|b| b.size == common_size + 1 }
|
53
|
-
diff = branches.map
|
54
|
-
"
|
64
|
+
diff = build_char_group(branches.map &:last)
|
65
|
+
"#{diff}#{common}"
|
55
66
|
else
|
56
67
|
diff = branches.map do |b|
|
57
68
|
b.drop(common_size).reverse.join
|
@@ -61,25 +72,67 @@ class Regexp
|
|
61
72
|
end
|
62
73
|
end
|
63
74
|
|
75
|
+
def build_char_group chars
|
76
|
+
return chars.first if chars.size == 1
|
77
|
+
|
78
|
+
if RUBY_VERSION < '1.9'
|
79
|
+
chars, mb_chars = chars.partition{|c| c.bytesize == 1}
|
80
|
+
else
|
81
|
+
mb_chars = []
|
82
|
+
end
|
83
|
+
|
84
|
+
chars = chars.map(&:ord)
|
85
|
+
chars.sort!
|
86
|
+
first_char = chars.shift
|
87
|
+
groups = [(first_char..first_char)]
|
88
|
+
chars.each do |c|
|
89
|
+
if c == groups.last.end + 1
|
90
|
+
groups[-1] = groups.last.begin..c
|
91
|
+
else
|
92
|
+
groups << (c..c)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
groups.map! do |range|
|
97
|
+
# only apply range to >= 4 contiguous chars
|
98
|
+
if range.end >= range.begin + 3
|
99
|
+
"#{range.begin.chr}-#{range.end.chr}"
|
100
|
+
elsif range.end > range.begin
|
101
|
+
range.map(&:chr).join
|
102
|
+
else
|
103
|
+
range.begin.chr
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
"[#{groups.join}#{mb_chars.join}]"
|
108
|
+
end
|
109
|
+
|
64
110
|
def to_re_src
|
65
111
|
return '' if empty?
|
66
|
-
|
112
|
+
|
67
113
|
res = extract_common_suffix if opt_suffix
|
114
|
+
char_group = false
|
68
115
|
if !res
|
69
116
|
can_be_branched = true
|
70
|
-
|
117
|
+
branches = map do |key, value|
|
71
118
|
"#{key}#{value.to_re_src}"
|
72
|
-
end
|
119
|
+
end
|
120
|
+
if branches.all?{|b| b.bytesize == 1}
|
121
|
+
char_group = true
|
122
|
+
res = build_char_group branches
|
123
|
+
else
|
124
|
+
res = branches.join '|'
|
125
|
+
end
|
73
126
|
end
|
74
|
-
|
127
|
+
|
75
128
|
if opt_maybe
|
76
|
-
if single_char?
|
129
|
+
if char_group or single_char?
|
77
130
|
"#{res}?"
|
78
131
|
else
|
79
132
|
"(?:#{res})?"
|
80
133
|
end
|
81
134
|
else
|
82
|
-
if can_be_branched and size > 1 and parent
|
135
|
+
if can_be_branched and size > 1 and parent and !char_group
|
83
136
|
"(?:#{res})"
|
84
137
|
else
|
85
138
|
res
|
@@ -125,8 +178,11 @@ class Regexp
|
|
125
178
|
end
|
126
179
|
|
127
180
|
if __FILE__ == $PROGRAM_NAME
|
181
|
+
# NOTE test will fail under ruby 1.8.7 due to hash order, but results should be identical
|
182
|
+
success = true
|
128
183
|
{
|
129
184
|
%w[] => //,
|
185
|
+
%w[a b c d f] => /[a-df]/,
|
130
186
|
%w[foo] => /foo/,
|
131
187
|
%w[foo bar] => /foo|bar/,
|
132
188
|
%w[foo foob bar] => /foob?|bar/,
|
@@ -134,17 +190,22 @@ if __FILE__ == $PROGRAM_NAME
|
|
134
190
|
%w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
|
135
191
|
%w[fooabar foobbar] => /foo[ab]bar/,
|
136
192
|
%w[fooabar foobazbar] => /foo(?:a|baz)bar/,
|
137
|
-
%w[foobar fooabar foogabar] => /foo(?:|a|ga)bar
|
193
|
+
%w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/,
|
194
|
+
%w[vax vcx vbx vdx] => /v[a-d]x/,
|
195
|
+
%w[vax vcx vbx] => /v[abc]x/,
|
196
|
+
%w[xa xc xb x] => /x[abc]?/
|
138
197
|
}.each do |a, r|
|
139
198
|
l = Regexp.optimized_union a
|
140
199
|
a.each do |s|
|
141
200
|
if l.match(s).offset(0) != [0, s.size]
|
142
|
-
|
201
|
+
success = false
|
202
|
+
puts "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
|
143
203
|
end
|
144
204
|
end
|
145
205
|
if r != l
|
146
|
-
|
206
|
+
success = false
|
207
|
+
puts "expected #{r} from #{a.inspect} but got #{l}"
|
147
208
|
end
|
148
209
|
end
|
149
|
-
puts 'test success!'
|
210
|
+
puts 'test success!' if success
|
150
211
|
end
|
data/readme.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
`Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list.
|
1
|
+
`Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list.
|
2
2
|
|
3
3
|
### Install:
|
4
4
|
|
@@ -24,6 +24,7 @@ Regexp.optimized_union(%w[foobar fooabar foogabar]) #=> /foo(?:|a|ga)bar/
|
|
24
24
|
|
25
25
|
- Treed common prefix extraction.
|
26
26
|
- Common suffix aggregation.
|
27
|
+
- If 4 or more contiguous chars exist in a char group, they are turned into char range.
|
27
28
|
- Optional leaf to `?`.
|
28
29
|
|
29
30
|
Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: regexp_optimized_union
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Regexp.optimized_union(word_list, regexp_options) generates optimized
|
15
15
|
regexp for matching union of word list
|