regexp_optimized_union 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/regexp_optimized_union.rb +76 -15
  2. data/readme.md +2 -1
  3. metadata +2 -2
@@ -1,3 +1,14 @@
1
+ if RUBY_VERSION < '1.9'
2
+ require 'enumerator'
3
+ class String
4
+ unless defined?(ord)
5
+ def ord
6
+ unpack('C').first
7
+ end
8
+ end
9
+ end
10
+ end
11
+
1
12
  class Regexp
2
13
  # trie for optimization
3
14
  class OptimizeTrie < Hash
@@ -14,7 +25,7 @@ class Regexp
14
25
  def single_char?
15
26
  size == 1 and values[0].empty?
16
27
  end
17
-
28
+
18
29
  # prereq: single_branch?
19
30
  def to_chars
20
31
  if empty?
@@ -23,7 +34,7 @@ class Regexp
23
34
  [keys[0], *values[0].to_chars]
24
35
  end
25
36
  end
26
-
37
+
27
38
  # prereq: opt_suffix
28
39
  # returns: regexp src
29
40
  def extract_common_suffix
@@ -46,12 +57,12 @@ class Regexp
46
57
  break
47
58
  end
48
59
  end
49
-
60
+
50
61
  if common_size
51
62
  common = branches[0].take(common_size).reverse.join
52
63
  if branches.all?{|b| b.size == common_size + 1 }
53
- diff = branches.map(&:last).join
54
- "[#{diff}]#{common}"
64
+ diff = build_char_group(branches.map &:last)
65
+ "#{diff}#{common}"
55
66
  else
56
67
  diff = branches.map do |b|
57
68
  b.drop(common_size).reverse.join
@@ -61,25 +72,67 @@ class Regexp
61
72
  end
62
73
  end
63
74
 
75
+ def build_char_group chars
76
+ return chars.first if chars.size == 1
77
+
78
+ if RUBY_VERSION < '1.9'
79
+ chars, mb_chars = chars.partition{|c| c.bytesize == 1}
80
+ else
81
+ mb_chars = []
82
+ end
83
+
84
+ chars = chars.map(&:ord)
85
+ chars.sort!
86
+ first_char = chars.shift
87
+ groups = [(first_char..first_char)]
88
+ chars.each do |c|
89
+ if c == groups.last.end + 1
90
+ groups[-1] = groups.last.begin..c
91
+ else
92
+ groups << (c..c)
93
+ end
94
+ end
95
+
96
+ groups.map! do |range|
97
+ # only apply range to >= 4 contiguous chars
98
+ if range.end >= range.begin + 3
99
+ "#{range.begin.chr}-#{range.end.chr}"
100
+ elsif range.end > range.begin
101
+ range.map(&:chr).join
102
+ else
103
+ range.begin.chr
104
+ end
105
+ end
106
+
107
+ "[#{groups.join}#{mb_chars.join}]"
108
+ end
109
+
64
110
  def to_re_src
65
111
  return '' if empty?
66
-
112
+
67
113
  res = extract_common_suffix if opt_suffix
114
+ char_group = false
68
115
  if !res
69
116
  can_be_branched = true
70
- res = map do |key, value|
117
+ branches = map do |key, value|
71
118
  "#{key}#{value.to_re_src}"
72
- end.join '|'
119
+ end
120
+ if branches.all?{|b| b.bytesize == 1}
121
+ char_group = true
122
+ res = build_char_group branches
123
+ else
124
+ res = branches.join '|'
125
+ end
73
126
  end
74
-
127
+
75
128
  if opt_maybe
76
- if single_char?
129
+ if char_group or single_char?
77
130
  "#{res}?"
78
131
  else
79
132
  "(?:#{res})?"
80
133
  end
81
134
  else
82
- if can_be_branched and size > 1 and parent
135
+ if can_be_branched and size > 1 and parent and !char_group
83
136
  "(?:#{res})"
84
137
  else
85
138
  res
@@ -125,8 +178,11 @@ class Regexp
125
178
  end
126
179
 
127
180
  if __FILE__ == $PROGRAM_NAME
181
+ # NOTE test will fail under ruby 1.8.7 due to hash order, but results should be identical
182
+ success = true
128
183
  {
129
184
  %w[] => //,
185
+ %w[a b c d f] => /[a-df]/,
130
186
  %w[foo] => /foo/,
131
187
  %w[foo bar] => /foo|bar/,
132
188
  %w[foo foob bar] => /foob?|bar/,
@@ -134,17 +190,22 @@ if __FILE__ == $PROGRAM_NAME
134
190
  %w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
135
191
  %w[fooabar foobbar] => /foo[ab]bar/,
136
192
  %w[fooabar foobazbar] => /foo(?:a|baz)bar/,
137
- %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/
193
+ %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/,
194
+ %w[vax vcx vbx vdx] => /v[a-d]x/,
195
+ %w[vax vcx vbx] => /v[abc]x/,
196
+ %w[xa xc xb x] => /x[abc]?/
138
197
  }.each do |a, r|
139
198
  l = Regexp.optimized_union a
140
199
  a.each do |s|
141
200
  if l.match(s).offset(0) != [0, s.size]
142
- raise "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
201
+ success = false
202
+ puts "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
143
203
  end
144
204
  end
145
205
  if r != l
146
- raise "expected #{r} from #{a.inspect} but got #{l}"
206
+ success = false
207
+ puts "expected #{r} from #{a.inspect} but got #{l}"
147
208
  end
148
209
  end
149
- puts 'test success!'
210
+ puts 'test success!' if success
150
211
  end
data/readme.md CHANGED
@@ -1,4 +1,4 @@
1
- `Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list. Works similar to `Regexp.union`, but API receives Regexp compile option.
1
+ `Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list.
2
2
 
3
3
  ### Install:
4
4
 
@@ -24,6 +24,7 @@ Regexp.optimized_union(%w[foobar fooabar foogabar]) #=> /foo(?:|a|ga)bar/
24
24
 
25
25
  - Treed common prefix extraction.
26
26
  - Common suffix aggregation.
27
+ - If 4 or more contiguous chars exist in a char group, they are turned into char range.
27
28
  - Optional leaf to `?`.
28
29
 
29
30
  Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: regexp_optimized_union
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-07 00:00:00.000000000 Z
12
+ date: 2012-11-14 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Regexp.optimized_union(word_list, regexp_options) generates optimized
15
15
  regexp for matching union of word list