regexp_optimized_union 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/regexp_optimized_union.rb +76 -15
  2. data/readme.md +2 -1
  3. metadata +2 -2
@@ -1,3 +1,14 @@
1
+ if RUBY_VERSION < '1.9'
2
+ require 'enumerator'
3
+ class String
4
+ unless defined?(ord)
5
+ def ord
6
+ unpack('C').first
7
+ end
8
+ end
9
+ end
10
+ end
11
+
1
12
  class Regexp
2
13
  # trie for optimization
3
14
  class OptimizeTrie < Hash
@@ -14,7 +25,7 @@ class Regexp
14
25
  def single_char?
15
26
  size == 1 and values[0].empty?
16
27
  end
17
-
28
+
18
29
  # prereq: single_branch?
19
30
  def to_chars
20
31
  if empty?
@@ -23,7 +34,7 @@ class Regexp
23
34
  [keys[0], *values[0].to_chars]
24
35
  end
25
36
  end
26
-
37
+
27
38
  # prereq: opt_suffix
28
39
  # returns: regexp src
29
40
  def extract_common_suffix
@@ -46,12 +57,12 @@ class Regexp
46
57
  break
47
58
  end
48
59
  end
49
-
60
+
50
61
  if common_size
51
62
  common = branches[0].take(common_size).reverse.join
52
63
  if branches.all?{|b| b.size == common_size + 1 }
53
- diff = branches.map(&:last).join
54
- "[#{diff}]#{common}"
64
+ diff = build_char_group(branches.map &:last)
65
+ "#{diff}#{common}"
55
66
  else
56
67
  diff = branches.map do |b|
57
68
  b.drop(common_size).reverse.join
@@ -61,25 +72,67 @@ class Regexp
61
72
  end
62
73
  end
63
74
 
75
+ def build_char_group chars
76
+ return chars.first if chars.size == 1
77
+
78
+ if RUBY_VERSION < '1.9'
79
+ chars, mb_chars = chars.partition{|c| c.bytesize == 1}
80
+ else
81
+ mb_chars = []
82
+ end
83
+
84
+ chars = chars.map(&:ord)
85
+ chars.sort!
86
+ first_char = chars.shift
87
+ groups = [(first_char..first_char)]
88
+ chars.each do |c|
89
+ if c == groups.last.end + 1
90
+ groups[-1] = groups.last.begin..c
91
+ else
92
+ groups << (c..c)
93
+ end
94
+ end
95
+
96
+ groups.map! do |range|
97
+ # only apply range to >= 4 contiguous chars
98
+ if range.end >= range.begin + 3
99
+ "#{range.begin.chr}-#{range.end.chr}"
100
+ elsif range.end > range.begin
101
+ range.map(&:chr).join
102
+ else
103
+ range.begin.chr
104
+ end
105
+ end
106
+
107
+ "[#{groups.join}#{mb_chars.join}]"
108
+ end
109
+
64
110
  def to_re_src
65
111
  return '' if empty?
66
-
112
+
67
113
  res = extract_common_suffix if opt_suffix
114
+ char_group = false
68
115
  if !res
69
116
  can_be_branched = true
70
- res = map do |key, value|
117
+ branches = map do |key, value|
71
118
  "#{key}#{value.to_re_src}"
72
- end.join '|'
119
+ end
120
+ if branches.all?{|b| b.bytesize == 1}
121
+ char_group = true
122
+ res = build_char_group branches
123
+ else
124
+ res = branches.join '|'
125
+ end
73
126
  end
74
-
127
+
75
128
  if opt_maybe
76
- if single_char?
129
+ if char_group or single_char?
77
130
  "#{res}?"
78
131
  else
79
132
  "(?:#{res})?"
80
133
  end
81
134
  else
82
- if can_be_branched and size > 1 and parent
135
+ if can_be_branched and size > 1 and parent and !char_group
83
136
  "(?:#{res})"
84
137
  else
85
138
  res
@@ -125,8 +178,11 @@ class Regexp
125
178
  end
126
179
 
127
180
  if __FILE__ == $PROGRAM_NAME
181
+ # NOTE test will fail under ruby 1.8.7 due to hash order, but results should be identical
182
+ success = true
128
183
  {
129
184
  %w[] => //,
185
+ %w[a b c d f] => /[a-df]/,
130
186
  %w[foo] => /foo/,
131
187
  %w[foo bar] => /foo|bar/,
132
188
  %w[foo foob bar] => /foob?|bar/,
@@ -134,17 +190,22 @@ if __FILE__ == $PROGRAM_NAME
134
190
  %w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
135
191
  %w[fooabar foobbar] => /foo[ab]bar/,
136
192
  %w[fooabar foobazbar] => /foo(?:a|baz)bar/,
137
- %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/
193
+ %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/,
194
+ %w[vax vcx vbx vdx] => /v[a-d]x/,
195
+ %w[vax vcx vbx] => /v[abc]x/,
196
+ %w[xa xc xb x] => /x[abc]?/
138
197
  }.each do |a, r|
139
198
  l = Regexp.optimized_union a
140
199
  a.each do |s|
141
200
  if l.match(s).offset(0) != [0, s.size]
142
- raise "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
201
+ success = false
202
+ puts "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
143
203
  end
144
204
  end
145
205
  if r != l
146
- raise "expected #{r} from #{a.inspect} but got #{l}"
206
+ success = false
207
+ puts "expected #{r} from #{a.inspect} but got #{l}"
147
208
  end
148
209
  end
149
- puts 'test success!'
210
+ puts 'test success!' if success
150
211
  end
data/readme.md CHANGED
@@ -1,4 +1,4 @@
1
- `Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list. Works similar to `Regexp.union`, but API receives Regexp compile option.
1
+ `Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list.
2
2
 
3
3
  ### Install:
4
4
 
@@ -24,6 +24,7 @@ Regexp.optimized_union(%w[foobar fooabar foogabar]) #=> /foo(?:|a|ga)bar/
24
24
 
25
25
  - Treed common prefix extraction.
26
26
  - Common suffix aggregation.
27
+ - If 4 or more contiguous chars exist in a char group, they are turned into char range.
27
28
  - Optional leaf to `?`.
28
29
 
29
30
  Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: regexp_optimized_union
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-07 00:00:00.000000000 Z
12
+ date: 2012-11-14 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Regexp.optimized_union(word_list, regexp_options) generates optimized
15
15
  regexp for matching union of word list