regexp_optimized_union 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/regexp_optimized_union.rb +150 -0
  2. metadata +53 -0
@@ -0,0 +1,150 @@
1
+ class Regexp
2
+ # trie for optimization
3
+ class OptimizeTrie < Hash
4
+ attr_accessor :parent, :opt_maybe, :opt_suffix
5
+ def []= k, v
6
+ super(k, v)
7
+ v.parent = self
8
+ end
9
+
10
+ def single_branch?
11
+ empty? or (size == 1 and !opt_maybe and values[0].single_branch?)
12
+ end
13
+
14
+ def single_char?
15
+ size == 1 and values[0].empty?
16
+ end
17
+
18
+ # prereq: single_branch?
19
+ def to_chars
20
+ if empty?
21
+ []
22
+ else
23
+ [keys[0], *values[0].to_chars]
24
+ end
25
+ end
26
+
27
+ # prereq: opt_suffix
28
+ # returns: regexp src
29
+ def extract_common_suffix
30
+ branches = map do |key, value|
31
+ [key, *value.to_chars]
32
+ end
33
+ branches.each &:reverse!
34
+ max_common_size = branches.map(&:size).min
35
+ common_size = nil
36
+ max_common_size.downto 1 do |i|
37
+ found = true
38
+ branches.map {|b| b.take i }.each_cons(2) do |b1, b2|
39
+ if b1 != b2
40
+ found = false
41
+ break
42
+ end
43
+ end
44
+ if found
45
+ common_size = i
46
+ break
47
+ end
48
+ end
49
+
50
+ if common_size
51
+ common = branches[0].take(common_size).reverse.join
52
+ if branches.all?{|b| b.size == common_size + 1 }
53
+ diff = branches.map(&:last).join
54
+ "[#{diff}]#{common}"
55
+ else
56
+ diff = branches.map do |b|
57
+ b.drop(common_size).reverse.join
58
+ end.join '|'
59
+ "(?:#{diff})#{common}"
60
+ end
61
+ end
62
+ end
63
+
64
+ def to_re_src
65
+ return '' if empty?
66
+
67
+ res = extract_common_suffix if opt_suffix
68
+ if !res
69
+ can_be_branched = true
70
+ res = map do |key, value|
71
+ "#{key}#{value.to_re_src}"
72
+ end.join '|'
73
+ end
74
+
75
+ if opt_maybe
76
+ if single_char?
77
+ "#{res}?"
78
+ else
79
+ "(?:#{res})?"
80
+ end
81
+ else
82
+ if can_be_branched and size > 1 and parent
83
+ "(?:#{res})"
84
+ else
85
+ res
86
+ end
87
+ end
88
+ end
89
+ end
90
+
91
+ def self.optimized_union a, opts=nil
92
+ trie = OptimizeTrie.new
93
+ term_nodes = {}
94
+
95
+ # build trie
96
+ a.each do |s|
97
+ next if s.empty?
98
+ t = trie
99
+ s.chars.each do |c|
100
+ c = Regexp.escape c
101
+ unless t[c]
102
+ t[c] = OptimizeTrie.new
103
+ end
104
+ t = t[c]
105
+ end
106
+ term_nodes[t] = true
107
+ t.opt_maybe = true
108
+ end
109
+
110
+ # tag opt_suffix nodes
111
+ term_nodes.each do |node, _|
112
+ next unless node.empty?
113
+ while node = node.parent and !node.opt_suffix and !node.opt_maybe
114
+ if node.size > 1
115
+ if node.values.all?(&:single_branch?)
116
+ node.opt_suffix = true
117
+ end
118
+ break
119
+ end
120
+ end
121
+ end
122
+
123
+ Regexp.new trie.to_re_src, opts
124
+ end
125
+ end
126
+
127
+ if __FILE__ == $PROGRAM_NAME
128
+ {
129
+ %w[] => //,
130
+ %w[foo] => /foo/,
131
+ %w[foo bar] => /foo|bar/,
132
+ %w[foo foob bar] => /foob?|bar/,
133
+ %w[foo foobar] => /foo(?:bar)?/,
134
+ %w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
135
+ %w[fooabar foobbar] => /foo[ab]bar/,
136
+ %w[fooabar foobazbar] => /foo(?:a|baz)bar/,
137
+ %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/
138
+ }.each do |a, r|
139
+ l = Regexp.optimized_union a
140
+ a.each do |s|
141
+ if l.match(s).offset(0) != [0, s.size]
142
+ raise "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
143
+ end
144
+ end
145
+ if r != l
146
+ raise "expected #{r} from #{a.inspect} but got #{l}"
147
+ end
148
+ end
149
+ puts 'test success!'
150
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: regexp_optimized_union
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - luikore
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-07 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! 'Regexp.optimized_union(word_list, regexp_options) generates optimized
15
+ regexp for matching union of word list.
16
+
17
+ Optimations include: treed common prefix extraction, common suffix aggregation and
18
+ optional leaf to ?.
19
+
20
+ Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION'
21
+ email:
22
+ executables: []
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/regexp_optimized_union.rb
27
+ homepage: https://github.com/luikore/regexp_optimized_union
28
+ licenses:
29
+ - WTFPL
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ required_rubygems_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.3.6
46
+ requirements: []
47
+ rubyforge_project:
48
+ rubygems_version: 1.8.24
49
+ signing_key:
50
+ specification_version: 3
51
+ summary: Regexp.optimized_union(word_list, regexp_options) generates optimized regexp
52
+ for matching union of word list
53
+ test_files: []