regexp_optimized_union 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/regexp_optimized_union.rb +150 -0
  2. metadata +53 -0
@@ -0,0 +1,150 @@
1
+ class Regexp
2
+ # trie for optimization
3
+ class OptimizeTrie < Hash
4
+ attr_accessor :parent, :opt_maybe, :opt_suffix
5
+ def []= k, v
6
+ super(k, v)
7
+ v.parent = self
8
+ end
9
+
10
+ def single_branch?
11
+ empty? or (size == 1 and !opt_maybe and values[0].single_branch?)
12
+ end
13
+
14
+ def single_char?
15
+ size == 1 and values[0].empty?
16
+ end
17
+
18
+ # prereq: single_branch?
19
+ def to_chars
20
+ if empty?
21
+ []
22
+ else
23
+ [keys[0], *values[0].to_chars]
24
+ end
25
+ end
26
+
27
+ # prereq: opt_suffix
28
+ # returns: regexp src
29
+ def extract_common_suffix
30
+ branches = map do |key, value|
31
+ [key, *value.to_chars]
32
+ end
33
+ branches.each &:reverse!
34
+ max_common_size = branches.map(&:size).min
35
+ common_size = nil
36
+ max_common_size.downto 1 do |i|
37
+ found = true
38
+ branches.map {|b| b.take i }.each_cons(2) do |b1, b2|
39
+ if b1 != b2
40
+ found = false
41
+ break
42
+ end
43
+ end
44
+ if found
45
+ common_size = i
46
+ break
47
+ end
48
+ end
49
+
50
+ if common_size
51
+ common = branches[0].take(common_size).reverse.join
52
+ if branches.all?{|b| b.size == common_size + 1 }
53
+ diff = branches.map(&:last).join
54
+ "[#{diff}]#{common}"
55
+ else
56
+ diff = branches.map do |b|
57
+ b.drop(common_size).reverse.join
58
+ end.join '|'
59
+ "(?:#{diff})#{common}"
60
+ end
61
+ end
62
+ end
63
+
64
+ def to_re_src
65
+ return '' if empty?
66
+
67
+ res = extract_common_suffix if opt_suffix
68
+ if !res
69
+ can_be_branched = true
70
+ res = map do |key, value|
71
+ "#{key}#{value.to_re_src}"
72
+ end.join '|'
73
+ end
74
+
75
+ if opt_maybe
76
+ if single_char?
77
+ "#{res}?"
78
+ else
79
+ "(?:#{res})?"
80
+ end
81
+ else
82
+ if can_be_branched and size > 1 and parent
83
+ "(?:#{res})"
84
+ else
85
+ res
86
+ end
87
+ end
88
+ end
89
+ end
90
+
91
+ def self.optimized_union a, opts=nil
92
+ trie = OptimizeTrie.new
93
+ term_nodes = {}
94
+
95
+ # build trie
96
+ a.each do |s|
97
+ next if s.empty?
98
+ t = trie
99
+ s.chars.each do |c|
100
+ c = Regexp.escape c
101
+ unless t[c]
102
+ t[c] = OptimizeTrie.new
103
+ end
104
+ t = t[c]
105
+ end
106
+ term_nodes[t] = true
107
+ t.opt_maybe = true
108
+ end
109
+
110
+ # tag opt_suffix nodes
111
+ term_nodes.each do |node, _|
112
+ next unless node.empty?
113
+ while node = node.parent and !node.opt_suffix and !node.opt_maybe
114
+ if node.size > 1
115
+ if node.values.all?(&:single_branch?)
116
+ node.opt_suffix = true
117
+ end
118
+ break
119
+ end
120
+ end
121
+ end
122
+
123
+ Regexp.new trie.to_re_src, opts
124
+ end
125
+ end
126
+
127
+ if __FILE__ == $PROGRAM_NAME
128
+ {
129
+ %w[] => //,
130
+ %w[foo] => /foo/,
131
+ %w[foo bar] => /foo|bar/,
132
+ %w[foo foob bar] => /foob?|bar/,
133
+ %w[foo foobar] => /foo(?:bar)?/,
134
+ %w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
135
+ %w[fooabar foobbar] => /foo[ab]bar/,
136
+ %w[fooabar foobazbar] => /foo(?:a|baz)bar/,
137
+ %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/
138
+ }.each do |a, r|
139
+ l = Regexp.optimized_union a
140
+ a.each do |s|
141
+ if l.match(s).offset(0) != [0, s.size]
142
+ raise "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
143
+ end
144
+ end
145
+ if r != l
146
+ raise "expected #{r} from #{a.inspect} but got #{l}"
147
+ end
148
+ end
149
+ puts 'test success!'
150
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: regexp_optimized_union
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - luikore
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-07 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! 'Regexp.optimized_union(word_list, regexp_options) generates optimized
15
+ regexp for matching union of word list.
16
+
17
+ Optimations include: treed common prefix extraction, common suffix aggregation and
18
+ optional leaf to ?.
19
+
20
+ Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION'
21
+ email:
22
+ executables: []
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - lib/regexp_optimized_union.rb
27
+ homepage: https://github.com/luikore/regexp_optimized_union
28
+ licenses:
29
+ - WTFPL
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ required_rubygems_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.3.6
46
+ requirements: []
47
+ rubyforge_project:
48
+ rubygems_version: 1.8.24
49
+ signing_key:
50
+ specification_version: 3
51
+ summary: Regexp.optimized_union(word_list, regexp_options) generates optimized regexp
52
+ for matching union of word list
53
+ test_files: []