regexp_optimized_union 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/regexp_optimized_union.rb +150 -0
- metadata +53 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
class Regexp
|
2
|
+
# trie for optimization
|
3
|
+
class OptimizeTrie < Hash
|
4
|
+
attr_accessor :parent, :opt_maybe, :opt_suffix
|
5
|
+
def []= k, v
|
6
|
+
super(k, v)
|
7
|
+
v.parent = self
|
8
|
+
end
|
9
|
+
|
10
|
+
def single_branch?
|
11
|
+
empty? or (size == 1 and !opt_maybe and values[0].single_branch?)
|
12
|
+
end
|
13
|
+
|
14
|
+
def single_char?
|
15
|
+
size == 1 and values[0].empty?
|
16
|
+
end
|
17
|
+
|
18
|
+
# prereq: single_branch?
|
19
|
+
def to_chars
|
20
|
+
if empty?
|
21
|
+
[]
|
22
|
+
else
|
23
|
+
[keys[0], *values[0].to_chars]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# prereq: opt_suffix
|
28
|
+
# returns: regexp src
|
29
|
+
def extract_common_suffix
|
30
|
+
branches = map do |key, value|
|
31
|
+
[key, *value.to_chars]
|
32
|
+
end
|
33
|
+
branches.each &:reverse!
|
34
|
+
max_common_size = branches.map(&:size).min
|
35
|
+
common_size = nil
|
36
|
+
max_common_size.downto 1 do |i|
|
37
|
+
found = true
|
38
|
+
branches.map {|b| b.take i }.each_cons(2) do |b1, b2|
|
39
|
+
if b1 != b2
|
40
|
+
found = false
|
41
|
+
break
|
42
|
+
end
|
43
|
+
end
|
44
|
+
if found
|
45
|
+
common_size = i
|
46
|
+
break
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
if common_size
|
51
|
+
common = branches[0].take(common_size).reverse.join
|
52
|
+
if branches.all?{|b| b.size == common_size + 1 }
|
53
|
+
diff = branches.map(&:last).join
|
54
|
+
"[#{diff}]#{common}"
|
55
|
+
else
|
56
|
+
diff = branches.map do |b|
|
57
|
+
b.drop(common_size).reverse.join
|
58
|
+
end.join '|'
|
59
|
+
"(?:#{diff})#{common}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_re_src
|
65
|
+
return '' if empty?
|
66
|
+
|
67
|
+
res = extract_common_suffix if opt_suffix
|
68
|
+
if !res
|
69
|
+
can_be_branched = true
|
70
|
+
res = map do |key, value|
|
71
|
+
"#{key}#{value.to_re_src}"
|
72
|
+
end.join '|'
|
73
|
+
end
|
74
|
+
|
75
|
+
if opt_maybe
|
76
|
+
if single_char?
|
77
|
+
"#{res}?"
|
78
|
+
else
|
79
|
+
"(?:#{res})?"
|
80
|
+
end
|
81
|
+
else
|
82
|
+
if can_be_branched and size > 1 and parent
|
83
|
+
"(?:#{res})"
|
84
|
+
else
|
85
|
+
res
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.optimized_union a, opts=nil
|
92
|
+
trie = OptimizeTrie.new
|
93
|
+
term_nodes = {}
|
94
|
+
|
95
|
+
# build trie
|
96
|
+
a.each do |s|
|
97
|
+
next if s.empty?
|
98
|
+
t = trie
|
99
|
+
s.chars.each do |c|
|
100
|
+
c = Regexp.escape c
|
101
|
+
unless t[c]
|
102
|
+
t[c] = OptimizeTrie.new
|
103
|
+
end
|
104
|
+
t = t[c]
|
105
|
+
end
|
106
|
+
term_nodes[t] = true
|
107
|
+
t.opt_maybe = true
|
108
|
+
end
|
109
|
+
|
110
|
+
# tag opt_suffix nodes
|
111
|
+
term_nodes.each do |node, _|
|
112
|
+
next unless node.empty?
|
113
|
+
while node = node.parent and !node.opt_suffix and !node.opt_maybe
|
114
|
+
if node.size > 1
|
115
|
+
if node.values.all?(&:single_branch?)
|
116
|
+
node.opt_suffix = true
|
117
|
+
end
|
118
|
+
break
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
Regexp.new trie.to_re_src, opts
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if __FILE__ == $PROGRAM_NAME
|
128
|
+
{
|
129
|
+
%w[] => //,
|
130
|
+
%w[foo] => /foo/,
|
131
|
+
%w[foo bar] => /foo|bar/,
|
132
|
+
%w[foo foob bar] => /foob?|bar/,
|
133
|
+
%w[foo foobar] => /foo(?:bar)?/,
|
134
|
+
%w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
|
135
|
+
%w[fooabar foobbar] => /foo[ab]bar/,
|
136
|
+
%w[fooabar foobazbar] => /foo(?:a|baz)bar/,
|
137
|
+
%w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/
|
138
|
+
}.each do |a, r|
|
139
|
+
l = Regexp.optimized_union a
|
140
|
+
a.each do |s|
|
141
|
+
if l.match(s).offset(0) != [0, s.size]
|
142
|
+
raise "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
if r != l
|
146
|
+
raise "expected #{r} from #{a.inspect} but got #{l}"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
puts 'test success!'
|
150
|
+
end
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: regexp_optimized_union
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- luikore
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-07 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! 'Regexp.optimized_union(word_list, regexp_options) generates optimized
|
15
|
+
regexp for matching union of word list.
|
16
|
+
|
17
|
+
Optimations include: treed common prefix extraction, common suffix aggregation and
|
18
|
+
optional leaf to ?.
|
19
|
+
|
20
|
+
Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION'
|
21
|
+
email:
|
22
|
+
executables: []
|
23
|
+
extensions: []
|
24
|
+
extra_rdoc_files: []
|
25
|
+
files:
|
26
|
+
- lib/regexp_optimized_union.rb
|
27
|
+
homepage: https://github.com/luikore/regexp_optimized_union
|
28
|
+
licenses:
|
29
|
+
- WTFPL
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.3.6
|
46
|
+
requirements: []
|
47
|
+
rubyforge_project:
|
48
|
+
rubygems_version: 1.8.24
|
49
|
+
signing_key:
|
50
|
+
specification_version: 3
|
51
|
+
summary: Regexp.optimized_union(word_list, regexp_options) generates optimized regexp
|
52
|
+
for matching union of word list
|
53
|
+
test_files: []
|