suffix_tree_ruby 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +8 -0
- data/lib/suffix_tree.rb +10 -0
- data/lib/suffix_tree/active_point.rb +31 -0
- data/lib/suffix_tree/base.rb +223 -0
- data/lib/suffix_tree/end.rb +11 -0
- data/lib/suffix_tree/node.rb +18 -0
- data/lib/suffix_tree/version.rb +5 -0
- data/test/test_suffix_tree.rb +26 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f63b159f00880e92a880c4a86d014312d9b74bbc51b14d2129813378b0e2258f
|
4
|
+
data.tar.gz: bf254242be7c1aac06549891371efb13de2008b72da84477c19eb1f2af1c2255
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7310c3ae440526fad97c07bc02cd9ded850106b3c3ac87ea8aaf1230deb5f4bac75851ff43940037a7da16d38027f298c9c4736caa5682306816f4e0623ee879
|
7
|
+
data.tar.gz: 81f81eb06bd65304a2b4b4013374168b2019a36196a6a8b99cef75d4703a09c4143108d09a8c9451e98786d64b12e7d8bd9e1fd46a6dcac18ebf0d978fc75a73
|
data/Rakefile
ADDED
data/lib/suffix_tree.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SuffixTree
|
4
|
+
# Active point - It is the point from which traversal starts for next extension or next phase.
|
5
|
+
# Active point always starts from root. Other extension will get active point set up
|
6
|
+
# correctly by last extension.
|
7
|
+
|
8
|
+
# Active node - Node from which active point will start
|
9
|
+
# Active Edge - It is used to choose the edge from active node. It has index of character.
|
10
|
+
# Active Length - How far to go on active edge.
|
11
|
+
|
12
|
+
# Active point rules
|
13
|
+
# 1) If rule 3 extension is applied then active length will increment by 1 if active length is
|
14
|
+
# not greater then length of path on edge.
|
15
|
+
# 2) If rule 3 extension is applied and if active length gets greater than length path of edge
|
16
|
+
# then change active node, active edge and active length
|
17
|
+
# 3) If active length is 0 then always start looking for the character from root.
|
18
|
+
# 4) If rule 2 extension is applied and if active node is root then active edge is active edge + 1
|
19
|
+
# and active length is active lenght -1
|
20
|
+
# 5) If rule 2 extension is applied and if active node is not root then follow suffix link and
|
21
|
+
# make active node as suffix link and do no change anything.
|
22
|
+
class ActivePoint
|
23
|
+
attr_accessor :length, :edge, :node
|
24
|
+
|
25
|
+
def initialize(node)
|
26
|
+
@length = 0
|
27
|
+
@edge = -1
|
28
|
+
@node = node
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SuffixTree
|
4
|
+
# https://en.wikipedia.org/wiki/Ukkonen%27s_algorithm
|
5
|
+
class Base
|
6
|
+
attr_accessor :input, :root, :remaining, :end, :active, :words, :uniq_word_map, :l_map
|
7
|
+
|
8
|
+
UNIQ_IDENTIFIERS = ((0...97).map(&:chr) + (135...200).map(&:chr)).freeze
|
9
|
+
|
10
|
+
def initialize(words)
|
11
|
+
@words = words
|
12
|
+
@input = ''
|
13
|
+
@l_map = {}
|
14
|
+
@uniq_word_map = {}
|
15
|
+
words.each_with_index do |word, index|
|
16
|
+
id = UNIQ_IDENTIFIERS[index]
|
17
|
+
@uniq_word_map[id] = index
|
18
|
+
@input += "#{word}#{id}"
|
19
|
+
end
|
20
|
+
@input = @input.split('')
|
21
|
+
@remaining = 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def build
|
25
|
+
self.root = Node.new(1, End.new(0), words.count)
|
26
|
+
root.index = -1
|
27
|
+
self.active = ActivePoint.new(root)
|
28
|
+
self.end = End.new(-1)
|
29
|
+
input.each_with_index do |_elm, index|
|
30
|
+
start_phase(index)
|
31
|
+
end
|
32
|
+
|
33
|
+
set_index_dfs(root, 0, input.length)
|
34
|
+
set_lvalues_dfs(root)
|
35
|
+
end
|
36
|
+
|
37
|
+
def set_lvalues_dfs(node, str = '')
|
38
|
+
return unless node
|
39
|
+
|
40
|
+
return if node.index != -1
|
41
|
+
|
42
|
+
str += input[node.start..node.end.end].join('')
|
43
|
+
if l_map[node.c_value]
|
44
|
+
if l_map[node.c_value].length < str.length
|
45
|
+
l_map[node.c_value] = str
|
46
|
+
elsif l_map[node.c_value].length == str.length
|
47
|
+
l_map[node.c_value] = [str, l_map[node.c_value]].min
|
48
|
+
end
|
49
|
+
else
|
50
|
+
l_map[node.c_value] = str
|
51
|
+
end
|
52
|
+
|
53
|
+
node.child.each do |_key, child|
|
54
|
+
set_lvalues_dfs(child, str)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def set_index_dfs(node, val, size)
|
59
|
+
return unless node
|
60
|
+
|
61
|
+
val += node.end.end - node.start + 1
|
62
|
+
if node.index != -1
|
63
|
+
node.index = size - val
|
64
|
+
c_index = node.index
|
65
|
+
|
66
|
+
c_index += 1 until uniq_word_map[input[c_index]]
|
67
|
+
node.c_values[uniq_word_map[input[c_index]]] = 1
|
68
|
+
return
|
69
|
+
end
|
70
|
+
nums = [node.c_values]
|
71
|
+
node.child.each do |_key, child|
|
72
|
+
set_index_dfs(child, val, size)
|
73
|
+
nums << child.c_values
|
74
|
+
end
|
75
|
+
if node != root
|
76
|
+
node.c_values = nums.transpose.map(&:sum).map { |e| e.positive? ? 1 : 0 }
|
77
|
+
end
|
78
|
+
node.c_value = node.c_values.sum
|
79
|
+
end
|
80
|
+
|
81
|
+
def start_phase(index)
|
82
|
+
last_created_internal_node = nil
|
83
|
+
self.end.end += 1
|
84
|
+
self.remaining += 1
|
85
|
+
|
86
|
+
while remaining.positive?
|
87
|
+
if active.length.zero?
|
88
|
+
if select_node(index)
|
89
|
+
active.edge = select_node(index).start
|
90
|
+
active.length += 1
|
91
|
+
break
|
92
|
+
else
|
93
|
+
root.child[input[index]] = Node.new(index, self.end, words.count)
|
94
|
+
self.remaining -= 1
|
95
|
+
end
|
96
|
+
else
|
97
|
+
begin
|
98
|
+
char = next_char(index)
|
99
|
+
if char == input[index]
|
100
|
+
last_created_internal_node.suffix_link = selected_node if last_created_internal_node
|
101
|
+
walk_down(index)
|
102
|
+
break
|
103
|
+
else
|
104
|
+
node = selected_node
|
105
|
+
temp_start = node.start
|
106
|
+
node.start = node.start + active.length
|
107
|
+
new_internal_node = Node.new(temp_start, End.new(temp_start + active.length - 1), words.count)
|
108
|
+
|
109
|
+
new_leaf_node = Node.new(index, self.end, words.count)
|
110
|
+
|
111
|
+
new_internal_node.child[input[new_internal_node.start + active.length]] = node
|
112
|
+
new_internal_node.child[input[index]] = new_leaf_node
|
113
|
+
new_internal_node.index = -1
|
114
|
+
active.node.child[input[new_internal_node.start]] = new_internal_node
|
115
|
+
|
116
|
+
last_created_internal_node.suffix_link = new_internal_node if last_created_internal_node
|
117
|
+
|
118
|
+
last_created_internal_node = new_internal_node
|
119
|
+
new_internal_node.suffix_link = root
|
120
|
+
|
121
|
+
if active.node != root
|
122
|
+
active.node = active.node.suffix_link
|
123
|
+
else
|
124
|
+
active.edge = active.edge + 1
|
125
|
+
active.length -= 1
|
126
|
+
end
|
127
|
+
self.remaining -= 1
|
128
|
+
end
|
129
|
+
rescue StandardError
|
130
|
+
node = selected_node
|
131
|
+
node.child[input[index]] = Node.new(index, self.end, words.count)
|
132
|
+
last_created_internal_node.suffix_link = node if last_created_internal_node
|
133
|
+
last_created_internal_node = node
|
134
|
+
|
135
|
+
if active.node != root
|
136
|
+
active.node = active.node.suffix_link
|
137
|
+
else
|
138
|
+
active.edge = active.edge + 1
|
139
|
+
active.length -= 1
|
140
|
+
end
|
141
|
+
self.remaining -= 1
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def walk_down(index)
|
148
|
+
node = selected_node
|
149
|
+
|
150
|
+
if diff(node) < active.length
|
151
|
+
active.node = node
|
152
|
+
active.length = active.length - diff(node)
|
153
|
+
active.edge = node.child[input[index]].start
|
154
|
+
else
|
155
|
+
active.length += 1
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def next_char(i)
|
160
|
+
node = selected_node
|
161
|
+
return input[active.node.child[input[active.edge]].start + active.length] if diff(node) >= active.length
|
162
|
+
|
163
|
+
if diff(node) + 1 == active.length
|
164
|
+
return input[i] if node.child[input[i]]
|
165
|
+
else
|
166
|
+
active.node = node
|
167
|
+
active.length = active.length - diff(node) - 1
|
168
|
+
active.edge = active.edge + diff(node) + 1
|
169
|
+
return next_char(i)
|
170
|
+
end
|
171
|
+
raise 'End Of Path Reached'
|
172
|
+
end
|
173
|
+
|
174
|
+
def diff(node)
|
175
|
+
node.end.end - node.start
|
176
|
+
end
|
177
|
+
|
178
|
+
def selected_node
|
179
|
+
active.node.child[input[active.edge]]
|
180
|
+
end
|
181
|
+
|
182
|
+
def select_node(index)
|
183
|
+
active.node.child[input[index]]
|
184
|
+
end
|
185
|
+
|
186
|
+
def longest_common_substring(k = words.length)
|
187
|
+
raise 'Input has to be integer' unless k.is_a? Integer
|
188
|
+
raise 'Invalid Input' if k <= 0
|
189
|
+
|
190
|
+
return base_case if k == 1
|
191
|
+
|
192
|
+
max_length = -1
|
193
|
+
answer = ''
|
194
|
+
|
195
|
+
l_map.each do |key, v|
|
196
|
+
next if key < k
|
197
|
+
|
198
|
+
if v.length > max_length
|
199
|
+
max_length = v.length
|
200
|
+
answer = v
|
201
|
+
elsif v.length == max_length
|
202
|
+
answer = [v, answer].min
|
203
|
+
end
|
204
|
+
end
|
205
|
+
answer
|
206
|
+
end
|
207
|
+
|
208
|
+
def base_case
|
209
|
+
max_length = -1
|
210
|
+
answer = ''
|
211
|
+
|
212
|
+
words.each do |word|
|
213
|
+
if max_length < word.length
|
214
|
+
max_length = word.length
|
215
|
+
answer = word
|
216
|
+
elsif max_length == word.length
|
217
|
+
answer = [answer, word].min
|
218
|
+
end
|
219
|
+
end
|
220
|
+
answer
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SuffixTree
|
4
|
+
class Node
|
5
|
+
attr_accessor :child, :suffix_link, :start, :end, :index, :c_values, :c_value
|
6
|
+
|
7
|
+
def initialize(start, end_p, word_count)
|
8
|
+
@child = {}
|
9
|
+
@suffix_link = nil
|
10
|
+
@start = start
|
11
|
+
@end = end_p
|
12
|
+
@index = 0
|
13
|
+
@depth = 0
|
14
|
+
@c_value = 0
|
15
|
+
@c_values = [0] * word_count
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'suffix_tree'
|
3
|
+
|
4
|
+
describe "SuffixTreeTest" do
|
5
|
+
let(:words) { ["sandollar", "sandlot", "handler", "grand", "pantry"] }
|
6
|
+
let(:tree) { SuffixTree::Base.new(words) }
|
7
|
+
|
8
|
+
it 'initializes with input' do
|
9
|
+
assert tree.words, words
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'builds suffix tree' do
|
13
|
+
assert tree.build
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "find longest commom substring" do
|
17
|
+
it 'finds longest common subtring for all the strings' do
|
18
|
+
assert tree.longest_common_substring, "an"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'finds longest common subtring for at least k substrings' do
|
22
|
+
assert tree.longest_common_substring(2), "sand"
|
23
|
+
assert tree.longest_common_substring(3), "and"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: suffix_tree_ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sudheer Meka
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Construct Generalized Suffix tree using Ukkonen's algorithm
|
14
|
+
email: reachme@sudheer-meka.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- Rakefile
|
20
|
+
- lib/suffix_tree.rb
|
21
|
+
- lib/suffix_tree/active_point.rb
|
22
|
+
- lib/suffix_tree/base.rb
|
23
|
+
- lib/suffix_tree/end.rb
|
24
|
+
- lib/suffix_tree/node.rb
|
25
|
+
- lib/suffix_tree/version.rb
|
26
|
+
- test/test_suffix_tree.rb
|
27
|
+
homepage: https://rubygems.org/gems/suffix_tree
|
28
|
+
licenses:
|
29
|
+
- MIT
|
30
|
+
metadata:
|
31
|
+
source_code_uri: https://github.com/sudheer-meka/suffix_tree
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
requirements: []
|
47
|
+
rubygems_version: 3.0.9
|
48
|
+
signing_key:
|
49
|
+
specification_version: 4
|
50
|
+
summary: Suffix Tree Implementaion and operation using ruby
|
51
|
+
test_files:
|
52
|
+
- test/test_suffix_tree.rb
|