suffix_tree_ruby 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Rakefile +8 -0
- data/lib/suffix_tree.rb +10 -0
- data/lib/suffix_tree/active_point.rb +31 -0
- data/lib/suffix_tree/base.rb +223 -0
- data/lib/suffix_tree/end.rb +11 -0
- data/lib/suffix_tree/node.rb +18 -0
- data/lib/suffix_tree/version.rb +5 -0
- data/test/test_suffix_tree.rb +26 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f63b159f00880e92a880c4a86d014312d9b74bbc51b14d2129813378b0e2258f
|
4
|
+
data.tar.gz: bf254242be7c1aac06549891371efb13de2008b72da84477c19eb1f2af1c2255
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7310c3ae440526fad97c07bc02cd9ded850106b3c3ac87ea8aaf1230deb5f4bac75851ff43940037a7da16d38027f298c9c4736caa5682306816f4e0623ee879
|
7
|
+
data.tar.gz: 81f81eb06bd65304a2b4b4013374168b2019a36196a6a8b99cef75d4703a09c4143108d09a8c9451e98786d64b12e7d8bd9e1fd46a6dcac18ebf0d978fc75a73
|
data/Rakefile
ADDED
data/lib/suffix_tree.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SuffixTree
|
4
|
+
# Active point - It is the point from which traversal starts for next extension or next phase.
|
5
|
+
# Active point always starts from root. Other extension will get active point set up
|
6
|
+
# correctly by last extension.
|
7
|
+
|
8
|
+
# Active node - Node from which active point will start
|
9
|
+
# Active Edge - It is used to choose the edge from active node. It has index of character.
|
10
|
+
# Active Length - How far to go on active edge.
|
11
|
+
|
12
|
+
# Active point rules
|
13
|
+
# 1) If rule 3 extension is applied then active length will increment by 1 if active length is
|
14
|
+
# not greater then length of path on edge.
|
15
|
+
# 2) If rule 3 extension is applied and if active length gets greater than length path of edge
|
16
|
+
# then change active node, active edge and active length
|
17
|
+
# 3) If active length is 0 then always start looking for the character from root.
|
18
|
+
# 4) If rule 2 extension is applied and if active node is root then active edge is active edge + 1
|
19
|
+
# and active length is active lenght -1
|
20
|
+
# 5) If rule 2 extension is applied and if active node is not root then follow suffix link and
|
21
|
+
# make active node as suffix link and do no change anything.
|
22
|
+
class ActivePoint
|
23
|
+
attr_accessor :length, :edge, :node
|
24
|
+
|
25
|
+
def initialize(node)
|
26
|
+
@length = 0
|
27
|
+
@edge = -1
|
28
|
+
@node = node
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SuffixTree
|
4
|
+
# https://en.wikipedia.org/wiki/Ukkonen%27s_algorithm
|
5
|
+
class Base
|
6
|
+
attr_accessor :input, :root, :remaining, :end, :active, :words, :uniq_word_map, :l_map
|
7
|
+
|
8
|
+
UNIQ_IDENTIFIERS = ((0...97).map(&:chr) + (135...200).map(&:chr)).freeze
|
9
|
+
|
10
|
+
def initialize(words)
|
11
|
+
@words = words
|
12
|
+
@input = ''
|
13
|
+
@l_map = {}
|
14
|
+
@uniq_word_map = {}
|
15
|
+
words.each_with_index do |word, index|
|
16
|
+
id = UNIQ_IDENTIFIERS[index]
|
17
|
+
@uniq_word_map[id] = index
|
18
|
+
@input += "#{word}#{id}"
|
19
|
+
end
|
20
|
+
@input = @input.split('')
|
21
|
+
@remaining = 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def build
|
25
|
+
self.root = Node.new(1, End.new(0), words.count)
|
26
|
+
root.index = -1
|
27
|
+
self.active = ActivePoint.new(root)
|
28
|
+
self.end = End.new(-1)
|
29
|
+
input.each_with_index do |_elm, index|
|
30
|
+
start_phase(index)
|
31
|
+
end
|
32
|
+
|
33
|
+
set_index_dfs(root, 0, input.length)
|
34
|
+
set_lvalues_dfs(root)
|
35
|
+
end
|
36
|
+
|
37
|
+
def set_lvalues_dfs(node, str = '')
|
38
|
+
return unless node
|
39
|
+
|
40
|
+
return if node.index != -1
|
41
|
+
|
42
|
+
str += input[node.start..node.end.end].join('')
|
43
|
+
if l_map[node.c_value]
|
44
|
+
if l_map[node.c_value].length < str.length
|
45
|
+
l_map[node.c_value] = str
|
46
|
+
elsif l_map[node.c_value].length == str.length
|
47
|
+
l_map[node.c_value] = [str, l_map[node.c_value]].min
|
48
|
+
end
|
49
|
+
else
|
50
|
+
l_map[node.c_value] = str
|
51
|
+
end
|
52
|
+
|
53
|
+
node.child.each do |_key, child|
|
54
|
+
set_lvalues_dfs(child, str)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def set_index_dfs(node, val, size)
|
59
|
+
return unless node
|
60
|
+
|
61
|
+
val += node.end.end - node.start + 1
|
62
|
+
if node.index != -1
|
63
|
+
node.index = size - val
|
64
|
+
c_index = node.index
|
65
|
+
|
66
|
+
c_index += 1 until uniq_word_map[input[c_index]]
|
67
|
+
node.c_values[uniq_word_map[input[c_index]]] = 1
|
68
|
+
return
|
69
|
+
end
|
70
|
+
nums = [node.c_values]
|
71
|
+
node.child.each do |_key, child|
|
72
|
+
set_index_dfs(child, val, size)
|
73
|
+
nums << child.c_values
|
74
|
+
end
|
75
|
+
if node != root
|
76
|
+
node.c_values = nums.transpose.map(&:sum).map { |e| e.positive? ? 1 : 0 }
|
77
|
+
end
|
78
|
+
node.c_value = node.c_values.sum
|
79
|
+
end
|
80
|
+
|
81
|
+
def start_phase(index)
|
82
|
+
last_created_internal_node = nil
|
83
|
+
self.end.end += 1
|
84
|
+
self.remaining += 1
|
85
|
+
|
86
|
+
while remaining.positive?
|
87
|
+
if active.length.zero?
|
88
|
+
if select_node(index)
|
89
|
+
active.edge = select_node(index).start
|
90
|
+
active.length += 1
|
91
|
+
break
|
92
|
+
else
|
93
|
+
root.child[input[index]] = Node.new(index, self.end, words.count)
|
94
|
+
self.remaining -= 1
|
95
|
+
end
|
96
|
+
else
|
97
|
+
begin
|
98
|
+
char = next_char(index)
|
99
|
+
if char == input[index]
|
100
|
+
last_created_internal_node.suffix_link = selected_node if last_created_internal_node
|
101
|
+
walk_down(index)
|
102
|
+
break
|
103
|
+
else
|
104
|
+
node = selected_node
|
105
|
+
temp_start = node.start
|
106
|
+
node.start = node.start + active.length
|
107
|
+
new_internal_node = Node.new(temp_start, End.new(temp_start + active.length - 1), words.count)
|
108
|
+
|
109
|
+
new_leaf_node = Node.new(index, self.end, words.count)
|
110
|
+
|
111
|
+
new_internal_node.child[input[new_internal_node.start + active.length]] = node
|
112
|
+
new_internal_node.child[input[index]] = new_leaf_node
|
113
|
+
new_internal_node.index = -1
|
114
|
+
active.node.child[input[new_internal_node.start]] = new_internal_node
|
115
|
+
|
116
|
+
last_created_internal_node.suffix_link = new_internal_node if last_created_internal_node
|
117
|
+
|
118
|
+
last_created_internal_node = new_internal_node
|
119
|
+
new_internal_node.suffix_link = root
|
120
|
+
|
121
|
+
if active.node != root
|
122
|
+
active.node = active.node.suffix_link
|
123
|
+
else
|
124
|
+
active.edge = active.edge + 1
|
125
|
+
active.length -= 1
|
126
|
+
end
|
127
|
+
self.remaining -= 1
|
128
|
+
end
|
129
|
+
rescue StandardError
|
130
|
+
node = selected_node
|
131
|
+
node.child[input[index]] = Node.new(index, self.end, words.count)
|
132
|
+
last_created_internal_node.suffix_link = node if last_created_internal_node
|
133
|
+
last_created_internal_node = node
|
134
|
+
|
135
|
+
if active.node != root
|
136
|
+
active.node = active.node.suffix_link
|
137
|
+
else
|
138
|
+
active.edge = active.edge + 1
|
139
|
+
active.length -= 1
|
140
|
+
end
|
141
|
+
self.remaining -= 1
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def walk_down(index)
|
148
|
+
node = selected_node
|
149
|
+
|
150
|
+
if diff(node) < active.length
|
151
|
+
active.node = node
|
152
|
+
active.length = active.length - diff(node)
|
153
|
+
active.edge = node.child[input[index]].start
|
154
|
+
else
|
155
|
+
active.length += 1
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def next_char(i)
|
160
|
+
node = selected_node
|
161
|
+
return input[active.node.child[input[active.edge]].start + active.length] if diff(node) >= active.length
|
162
|
+
|
163
|
+
if diff(node) + 1 == active.length
|
164
|
+
return input[i] if node.child[input[i]]
|
165
|
+
else
|
166
|
+
active.node = node
|
167
|
+
active.length = active.length - diff(node) - 1
|
168
|
+
active.edge = active.edge + diff(node) + 1
|
169
|
+
return next_char(i)
|
170
|
+
end
|
171
|
+
raise 'End Of Path Reached'
|
172
|
+
end
|
173
|
+
|
174
|
+
def diff(node)
|
175
|
+
node.end.end - node.start
|
176
|
+
end
|
177
|
+
|
178
|
+
def selected_node
|
179
|
+
active.node.child[input[active.edge]]
|
180
|
+
end
|
181
|
+
|
182
|
+
def select_node(index)
|
183
|
+
active.node.child[input[index]]
|
184
|
+
end
|
185
|
+
|
186
|
+
def longest_common_substring(k = words.length)
|
187
|
+
raise 'Input has to be integer' unless k.is_a? Integer
|
188
|
+
raise 'Invalid Input' if k <= 0
|
189
|
+
|
190
|
+
return base_case if k == 1
|
191
|
+
|
192
|
+
max_length = -1
|
193
|
+
answer = ''
|
194
|
+
|
195
|
+
l_map.each do |key, v|
|
196
|
+
next if key < k
|
197
|
+
|
198
|
+
if v.length > max_length
|
199
|
+
max_length = v.length
|
200
|
+
answer = v
|
201
|
+
elsif v.length == max_length
|
202
|
+
answer = [v, answer].min
|
203
|
+
end
|
204
|
+
end
|
205
|
+
answer
|
206
|
+
end
|
207
|
+
|
208
|
+
def base_case
|
209
|
+
max_length = -1
|
210
|
+
answer = ''
|
211
|
+
|
212
|
+
words.each do |word|
|
213
|
+
if max_length < word.length
|
214
|
+
max_length = word.length
|
215
|
+
answer = word
|
216
|
+
elsif max_length == word.length
|
217
|
+
answer = [answer, word].min
|
218
|
+
end
|
219
|
+
end
|
220
|
+
answer
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SuffixTree
|
4
|
+
class Node
|
5
|
+
attr_accessor :child, :suffix_link, :start, :end, :index, :c_values, :c_value
|
6
|
+
|
7
|
+
def initialize(start, end_p, word_count)
|
8
|
+
@child = {}
|
9
|
+
@suffix_link = nil
|
10
|
+
@start = start
|
11
|
+
@end = end_p
|
12
|
+
@index = 0
|
13
|
+
@depth = 0
|
14
|
+
@c_value = 0
|
15
|
+
@c_values = [0] * word_count
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'suffix_tree'
|
3
|
+
|
4
|
+
describe "SuffixTreeTest" do
|
5
|
+
let(:words) { ["sandollar", "sandlot", "handler", "grand", "pantry"] }
|
6
|
+
let(:tree) { SuffixTree::Base.new(words) }
|
7
|
+
|
8
|
+
it 'initializes with input' do
|
9
|
+
assert tree.words, words
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'builds suffix tree' do
|
13
|
+
assert tree.build
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "find longest commom substring" do
|
17
|
+
it 'finds longest common subtring for all the strings' do
|
18
|
+
assert tree.longest_common_substring, "an"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'finds longest common subtring for at least k substrings' do
|
22
|
+
assert tree.longest_common_substring(2), "sand"
|
23
|
+
assert tree.longest_common_substring(3), "and"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: suffix_tree_ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sudheer Meka
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Construct Generalized Suffix tree using Ukkonen's algorithm
|
14
|
+
email: reachme@sudheer-meka.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- Rakefile
|
20
|
+
- lib/suffix_tree.rb
|
21
|
+
- lib/suffix_tree/active_point.rb
|
22
|
+
- lib/suffix_tree/base.rb
|
23
|
+
- lib/suffix_tree/end.rb
|
24
|
+
- lib/suffix_tree/node.rb
|
25
|
+
- lib/suffix_tree/version.rb
|
26
|
+
- test/test_suffix_tree.rb
|
27
|
+
homepage: https://rubygems.org/gems/suffix_tree
|
28
|
+
licenses:
|
29
|
+
- MIT
|
30
|
+
metadata:
|
31
|
+
source_code_uri: https://github.com/sudheer-meka/suffix_tree
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
requirements: []
|
47
|
+
rubygems_version: 3.0.9
|
48
|
+
signing_key:
|
49
|
+
specification_version: 4
|
50
|
+
summary: Suffix Tree Implementaion and operation using ruby
|
51
|
+
test_files:
|
52
|
+
- test/test_suffix_tree.rb
|