suffix_tree_ruby 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f63b159f00880e92a880c4a86d014312d9b74bbc51b14d2129813378b0e2258f
4
+ data.tar.gz: bf254242be7c1aac06549891371efb13de2008b72da84477c19eb1f2af1c2255
5
+ SHA512:
6
+ metadata.gz: 7310c3ae440526fad97c07bc02cd9ded850106b3c3ac87ea8aaf1230deb5f4bac75851ff43940037a7da16d38027f298c9c4736caa5682306816f4e0623ee879
7
+ data.tar.gz: 81f81eb06bd65304a2b4b4013374168b2019a36196a6a8b99cef75d4703a09c4143108d09a8c9451e98786d64b12e7d8bd9e1fd46a6dcac18ebf0d978fc75a73
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'suffix_tree/version'
4
+
5
+ module SuffixTree
6
+ autoload :Node, 'suffix_tree/node'
7
+ autoload :ActivePoint, 'suffix_tree/active_point'
8
+ autoload :End, 'suffix_tree/end'
9
+ end
10
+ require 'suffix_tree/base'
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ # Active point - It is the point from which traversal starts for next extension or next phase.
5
+ # Active point always starts from root. Other extension will get active point set up
6
+ # correctly by last extension.
7
+
8
+ # Active node - Node from which active point will start
9
+ # Active Edge - It is used to choose the edge from active node. It has index of character.
10
+ # Active Length - How far to go on active edge.
11
+
12
+ # Active point rules
13
+ # 1) If rule 3 extension is applied then active length will increment by 1 if active length is
14
+ # not greater then length of path on edge.
15
+ # 2) If rule 3 extension is applied and if active length gets greater than length path of edge
16
+ # then change active node, active edge and active length
17
+ # 3) If active length is 0 then always start looking for the character from root.
18
+ # 4) If rule 2 extension is applied and if active node is root then active edge is active edge + 1
19
+ # and active length is active lenght -1
20
+ # 5) If rule 2 extension is applied and if active node is not root then follow suffix link and
21
+ # make active node as suffix link and do no change anything.
22
+ class ActivePoint
23
+ attr_accessor :length, :edge, :node
24
+
25
+ def initialize(node)
26
+ @length = 0
27
+ @edge = -1
28
+ @node = node
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ # https://en.wikipedia.org/wiki/Ukkonen%27s_algorithm
5
+ class Base
6
+ attr_accessor :input, :root, :remaining, :end, :active, :words, :uniq_word_map, :l_map
7
+
8
+ UNIQ_IDENTIFIERS = ((0...97).map(&:chr) + (135...200).map(&:chr)).freeze
9
+
10
+ def initialize(words)
11
+ @words = words
12
+ @input = ''
13
+ @l_map = {}
14
+ @uniq_word_map = {}
15
+ words.each_with_index do |word, index|
16
+ id = UNIQ_IDENTIFIERS[index]
17
+ @uniq_word_map[id] = index
18
+ @input += "#{word}#{id}"
19
+ end
20
+ @input = @input.split('')
21
+ @remaining = 0
22
+ end
23
+
24
+ def build
25
+ self.root = Node.new(1, End.new(0), words.count)
26
+ root.index = -1
27
+ self.active = ActivePoint.new(root)
28
+ self.end = End.new(-1)
29
+ input.each_with_index do |_elm, index|
30
+ start_phase(index)
31
+ end
32
+
33
+ set_index_dfs(root, 0, input.length)
34
+ set_lvalues_dfs(root)
35
+ end
36
+
37
+ def set_lvalues_dfs(node, str = '')
38
+ return unless node
39
+
40
+ return if node.index != -1
41
+
42
+ str += input[node.start..node.end.end].join('')
43
+ if l_map[node.c_value]
44
+ if l_map[node.c_value].length < str.length
45
+ l_map[node.c_value] = str
46
+ elsif l_map[node.c_value].length == str.length
47
+ l_map[node.c_value] = [str, l_map[node.c_value]].min
48
+ end
49
+ else
50
+ l_map[node.c_value] = str
51
+ end
52
+
53
+ node.child.each do |_key, child|
54
+ set_lvalues_dfs(child, str)
55
+ end
56
+ end
57
+
58
+ def set_index_dfs(node, val, size)
59
+ return unless node
60
+
61
+ val += node.end.end - node.start + 1
62
+ if node.index != -1
63
+ node.index = size - val
64
+ c_index = node.index
65
+
66
+ c_index += 1 until uniq_word_map[input[c_index]]
67
+ node.c_values[uniq_word_map[input[c_index]]] = 1
68
+ return
69
+ end
70
+ nums = [node.c_values]
71
+ node.child.each do |_key, child|
72
+ set_index_dfs(child, val, size)
73
+ nums << child.c_values
74
+ end
75
+ if node != root
76
+ node.c_values = nums.transpose.map(&:sum).map { |e| e.positive? ? 1 : 0 }
77
+ end
78
+ node.c_value = node.c_values.sum
79
+ end
80
+
81
+ def start_phase(index)
82
+ last_created_internal_node = nil
83
+ self.end.end += 1
84
+ self.remaining += 1
85
+
86
+ while remaining.positive?
87
+ if active.length.zero?
88
+ if select_node(index)
89
+ active.edge = select_node(index).start
90
+ active.length += 1
91
+ break
92
+ else
93
+ root.child[input[index]] = Node.new(index, self.end, words.count)
94
+ self.remaining -= 1
95
+ end
96
+ else
97
+ begin
98
+ char = next_char(index)
99
+ if char == input[index]
100
+ last_created_internal_node.suffix_link = selected_node if last_created_internal_node
101
+ walk_down(index)
102
+ break
103
+ else
104
+ node = selected_node
105
+ temp_start = node.start
106
+ node.start = node.start + active.length
107
+ new_internal_node = Node.new(temp_start, End.new(temp_start + active.length - 1), words.count)
108
+
109
+ new_leaf_node = Node.new(index, self.end, words.count)
110
+
111
+ new_internal_node.child[input[new_internal_node.start + active.length]] = node
112
+ new_internal_node.child[input[index]] = new_leaf_node
113
+ new_internal_node.index = -1
114
+ active.node.child[input[new_internal_node.start]] = new_internal_node
115
+
116
+ last_created_internal_node.suffix_link = new_internal_node if last_created_internal_node
117
+
118
+ last_created_internal_node = new_internal_node
119
+ new_internal_node.suffix_link = root
120
+
121
+ if active.node != root
122
+ active.node = active.node.suffix_link
123
+ else
124
+ active.edge = active.edge + 1
125
+ active.length -= 1
126
+ end
127
+ self.remaining -= 1
128
+ end
129
+ rescue StandardError
130
+ node = selected_node
131
+ node.child[input[index]] = Node.new(index, self.end, words.count)
132
+ last_created_internal_node.suffix_link = node if last_created_internal_node
133
+ last_created_internal_node = node
134
+
135
+ if active.node != root
136
+ active.node = active.node.suffix_link
137
+ else
138
+ active.edge = active.edge + 1
139
+ active.length -= 1
140
+ end
141
+ self.remaining -= 1
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ def walk_down(index)
148
+ node = selected_node
149
+
150
+ if diff(node) < active.length
151
+ active.node = node
152
+ active.length = active.length - diff(node)
153
+ active.edge = node.child[input[index]].start
154
+ else
155
+ active.length += 1
156
+ end
157
+ end
158
+
159
+ def next_char(i)
160
+ node = selected_node
161
+ return input[active.node.child[input[active.edge]].start + active.length] if diff(node) >= active.length
162
+
163
+ if diff(node) + 1 == active.length
164
+ return input[i] if node.child[input[i]]
165
+ else
166
+ active.node = node
167
+ active.length = active.length - diff(node) - 1
168
+ active.edge = active.edge + diff(node) + 1
169
+ return next_char(i)
170
+ end
171
+ raise 'End Of Path Reached'
172
+ end
173
+
174
+ def diff(node)
175
+ node.end.end - node.start
176
+ end
177
+
178
+ def selected_node
179
+ active.node.child[input[active.edge]]
180
+ end
181
+
182
+ def select_node(index)
183
+ active.node.child[input[index]]
184
+ end
185
+
186
+ def longest_common_substring(k = words.length)
187
+ raise 'Input has to be integer' unless k.is_a? Integer
188
+ raise 'Invalid Input' if k <= 0
189
+
190
+ return base_case if k == 1
191
+
192
+ max_length = -1
193
+ answer = ''
194
+
195
+ l_map.each do |key, v|
196
+ next if key < k
197
+
198
+ if v.length > max_length
199
+ max_length = v.length
200
+ answer = v
201
+ elsif v.length == max_length
202
+ answer = [v, answer].min
203
+ end
204
+ end
205
+ answer
206
+ end
207
+
208
+ def base_case
209
+ max_length = -1
210
+ answer = ''
211
+
212
+ words.each do |word|
213
+ if max_length < word.length
214
+ max_length = word.length
215
+ answer = word
216
+ elsif max_length == word.length
217
+ answer = [answer, word].min
218
+ end
219
+ end
220
+ answer
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ class End
5
+ attr_accessor :end
6
+
7
+ def initialize(end_p)
8
+ @end = end_p
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ class Node
5
+ attr_accessor :child, :suffix_link, :start, :end, :index, :c_values, :c_value
6
+
7
+ def initialize(start, end_p, word_count)
8
+ @child = {}
9
+ @suffix_link = nil
10
+ @start = start
11
+ @end = end_p
12
+ @index = 0
13
+ @depth = 0
14
+ @c_value = 0
15
+ @c_values = [0] * word_count
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ VERSION = '0.0.1'
5
+ end
@@ -0,0 +1,26 @@
1
+ require 'minitest/autorun'
2
+ require 'suffix_tree'
3
+
4
+ describe "SuffixTreeTest" do
5
+ let(:words) { ["sandollar", "sandlot", "handler", "grand", "pantry"] }
6
+ let(:tree) { SuffixTree::Base.new(words) }
7
+
8
+ it 'initializes with input' do
9
+ assert tree.words, words
10
+ end
11
+
12
+ it 'builds suffix tree' do
13
+ assert tree.build
14
+ end
15
+
16
+ describe "find longest commom substring" do
17
+ it 'finds longest common subtring for all the strings' do
18
+ assert tree.longest_common_substring, "an"
19
+ end
20
+
21
+ it 'finds longest common subtring for at least k substrings' do
22
+ assert tree.longest_common_substring(2), "sand"
23
+ assert tree.longest_common_substring(3), "and"
24
+ end
25
+ end
26
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: suffix_tree_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Sudheer Meka
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-04-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Construct Generalized Suffix tree using Ukkonen's algorithm
14
+ email: reachme@sudheer-meka.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - Rakefile
20
+ - lib/suffix_tree.rb
21
+ - lib/suffix_tree/active_point.rb
22
+ - lib/suffix_tree/base.rb
23
+ - lib/suffix_tree/end.rb
24
+ - lib/suffix_tree/node.rb
25
+ - lib/suffix_tree/version.rb
26
+ - test/test_suffix_tree.rb
27
+ homepage: https://rubygems.org/gems/suffix_tree
28
+ licenses:
29
+ - MIT
30
+ metadata:
31
+ source_code_uri: https://github.com/sudheer-meka/suffix_tree
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubygems_version: 3.0.9
48
+ signing_key:
49
+ specification_version: 4
50
+ summary: Suffix Tree Implementaion and operation using ruby
51
+ test_files:
52
+ - test/test_suffix_tree.rb