suffix_tree_ruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f63b159f00880e92a880c4a86d014312d9b74bbc51b14d2129813378b0e2258f
4
+ data.tar.gz: bf254242be7c1aac06549891371efb13de2008b72da84477c19eb1f2af1c2255
5
+ SHA512:
6
+ metadata.gz: 7310c3ae440526fad97c07bc02cd9ded850106b3c3ac87ea8aaf1230deb5f4bac75851ff43940037a7da16d38027f298c9c4736caa5682306816f4e0623ee879
7
+ data.tar.gz: 81f81eb06bd65304a2b4b4013374168b2019a36196a6a8b99cef75d4703a09c4143108d09a8c9451e98786d64b12e7d8bd9e1fd46a6dcac18ebf0d978fc75a73
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'suffix_tree/version'
4
+
5
+ module SuffixTree
6
+ autoload :Node, 'suffix_tree/node'
7
+ autoload :ActivePoint, 'suffix_tree/active_point'
8
+ autoload :End, 'suffix_tree/end'
9
+ end
10
+ require 'suffix_tree/base'
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ # Active point - It is the point from which traversal starts for next extension or next phase.
5
+ # Active point always starts from root. Other extension will get active point set up
6
+ # correctly by last extension.
7
+
8
+ # Active node - Node from which active point will start
9
+ # Active Edge - It is used to choose the edge from active node. It has index of character.
10
+ # Active Length - How far to go on active edge.
11
+
12
+ # Active point rules
13
+ # 1) If rule 3 extension is applied then active length will increment by 1 if active length is
14
+ # not greater then length of path on edge.
15
+ # 2) If rule 3 extension is applied and if active length gets greater than length path of edge
16
+ # then change active node, active edge and active length
17
+ # 3) If active length is 0 then always start looking for the character from root.
18
+ # 4) If rule 2 extension is applied and if active node is root then active edge is active edge + 1
19
+ # and active length is active lenght -1
20
+ # 5) If rule 2 extension is applied and if active node is not root then follow suffix link and
21
+ # make active node as suffix link and do no change anything.
22
+ class ActivePoint
23
+ attr_accessor :length, :edge, :node
24
+
25
+ def initialize(node)
26
+ @length = 0
27
+ @edge = -1
28
+ @node = node
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ # https://en.wikipedia.org/wiki/Ukkonen%27s_algorithm
5
+ class Base
6
+ attr_accessor :input, :root, :remaining, :end, :active, :words, :uniq_word_map, :l_map
7
+
8
+ UNIQ_IDENTIFIERS = ((0...97).map(&:chr) + (135...200).map(&:chr)).freeze
9
+
10
+ def initialize(words)
11
+ @words = words
12
+ @input = ''
13
+ @l_map = {}
14
+ @uniq_word_map = {}
15
+ words.each_with_index do |word, index|
16
+ id = UNIQ_IDENTIFIERS[index]
17
+ @uniq_word_map[id] = index
18
+ @input += "#{word}#{id}"
19
+ end
20
+ @input = @input.split('')
21
+ @remaining = 0
22
+ end
23
+
24
+ def build
25
+ self.root = Node.new(1, End.new(0), words.count)
26
+ root.index = -1
27
+ self.active = ActivePoint.new(root)
28
+ self.end = End.new(-1)
29
+ input.each_with_index do |_elm, index|
30
+ start_phase(index)
31
+ end
32
+
33
+ set_index_dfs(root, 0, input.length)
34
+ set_lvalues_dfs(root)
35
+ end
36
+
37
+ def set_lvalues_dfs(node, str = '')
38
+ return unless node
39
+
40
+ return if node.index != -1
41
+
42
+ str += input[node.start..node.end.end].join('')
43
+ if l_map[node.c_value]
44
+ if l_map[node.c_value].length < str.length
45
+ l_map[node.c_value] = str
46
+ elsif l_map[node.c_value].length == str.length
47
+ l_map[node.c_value] = [str, l_map[node.c_value]].min
48
+ end
49
+ else
50
+ l_map[node.c_value] = str
51
+ end
52
+
53
+ node.child.each do |_key, child|
54
+ set_lvalues_dfs(child, str)
55
+ end
56
+ end
57
+
58
+ def set_index_dfs(node, val, size)
59
+ return unless node
60
+
61
+ val += node.end.end - node.start + 1
62
+ if node.index != -1
63
+ node.index = size - val
64
+ c_index = node.index
65
+
66
+ c_index += 1 until uniq_word_map[input[c_index]]
67
+ node.c_values[uniq_word_map[input[c_index]]] = 1
68
+ return
69
+ end
70
+ nums = [node.c_values]
71
+ node.child.each do |_key, child|
72
+ set_index_dfs(child, val, size)
73
+ nums << child.c_values
74
+ end
75
+ if node != root
76
+ node.c_values = nums.transpose.map(&:sum).map { |e| e.positive? ? 1 : 0 }
77
+ end
78
+ node.c_value = node.c_values.sum
79
+ end
80
+
81
+ def start_phase(index)
82
+ last_created_internal_node = nil
83
+ self.end.end += 1
84
+ self.remaining += 1
85
+
86
+ while remaining.positive?
87
+ if active.length.zero?
88
+ if select_node(index)
89
+ active.edge = select_node(index).start
90
+ active.length += 1
91
+ break
92
+ else
93
+ root.child[input[index]] = Node.new(index, self.end, words.count)
94
+ self.remaining -= 1
95
+ end
96
+ else
97
+ begin
98
+ char = next_char(index)
99
+ if char == input[index]
100
+ last_created_internal_node.suffix_link = selected_node if last_created_internal_node
101
+ walk_down(index)
102
+ break
103
+ else
104
+ node = selected_node
105
+ temp_start = node.start
106
+ node.start = node.start + active.length
107
+ new_internal_node = Node.new(temp_start, End.new(temp_start + active.length - 1), words.count)
108
+
109
+ new_leaf_node = Node.new(index, self.end, words.count)
110
+
111
+ new_internal_node.child[input[new_internal_node.start + active.length]] = node
112
+ new_internal_node.child[input[index]] = new_leaf_node
113
+ new_internal_node.index = -1
114
+ active.node.child[input[new_internal_node.start]] = new_internal_node
115
+
116
+ last_created_internal_node.suffix_link = new_internal_node if last_created_internal_node
117
+
118
+ last_created_internal_node = new_internal_node
119
+ new_internal_node.suffix_link = root
120
+
121
+ if active.node != root
122
+ active.node = active.node.suffix_link
123
+ else
124
+ active.edge = active.edge + 1
125
+ active.length -= 1
126
+ end
127
+ self.remaining -= 1
128
+ end
129
+ rescue StandardError
130
+ node = selected_node
131
+ node.child[input[index]] = Node.new(index, self.end, words.count)
132
+ last_created_internal_node.suffix_link = node if last_created_internal_node
133
+ last_created_internal_node = node
134
+
135
+ if active.node != root
136
+ active.node = active.node.suffix_link
137
+ else
138
+ active.edge = active.edge + 1
139
+ active.length -= 1
140
+ end
141
+ self.remaining -= 1
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ def walk_down(index)
148
+ node = selected_node
149
+
150
+ if diff(node) < active.length
151
+ active.node = node
152
+ active.length = active.length - diff(node)
153
+ active.edge = node.child[input[index]].start
154
+ else
155
+ active.length += 1
156
+ end
157
+ end
158
+
159
+ def next_char(i)
160
+ node = selected_node
161
+ return input[active.node.child[input[active.edge]].start + active.length] if diff(node) >= active.length
162
+
163
+ if diff(node) + 1 == active.length
164
+ return input[i] if node.child[input[i]]
165
+ else
166
+ active.node = node
167
+ active.length = active.length - diff(node) - 1
168
+ active.edge = active.edge + diff(node) + 1
169
+ return next_char(i)
170
+ end
171
+ raise 'End Of Path Reached'
172
+ end
173
+
174
+ def diff(node)
175
+ node.end.end - node.start
176
+ end
177
+
178
+ def selected_node
179
+ active.node.child[input[active.edge]]
180
+ end
181
+
182
+ def select_node(index)
183
+ active.node.child[input[index]]
184
+ end
185
+
186
+ def longest_common_substring(k = words.length)
187
+ raise 'Input has to be integer' unless k.is_a? Integer
188
+ raise 'Invalid Input' if k <= 0
189
+
190
+ return base_case if k == 1
191
+
192
+ max_length = -1
193
+ answer = ''
194
+
195
+ l_map.each do |key, v|
196
+ next if key < k
197
+
198
+ if v.length > max_length
199
+ max_length = v.length
200
+ answer = v
201
+ elsif v.length == max_length
202
+ answer = [v, answer].min
203
+ end
204
+ end
205
+ answer
206
+ end
207
+
208
+ def base_case
209
+ max_length = -1
210
+ answer = ''
211
+
212
+ words.each do |word|
213
+ if max_length < word.length
214
+ max_length = word.length
215
+ answer = word
216
+ elsif max_length == word.length
217
+ answer = [answer, word].min
218
+ end
219
+ end
220
+ answer
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ class End
5
+ attr_accessor :end
6
+
7
+ def initialize(end_p)
8
+ @end = end_p
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ class Node
5
+ attr_accessor :child, :suffix_link, :start, :end, :index, :c_values, :c_value
6
+
7
+ def initialize(start, end_p, word_count)
8
+ @child = {}
9
+ @suffix_link = nil
10
+ @start = start
11
+ @end = end_p
12
+ @index = 0
13
+ @depth = 0
14
+ @c_value = 0
15
+ @c_values = [0] * word_count
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SuffixTree
4
+ VERSION = '0.0.1'
5
+ end
@@ -0,0 +1,26 @@
1
+ require 'minitest/autorun'
2
+ require 'suffix_tree'
3
+
4
+ describe "SuffixTreeTest" do
5
+ let(:words) { ["sandollar", "sandlot", "handler", "grand", "pantry"] }
6
+ let(:tree) { SuffixTree::Base.new(words) }
7
+
8
+ it 'initializes with input' do
9
+ assert tree.words, words
10
+ end
11
+
12
+ it 'builds suffix tree' do
13
+ assert tree.build
14
+ end
15
+
16
+ describe "find longest commom substring" do
17
+ it 'finds longest common subtring for all the strings' do
18
+ assert tree.longest_common_substring, "an"
19
+ end
20
+
21
+ it 'finds longest common subtring for at least k substrings' do
22
+ assert tree.longest_common_substring(2), "sand"
23
+ assert tree.longest_common_substring(3), "and"
24
+ end
25
+ end
26
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: suffix_tree_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Sudheer Meka
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-04-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Construct Generalized Suffix tree using Ukkonen's algorithm
14
+ email: reachme@sudheer-meka.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - Rakefile
20
+ - lib/suffix_tree.rb
21
+ - lib/suffix_tree/active_point.rb
22
+ - lib/suffix_tree/base.rb
23
+ - lib/suffix_tree/end.rb
24
+ - lib/suffix_tree/node.rb
25
+ - lib/suffix_tree/version.rb
26
+ - test/test_suffix_tree.rb
27
+ homepage: https://rubygems.org/gems/suffix_tree
28
+ licenses:
29
+ - MIT
30
+ metadata:
31
+ source_code_uri: https://github.com/sudheer-meka/suffix_tree
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubygems_version: 3.0.9
48
+ signing_key:
49
+ specification_version: 4
50
+ summary: Suffix Tree Implementaion and operation using ruby
51
+ test_files:
52
+ - test/test_suffix_tree.rb