suffix_tree 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/data/base_data_source.rb +44 -0
- data/lib/data/data_source_factory.rb +16 -0
- data/lib/data/file_data_source.rb +29 -0
- data/lib/data/line_state_machine.rb +86 -0
- data/lib/data/string_data_source.rb +31 -0
- data/lib/data/word_data_source.rb +229 -0
- data/lib/location.rb +165 -0
- data/lib/node.rb +63 -0
- data/lib/node_factory.rb +169 -0
- data/lib/persist/suffix_tree_db.rb +148 -0
- data/lib/search/searcher.rb +68 -0
- data/lib/suffix_linker.rb +16 -0
- data/lib/suffix_tree.rb +122 -0
- data/lib/visitor/base_visitor.rb +17 -0
- data/lib/visitor/bfs.rb +22 -0
- data/lib/visitor/data_source_visitor.rb +15 -0
- data/lib/visitor/dfs.rb +34 -0
- data/lib/visitor/k_common_visitor.rb +71 -0
- data/lib/visitor/leaf_count_visitor.rb +15 -0
- data/lib/visitor/node_count_visitor.rb +16 -0
- data/lib/visitor/numbering_visitor.rb +230 -0
- data/lib/visitor/suffix_offset_visitor.rb +23 -0
- data/lib/visitor/tree_print_visitor.rb +44 -0
- data/lib/visitor/value_depth_visitor.rb +34 -0
- data/spec/constant_lca_spec.rb +27 -0
- data/spec/data_source_spec.rb +51 -0
- data/spec/fixtures/arizona.txt +1 -0
- data/spec/fixtures/chapter1.txt +371 -0
- data/spec/fixtures/chapter1.txt.summary +3 -0
- data/spec/fixtures/chapter1.txt.values +0 -0
- data/spec/fixtures/chapter1.txt.words +1329 -0
- data/spec/fixtures/mississippi.txt +1 -0
- data/spec/fixtures/singlePara.txt +41 -0
- data/spec/fixtures/smallFile.txt +3 -0
- data/spec/fixtures/smallFile.txt.summary +2 -0
- data/spec/fixtures/smallFile.txt.values +0 -0
- data/spec/fixtures/smallFile.txt.words +14 -0
- data/spec/fixtures/testbook.txt +5414 -0
- data/spec/location_spec.rb +149 -0
- data/spec/node_factory_spec.rb +199 -0
- data/spec/search_spec.rb +182 -0
- data/spec/suffix_tree_spec.rb +270 -0
- data/spec/util_spec.rb +47 -0
- data/spec/visitor_spec.rb +310 -0
- metadata +87 -0
@@ -0,0 +1,230 @@
|
|
1
|
+
require_relative '../node'
|
2
|
+
require_relative 'base_visitor'
|
3
|
+
|
4
|
+
# monkey patching dfsNumber and numberNodesInSubtree
|
5
|
+
module NodeExtensions
|
6
|
+
# set by first pass traversal with NumberingVisitor
|
7
|
+
attr_accessor :dfsNumber
|
8
|
+
|
9
|
+
# numberNodesInSubtree detects proper ancestor of two nodes, where ancestor is lca
|
10
|
+
# binaryTreeHeight is required for building runs
|
11
|
+
attr_accessor :numberNodesInSubtree, :binaryTreeHeight
|
12
|
+
|
13
|
+
# set by second pass traversal with IvVisitor, a run is the path with a single
|
14
|
+
# (lowest in tree) node with the greatest binaryTreeHeight. runHead and runTail
|
15
|
+
# are the nodes that span the run.
|
16
|
+
attr_accessor :runHead, :runTail
|
17
|
+
|
18
|
+
# set by third pass with RunBitVisitor, sets bits for each ancestor run
|
19
|
+
attr_accessor :runBits
|
20
|
+
end
|
21
|
+
|
22
|
+
class Node
|
23
|
+
prepend NodeExtensions
|
24
|
+
end
|
25
|
+
|
26
|
+
class BitUtil
|
27
|
+
def initialize
|
28
|
+
@masks = [ 0xffff, 0xff ]
|
29
|
+
@rightTable = [ 0 ]
|
30
|
+
@leftTable = [ 0 ]
|
31
|
+
|
32
|
+
(1..255).each do |val|
|
33
|
+
@rightTable << rightOneBit(val)
|
34
|
+
@leftTable << leftOneBit(val)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def rightBit(n)
|
39
|
+
shiftCount, n = shiftToByteRight(n)
|
40
|
+
@rightTable[n & 0xff] + shiftCount
|
41
|
+
end
|
42
|
+
|
43
|
+
def leftBit(n)
|
44
|
+
shiftCount, n = shiftToByteLeft(n)
|
45
|
+
@leftTable[n & 0xff] + shiftCount
|
46
|
+
end
|
47
|
+
|
48
|
+
def bitGreaterThanOrEqualTo(startOffset, v1, v2)
|
49
|
+
map = (1 << (startOffset - 1))
|
50
|
+
while (startOffset < 64) do
|
51
|
+
if (((v1 & map) != 0) && ((v2 & map) != 0)) then
|
52
|
+
return startOffset
|
53
|
+
else
|
54
|
+
startOffset += 1
|
55
|
+
map = map << 1
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def leftMostBitToRightOf(bitNumber, n)
|
61
|
+
mask = getMask(bitNumber + 1)
|
62
|
+
return leftBit(n & mask)
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def getMask(bitNumber)
|
68
|
+
# assume 64 bit numbers
|
69
|
+
0xffffffffffffffff >> bitNumber
|
70
|
+
end
|
71
|
+
|
72
|
+
def shiftToByteRight(n)
|
73
|
+
shiftCount = 0
|
74
|
+
if ((n & 0xffffffff) == 0) then
|
75
|
+
n = n >> 32
|
76
|
+
shiftCount += 32
|
77
|
+
end
|
78
|
+
if ((n & 0xffff) == 0) then
|
79
|
+
n = n >> 16
|
80
|
+
shiftCount += 16
|
81
|
+
end
|
82
|
+
if ((n & 0xff) == 0) then
|
83
|
+
n = n >> 8
|
84
|
+
shiftCount += 8
|
85
|
+
end
|
86
|
+
return shiftCount, n
|
87
|
+
end
|
88
|
+
|
89
|
+
def shiftToByteLeft(n)
|
90
|
+
shiftCount = 0
|
91
|
+
if ((n & 0xffffffff00000000) != 0) then
|
92
|
+
n = n >> 32
|
93
|
+
shiftCount += 32
|
94
|
+
end
|
95
|
+
if ((n & 0xffff0000) != 0) then
|
96
|
+
n = n >> 16
|
97
|
+
shiftCount += 16
|
98
|
+
end
|
99
|
+
if ((n & 0xff00) != 0) then
|
100
|
+
n = n >> 8
|
101
|
+
shiftCount += 8
|
102
|
+
end
|
103
|
+
return shiftCount, n
|
104
|
+
end
|
105
|
+
|
106
|
+
def rightOneBit(n)
|
107
|
+
mask = 1
|
108
|
+
result = 1
|
109
|
+
while ((n & mask) == 0) do
|
110
|
+
mask = mask << 1
|
111
|
+
result += 1
|
112
|
+
end
|
113
|
+
return result
|
114
|
+
end
|
115
|
+
|
116
|
+
def leftOneBit(n)
|
117
|
+
mask = 1 << 7
|
118
|
+
result = 8
|
119
|
+
while ((n & mask) == 0) do
|
120
|
+
mask = mask >> 1
|
121
|
+
result -= 1
|
122
|
+
end
|
123
|
+
return result
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# use BaseVisitor counters to set the values
|
128
|
+
class NumberingVisitor < BaseVisitor
|
129
|
+
def initialize
|
130
|
+
@bitCalculator = BitUtil.new
|
131
|
+
super
|
132
|
+
end
|
133
|
+
|
134
|
+
def preVisit(node)
|
135
|
+
super(node)
|
136
|
+
node.dfsNumber = @preCounter
|
137
|
+
node.binaryTreeHeight = @bitCalculator.rightBit(@preCounter)
|
138
|
+
return true
|
139
|
+
end
|
140
|
+
|
141
|
+
def postVisit(node)
|
142
|
+
node.numberNodesInSubtree = @preCounter - node.dfsNumber + 1
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# set the height of the complete binary tree node that each node maps to
|
147
|
+
class RunDefiningVisitor
|
148
|
+
def preVisit(node)
|
149
|
+
# every node gets the runTail set correctly
|
150
|
+
# runHead is ONLY valid in the runTail node
|
151
|
+
node.runHead = node.runTail = node
|
152
|
+
parentDfsNumber = 0
|
153
|
+
parentDfsNumber = node.parent.dfsNumber if (node.parent != nil)
|
154
|
+
return true
|
155
|
+
end
|
156
|
+
|
157
|
+
def postVisit(node)
|
158
|
+
# the child with a greatest binaryTreeHeight larger than the current node's binaryTreeHeight
|
159
|
+
# is the runTail, the current node is the runHead (which we need to set in runTail)
|
160
|
+
if (node.children != nil) then
|
161
|
+
maxBinaryTreeHeight = node.binaryTreeHeight
|
162
|
+
maxBinaryTreeHeightNode = nil
|
163
|
+
node.children.values.each do |child|
|
164
|
+
if (child.runTail.binaryTreeHeight > maxBinaryTreeHeight) then
|
165
|
+
maxBinaryTreeHeight = child.runTail.binaryTreeHeight
|
166
|
+
maxBinaryTreeHeightNode = child
|
167
|
+
end
|
168
|
+
end
|
169
|
+
if (maxBinaryTreeHeightNode != nil) then
|
170
|
+
node.runTail = maxBinaryTreeHeightNode.runTail
|
171
|
+
|
172
|
+
# runHead is ONLY valid in the runTail node,
|
173
|
+
# the alternative is to traverse from runTail to node whenever runHead changes
|
174
|
+
# (or to do this only on final change)
|
175
|
+
node.runTail.runHead = node
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
class RunBitVisitor
|
182
|
+
def initialize(startNode)
|
183
|
+
startNode.runBits = 0
|
184
|
+
end
|
185
|
+
|
186
|
+
def preVisit(node)
|
187
|
+
if (node.parent != nil) then
|
188
|
+
node.runBits = node.parent.runBits
|
189
|
+
end
|
190
|
+
node.runBits = node.runBits | getBit(node.runTail.binaryTreeHeight)
|
191
|
+
return true
|
192
|
+
end
|
193
|
+
|
194
|
+
def postVisit(node)
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
def getBit(n)
|
199
|
+
1 << (n-1)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
class LeafNodeCollector
|
204
|
+
attr_reader :suffixToLeaf
|
205
|
+
|
206
|
+
def initialize
|
207
|
+
@suffixToLeaf = {}
|
208
|
+
end
|
209
|
+
|
210
|
+
def preVisit(node)
|
211
|
+
if (node.children == nil) then
|
212
|
+
@suffixToLeaf[node.suffixOffset] = node
|
213
|
+
end
|
214
|
+
return true
|
215
|
+
end
|
216
|
+
|
217
|
+
def postVisit(node)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
class LeastCommonAncestorPreprocessing
|
222
|
+
def initialize(startNode)
|
223
|
+
dfs = OrderedDFS.new(NumberingVisitor.new)
|
224
|
+
dfs.traverse(startNode)
|
225
|
+
dfs = OrderedDFS.new(RunDefiningVisitor.new)
|
226
|
+
dfs.traverse(startNode)
|
227
|
+
dfs = DFS.new(RunBitVisitor.new(startNode))
|
228
|
+
dfs.traverse(startNode)
|
229
|
+
end
|
230
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#
|
2
|
+
# Only makes sense from OrderedDFS
|
3
|
+
#
|
4
|
+
class SuffixOffsetVisitor
|
5
|
+
|
6
|
+
attr_reader :result
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@result = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def preVisit(node)
|
13
|
+
if (node.isLeaf) then
|
14
|
+
@result << node.suffixOffset
|
15
|
+
end
|
16
|
+
return true
|
17
|
+
end
|
18
|
+
|
19
|
+
def postVisit(node)
|
20
|
+
# do nothing
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class TreePrintVisitor
|
2
|
+
ALL_LEVELS = -1
|
3
|
+
|
4
|
+
def initialize(dataSource, io, level=ALL_LEVELS)
|
5
|
+
@indentation = 0
|
6
|
+
@dataSource = dataSource
|
7
|
+
@io = io
|
8
|
+
@level = level
|
9
|
+
end
|
10
|
+
|
11
|
+
def nodeToStr(node)
|
12
|
+
if (node.isRoot) then
|
13
|
+
"ROOT"
|
14
|
+
else
|
15
|
+
"#{@dataSource.toString(node.incomingEdgeStartOffset, node.incomingEdgeEndOffset)}"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def preVisit(node)
|
20
|
+
@io.print "#{" "*@indentation}#{self.nodeToStr(node)}\n"
|
21
|
+
if (@level == ALL_LEVELS) || (@indentation < @level) then
|
22
|
+
@indentation += 1
|
23
|
+
return true
|
24
|
+
else
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def postVisit(node)
|
30
|
+
@indentation -= 1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class DfsTreePrintVisitor < TreePrintVisitor
|
35
|
+
def nodeToStr(node)
|
36
|
+
"#{node.dfsNumber} #{node.suffixOffset}, #{node.runTail.binaryTreeHeight}/#{node.runTail.dfsNumber} #{super}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class BasicDfsTreePrintVisitor < TreePrintVisitor
|
41
|
+
def nodeToStr(node)
|
42
|
+
"#{node.dfsNumber} #{node.suffixOffset}, #{super}"
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative '../node'
|
2
|
+
|
3
|
+
class ValueDepthVisitor < BaseVisitor
|
4
|
+
def initialize
|
5
|
+
super
|
6
|
+
end
|
7
|
+
|
8
|
+
def preVisit(node)
|
9
|
+
if (node.isInternal) then
|
10
|
+
node.valueDepth = node.parent.valueDepth + node.incomingEdgeLength
|
11
|
+
elsif (node.isLeaf) then
|
12
|
+
node.valueDepth = Node::LEAF_DEPTH
|
13
|
+
end
|
14
|
+
return true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
class DeepestValueDepthVisitor < BaseVisitor
|
20
|
+
attr_reader :deepestValueDepth, :deepestValueDepthNode
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
@deepestValueDepthNode = nil
|
24
|
+
@deepestValueDepth = 0
|
25
|
+
super
|
26
|
+
end
|
27
|
+
|
28
|
+
def postVisit(node)
|
29
|
+
if (node.valueDepth > @deepestValueDepth) then
|
30
|
+
@deepestValueDepth = node.valueDepth
|
31
|
+
@deepestValueDepthNode = node
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require_relative '../lib/data/string_data_source'
|
3
|
+
require_relative '../lib/suffix_tree'
|
4
|
+
require_relative '../lib/visitor/dfs'
|
5
|
+
require_relative '../lib/visitor/numbering_visitor'
|
6
|
+
|
7
|
+
describe 'Preprocesses suffix tree to allow constant time least-common-ancestor' do
|
8
|
+
|
9
|
+
let (:dataSource) { StringDataSource.new("mississippi$") }
|
10
|
+
|
11
|
+
it 'should find least-common-ancestor of any two leaf' do
|
12
|
+
st = SuffixTree.new
|
13
|
+
st.addDataSource(dataSource)
|
14
|
+
dfs = OrderedDFS.new(NumberingVisitor.new)
|
15
|
+
dfs.traverse(st.root)
|
16
|
+
dfs = OrderedDFS.new(RunDefiningVisitor.new)
|
17
|
+
dfs.traverse(st.root)
|
18
|
+
dfs = DFS.new(RunBitVisitor.new(st.root))
|
19
|
+
dfs.traverse(st.root)
|
20
|
+
leafNodeCollector = LeafNodeCollector.new
|
21
|
+
dfs = DFS.new(leafNodeCollector)
|
22
|
+
dfs.traverse(st.root)
|
23
|
+
leafNodeCollector.suffixToLeaf.keys.sort.each do |key|
|
24
|
+
print "#{key} #{leafNodeCollector.suffixToLeaf[key].dfsNumber}\n"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require_relative '../lib/data/file_data_source'
|
3
|
+
require_relative '../lib/data/string_data_source'
|
4
|
+
require_relative '../lib/data/word_data_source'
|
5
|
+
|
6
|
+
describe 'reads data sources' do
|
7
|
+
|
8
|
+
describe 'WordDataSource' do
|
9
|
+
it "should read words" do
|
10
|
+
wordDataSource = WordDataSource.new(File.join('spec', 'fixtures', "singlePara.txt"))
|
11
|
+
expect(wordDataSource.valueAt(0)).to eq "i"
|
12
|
+
expect(wordDataSource.valueAt(1)).to eq "was"
|
13
|
+
expect(wordDataSource.valueAt(2)).to eq "born"
|
14
|
+
expect(wordDataSource.valueAt(8)).to eq "angora"
|
15
|
+
expect(wordDataSource.valueAt(16)).to eq "silky-haired"
|
16
|
+
expect(wordDataSource.valueAt(19)).to eq "goats"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#extendWith on a StringDataSource" do
|
21
|
+
it "allows a data source to be extended with another" do
|
22
|
+
# this is used for making generalized suffix tree from multiple data sources
|
23
|
+
# we need ability to treat them as belonging to one large data source
|
24
|
+
sd1 = StringDataSource.new "abc"
|
25
|
+
sd2 = StringDataSource.new "def"
|
26
|
+
sd1.extendWith(sd2, 3)
|
27
|
+
expect(sd1.valueAt(0)).to eq "a"
|
28
|
+
expect(sd1.valueAt(1)).to eq "b"
|
29
|
+
expect(sd1.valueAt(2)).to eq "c"
|
30
|
+
val = sd1.valueAt(3)
|
31
|
+
expect(sd1.valueAt(3)).to eq "d"
|
32
|
+
expect(sd1.valueAt(4)).to eq "e"
|
33
|
+
expect(sd1.valueAt(5)).to eq "f"
|
34
|
+
expect(sd1.valueAt(6)).to eq nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "#extendWith on a FileDataSource" do
|
39
|
+
it "allows file data sources to be extended" do
|
40
|
+
fd1 = FileDataSource.new(File.join('spec', 'fixtures', "mississippi.txt"))
|
41
|
+
fd2 = FileDataSource.new(File.join('spec', 'fixtures', "arizona.txt"))
|
42
|
+
fd1.extendWith(fd2, 11)
|
43
|
+
expect(fd1.valueAt(2)).to eq "s"
|
44
|
+
expect(fd1.valueAt(10)).to eq "i"
|
45
|
+
expect(fd1.valueAt(11)).to eq "a"
|
46
|
+
expect(fd1.valueAt(12)).to eq "r"
|
47
|
+
expect(fd1.valueAt(17)).to eq "a"
|
48
|
+
expect(fd1.valueAt(18)).to eq nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
arizona
|