suffix_tree 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/lib/data/base_data_source.rb +44 -0
  3. data/lib/data/data_source_factory.rb +16 -0
  4. data/lib/data/file_data_source.rb +29 -0
  5. data/lib/data/line_state_machine.rb +86 -0
  6. data/lib/data/string_data_source.rb +31 -0
  7. data/lib/data/word_data_source.rb +229 -0
  8. data/lib/location.rb +165 -0
  9. data/lib/node.rb +63 -0
  10. data/lib/node_factory.rb +169 -0
  11. data/lib/persist/suffix_tree_db.rb +148 -0
  12. data/lib/search/searcher.rb +68 -0
  13. data/lib/suffix_linker.rb +16 -0
  14. data/lib/suffix_tree.rb +122 -0
  15. data/lib/visitor/base_visitor.rb +17 -0
  16. data/lib/visitor/bfs.rb +22 -0
  17. data/lib/visitor/data_source_visitor.rb +15 -0
  18. data/lib/visitor/dfs.rb +34 -0
  19. data/lib/visitor/k_common_visitor.rb +71 -0
  20. data/lib/visitor/leaf_count_visitor.rb +15 -0
  21. data/lib/visitor/node_count_visitor.rb +16 -0
  22. data/lib/visitor/numbering_visitor.rb +230 -0
  23. data/lib/visitor/suffix_offset_visitor.rb +23 -0
  24. data/lib/visitor/tree_print_visitor.rb +44 -0
  25. data/lib/visitor/value_depth_visitor.rb +34 -0
  26. data/spec/constant_lca_spec.rb +27 -0
  27. data/spec/data_source_spec.rb +51 -0
  28. data/spec/fixtures/arizona.txt +1 -0
  29. data/spec/fixtures/chapter1.txt +371 -0
  30. data/spec/fixtures/chapter1.txt.summary +3 -0
  31. data/spec/fixtures/chapter1.txt.values +0 -0
  32. data/spec/fixtures/chapter1.txt.words +1329 -0
  33. data/spec/fixtures/mississippi.txt +1 -0
  34. data/spec/fixtures/singlePara.txt +41 -0
  35. data/spec/fixtures/smallFile.txt +3 -0
  36. data/spec/fixtures/smallFile.txt.summary +2 -0
  37. data/spec/fixtures/smallFile.txt.values +0 -0
  38. data/spec/fixtures/smallFile.txt.words +14 -0
  39. data/spec/fixtures/testbook.txt +5414 -0
  40. data/spec/location_spec.rb +149 -0
  41. data/spec/node_factory_spec.rb +199 -0
  42. data/spec/search_spec.rb +182 -0
  43. data/spec/suffix_tree_spec.rb +270 -0
  44. data/spec/util_spec.rb +47 -0
  45. data/spec/visitor_spec.rb +310 -0
  46. metadata +87 -0
@@ -0,0 +1,16 @@
1
+ class NodeCountVisitor
2
+ attr_reader :count
3
+
4
+ def initialize
5
+ @count = 0
6
+ end
7
+
8
+ def preVisit(node)
9
+ @count += 1
10
+ return true
11
+ end
12
+
13
+ def postVisit(node)
14
+ # do nothing
15
+ end
16
+ end
@@ -0,0 +1,230 @@
1
+ require_relative '../node'
2
+ require_relative 'base_visitor'
3
+
4
+ # monkey patching dfsNumber and numberNodesInSubtree
5
+ module NodeExtensions
6
+ # set by first pass traversal with NumberingVisitor
7
+ attr_accessor :dfsNumber
8
+
9
+ # numberNodesInSubtree detects proper ancestor of two nodes, where ancestor is lca
10
+ # binaryTreeHeight is required for building runs
11
+ attr_accessor :numberNodesInSubtree, :binaryTreeHeight
12
+
13
+ # set by second pass traversal with IvVisitor, a run is the path with a single
14
+ # (lowest in tree) node with the greatest binaryTreeHeight. runHead and runTail
15
+ # are the nodes that span the run.
16
+ attr_accessor :runHead, :runTail
17
+
18
+ # set by third pass with RunBitVisitor, sets bits for each ancestor run
19
+ attr_accessor :runBits
20
+ end
21
+
22
+ class Node
23
+ prepend NodeExtensions
24
+ end
25
+
26
+ class BitUtil
27
+ def initialize
28
+ @masks = [ 0xffff, 0xff ]
29
+ @rightTable = [ 0 ]
30
+ @leftTable = [ 0 ]
31
+
32
+ (1..255).each do |val|
33
+ @rightTable << rightOneBit(val)
34
+ @leftTable << leftOneBit(val)
35
+ end
36
+ end
37
+
38
+ def rightBit(n)
39
+ shiftCount, n = shiftToByteRight(n)
40
+ @rightTable[n & 0xff] + shiftCount
41
+ end
42
+
43
+ def leftBit(n)
44
+ shiftCount, n = shiftToByteLeft(n)
45
+ @leftTable[n & 0xff] + shiftCount
46
+ end
47
+
48
+ def bitGreaterThanOrEqualTo(startOffset, v1, v2)
49
+ map = (1 << (startOffset - 1))
50
+ while (startOffset < 64) do
51
+ if (((v1 & map) != 0) && ((v2 & map) != 0)) then
52
+ return startOffset
53
+ else
54
+ startOffset += 1
55
+ map = map << 1
56
+ end
57
+ end
58
+ end
59
+
60
+ def leftMostBitToRightOf(bitNumber, n)
61
+ mask = getMask(bitNumber + 1)
62
+ return leftBit(n & mask)
63
+ end
64
+
65
+ private
66
+
67
+ def getMask(bitNumber)
68
+ # assume 64 bit numbers
69
+ 0xffffffffffffffff >> bitNumber
70
+ end
71
+
72
+ def shiftToByteRight(n)
73
+ shiftCount = 0
74
+ if ((n & 0xffffffff) == 0) then
75
+ n = n >> 32
76
+ shiftCount += 32
77
+ end
78
+ if ((n & 0xffff) == 0) then
79
+ n = n >> 16
80
+ shiftCount += 16
81
+ end
82
+ if ((n & 0xff) == 0) then
83
+ n = n >> 8
84
+ shiftCount += 8
85
+ end
86
+ return shiftCount, n
87
+ end
88
+
89
+ def shiftToByteLeft(n)
90
+ shiftCount = 0
91
+ if ((n & 0xffffffff00000000) != 0) then
92
+ n = n >> 32
93
+ shiftCount += 32
94
+ end
95
+ if ((n & 0xffff0000) != 0) then
96
+ n = n >> 16
97
+ shiftCount += 16
98
+ end
99
+ if ((n & 0xff00) != 0) then
100
+ n = n >> 8
101
+ shiftCount += 8
102
+ end
103
+ return shiftCount, n
104
+ end
105
+
106
+ def rightOneBit(n)
107
+ mask = 1
108
+ result = 1
109
+ while ((n & mask) == 0) do
110
+ mask = mask << 1
111
+ result += 1
112
+ end
113
+ return result
114
+ end
115
+
116
+ def leftOneBit(n)
117
+ mask = 1 << 7
118
+ result = 8
119
+ while ((n & mask) == 0) do
120
+ mask = mask >> 1
121
+ result -= 1
122
+ end
123
+ return result
124
+ end
125
+ end
126
+
127
+ # use BaseVisitor counters to set the values
128
+ class NumberingVisitor < BaseVisitor
129
+ def initialize
130
+ @bitCalculator = BitUtil.new
131
+ super
132
+ end
133
+
134
+ def preVisit(node)
135
+ super(node)
136
+ node.dfsNumber = @preCounter
137
+ node.binaryTreeHeight = @bitCalculator.rightBit(@preCounter)
138
+ return true
139
+ end
140
+
141
+ def postVisit(node)
142
+ node.numberNodesInSubtree = @preCounter - node.dfsNumber + 1
143
+ end
144
+ end
145
+
146
+ # set the height of the complete binary tree node that each node maps to
147
+ class RunDefiningVisitor
148
+ def preVisit(node)
149
+ # every node gets the runTail set correctly
150
+ # runHead is ONLY valid in the runTail node
151
+ node.runHead = node.runTail = node
152
+ parentDfsNumber = 0
153
+ parentDfsNumber = node.parent.dfsNumber if (node.parent != nil)
154
+ return true
155
+ end
156
+
157
+ def postVisit(node)
158
+ # the child with a greatest binaryTreeHeight larger than the current node's binaryTreeHeight
159
+ # is the runTail, the current node is the runHead (which we need to set in runTail)
160
+ if (node.children != nil) then
161
+ maxBinaryTreeHeight = node.binaryTreeHeight
162
+ maxBinaryTreeHeightNode = nil
163
+ node.children.values.each do |child|
164
+ if (child.runTail.binaryTreeHeight > maxBinaryTreeHeight) then
165
+ maxBinaryTreeHeight = child.runTail.binaryTreeHeight
166
+ maxBinaryTreeHeightNode = child
167
+ end
168
+ end
169
+ if (maxBinaryTreeHeightNode != nil) then
170
+ node.runTail = maxBinaryTreeHeightNode.runTail
171
+
172
+ # runHead is ONLY valid in the runTail node,
173
+ # the alternative is to traverse from runTail to node whenever runHead changes
174
+ # (or to do this only on final change)
175
+ node.runTail.runHead = node
176
+ end
177
+ end
178
+ end
179
+ end
180
+
181
+ class RunBitVisitor
182
+ def initialize(startNode)
183
+ startNode.runBits = 0
184
+ end
185
+
186
+ def preVisit(node)
187
+ if (node.parent != nil) then
188
+ node.runBits = node.parent.runBits
189
+ end
190
+ node.runBits = node.runBits | getBit(node.runTail.binaryTreeHeight)
191
+ return true
192
+ end
193
+
194
+ def postVisit(node)
195
+ end
196
+
197
+ private
198
+ def getBit(n)
199
+ 1 << (n-1)
200
+ end
201
+ end
202
+
203
+ class LeafNodeCollector
204
+ attr_reader :suffixToLeaf
205
+
206
+ def initialize
207
+ @suffixToLeaf = {}
208
+ end
209
+
210
+ def preVisit(node)
211
+ if (node.children == nil) then
212
+ @suffixToLeaf[node.suffixOffset] = node
213
+ end
214
+ return true
215
+ end
216
+
217
+ def postVisit(node)
218
+ end
219
+ end
220
+
221
+ class LeastCommonAncestorPreprocessing
222
+ def initialize(startNode)
223
+ dfs = OrderedDFS.new(NumberingVisitor.new)
224
+ dfs.traverse(startNode)
225
+ dfs = OrderedDFS.new(RunDefiningVisitor.new)
226
+ dfs.traverse(startNode)
227
+ dfs = DFS.new(RunBitVisitor.new(startNode))
228
+ dfs.traverse(startNode)
229
+ end
230
+ end
@@ -0,0 +1,23 @@
1
+ #
2
+ # Only makes sense from OrderedDFS
3
+ #
4
+ class SuffixOffsetVisitor
5
+
6
+ attr_reader :result
7
+
8
+ def initialize
9
+ @result = []
10
+ end
11
+
12
+ def preVisit(node)
13
+ if (node.isLeaf) then
14
+ @result << node.suffixOffset
15
+ end
16
+ return true
17
+ end
18
+
19
+ def postVisit(node)
20
+ # do nothing
21
+ end
22
+
23
+ end
@@ -0,0 +1,44 @@
1
+ class TreePrintVisitor
2
+ ALL_LEVELS = -1
3
+
4
+ def initialize(dataSource, io, level=ALL_LEVELS)
5
+ @indentation = 0
6
+ @dataSource = dataSource
7
+ @io = io
8
+ @level = level
9
+ end
10
+
11
+ def nodeToStr(node)
12
+ if (node.isRoot) then
13
+ "ROOT"
14
+ else
15
+ "#{@dataSource.toString(node.incomingEdgeStartOffset, node.incomingEdgeEndOffset)}"
16
+ end
17
+ end
18
+
19
+ def preVisit(node)
20
+ @io.print "#{" "*@indentation}#{self.nodeToStr(node)}\n"
21
+ if (@level == ALL_LEVELS) || (@indentation < @level) then
22
+ @indentation += 1
23
+ return true
24
+ else
25
+ return false
26
+ end
27
+ end
28
+
29
+ def postVisit(node)
30
+ @indentation -= 1
31
+ end
32
+ end
33
+
34
+ class DfsTreePrintVisitor < TreePrintVisitor
35
+ def nodeToStr(node)
36
+ "#{node.dfsNumber} #{node.suffixOffset}, #{node.runTail.binaryTreeHeight}/#{node.runTail.dfsNumber} #{super}"
37
+ end
38
+ end
39
+
40
+ class BasicDfsTreePrintVisitor < TreePrintVisitor
41
+ def nodeToStr(node)
42
+ "#{node.dfsNumber} #{node.suffixOffset}, #{super}"
43
+ end
44
+ end
@@ -0,0 +1,34 @@
1
+ require_relative '../node'
2
+
3
+ class ValueDepthVisitor < BaseVisitor
4
+ def initialize
5
+ super
6
+ end
7
+
8
+ def preVisit(node)
9
+ if (node.isInternal) then
10
+ node.valueDepth = node.parent.valueDepth + node.incomingEdgeLength
11
+ elsif (node.isLeaf) then
12
+ node.valueDepth = Node::LEAF_DEPTH
13
+ end
14
+ return true
15
+ end
16
+ end
17
+
18
+
19
+ class DeepestValueDepthVisitor < BaseVisitor
20
+ attr_reader :deepestValueDepth, :deepestValueDepthNode
21
+
22
+ def initialize
23
+ @deepestValueDepthNode = nil
24
+ @deepestValueDepth = 0
25
+ super
26
+ end
27
+
28
+ def postVisit(node)
29
+ if (node.valueDepth > @deepestValueDepth) then
30
+ @deepestValueDepth = node.valueDepth
31
+ @deepestValueDepthNode = node
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+ require_relative '../lib/data/string_data_source'
3
+ require_relative '../lib/suffix_tree'
4
+ require_relative '../lib/visitor/dfs'
5
+ require_relative '../lib/visitor/numbering_visitor'
6
+
7
+ describe 'Preprocesses suffix tree to allow constant time least-common-ancestor' do
8
+
9
+ let (:dataSource) { StringDataSource.new("mississippi$") }
10
+
11
+ it 'should find least-common-ancestor of any two leaf' do
12
+ st = SuffixTree.new
13
+ st.addDataSource(dataSource)
14
+ dfs = OrderedDFS.new(NumberingVisitor.new)
15
+ dfs.traverse(st.root)
16
+ dfs = OrderedDFS.new(RunDefiningVisitor.new)
17
+ dfs.traverse(st.root)
18
+ dfs = DFS.new(RunBitVisitor.new(st.root))
19
+ dfs.traverse(st.root)
20
+ leafNodeCollector = LeafNodeCollector.new
21
+ dfs = DFS.new(leafNodeCollector)
22
+ dfs.traverse(st.root)
23
+ leafNodeCollector.suffixToLeaf.keys.sort.each do |key|
24
+ print "#{key} #{leafNodeCollector.suffixToLeaf[key].dfsNumber}\n"
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,51 @@
1
+ require 'rspec'
2
+ require_relative '../lib/data/file_data_source'
3
+ require_relative '../lib/data/string_data_source'
4
+ require_relative '../lib/data/word_data_source'
5
+
6
+ describe 'reads data sources' do
7
+
8
+ describe 'WordDataSource' do
9
+ it "should read words" do
10
+ wordDataSource = WordDataSource.new(File.join('spec', 'fixtures', "singlePara.txt"))
11
+ expect(wordDataSource.valueAt(0)).to eq "i"
12
+ expect(wordDataSource.valueAt(1)).to eq "was"
13
+ expect(wordDataSource.valueAt(2)).to eq "born"
14
+ expect(wordDataSource.valueAt(8)).to eq "angora"
15
+ expect(wordDataSource.valueAt(16)).to eq "silky-haired"
16
+ expect(wordDataSource.valueAt(19)).to eq "goats"
17
+ end
18
+ end
19
+
20
+ describe "#extendWith on a StringDataSource" do
21
+ it "allows a data source to be extended with another" do
22
+ # this is used for making generalized suffix tree from multiple data sources
23
+ # we need ability to treat them as belonging to one large data source
24
+ sd1 = StringDataSource.new "abc"
25
+ sd2 = StringDataSource.new "def"
26
+ sd1.extendWith(sd2, 3)
27
+ expect(sd1.valueAt(0)).to eq "a"
28
+ expect(sd1.valueAt(1)).to eq "b"
29
+ expect(sd1.valueAt(2)).to eq "c"
30
+ val = sd1.valueAt(3)
31
+ expect(sd1.valueAt(3)).to eq "d"
32
+ expect(sd1.valueAt(4)).to eq "e"
33
+ expect(sd1.valueAt(5)).to eq "f"
34
+ expect(sd1.valueAt(6)).to eq nil
35
+ end
36
+ end
37
+
38
+ describe "#extendWith on a FileDataSource" do
39
+ it "allows file data sources to be extended" do
40
+ fd1 = FileDataSource.new(File.join('spec', 'fixtures', "mississippi.txt"))
41
+ fd2 = FileDataSource.new(File.join('spec', 'fixtures', "arizona.txt"))
42
+ fd1.extendWith(fd2, 11)
43
+ expect(fd1.valueAt(2)).to eq "s"
44
+ expect(fd1.valueAt(10)).to eq "i"
45
+ expect(fd1.valueAt(11)).to eq "a"
46
+ expect(fd1.valueAt(12)).to eq "r"
47
+ expect(fd1.valueAt(17)).to eq "a"
48
+ expect(fd1.valueAt(18)).to eq nil
49
+ end
50
+ end
51
+ end
@@ -0,0 +1 @@
1
+ arizona