suffix_tree 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/data/base_data_source.rb +44 -0
- data/lib/data/data_source_factory.rb +16 -0
- data/lib/data/file_data_source.rb +29 -0
- data/lib/data/line_state_machine.rb +86 -0
- data/lib/data/string_data_source.rb +31 -0
- data/lib/data/word_data_source.rb +229 -0
- data/lib/location.rb +165 -0
- data/lib/node.rb +63 -0
- data/lib/node_factory.rb +169 -0
- data/lib/persist/suffix_tree_db.rb +148 -0
- data/lib/search/searcher.rb +68 -0
- data/lib/suffix_linker.rb +16 -0
- data/lib/suffix_tree.rb +122 -0
- data/lib/visitor/base_visitor.rb +17 -0
- data/lib/visitor/bfs.rb +22 -0
- data/lib/visitor/data_source_visitor.rb +15 -0
- data/lib/visitor/dfs.rb +34 -0
- data/lib/visitor/k_common_visitor.rb +71 -0
- data/lib/visitor/leaf_count_visitor.rb +15 -0
- data/lib/visitor/node_count_visitor.rb +16 -0
- data/lib/visitor/numbering_visitor.rb +230 -0
- data/lib/visitor/suffix_offset_visitor.rb +23 -0
- data/lib/visitor/tree_print_visitor.rb +44 -0
- data/lib/visitor/value_depth_visitor.rb +34 -0
- data/spec/constant_lca_spec.rb +27 -0
- data/spec/data_source_spec.rb +51 -0
- data/spec/fixtures/arizona.txt +1 -0
- data/spec/fixtures/chapter1.txt +371 -0
- data/spec/fixtures/chapter1.txt.summary +3 -0
- data/spec/fixtures/chapter1.txt.values +0 -0
- data/spec/fixtures/chapter1.txt.words +1329 -0
- data/spec/fixtures/mississippi.txt +1 -0
- data/spec/fixtures/singlePara.txt +41 -0
- data/spec/fixtures/smallFile.txt +3 -0
- data/spec/fixtures/smallFile.txt.summary +2 -0
- data/spec/fixtures/smallFile.txt.values +0 -0
- data/spec/fixtures/smallFile.txt.words +14 -0
- data/spec/fixtures/testbook.txt +5414 -0
- data/spec/location_spec.rb +149 -0
- data/spec/node_factory_spec.rb +199 -0
- data/spec/search_spec.rb +182 -0
- data/spec/suffix_tree_spec.rb +270 -0
- data/spec/util_spec.rb +47 -0
- data/spec/visitor_spec.rb +310 -0
- metadata +87 -0
data/lib/node_factory.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative 'node'
|
2
|
+
|
3
|
+
class NodeFactory
|
4
|
+
attr_reader :nextNodeId, :root
|
5
|
+
attr_reader :dataSource
|
6
|
+
attr_reader :configuration
|
7
|
+
attr_reader :db
|
8
|
+
|
9
|
+
def initialize(dataSource, persister=nil)
|
10
|
+
@dataSource = dataSource
|
11
|
+
@suffixOffset = 0
|
12
|
+
@configuration = {
|
13
|
+
:leafCount => false,
|
14
|
+
:valueDepth => false,
|
15
|
+
:previousValue => false,
|
16
|
+
:dataSourceBit => false
|
17
|
+
}
|
18
|
+
@db = persister
|
19
|
+
self.reset
|
20
|
+
end
|
21
|
+
|
22
|
+
def reset
|
23
|
+
@nextNodeId = 1
|
24
|
+
end
|
25
|
+
|
26
|
+
def nextDataSourceBit
|
27
|
+
@dataSourceBit = (@dataSourceBit << 1) if ((@configuration[:dataSourceBit]) && (@dataSource != nil))
|
28
|
+
end
|
29
|
+
|
30
|
+
def extendDataSource(dataSource, startOffset)
|
31
|
+
self.nextDataSourceBit
|
32
|
+
if (@dataSource == nil) then
|
33
|
+
@dataSource = dataSource
|
34
|
+
else
|
35
|
+
@dataSource.extendWith(dataSource, startOffset)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def nextDataSourceSetSize(modForSwitch)
|
40
|
+
@nextDataSourceSwitch = modForSwitch
|
41
|
+
end
|
42
|
+
|
43
|
+
def setConfiguration configurationHash
|
44
|
+
configurationHash.each do |key, value|
|
45
|
+
@configuration[key] = value
|
46
|
+
end
|
47
|
+
self
|
48
|
+
end
|
49
|
+
|
50
|
+
def newRoot
|
51
|
+
self.reset
|
52
|
+
result = newNode
|
53
|
+
result.children = {}
|
54
|
+
@root = result
|
55
|
+
@configuration.each do |key, value|
|
56
|
+
if (value) then
|
57
|
+
@root.createAccessor(key.to_s)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# configuration controlled accessors
|
62
|
+
@root.valueDepth = 0 if @configuration[:valueDepth]
|
63
|
+
@root.leafCount = 0 if @configuration[:leafCount]
|
64
|
+
@dataSourceBit = 1 if @configuration[:dataSourceBit]
|
65
|
+
@root.dataSourceBit = @dataSourceBit if @configuration[:dataSourceBit]
|
66
|
+
|
67
|
+
persist(result)
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
#
|
72
|
+
# The algorithm adds leaf nodes in order
|
73
|
+
#
|
74
|
+
def addLeaf(node, value, offset)
|
75
|
+
result = newChild(node, value, @suffixOffset, offset, Node::CURRENT_ENDING_OFFSET)
|
76
|
+
|
77
|
+
# optional configuration based properties
|
78
|
+
result.leafCount = 1 if (@configuration[:leafCount])
|
79
|
+
result.previousValue = (@dataSource.valueAt(@suffixOffset - 1)) if ((@suffixOffset > 0) && @configuration[:previousValue])
|
80
|
+
result.dataSourceBit = @dataSourceBit if @configuration[:dataSourceBit]
|
81
|
+
@suffixOffset += 1
|
82
|
+
if ((@nextDataSourceSwitch != nil) && ((@suffixOffset % @nextDataSourceSwitch) == 0)) then
|
83
|
+
self.nextDataSourceBit
|
84
|
+
end
|
85
|
+
|
86
|
+
persist(result)
|
87
|
+
end
|
88
|
+
|
89
|
+
def splitEdgeAt(node, incomingEdgeOffset)
|
90
|
+
result = newChild(node.parent, @dataSource.valueAt(node.incomingEdgeStartOffset), node.suffixOffset, node.incomingEdgeStartOffset, incomingEdgeOffset - 1)
|
91
|
+
node.incomingEdgeStartOffset = incomingEdgeOffset
|
92
|
+
addChild(result, @dataSource.valueAt(incomingEdgeOffset), node)
|
93
|
+
|
94
|
+
# optional configuration based properties
|
95
|
+
result.valueDepth = (result.parent.valueDepth + result.incomingEdgeLength) if @configuration[:valueDepth]
|
96
|
+
result.dataSourceBit = (node.dataSourceBit | @dataSourceBit) if @configuration[:dataSourceBit]
|
97
|
+
|
98
|
+
persist(node)
|
99
|
+
persist(result)
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# return a sequence of all values on the path to this node
|
104
|
+
#
|
105
|
+
def valuePath(node, delimiter=' ')
|
106
|
+
result = []
|
107
|
+
while (node.parent != nil) do
|
108
|
+
reverseAddValues(result, node.incomingEdgeStartOffset, node.incomingEdgeEndOffset)
|
109
|
+
node = node.parent
|
110
|
+
end
|
111
|
+
result.reverse!
|
112
|
+
return result.join(delimiter)
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# internal private methods
|
117
|
+
#
|
118
|
+
private
|
119
|
+
|
120
|
+
# return edge value sequence in reverse (used when getting path to root from a node)
|
121
|
+
def reverseAddValues(result, startOffset, endOffset)
|
122
|
+
if (endOffset == Node::CURRENT_ENDING_OFFSET) then
|
123
|
+
result << @dataSource.valueAt(startOffset)
|
124
|
+
else
|
125
|
+
scanner = endOffset
|
126
|
+
while (scanner >= startOffset) do
|
127
|
+
result << @dataSource.valueAt(scanner)
|
128
|
+
scanner -= 1
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def newChild(node, key, suffixOffset, incomingEdgeStartOffset, incomingEdgeEndOffset)
|
134
|
+
child = newNode
|
135
|
+
child.suffixOffset = suffixOffset
|
136
|
+
child.incomingEdgeStartOffset = incomingEdgeStartOffset
|
137
|
+
child.incomingEdgeEndOffset = incomingEdgeEndOffset
|
138
|
+
addChild(node, key, child)
|
139
|
+
child.valueDepth = 0 if @configuration[:valueDepth]
|
140
|
+
return child
|
141
|
+
end
|
142
|
+
|
143
|
+
def newNode
|
144
|
+
result = Node.new(@nextNodeId)
|
145
|
+
|
146
|
+
# newRoot defines leafCount accessor, so that case is handled in newRoot after the node is created
|
147
|
+
result.leafCount = 0 if (@configuration[:leafCount] && (@nextNodeId > 1))
|
148
|
+
result.dataSourceBit = @dataSourceBit if (@configuration[:dataSourceBit] && (@nextNodeId > 1))
|
149
|
+
@nextNodeId += 1
|
150
|
+
return result
|
151
|
+
end
|
152
|
+
|
153
|
+
def addChild(parentNode, value, childNode)
|
154
|
+
if (parentNode.children == nil) then
|
155
|
+
parentNode.children = {}
|
156
|
+
end
|
157
|
+
parentNode.children[value] = childNode
|
158
|
+
childNode.parent = parentNode
|
159
|
+
persist(parentNode)
|
160
|
+
persist(childNode)
|
161
|
+
end
|
162
|
+
|
163
|
+
def persist(node)
|
164
|
+
if (@db != nil) then
|
165
|
+
@db.persist(node)
|
166
|
+
end
|
167
|
+
node
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
require_relative '../node'
|
2
|
+
|
3
|
+
class SuffixTreeDB
|
4
|
+
def initialize(textFile)
|
5
|
+
@textFile = File.open(textFile, "w")
|
6
|
+
@dataValues = []
|
7
|
+
@dataValueIdx = 0
|
8
|
+
end
|
9
|
+
|
10
|
+
def val(node)
|
11
|
+
if (node == nil) then
|
12
|
+
return 0
|
13
|
+
else
|
14
|
+
return node.nodeId
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def persist(node)
|
19
|
+
@textFile.print "#{node.nodeId} #{val(node.parent)} #{node.incomingEdgeStartOffset} #{node.incomingEdgeEndOffset} #{node.suffixOffset} #{val(node.suffixLink)}"
|
20
|
+
if (node.children != nil) then
|
21
|
+
node.children.values.each do |childNode|
|
22
|
+
@textFile.print " #{childNode.nodeId}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@textFile.print " 0\n"
|
26
|
+
end
|
27
|
+
|
28
|
+
def readInt()
|
29
|
+
if (@dataValueIdx >= @dataValues.length) then
|
30
|
+
if (@textFile.eof?) then
|
31
|
+
return 0
|
32
|
+
end
|
33
|
+
line = @textFile.readline()
|
34
|
+
if (line == nil) then
|
35
|
+
return 0
|
36
|
+
else
|
37
|
+
line.chomp!
|
38
|
+
@dataValueIdx = 0
|
39
|
+
@dataValues = line.split
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
result = @dataValues[@dataValueIdx].to_i
|
44
|
+
@dataValueIdx += 1
|
45
|
+
return result
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class SuffixTreeBuilder
|
50
|
+
attr_reader :suffixCount
|
51
|
+
|
52
|
+
def initialize(stdb, dataSource)
|
53
|
+
@suffxTreeDB = stdb
|
54
|
+
@dataSource = dataSource
|
55
|
+
@root = nil
|
56
|
+
@unresolvedParents = {}
|
57
|
+
@unresolvedSuffixLinks = {}
|
58
|
+
@unresolvedChildren = {}
|
59
|
+
@allNodes = {}
|
60
|
+
end
|
61
|
+
|
62
|
+
def buildNode
|
63
|
+
nodeId = @suffxTreeDB.readInt()
|
64
|
+
if (nodeId > 0) then
|
65
|
+
node = resolveNodeId(nodeId)
|
66
|
+
resolve(nodeId, node)
|
67
|
+
@allNodes[nodeId] = node
|
68
|
+
@root = node if (@root == nil)
|
69
|
+
resolveParent(node, @suffxTreeDB.readInt())
|
70
|
+
node.incomingEdgeStartOffset = @suffxTreeDB.readInt()
|
71
|
+
node.incomingEdgeEndOffset = @suffxTreeDB.readInt()
|
72
|
+
@suffixCount = node.suffixOffset = @suffxTreeDB.readInt()
|
73
|
+
resolveSuffixLink(node, @suffxTreeDB.readInt())
|
74
|
+
childNodeId = @suffxTreeDB.readInt()
|
75
|
+
while (childNodeId != 0) do
|
76
|
+
resolveChild(node, childNodeId)
|
77
|
+
childNodeId = @suffxTreeDB.readInt()
|
78
|
+
end
|
79
|
+
return node
|
80
|
+
end
|
81
|
+
return false
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def resolveParent(node, nodeId)
|
87
|
+
if (@allNodes.has_key?(nodeId)) then
|
88
|
+
node.parent = @allNodes[nodeId]
|
89
|
+
else
|
90
|
+
resolveEntry(node, nodeId, @unresolvedParents)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def resolveSuffixLink(node, nodeId)
|
95
|
+
if (@allNodes.has_key?(nodeId)) then
|
96
|
+
node.suffixLink = @allNodes[nodeId]
|
97
|
+
else
|
98
|
+
resolveEntry(node, nodeId, @unresolvedSuffixLinks)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def resolveChild(node, nodeId)
|
103
|
+
if (@allNodes.has_key?(nodeId)) then
|
104
|
+
childNode = @allNodes[nodeId]
|
105
|
+
if (node.children == nil) then
|
106
|
+
node.children = {}
|
107
|
+
end
|
108
|
+
node.children[@dataSource.valueAt(childNode.incomingEdgeStartOffset)] = childNode
|
109
|
+
else
|
110
|
+
resolveEntry(node, nodeId, @unresolvedChildren)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def resolveEntry(node, nodeId, theList)
|
115
|
+
if (nodeId > 0) then
|
116
|
+
theList[nodeId] = node
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def resolveNodeId(nodeId)
|
121
|
+
if @allNodes.has_key?(nodeId) then
|
122
|
+
@allNodes[nodeId]
|
123
|
+
else
|
124
|
+
Node.new(nodeId)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def resolve(nodeId, node)
|
129
|
+
if (@unresolvedParents.has_key?(nodeId)) then
|
130
|
+
print "Unresolved parent value #{nodeId}\n"
|
131
|
+
@unresolvedParents[nodeId].parent = node
|
132
|
+
@unresolvedParents.delete(nodeId)
|
133
|
+
end
|
134
|
+
if (@unresolvedChildren.has_key?(nodeId) && (node.incomingEdgeStartOffset >= 0)) then
|
135
|
+
unfinishedNode = @unresolvedChildren[nodeId]
|
136
|
+
if (unfinishedNode.children == nil) then
|
137
|
+
unfinishedNode.children = {}
|
138
|
+
end
|
139
|
+
unfinishedNode.children[@dataSource.valueAt(node.incomingEdgeStartOffset)] = node
|
140
|
+
@unresolvedChildren.delete(nodeId)
|
141
|
+
end
|
142
|
+
if (@unresolvedSuffixLinks.has_key?(nodeId)) then
|
143
|
+
unfinishedNode = @unresolvedSuffixLinks[nodeId]
|
144
|
+
unfinishedNode.suffixLink = node
|
145
|
+
@unresolvedSuffixLinks.delete(nodeId)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require_relative '../visitor/bfs'
|
2
|
+
require_relative '../visitor/suffix_offset_visitor'
|
3
|
+
require_relative '../data/string_data_source'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Searcher finds matches in a tree
|
7
|
+
#
|
8
|
+
# It needs the tree root, and the data source used to make the tree
|
9
|
+
# This assumes the tree was made with a single data source.
|
10
|
+
#
|
11
|
+
# "find" really should be finding matches from a different data source (not a string)
|
12
|
+
#
|
13
|
+
class Searcher
|
14
|
+
def initialize(treeDataSource, treeRoot)
|
15
|
+
@dataSource = treeDataSource
|
16
|
+
@root = treeRoot
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# match dataSource values, return location in the suffix tree where the match stopped
|
21
|
+
#
|
22
|
+
def matchDataSource(dataSource)
|
23
|
+
location = Location.new(@root)
|
24
|
+
location.matchDataSource(@dataSource, dataSource)
|
25
|
+
location
|
26
|
+
end
|
27
|
+
|
28
|
+
def findNode(dataSource)
|
29
|
+
location = Location.new(@root)
|
30
|
+
if (location.matchDataSource(@dataSource, dataSource).depth == dataSource.numberValues) then
|
31
|
+
return location.node
|
32
|
+
else
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# returns the list of suffix offset values where the searchString has been found
|
39
|
+
#
|
40
|
+
def findString(searchString)
|
41
|
+
node = self.findNode(StringDataSource.new(searchString))
|
42
|
+
return self.findResults(node)
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def findWord(searchString)
|
47
|
+
node = self.findNode(SingleWordDataSource.new(searchString))
|
48
|
+
return self.findResults(node)
|
49
|
+
end
|
50
|
+
|
51
|
+
def findResults(node)
|
52
|
+
if (node != nil) then
|
53
|
+
soCollector = SuffixOffsetVisitor.new
|
54
|
+
so = BFS.new(soCollector)
|
55
|
+
so.traverse(node)
|
56
|
+
return soCollector.result.sort
|
57
|
+
else
|
58
|
+
return []
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# match a string starting at a specific location,
|
63
|
+
# returning the character depth of the resulting match
|
64
|
+
def findAtLocation(location, s)
|
65
|
+
location.matchString(@dataSource, s)
|
66
|
+
return location.depth
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class SuffixLinker
|
2
|
+
|
3
|
+
def update(location)
|
4
|
+
if ((@nodeNeedingSuffixLink != nil) && (location.node != @nodeNeedingSuffixLink) && location.onNode) then
|
5
|
+
@nodeNeedingSuffixLink.suffixLink = location.node
|
6
|
+
@nodeNeedingSuffixLink = nil
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def nodeNeedingSuffixLink(node)
|
11
|
+
if (@nodeNeedingSuffixLink != nil) then
|
12
|
+
@nodeNeedingSuffixLink.suffixLink = node
|
13
|
+
end
|
14
|
+
@nodeNeedingSuffixLink = node
|
15
|
+
end
|
16
|
+
end
|
data/lib/suffix_tree.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require_relative 'location'
|
2
|
+
require_relative 'node_factory'
|
3
|
+
require_relative 'suffix_linker'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Builds a suffix tree from one or more DataSource instances
|
7
|
+
#
|
8
|
+
class SuffixTree
|
9
|
+
NO_SUFFIX_OFFSET = -1
|
10
|
+
|
11
|
+
# first data source we use
|
12
|
+
attr_reader :rootDataSource
|
13
|
+
|
14
|
+
# when there are a sequence of data sources, treat them as one long one, this is where next source starts
|
15
|
+
attr_reader :startOffset
|
16
|
+
|
17
|
+
# where we are in the implicit tree building process
|
18
|
+
attr_reader :location
|
19
|
+
|
20
|
+
attr_reader :nodeFactory
|
21
|
+
|
22
|
+
# the root of the tree, and the terminal value (for making implicit trees explicit)
|
23
|
+
attr_reader :root, :terminalValue
|
24
|
+
|
25
|
+
# keep track of which nodes need suffix links
|
26
|
+
attr_reader :suffixLinker
|
27
|
+
|
28
|
+
def initialize(terminalValue = nil, configuration = nil, persister = nil)
|
29
|
+
@nextNodeId = 0
|
30
|
+
@nodeFactory = NodeFactory.new(nil, persister)
|
31
|
+
@nodeFactory.setConfiguration(configuration) if (configuration != nil)
|
32
|
+
@root = @nodeFactory.newRoot()
|
33
|
+
@rootDataSource = nil
|
34
|
+
@location = Location.new(@root)
|
35
|
+
@startOffset = 0
|
36
|
+
@suffixOffset = 0
|
37
|
+
@suffixLinker = SuffixLinker.new
|
38
|
+
@terminalValue = terminalValue
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Set the data source, but do not add any values from the data source
|
43
|
+
#
|
44
|
+
def setDataSource(dataSource)
|
45
|
+
if (@rootDataSource == nil) then
|
46
|
+
@rootDataSource = dataSource
|
47
|
+
end
|
48
|
+
@nodeFactory.extendDataSource(dataSource, @startOffset)
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Add all values in a given dataSource
|
53
|
+
#
|
54
|
+
def addDataSource(dataSource)
|
55
|
+
@suffixOffset = 0
|
56
|
+
self.setDataSource(dataSource)
|
57
|
+
dataSource.each_with_index(@startOffset) do |value, offset|
|
58
|
+
self.addValue(value, offset)
|
59
|
+
end
|
60
|
+
if (@terminalValue != nil) then
|
61
|
+
@lastOffsetAdded += 1
|
62
|
+
self.addValue(@terminalValue, @lastOffsetAdded)
|
63
|
+
end
|
64
|
+
@startOffset = @lastOffsetAdded + 1
|
65
|
+
end
|
66
|
+
|
67
|
+
#
|
68
|
+
# Adding one value at a time, rootDataSource must be set for this to work
|
69
|
+
#
|
70
|
+
def addValue(value, offset)
|
71
|
+
while (extend(value, offset)) do
|
72
|
+
@suffixLinker.update(@location)
|
73
|
+
end
|
74
|
+
@lastOffsetAdded = offset
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# Finish building the tree by adding a value that is not part of the data source
|
79
|
+
#
|
80
|
+
def finish()
|
81
|
+
if (@rootDataSource.has_terminator?) then
|
82
|
+
self.addValue(@rootDataSource.terminator, @startOffset)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# Extend a single suffix at the current location, returns true if there is another
|
88
|
+
# suffix to extend.
|
89
|
+
#
|
90
|
+
# Handles these cases:
|
91
|
+
#
|
92
|
+
# On a node:
|
93
|
+
# if there is a child starting with the extension value, traverse down that one value, return FALSE
|
94
|
+
# if no child has the extension value, add a leaf,
|
95
|
+
# if we are at root, return FALSE,
|
96
|
+
# otherwise traverse to the next suffix and return TRUE
|
97
|
+
#
|
98
|
+
# On an edge:
|
99
|
+
# if next character has the value, traverse past it, return FALSE
|
100
|
+
# if next character is not the value, split edge at that location, locate at the new node, and return TRUE
|
101
|
+
#
|
102
|
+
def extend(value,offset)
|
103
|
+
if (@location.onNode)
|
104
|
+
if (@location.node.children.has_key?(value)) then
|
105
|
+
@location.traverseDownChildValue(value)
|
106
|
+
return false # rule 3
|
107
|
+
else
|
108
|
+
@nodeFactory.addLeaf(@location.node, value, offset)
|
109
|
+
return @location.traverseToNextSuffix(@rootDataSource) # rule 1, traverse returns false when at root
|
110
|
+
end
|
111
|
+
elsif (@rootDataSource.valueAt(@location.incomingEdgeOffset) == value) then
|
112
|
+
@location.traverseDownEdgeValue()
|
113
|
+
return false # found value on edge, rule 3
|
114
|
+
else
|
115
|
+
newNode = @nodeFactory.splitEdgeAt(@location.node, @location.incomingEdgeOffset)
|
116
|
+
@suffixLinker.nodeNeedingSuffixLink(newNode)
|
117
|
+
@location.jumpToNode(newNode)
|
118
|
+
return true # rule 2
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class BaseVisitor
|
2
|
+
attr_accessor :preCounter, :postCounter
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@preCounter = 0
|
6
|
+
@postCounter = 0
|
7
|
+
end
|
8
|
+
|
9
|
+
def preVisit(node)
|
10
|
+
@preCounter += 1
|
11
|
+
return true
|
12
|
+
end
|
13
|
+
|
14
|
+
def postVisit(node)
|
15
|
+
@postCounter += 1
|
16
|
+
end
|
17
|
+
end
|
data/lib/visitor/bfs.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
class BFS
|
2
|
+
def initialize(visitor)
|
3
|
+
@visitor = visitor
|
4
|
+
@q = Array.new
|
5
|
+
end
|
6
|
+
|
7
|
+
def traverse(node)
|
8
|
+
@q.unshift(node)
|
9
|
+
|
10
|
+
while (@q.size > 0) do
|
11
|
+
node = @q.pop
|
12
|
+
if (@visitor.preVisit(node)) then
|
13
|
+
if (node.children != nil) then
|
14
|
+
node.children.values.each do |child|
|
15
|
+
@q.unshift(child)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative 'base_visitor'
|
2
|
+
|
3
|
+
class DataSourceVisitor < BaseVisitor
|
4
|
+
def initialize
|
5
|
+
super
|
6
|
+
end
|
7
|
+
|
8
|
+
def postVisit(node)
|
9
|
+
if (node.children != nil) then
|
10
|
+
node.children.values.each do |child|
|
11
|
+
node.dataSourceBit |= child.dataSourceBit
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/visitor/dfs.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
class DFS
|
2
|
+
def initialize(visitor)
|
3
|
+
@visitor = visitor
|
4
|
+
end
|
5
|
+
|
6
|
+
def traverseChildren(children)
|
7
|
+
if (children != nil)
|
8
|
+
children.each do |key,value|
|
9
|
+
self.traverse(value)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def traverse(node)
|
15
|
+
if (@visitor.preVisit(node)) then
|
16
|
+
self.traverseChildren(node.children)
|
17
|
+
@visitor.postVisit(node)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class OrderedDFS < DFS
|
23
|
+
def initialize(visitor)
|
24
|
+
super(visitor)
|
25
|
+
end
|
26
|
+
|
27
|
+
def traverseChildren(children)
|
28
|
+
if (children != nil)
|
29
|
+
children.keys.sort.each do |key|
|
30
|
+
self.traverse(children[key])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class ValueRange
|
2
|
+
|
3
|
+
attr_accessor :startOffset, :endOffset
|
4
|
+
|
5
|
+
def initialize(startOffset, endOffset)
|
6
|
+
@startOffset = startOffset
|
7
|
+
@endOffset = endOffset
|
8
|
+
end
|
9
|
+
|
10
|
+
def length
|
11
|
+
return @endOffset - @startOffset + 1
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
class KCommonVisitor < BaseVisitor
|
17
|
+
|
18
|
+
def initialize(dataSource)
|
19
|
+
@dataSource = dataSource
|
20
|
+
|
21
|
+
#
|
22
|
+
# key = common to at least this many (2, 3, ...)
|
23
|
+
# value = [ startOffset, endOffset ] of value sequence
|
24
|
+
#
|
25
|
+
@commonTo = {}
|
26
|
+
|
27
|
+
#
|
28
|
+
# set up initial values
|
29
|
+
#
|
30
|
+
(0..64).each do |value|
|
31
|
+
@commonTo[value] = ValueRange.new(0,-1)
|
32
|
+
end
|
33
|
+
super()
|
34
|
+
end
|
35
|
+
|
36
|
+
def postVisit(node)
|
37
|
+
nCommon = self.countCommon(node.dataSourceBit)
|
38
|
+
currentCommonLength = @commonTo[nCommon].endOffset - @commonTo[nCommon].startOffset + 1
|
39
|
+
if (node.valueDepth > currentCommonLength) then
|
40
|
+
@commonTo[nCommon].startOffset = node.incomingEdgeEndOffset - node.valueDepth + 1
|
41
|
+
@commonTo[nCommon].endOffset = node.incomingEdgeEndOffset
|
42
|
+
if (nCommon > 2) then
|
43
|
+
longestLength = node.valueDepth
|
44
|
+
(1..(nCommon-1)).each do |offset|
|
45
|
+
testLength = @commonTo[offset].endOffset - @commonTo[offset].startOffset + 1
|
46
|
+
if (testLength < longestLength) then
|
47
|
+
@commonTo[offset].startOffset = @commonTo[nCommon].startOffset
|
48
|
+
@commonTo[offset].endOffset = @commonTo[nCommon].endOffset
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def longestStringCommonTo(numberInCommon)
|
56
|
+
return @commonTo[numberInCommon].length, @dataSource.valueSequence(@commonTo[numberInCommon].startOffset, @commonTo[numberInCommon].endOffset)
|
57
|
+
end
|
58
|
+
|
59
|
+
def countCommon(bits)
|
60
|
+
result = 0
|
61
|
+
scanner = 1
|
62
|
+
bits = bits.to_i
|
63
|
+
(1..32).each do
|
64
|
+
if ((scanner & bits) != 0) then
|
65
|
+
result += 1
|
66
|
+
end
|
67
|
+
scanner = scanner << 1
|
68
|
+
end
|
69
|
+
result
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative 'base_visitor'
|
2
|
+
|
3
|
+
class LeafCountVisitor < BaseVisitor
|
4
|
+
def initialize
|
5
|
+
super
|
6
|
+
end
|
7
|
+
|
8
|
+
def postVisit(node)
|
9
|
+
if (node.children != nil) then
|
10
|
+
node.children.values.each do |child|
|
11
|
+
node.leafCount += child.leafCount
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|