RubyGems - suffix_tree - Versions diffs - 0.0.1 - Mend

suffix_tree 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/lib/data/base_data_source.rb +44 -0
data/lib/data/data_source_factory.rb +16 -0
data/lib/data/file_data_source.rb +29 -0
data/lib/data/line_state_machine.rb +86 -0
data/lib/data/string_data_source.rb +31 -0
data/lib/data/word_data_source.rb +229 -0
data/lib/location.rb +165 -0
data/lib/node.rb +63 -0
data/lib/node_factory.rb +169 -0
data/lib/persist/suffix_tree_db.rb +148 -0
data/lib/search/searcher.rb +68 -0
data/lib/suffix_linker.rb +16 -0
data/lib/suffix_tree.rb +122 -0
data/lib/visitor/base_visitor.rb +17 -0
data/lib/visitor/bfs.rb +22 -0
data/lib/visitor/data_source_visitor.rb +15 -0
data/lib/visitor/dfs.rb +34 -0
data/lib/visitor/k_common_visitor.rb +71 -0
data/lib/visitor/leaf_count_visitor.rb +15 -0
data/lib/visitor/node_count_visitor.rb +16 -0
data/lib/visitor/numbering_visitor.rb +230 -0
data/lib/visitor/suffix_offset_visitor.rb +23 -0
data/lib/visitor/tree_print_visitor.rb +44 -0
data/lib/visitor/value_depth_visitor.rb +34 -0
data/spec/constant_lca_spec.rb +27 -0
data/spec/data_source_spec.rb +51 -0
data/spec/fixtures/arizona.txt +1 -0
data/spec/fixtures/chapter1.txt +371 -0
data/spec/fixtures/chapter1.txt.summary +3 -0
data/spec/fixtures/chapter1.txt.values +0 -0
data/spec/fixtures/chapter1.txt.words +1329 -0
data/spec/fixtures/mississippi.txt +1 -0
data/spec/fixtures/singlePara.txt +41 -0
data/spec/fixtures/smallFile.txt +3 -0
data/spec/fixtures/smallFile.txt.summary +2 -0
data/spec/fixtures/smallFile.txt.values +0 -0
data/spec/fixtures/smallFile.txt.words +14 -0
data/spec/fixtures/testbook.txt +5414 -0
data/spec/location_spec.rb +149 -0
data/spec/node_factory_spec.rb +199 -0
data/spec/search_spec.rb +182 -0
data/spec/suffix_tree_spec.rb +270 -0
data/spec/util_spec.rb +47 -0
data/spec/visitor_spec.rb +310 -0
metadata +87 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 188a9ad0d6fdd0a21bd58f8715abf6b0fb8643e5
+  data.tar.gz: 64f9ef00d6bba6b8491c73e64e314b7b9e9b6003
+SHA512:
+  metadata.gz: 3101607dbc019a2a6e8ccc0ca271b909d5e2d2ce14ee23ed59e2725a0ee174d513ad2e5c427b7f6fcd7abfb5e617d719b38c7a95379aab65a5f0f2ae13ad5f95
+  data.tar.gz: c960dd1df4823fdfbe089642846db70016a30a1adda1f9ddd39858937937980686f135a0004348f54c0a22eea72881f29e619a834db45a081e3d41b37e0af0b6

data/lib/data/base_data_source.rb ADDED Viewed

@@ -0,0 +1,44 @@
+class BaseDataSource
+  attr_accessor :startOffset
+  def initialize(startOffset = 0)
+    @nextDataSource = nil
+    @startOffset = startOffset
+  end
+  def each_with_index(offset = 0)
+    while ((value = self.valueAt(offset)) != nil) do
+      yield value, offset
+      offset += 1
+    end
+  end
+  def extendWith(dataSource, startOffset)
+    if (@nextDataSource == nil) then
+      @nextDataSource = dataSource
+      dataSource.startOffset = startOffset
+    else
+      @nextDataSource.extendWith(dataSource, startOffset)
+    end
+  end
+  def has_terminator?
+    false
+  end
+  def nextDataSourceValueAt(offset)
+    if (@nextDataSource != nil) then
+      return @nextDataSource.valueAt(offset)
+    else
+      return nil
+    end
+  end
+  def valueSequence(startOffset, endOffset)
+    result = ""
+    (startOffset..endOffset).each do |offset|
+      result += self.valueAt(offset)
+    end
+    result
+  end
+end

data/lib/data/data_source_factory.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require_relative 'string_data_source'
+require_relative 'file_data_source'
+class DataSourceFactory
+  STRING_DATA_SOURCE = 'string'
+  FILE_DATA_SOURCE = 'file'
+  def newDataSource(dataSourceType, dataSourceValue)
+    if (dataSourceType == STRING_DATA_SOURCE) then
+      return StringDataSource.new(dataSourceValue)
+    elsif (dataSourceType == FILE_DATA_SOURCE) then
+      return FileDataSource.new(dataSourceValue)
+    end
+  end
+end

data/lib/data/file_data_source.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require_relative 'base_data_source'
+class FileDataSource < BaseDataSource
+  def initialize(path)
+    @inFile = File.open(path, "rb")
+    @checkFile = File.open(path, "rb")
+    super(0)
+  end
+  def valueAt(offset)
+    @checkFile.seek(offset - @startOffset, IO::SEEK_SET)
+    result = @checkFile.getc
+    if (result == nil) then
+      return self.nextDataSourceValueAt(offset)
+    end
+    result
+  end
+  # substring
+  def toString(startOffset, endOffset)
+    @checkFile.seek(startOffset - @startOffset, IO::SEEK_SET)
+    if (endOffset >= startOffset) then
+      return @checkFile.read(endOffset - startOffset + 1)
+    else
+      return @checkFile.read()
+    end
+  end
+end

data/lib/data/line_state_machine.rb ADDED Viewed

@@ -0,0 +1,86 @@
+require 'state_machine'
+#
+#  First pass at state machine for converting sequence of formatted lines into a different
+#  set of word values, in this case "<N>, blank, |, blank, <footer title>" get converted
+#  into [ "END_OF_PAGE", "<page number>", "<title as a single word>"]
+#
+class LineStateMachine
+  attr_accessor :bucket, :pages
+  def initialize
+    @bucket = "Page 0"
+    @pages = {}
+    @dataQueue = []
+    super
+  end
+  def resetState(data)
+    self.reset
+    result = []
+    result << @dataQueue
+    result << data
+    @dataQueue = []
+    return result.flatten
+  end
+  def process(line, wordIndex)
+    data = line.split
+    # we are looking for a blank, a pipe, or a page number
+    if (data.length == 0) then
+      if (self.foundBlank) then
+        return []
+      end
+    end
+    if (data.length == 1) then
+      if (data[0] == "|") then
+        if (self.foundPipe) then
+          return []
+        end
+      end
+      ival = data[0].to_i
+      if (ival > 0) then
+        if (self.foundN) then
+          @potentialPageNumber = ival
+          @dataQueue << data  # in case this really isn't it
+          return []
+        end
+      end
+    end
+    # if we are looking for the title, the entire line is the title
+    if (data.length > 0) then
+      if (self.foundTitle) then
+        @dataQueue = []
+        @bucket = "Page #{@potentialPageNumber}"
+        @pages[@bucket] = wordIndex
+        return []
+      end
+    end
+    resetState(data)
+  end
+  state_machine :state, :initial => :lookingForN do
+    event :foundN do
+      transition :lookingForN => :lookingForFirstBlank
+    end
+    event :foundBlank do
+      transition :lookingForFirstBlank => :lookingForPipe, :lookingForSecondBlank => :lookingForTitle
+    end
+    event :foundPipe do
+      transition :lookingForPipe => :lookingForSecondBlank
+    end
+    event :foundTitle do
+      transition :lookingForTitle => :lookingForN
+    end
+    event :reset do
+      transition all => :lookingForN
+    end
+  end
+end

data/lib/data/string_data_source.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require_relative 'base_data_source'
+class StringDataSource < BaseDataSource
+  def initialize(s)
+    @s = s
+    super()
+  end
+  def numberValues
+    return @s.length
+  end
+  def valueAt(offset)
+    value = @s[ offset - @startOffset ]
+    if (value == nil) then
+      return self.nextDataSourceValueAt(offset)
+    else
+      return value
+    end
+  end
+  # substring
+  def toString(startOffset, endOffset)
+    if (endOffset >= startOffset) then
+      return @s[startOffset..endOffset]
+    else
+      return @s[startOffset..(@s.length - 1)]
+    end
+  end
+end

data/lib/data/word_data_source.rb ADDED Viewed

@@ -0,0 +1,229 @@
+require_relative 'base_data_source'
+class WordDataSource < BaseDataSource
+  attr_reader :words, :numberWordsInFile
+  def initialize(filePath, regex = "/[^a-z0-9\-\s]/i")
+    @filePath = filePath
+    @words = []
+    @regex = regex
+    File.open(filePath, "r") do |file|
+      file.each_line do |line|
+        line.chomp!
+        if (self.process(line)) then
+          break
+        end
+      end
+    end
+    @numberWordsInFile = @words.length
+  end
+  def numberValues
+    return @words.length
+  end
+  def process(line)
+    line = self.preprocessLine(line)
+    return self.processData(line.split)
+  end
+  def processData(data)
+    data.each do |word|
+      word = word.chomp(",")
+      @words << word
+    end
+    return false
+  end
+  def preprocessLine(line)
+    line.downcase.gsub(@regex, ' ')
+  end
+  def valueAt(offset)
+    return @words[offset] if (offset < @numberWordsInFile)
+    return nil
+  end
+  def toString(startOffset, endOffset)
+    if (endOffset == -1) then
+      result = "#{@words[startOffset]} ..*"
+    else
+      result = ""
+      (startOffset..endOffset).each do |offset|
+        result += "#{@words[offset]} "
+      end
+    end
+    result
+  end
+end
+class SingleWordDataSource < BaseDataSource
+  def initialize(word)
+    @word = word
+  end
+  def numberValues
+    return 1
+  end
+  def valueAt(offset)
+    return nil if (offset > 0)
+    return @word
+  end
+end
+class ArrayWordDataSource
+  attr_reader :wordCounts
+  def initialize(wordList, offsetList, size)
+    @wordList = wordList
+    @offsetList = offsetList
+    @size = size
+    @wordCounts = createWordCounts
+  end
+  def valueAt(offset)
+    if (offset < @size) then
+      return @wordList[@offsetList[offset]]
+    else
+      return nil
+    end
+  end
+  def verify(word, count)
+    if (@wordCounts == nil) then
+      createWordCounts
+    end
+    @wordCounts[word] == count
+  end
+  def each_word(offset = 0)
+    while ((value = self.valueAt(offset)) != nil) do
+      yield value
+      offset += 1
+    end
+  end
+  private
+  def createWordCounts()
+    wordCounts = {}
+    @wordList.each do |word|
+      if (!wordCounts.has_key?(word)) then
+        wordCounts[word] = 0
+      end
+      wordCounts[word] += 1
+    end
+    wordCounts
+  end
+end
+class DelimitedWordDataSource < WordDataSource
+  attr_reader :buckets, :wordCounts, :wordAsEncountered, :wordValueSequence
+  def initialize(filePath, lineStateMachine, limit)
+    @lineStateMachine = lineStateMachine
+    @limit = limit
+    @count = 0
+    @buckets = {}
+    @wordCounts = {}
+    @wordValueSequence = []  # list of words in file in terms of index into @wordAsEncountered
+    @wordAsEncounteredIndex = {}          # key is word, value is number as encountered
+    @wordAsEncountered = []  # array entry added only when a new word is encountered
+    @nextWordEncounteredIndex = 0
+    super(filePath,"/[^[:print:]]/")
+  end
+  def bucket
+    @lineStateMachine.bucket
+  end
+  def save
+    File.open("#{@filePath}.words", 'w') do |file|
+      @wordAsEncountered.each do |word|
+        file.write("#{word}\n")
+      end
+    end
+    File.open("#{@filePath}.values", 'wb') do |file|
+      file << @wordValueSequence.pack("N*")
+    end
+    File.open("#{@filePath}.summary", "w") do |file|
+      file << "#{@numberWordsInFile} words in file\n"
+      file << "#{@nextWordEncounteredIndex} distinct words\n"
+      file << "Metadata\n"
+      # uh-oh, this seems to reverse the hash in place!
+      @lineStateMachine.pages.sort_by(&:reverse).each do |page, wordOffset|
+        file << "#{wordOffset} #{page}\n"
+      end
+    end
+  end
+  # TODO: fix this, linear metadata search, O(N) should be O(lg N)
+  def metaDataFor(offset)
+    previousMetadata = "unknown"
+    @lineStateMachine.pages.sort_by(&:reverse).each do |metadata, wordOffset|
+      if (wordOffset < offset) then
+        previousMetadata = metadata
+      else
+        return previousMetadata
+      end
+    end
+    return previousMetadata
+  end
+  def wordCount(word)
+    return @wordCounts[word] if @wordCounts.has_key?(word)
+    return 0
+  end
+  def processData(data,bucket)
+    data.each do |word|
+      word = word.chomp(",")
+      word = word.chomp(".")
+      if (word.length > 0) then
+        @words << word
+        if (!@wordCounts.has_key?(word)) then
+          # we have a new word
+          @wordAsEncounteredIndex[word] = @nextWordEncounteredIndex
+          @wordAsEncountered << word
+          @nextWordEncounteredIndex += 1
+          @wordCounts[word] = 0
+        end
+        @wordCounts[word] += 1
+        if (!@buckets[bucket].has_key?(word)) then
+          @buckets[bucket][word] = 0
+        end
+        @buckets[bucket][word] += 1
+        @wordValueSequence << @wordAsEncounteredIndex[word]
+        @count += 1
+        if ((@limit > 0) && (@count >= @limit)) then
+          return true
+        end
+      end
+    end
+    return false
+  end
+  def process(line)
+    line = self.preprocessLine(line)
+    data = @lineStateMachine.process(line, @wordValueSequence.length)
+    if (data.length > 0) then
+      bucket = @lineStateMachine.bucket
+      @buckets[bucket] = {} if (!@buckets.has_key?(bucket))
+      return self.processData(data,bucket)
+    end
+    return false
+  end
+  def verify(word, count)
+    @wordCounts[word] == count
+  end
+  def has_terminator?
+    true
+  end
+  def terminator
+    "END_OF_DOCUMENT"
+  end
+end

data/lib/location.rb ADDED Viewed

@@ -0,0 +1,165 @@
+require_relative 'node'
+#
+#  This class keeps track of the next value to check in a suffix tree
+#
+#  If we are located at a node, there are several options for the next value
+#  which are in the map of value-to-node.
+#
+#  If we are not on a node, there is an incoming edge with at least one value
+#  so we store the offset of that value in the data source
+#
+#  The location can never be onNode at a leaf, but can be at a leaf with
+#  an incomingEdgeOffset at or past the leaf's incomingEdgeStartOffset
+#
+class Location
+  attr_reader :node, :onNode, :incomingEdgeOffset
+  #
+  #  optional parameters needed for testing
+  #
+  def initialize(node, onNode = true, incomingEdgeOffset = Node::UNSPECIFIED_OFFSET)
+    @node = node
+    @onNode = onNode
+    @incomingEdgeOffset = incomingEdgeOffset
+  end
+  #
+  #  traverse to parent, return the range of characters covered
+  #
+  def traverseUp
+    incomingEdgeStart = @node.incomingEdgeStartOffset
+    if (@onNode) then
+      incomingEdgeEnd = @node.incomingEdgeEndOffset
+    else
+      incomingEdgeEnd = @incomingEdgeOffset - 1
+    end
+    @node = @node.parent
+    @incomingEdgeOffset = Node::UNSPECIFIED_OFFSET
+    @onNode = true
+    return incomingEdgeStart, incomingEdgeEnd
+  end
+  def traverseSuffixLink
+    self.jumpToNode(@node.suffixLink)
+  end
+  #
+  #  From the current Node with a given child value, traverse past that value
+  #
+  def traverseDownChildValue(value)
+    @node = @node.children[value]
+    if (@node.incomingEdgeLength == 1) then
+      @onNode = true
+      @incomingEdgeOffset = Node::UNSPECIFIED_OFFSET
+    else
+      @onNode = false
+      @incomingEdgeOffset = @node.incomingEdgeStartOffset + 1
+    end
+  end
+  #
+  #  From the current location that does NOT have a suffix link, either because it
+  #  is on an edge or because it is on a newly created internal node, traverse
+  #  to the next suffix
+  #
+  #  Returns true if it actually traversed, otherwise false
+  #
+  def traverseToNextSuffix(dataSource)
+    if (@node.isRoot) then
+      return false
+    end
+    upStart, upEnd = self.traverseUp
+    if (@node.isRoot) then
+      if (upStart < upEnd) then
+        self.traverseSkipCountDown(dataSource, upStart + 1, upEnd)
+      else
+        @onNode = true
+      end
+    else
+      @node = @node.suffixLink
+      self.traverseSkipCountDown(dataSource, upStart, upEnd)
+    end
+    return true
+  end
+  #
+  #  From the current location on a Node, traverse down assuming the characters
+  #  on the path exist, which allows skip/count method to be used to move down.
+  #
+  def traverseSkipCountDown(dataSource, startOffset, endOffset)
+    done = false
+    while (!done) do
+      @node = @node.children[dataSource.valueAt(startOffset)]
+      if (@node.isLeaf) then
+        @onNode = false
+        @incomingEdgeOffset = @node.incomingEdgeStartOffset + (endOffset - startOffset + 1)
+      else
+        incomingEdgeLength = @node.incomingEdgeLength
+        startOffset += incomingEdgeLength
+        remainingLength = endOffset - startOffset + 1
+        @onNode = (remainingLength == 0)
+        # if remaining length is negative, it means we have past where we need to be
+        # by that amount, incoming edge offset is set to end reduced by that amount
+        if (remainingLength < 0) then
+          @incomingEdgeOffset = @node.incomingEdgeEndOffset + remainingLength + 1
+        else
+          @incomingEdgeOffset = @node.incomingEdgeStartOffset
+        end
+      end
+      done = (@node.isLeaf || (remainingLength <= 0))
+    end
+  end
+  def traverseDownEdgeValue()
+    @incomingEdgeOffset += 1
+    if (!@node.isLeaf && (@incomingEdgeOffset > @node.incomingEdgeEndOffset)) then
+      @onNode = true
+    end
+  end
+  def matchDataSource(dataSource, matchThis)
+    matchThis.each_with_index do |value, index|
+      if (!self.matchValue(dataSource, value)) then
+        break
+      end
+    end
+    self
+  end
+  def matchValue(dataSource, value)
+    if (@onNode) then
+      if (@node.children.has_key?(value)) then
+        self.traverseDownChildValue(value)
+        return true
+      end
+    else
+      if (dataSource.valueAt(@incomingEdgeOffset) == value) then
+        self.traverseDownEdgeValue()
+        return true
+      end
+    end
+    return false
+  end
+  #
+  #  get the depth of the location
+  #
+  #  Requires nodes with "valueDepth" property (nodeFactory with :valueDepth=>true, followed by traversal with ValueDepthVisitor)
+  #
+  def depth
+    if (@onNode) then
+      return @node.valueDepth
+    else
+      return @node.parent.valueDepth + @incomingEdgeOffset - @node.incomingEdgeStartOffset
+    end
+  end
+  def jumpToNode(node)
+    @node = node
+    @onNode = true
+    @incomingEdgeOffset = Node::UNSPECIFIED_OFFSET
+  end
+end

data/lib/node.rb ADDED Viewed

@@ -0,0 +1,63 @@
+class Node
+  # Leaf nodes use this due to Rule 1: once a leaf, always a leaf
+  CURRENT_ENDING_OFFSET = -1
+  # Root uses this, it has no incoming edge, yet as a Node has incoming edge offset properties
+  UNSPECIFIED_OFFSET = -2
+  # Leaf nodes get special depth, since they vary as characters get added
+  LEAF_DEPTH = -3
+  attr_accessor :incomingEdgeStartOffset, :incomingEdgeEndOffset, :suffixOffset
+  attr_accessor :parent, :suffixLink, :children
+  attr_reader :nodeId
+  def initialize(nodeId, suffixOffset = UNSPECIFIED_OFFSET)
+    @nodeId = nodeId
+    @incomingEdgeStartOffset = UNSPECIFIED_OFFSET
+    @incomingEdgeEndOffset = UNSPECIFIED_OFFSET
+    @suffixOffset = suffixOffset
+    @parent = nil
+    @suffixLink = nil
+    @children = nil
+  end
+  def isRoot
+    return @parent == nil
+  end
+  def isLeaf
+    return @incomingEdgeEndOffset == CURRENT_ENDING_OFFSET
+  end
+  def isInternal
+    return !isLeaf && !isRoot
+  end
+  def incomingEdgeLength
+    return @incomingEdgeEndOffset - @incomingEdgeStartOffset + 1
+  end
+  #
+  #  some algorithms require additional accessors, allow these to be created dynamically
+  #
+  def createAccessor(name)
+    self.class.send(:attr_accessor, name)
+  end
+  #
+  #  suffix offset enumerator (not sure this belongs here)
+  #
+  def each_suffix
+    if (self.isLeaf) then
+      yield suffixOffset
+    else
+      children.keys.sort.each do |key|
+        children[key].each_suffix do |suffixOffset|
+          yield suffixOffset
+        end
+      end
+    end
+  end
+end