fastri 0.1.1.1 → 0.2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +12 -0
- data/Rakefile +5 -4
- data/bin/fastri-server +77 -35
- data/bin/fri +145 -8
- data/bin/ri-emacs +1 -1
- data/lib/fastri/full_text_index.rb +245 -0
- data/lib/fastri/full_text_indexer.rb +100 -0
- data/lib/fastri/ri_index.rb +30 -0
- data/lib/fastri/ri_service.rb +6 -0
- data/lib/fastri/util.rb +83 -0
- data/lib/fastri/version.rb +6 -1
- data/test/test_full_text_index.rb +182 -0
- data/test/test_full_text_indexer.rb +84 -0
- data/test/test_integration_full_text_index.rb +43 -0
- data/test/test_ri_index.rb +99 -1
- data/test/test_util.rb +38 -0
- metadata +14 -3
| @@ -0,0 +1,100 @@ | |
| 1 | 
            +
            # Copyright (C) 2006  Mauricio Fernandez <mfp@acm.org>
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require 'fastri/version'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module FastRI
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            class FullTextIndexer
         | 
| 9 | 
            +
              WORD_RE    = /[A-Za-z0-9_]+/
         | 
| 10 | 
            +
              NONWORD_RE = /[^A-Za-z0-9_]+/
         | 
| 11 | 
            +
              MAGIC      = "FastRI full-text index #{FASTRI_FT_INDEX_FORMAT}\0"
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              def initialize(max_querysize)
         | 
| 14 | 
            +
                @documents = []
         | 
| 15 | 
            +
                @doc_hash  = {}
         | 
| 16 | 
            +
                @max_wordsize = max_querysize
         | 
| 17 | 
            +
              end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
              def add_document(name, data, metadata = {})
         | 
| 20 | 
            +
                @doc_hash[name] = [data, metadata]
         | 
| 21 | 
            +
                @documents << name
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
              def data(name)
         | 
| 25 | 
            +
                @doc_hash[name][0]
         | 
| 26 | 
            +
              end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              def documents
         | 
| 29 | 
            +
                @documents = @documents.uniq
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              def preprocess(str)
         | 
| 33 | 
            +
                str.gsub(/\0/,"")
         | 
| 34 | 
            +
              end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
              require 'strscan'
         | 
| 37 | 
            +
              def find_suffixes(text, offset)
         | 
| 38 | 
            +
                find_suffixes_simple(text, WORD_RE, NONWORD_RE, offset)
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
              def find_suffixes_simple(string, word_re, nonword_re, offset)
         | 
| 42 | 
            +
                suffixes = []
         | 
| 43 | 
            +
                sc = StringScanner.new(string)
         | 
| 44 | 
            +
                until sc.eos?
         | 
| 45 | 
            +
                  sc.skip(nonword_re)
         | 
| 46 | 
            +
                  len = string.size
         | 
| 47 | 
            +
                  loop do
         | 
| 48 | 
            +
                    break if sc.pos == len
         | 
| 49 | 
            +
                    suffixes << offset + sc.pos
         | 
| 50 | 
            +
                    skipped_word = sc.skip(word_re)
         | 
| 51 | 
            +
                    break unless skipped_word
         | 
| 52 | 
            +
                    loop do
         | 
| 53 | 
            +
                      skipped_nonword = sc.skip(nonword_re)
         | 
| 54 | 
            +
                      break unless skipped_nonword
         | 
| 55 | 
            +
                    end
         | 
| 56 | 
            +
                  end
         | 
| 57 | 
            +
                end
         | 
| 58 | 
            +
                suffixes
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              require 'enumerator'
         | 
| 62 | 
            +
              def build_index(full_text_IO, suffix_array_IO)
         | 
| 63 | 
            +
                fulltext = ""
         | 
| 64 | 
            +
                io = StringIO.new(fulltext)
         | 
| 65 | 
            +
                io.write MAGIC
         | 
| 66 | 
            +
                full_text_IO.write MAGIC
         | 
| 67 | 
            +
                documents.each do |doc|
         | 
| 68 | 
            +
                  data, metadata = @doc_hash[doc]
         | 
| 69 | 
            +
                  io.write(data)
         | 
| 70 | 
            +
                  full_text_IO.write(data)
         | 
| 71 | 
            +
                  meta_txt = Marshal.dump(metadata)
         | 
| 72 | 
            +
                  footer = "\0....#{doc}\0#{meta_txt}\0"
         | 
| 73 | 
            +
                  footer[1,4] = [footer.size - 5].pack("V")
         | 
| 74 | 
            +
                  io.write(footer)
         | 
| 75 | 
            +
                  full_text_IO.write(footer)
         | 
| 76 | 
            +
                end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                scanner = StringScanner.new(fulltext)
         | 
| 79 | 
            +
                scanner.scan(Regexp.new(Regexp.escape(MAGIC)))
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                count = 0
         | 
| 82 | 
            +
                suffixes = []
         | 
| 83 | 
            +
                until scanner.eos?
         | 
| 84 | 
            +
                  count += 1
         | 
| 85 | 
            +
                  start = scanner.pos
         | 
| 86 | 
            +
                  text = scanner.scan_until(/\0/)
         | 
| 87 | 
            +
                  suffixes.concat find_suffixes(text[0..-2], start)
         | 
| 88 | 
            +
                  len = scanner.scan(/..../).unpack("V")[0]
         | 
| 89 | 
            +
                  #puts "LEN: #{len}  #{scanner.pos}  #{scanner.string.size}"
         | 
| 90 | 
            +
                  #puts "#{scanner.string[scanner.pos,20].inspect}"
         | 
| 91 | 
            +
                  scanner.pos += len
         | 
| 92 | 
            +
                  #scanner.terminate if !text
         | 
| 93 | 
            +
                end
         | 
| 94 | 
            +
                sorted = suffixes.sort_by{|x| fulltext[x, @max_wordsize]}
         | 
| 95 | 
            +
                sorted.each_slice(10000){|x| suffix_array_IO.write x.pack("V*")}
         | 
| 96 | 
            +
                nil
         | 
| 97 | 
            +
              end
         | 
| 98 | 
            +
            end # class FullTextIndexer
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            end # module FastRI
         | 
    
        data/lib/fastri/ri_index.rb
    CHANGED
    
    | @@ -3,9 +3,37 @@ | |
| 3 3 |  | 
| 4 4 | 
             
            require 'rdoc/ri/ri_cache'
         | 
| 5 5 | 
             
            require 'rdoc/ri/ri_reader'
         | 
| 6 | 
            +
            require 'rdoc/ri/ri_descriptions'
         | 
| 6 7 | 
             
            require 'fastri/version'
         | 
| 7 8 |  | 
| 8 9 |  | 
| 10 | 
            +
            # This is taken straight from 1.8.5's rdoc/ri/ri_descriptions.rb.
         | 
| 11 | 
            +
            # Older releases have a buggy #merge_in that crashes when old.comment is nil.
         | 
| 12 | 
            +
            if RUBY_RELEASE_DATE < "2006-06-15"
         | 
| 13 | 
            +
              module ::RI # :nodoc:
         | 
| 14 | 
            +
                class ModuleDescription # :nodoc:
         | 
| 15 | 
            +
                  remove_method :merge_in
         | 
| 16 | 
            +
                  # merge in another class desscription into this one
         | 
| 17 | 
            +
                  def merge_in(old)
         | 
| 18 | 
            +
                    merge(@class_methods, old.class_methods)
         | 
| 19 | 
            +
                    merge(@instance_methods, old.instance_methods)
         | 
| 20 | 
            +
                    merge(@attributes, old.attributes)
         | 
| 21 | 
            +
                    merge(@constants, old.constants)
         | 
| 22 | 
            +
                    merge(@includes, old.includes)
         | 
| 23 | 
            +
                    if @comment.nil? || @comment.empty?
         | 
| 24 | 
            +
                      @comment = old.comment
         | 
| 25 | 
            +
                    else
         | 
| 26 | 
            +
                      unless old.comment.nil? or old.comment.empty? then
         | 
| 27 | 
            +
                        @comment << SM::Flow::RULE.new
         | 
| 28 | 
            +
                        @comment.concat old.comment
         | 
| 29 | 
            +
                      end
         | 
| 30 | 
            +
                    end
         | 
| 31 | 
            +
                  end
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
            end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
             | 
| 9 37 | 
             
            module FastRI
         | 
| 10 38 |  | 
| 11 39 | 
             
            # This class provides the same functionality as RiReader, with some
         | 
| @@ -466,6 +494,8 @@ class RiIndex | |
| 466 494 | 
             
                  when /[#.]\S+/
         | 
| 467 495 | 
             
                    method_entry = get_entry(@method_array, entry_or_name, MethodEntry, nil)
         | 
| 468 496 | 
             
                    source_paths_for(method_entry)
         | 
| 497 | 
            +
                  when ""
         | 
| 498 | 
            +
                    []
         | 
| 469 499 | 
             
                  else
         | 
| 470 500 | 
             
                    class_entry = get_entry(@namespace_array, entry_or_name, ClassEntry, nil)
         | 
| 471 501 | 
             
                    source_paths_for(class_entry)
         | 
    
        data/lib/fastri/ri_service.rb
    CHANGED
    
    | @@ -273,6 +273,12 @@ class RiService | |
| 273 273 | 
             
                  m.add_matcher(:partial_ci) do
         | 
| 274 274 | 
             
                    m.yield @ri_reader.methods_under_matching("", /#{sep_re}#{name}/i, true)
         | 
| 275 275 | 
             
                  end
         | 
| 276 | 
            +
                  m.add_matcher(:anywhere) do
         | 
| 277 | 
            +
                    m.yield @ri_reader.methods_under_matching("", /#{sep_re}.*#{name}/, true)
         | 
| 278 | 
            +
                  end
         | 
| 279 | 
            +
                  m.add_matcher(:anywhere_ci) do
         | 
| 280 | 
            +
                    m.yield @ri_reader.methods_under_matching("", /#{sep_re}.*#{name}/i, true)
         | 
| 281 | 
            +
                  end
         | 
| 276 282 | 
             
                end
         | 
| 277 283 | 
             
                matcher.get_matches(order)
         | 
| 278 284 | 
             
              end
         | 
    
        data/lib/fastri/util.rb
    ADDED
    
    | @@ -0,0 +1,83 @@ | |
| 1 | 
            +
            # Copyright (C) 2006  Mauricio Fernandez <mfp@acm.org>
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'rdoc/ri/ri_paths'
         | 
| 4 | 
            +
            begin
         | 
| 5 | 
            +
              require 'rubygems'
         | 
| 6 | 
            +
            rescue LoadError
         | 
| 7 | 
            +
            end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            require 'rdoc/ri/ri_writer'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            module FastRI
         | 
| 12 | 
            +
            module Util
         | 
| 13 | 
            +
              # Return an array of <tt>[name, version, path]</tt> arrays corresponding to
         | 
| 14 | 
            +
              # the last version of each installed gem. +path+ is the base path of the RI
         | 
| 15 | 
            +
              # documentation from the gem. If the version cannot be determined, it will
         | 
| 16 | 
            +
              # be +nil+, and the corresponding gem might be repeated in the output array
         | 
| 17 | 
            +
              # (once per version).
         | 
| 18 | 
            +
              def gem_directories_unique
         | 
| 19 | 
            +
                return [] unless defined? Gem
         | 
| 20 | 
            +
                gemdirs = Dir["#{Gem.path}/doc/*/ri"]
         | 
| 21 | 
            +
                gems = Hash.new{|h,k| h[k] = []}
         | 
| 22 | 
            +
                gemdirs.each do |path|
         | 
| 23 | 
            +
                  gemname, version = %r{/([^/]+)-(.*)/ri$}.match(path).captures
         | 
| 24 | 
            +
                  if gemname.nil? # doesn't follow any conventions :(
         | 
| 25 | 
            +
                    gems[path[%r{/([^/]+)/ri$}, 1]] << [nil, path]
         | 
| 26 | 
            +
                  else
         | 
| 27 | 
            +
                    gems[gemname] << [version, path]
         | 
| 28 | 
            +
                  end
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
                gems.sort_by{|name, _| name}.map do |name, versions|
         | 
| 31 | 
            +
                  version, path = versions.sort.last
         | 
| 32 | 
            +
                  [name, version, File.expand_path(path)]
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
              end
         | 
| 35 | 
            +
              module_function :gem_directories_unique
         | 
| 36 | 
            +
             | 
| 37 | 
            +
              # Return the <tt>[name, version, path]</tt> array for the gem owning the RI
         | 
| 38 | 
            +
              # information stored in +path+, or +nil+.
         | 
| 39 | 
            +
              def gem_info_for_path(path, gem_dir_info = FastRI::Util.gem_directories_unique)
         | 
| 40 | 
            +
                path = File.expand_path(path)
         | 
| 41 | 
            +
                matches = gem_dir_info.select{|name, version, gem_path| path.index(gem_path) == 0}
         | 
| 42 | 
            +
                matches.sort_by{|name, version, gem_path| [gem_path.size, version, name]}.last
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
              module_function :gem_info_for_path
         | 
| 45 | 
            +
             | 
| 46 | 
            +
              # Return the +full_name+ (in ClassEntry or MethodEntry's sense) given a path
         | 
| 47 | 
            +
              # to a .yaml file relative to a "base RI DB path".
         | 
| 48 | 
            +
              def gem_relpath_to_full_name(relpath)
         | 
| 49 | 
            +
                case relpath
         | 
| 50 | 
            +
                when %r{^(.*)/cdesc-([^/]*)\.yaml$}
         | 
| 51 | 
            +
                  path, name = $~.captures
         | 
| 52 | 
            +
                  (path.split(%r{/})[0..-2] << name).join("::")
         | 
| 53 | 
            +
                when %r{^(.*)/([^/]*)-(i|c)\.yaml$}
         | 
| 54 | 
            +
                  path, escaped_name, type = $~.captures
         | 
| 55 | 
            +
                  name = RI::RiWriter.external_to_internal(escaped_name)
         | 
| 56 | 
            +
                  sep = ( type == 'c' ) ? "." : "#"
         | 
| 57 | 
            +
                  path.gsub("/", "::") + sep + name
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
              module_function :gem_relpath_to_full_name
         | 
| 61 | 
            +
              
         | 
| 62 | 
            +
              # Returns the home directory (win32-aware).
         | 
| 63 | 
            +
              def find_home
         | 
| 64 | 
            +
                # stolen from RubyGems
         | 
| 65 | 
            +
                ['HOME', 'USERPROFILE'].each do |homekey|
         | 
| 66 | 
            +
                  return ENV[homekey] if ENV[homekey]
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
                if ENV['HOMEDRIVE'] && ENV['HOMEPATH']
         | 
| 69 | 
            +
                  return "#{ENV['HOMEDRIVE']}:#{ENV['HOMEPATH']}"
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
                begin
         | 
| 72 | 
            +
                  File.expand_path("~")
         | 
| 73 | 
            +
                rescue StandardError => ex
         | 
| 74 | 
            +
                  if File::ALT_SEPARATOR
         | 
| 75 | 
            +
                    "C:/"
         | 
| 76 | 
            +
                  else
         | 
| 77 | 
            +
                    "/"
         | 
| 78 | 
            +
                  end
         | 
| 79 | 
            +
                end
         | 
| 80 | 
            +
              end
         | 
| 81 | 
            +
              module_function :find_home
         | 
| 82 | 
            +
            end # module Util
         | 
| 83 | 
            +
            end # module FastRI
         | 
    
        data/lib/fastri/version.rb
    CHANGED
    
    | @@ -2,7 +2,12 @@ | |
| 2 2 | 
             
            #
         | 
| 3 3 |  | 
| 4 4 | 
             
            module FastRI
         | 
| 5 | 
            -
              FASTRI_VERSION      = "0. | 
| 5 | 
            +
              FASTRI_VERSION      = "0.2.0"
         | 
| 6 | 
            +
              FASTRI_RELEASE_DATE = "2006-11-15"
         | 
| 6 7 | 
             
              FASTRI_INDEX_FORMAT = "0.1.0"
         | 
| 8 | 
            +
              FASTRI_FT_INDEX_FORMAT = "0.0.0"
         | 
| 9 | 
            +
              FASTRI_FT_INDEX_FORMAT_MAJOR = "0"
         | 
| 10 | 
            +
              FASTRI_FT_INDEX_FORMAT_MINOR = "0"
         | 
| 11 | 
            +
              FASTRI_FT_INDEX_FORMAT_TEENY = "0"
         | 
| 7 12 | 
             
            end
         | 
| 8 13 | 
             
            # vi: set sw=2 expandtab:
         | 
| @@ -0,0 +1,182 @@ | |
| 1 | 
            +
            require 'test/unit'
         | 
| 2 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 3 | 
            +
            $:.unshift "lib"
         | 
| 4 | 
            +
            require 'fastri/full_text_index'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            class TestFullTextIndex < Test::Unit::TestCase
         | 
| 7 | 
            +
              require 'stringio'
         | 
| 8 | 
            +
              include FastRI
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              magic = FullTextIndexer::MAGIC
         | 
| 11 | 
            +
              data = <<EOF
         | 
| 12 | 
            +
            #{magic}this is a test 
         | 
| 13 | 
            +
            \r\000\000\000foo.txt\000\004\b{\000
         | 
| 14 | 
            +
            zzzz
         | 
| 15 | 
            +
            \r\000\000\000bar.txt\000\004\b{\000
         | 
| 16 | 
            +
            EOF
         | 
| 17 | 
            +
              DATA = (data.split(/\n/) << "").join("\0")
         | 
| 18 | 
            +
              SUFFIXES = %w[a\ test is\ a test this zzzz].map{|w| [DATA.index(w)].pack("V")}.join("")
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              data = <<EOF
         | 
| 21 | 
            +
            #{magic}this is a test 
         | 
| 22 | 
            +
            \r\000\000\000foo.txt\000\004\b{\000
         | 
| 23 | 
            +
            zzzz this
         | 
| 24 | 
            +
            \r\000\000\000bar.txt\000\004\b{\000
         | 
| 25 | 
            +
            EOF
         | 
| 26 | 
            +
              DATA2 = (data.split(/\n/) << "").join("\0")
         | 
| 27 | 
            +
              SUFFIXES2 = ["a test", "is a", "test", "this\0", "this", "zzzz"].map{|x| [DATA2.index(x)].pack("V")}.join("")
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              data = <<EOF
         | 
| 30 | 
            +
            #{magic}this is a test 
         | 
| 31 | 
            +
            SIZ1foo.txt\000#{Marshal.dump({:foo => :bar, :bar => 1})}
         | 
| 32 | 
            +
            zzzz this
         | 
| 33 | 
            +
            SIZ2bar.txt\000#{Marshal.dump({:foo => :baz, :bar => 42})}
         | 
| 34 | 
            +
            EOF
         | 
| 35 | 
            +
              lines = data.split(/\n/)
         | 
| 36 | 
            +
              len1 = lines[1].size - 4 + 1
         | 
| 37 | 
            +
              lines[1].sub!(/SIZ1/, [len1].pack("V"))
         | 
| 38 | 
            +
              len2 = lines[3].size - 4 + 1
         | 
| 39 | 
            +
              lines[3].sub!(/SIZ2/, [len2].pack("V"))
         | 
| 40 | 
            +
              DATA3 = (lines << "").join("\0")
         | 
| 41 | 
            +
              SUFFIXES3 = ["a test", "is a", "test", "this\0", "this", "zzzz"].map{|x| [DATA3.index(x)].pack("V")}.join("")
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              def setup
         | 
| 44 | 
            +
                @index = FullTextIndex.new_from_ios(StringIO.new(DATA), StringIO.new(SUFFIXES))
         | 
| 45 | 
            +
                @index2 = FullTextIndex.new_from_ios(StringIO.new(DATA2), StringIO.new(SUFFIXES2))
         | 
| 46 | 
            +
                @index3 = FullTextIndex.new_from_ios(StringIO.new(DATA3), StringIO.new(SUFFIXES3))
         | 
| 47 | 
            +
              end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
              def test_new_from_ios
         | 
| 50 | 
            +
                a = nil
         | 
| 51 | 
            +
                assert_nothing_raised { a = FullTextIndex.new_from_ios(StringIO.new(DATA), StringIO.new(SUFFIXES)) }
         | 
| 52 | 
            +
                assert_equal(FullTextIndex::DEFAULT_OPTIONS[:max_query_size], a.max_query_size)
         | 
| 53 | 
            +
              end
         | 
| 54 | 
            +
              
         | 
| 55 | 
            +
              def test_lookup_basic
         | 
| 56 | 
            +
                %w[this is a test].each do |term|
         | 
| 57 | 
            +
                  result = @index.lookup(term)
         | 
| 58 | 
            +
                  assert_kind_of(FullTextIndex::Result, result)
         | 
| 59 | 
            +
                  assert_equal(term, result.query)
         | 
| 60 | 
            +
                  assert_equal("foo.txt", result.path)
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
                assert_equal(0, @index.lookup("a").index)
         | 
| 63 | 
            +
                assert_equal(2, @index.lookup("t").index)
         | 
| 64 | 
            +
                assert_equal(3, @index.lookup("th").index)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                assert_equal(4, @index.lookup("z").index)
         | 
| 67 | 
            +
                assert_equal("bar.txt", @index.lookup("z").path)
         | 
| 68 | 
            +
              end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
              def test_lookup_metadata
         | 
| 71 | 
            +
                assert_equal({}, @index.lookup("test").metadata)
         | 
| 72 | 
            +
                assert_equal({}, @index.lookup("zzzz").metadata)
         | 
| 73 | 
            +
                assert_equal({:foo => :bar, :bar => 1}, @index3.lookup("test").metadata)
         | 
| 74 | 
            +
                assert_equal({:foo => :baz, :bar => 42}, @index3.lookup("zzz").metadata)
         | 
| 75 | 
            +
              end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              def test_Result_text
         | 
| 78 | 
            +
                assert_equal("t", @index.lookup("this").text(1))
         | 
| 79 | 
            +
                assert_equal("this", @index.lookup("this").text(4))
         | 
| 80 | 
            +
                assert_equal("this is a ", @index.lookup("this").text(10))
         | 
| 81 | 
            +
                assert_equal("this is a test ", @index.lookup("th").text(100))
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                assert_equal("test ", @index.lookup("t").text(10))
         | 
| 84 | 
            +
                assert_equal("test ", @index.lookup("t").text(20))
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                assert_equal("z", @index.lookup("z").text(1))
         | 
| 87 | 
            +
                assert_equal("zzzz", @index.lookup("z").text(10))
         | 
| 88 | 
            +
              end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
              def test_Result_context
         | 
| 91 | 
            +
                assert_equal(" a ", @index.lookup("a").context(1))
         | 
| 92 | 
            +
                assert_equal("s a t", @index.lookup("a").context(2))
         | 
| 93 | 
            +
                assert_equal("is a te", @index.lookup("a").context(3))
         | 
| 94 | 
            +
                assert_equal("s is a test", @index.lookup("a").context(5))
         | 
| 95 | 
            +
                assert_equal("this is a test ", @index.lookup("a").context(10))
         | 
| 96 | 
            +
              end
         | 
| 97 | 
            +
             | 
| 98 | 
            +
              def test_Result_context_non_initial_entry
         | 
| 99 | 
            +
                assert_equal("zz", @index.lookup("z").context(1))
         | 
| 100 | 
            +
                assert_equal("zzz", @index.lookup("z").context(2))
         | 
| 101 | 
            +
                assert_equal("zzzz", @index.lookup("z").context(3))
         | 
| 102 | 
            +
                assert_equal("zzzz", @index.lookup("z").context(4))
         | 
| 103 | 
            +
                assert_equal("zzzz", @index.lookup("z").context(10))
         | 
| 104 | 
            +
              end
         | 
| 105 | 
            +
             | 
| 106 | 
            +
              def test_lookup_nonexistent
         | 
| 107 | 
            +
                assert_nil(@index.lookup("bogus"))
         | 
| 108 | 
            +
              end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
              def test_next_match_basic
         | 
| 111 | 
            +
                first = @index2.lookup("t")
         | 
| 112 | 
            +
                assert_equal("foo.txt", first.path)
         | 
| 113 | 
            +
                assert_equal(2, first.index)
         | 
| 114 | 
            +
                assert_equal("test ", first.text(10))
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                second = @index2.next_match(first)
         | 
| 117 | 
            +
                assert_equal("bar.txt", second.path)
         | 
| 118 | 
            +
                assert_equal(3, second.index)
         | 
| 119 | 
            +
                assert_equal("this", second.text(10))
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                third = @index2.next_match(second)
         | 
| 122 | 
            +
                assert_kind_of(FullTextIndex::Result, third)
         | 
| 123 | 
            +
                assert_equal(4, third.index)
         | 
| 124 | 
            +
                assert_equal("this is a ", third.text(10))
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                assert_nil(@index2.next_match(third))
         | 
| 127 | 
            +
              end
         | 
| 128 | 
            +
             | 
| 129 | 
            +
              def test_next_match_restricted
         | 
| 130 | 
            +
                first = @index2.lookup("t")
         | 
| 131 | 
            +
                assert_equal("foo.txt", first.path)
         | 
| 132 | 
            +
                assert_equal(2, first.index)
         | 
| 133 | 
            +
                assert_equal("test ", first.text(10))
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                second = @index2.next_match(first, "this is")
         | 
| 136 | 
            +
                assert_equal("foo.txt", second.path)
         | 
| 137 | 
            +
                assert_equal(4, second.index)
         | 
| 138 | 
            +
                assert_equal("this is a ", second.text(10))
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                assert_nil(@index2.next_match(first, "foo"))
         | 
| 141 | 
            +
              end
         | 
| 142 | 
            +
             | 
| 143 | 
            +
              def test_next_match_regexp
         | 
| 144 | 
            +
                first = @index2.lookup("t")
         | 
| 145 | 
            +
                assert_equal("foo.txt", first.path)
         | 
| 146 | 
            +
                assert_equal(2, first.index)
         | 
| 147 | 
            +
                assert_equal("test ", first.text(10))
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                second = @index2.next_match(first, /.*test/)
         | 
| 150 | 
            +
                assert_equal("foo.txt", second.path)
         | 
| 151 | 
            +
                assert_equal(4, second.index)
         | 
| 152 | 
            +
                assert_equal("this is a test ", second.text(20))
         | 
| 153 | 
            +
              end
         | 
| 154 | 
            +
             | 
| 155 | 
            +
             | 
| 156 | 
            +
              def test_next_matches
         | 
| 157 | 
            +
                first = @index2.lookup("t")
         | 
| 158 | 
            +
                all = [first] + @index2.next_matches(first)
         | 
| 159 | 
            +
                assert_equal([2, 3, 4], all.map{|x| x.index})
         | 
| 160 | 
            +
                assert_equal(["foo.txt", "bar.txt", "foo.txt"], all.map{|x| x.path})
         | 
| 161 | 
            +
                one, two, three = *all
         | 
| 162 | 
            +
                assert_equal(["test ", "this", "this is a test "], all.map{|x| x.text(20)})
         | 
| 163 | 
            +
              end
         | 
| 164 | 
            +
             | 
| 165 | 
            +
              def test_next_matches_restricted
         | 
| 166 | 
            +
                first = @index2.lookup("t")
         | 
| 167 | 
            +
                assert_equal([], @index2.next_matches(first, "this is not"))
         | 
| 168 | 
            +
                all = @index2.next_matches(first, "this is")
         | 
| 169 | 
            +
                assert_equal(["foo.txt"], all.map{|x| x.path})
         | 
| 170 | 
            +
                assert_equal([4], all.map{|x| x.index})
         | 
| 171 | 
            +
                assert_equal(["this is a test "], all.map{|x| x.text(20)})
         | 
| 172 | 
            +
              end
         | 
| 173 | 
            +
             | 
| 174 | 
            +
              def test_next_matches_regexp
         | 
| 175 | 
            +
                first = @index2.lookup("t")
         | 
| 176 | 
            +
                all = @index2.next_matches(first, /.*test/)
         | 
| 177 | 
            +
                assert_equal(["foo.txt"], all.map{|x| x.path})
         | 
| 178 | 
            +
                assert_equal([4], all.map{|x| x.index})
         | 
| 179 | 
            +
                assert_equal(["this is a test "], all.map{|x| x.text(20)})
         | 
| 180 | 
            +
              end
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            end
         | 
| @@ -0,0 +1,84 @@ | |
| 1 | 
            +
            require 'test/unit'
         | 
| 2 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 3 | 
            +
            $:.unshift "lib"
         | 
| 4 | 
            +
            require 'fastri/full_text_indexer'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            class TestFullTextIndexer < Test::Unit::TestCase
         | 
| 7 | 
            +
              require 'stringio'
         | 
| 8 | 
            +
              include FastRI
         | 
| 9 | 
            +
              def setup
         | 
| 10 | 
            +
                @indexer = FullTextIndexer.new(20)
         | 
| 11 | 
            +
              end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              DATA1 =  "this is a test " * 1000
         | 
| 14 | 
            +
              DATA2 =  "this is another test " * 1000
         | 
| 15 | 
            +
              def test_add_document
         | 
| 16 | 
            +
                @indexer.add_document("foo.txt", DATA1)
         | 
| 17 | 
            +
                assert_equal(["foo.txt"], @indexer.documents)
         | 
| 18 | 
            +
                assert_equal(DATA1, @indexer.data("foo.txt"))
         | 
| 19 | 
            +
                @indexer.add_document("foo.txt", DATA2)
         | 
| 20 | 
            +
                assert_equal(["foo.txt"], @indexer.documents)
         | 
| 21 | 
            +
                assert_equal(DATA2, @indexer.data("foo.txt"))
         | 
| 22 | 
            +
                @indexer.add_document("bar.txt", DATA2)
         | 
| 23 | 
            +
                assert_equal(["foo.txt", "bar.txt"], @indexer.documents)
         | 
| 24 | 
            +
                assert_equal(DATA2, @indexer.data("bar.txt"))
         | 
| 25 | 
            +
              end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              def test_preprocess
         | 
| 28 | 
            +
                data = "this is a \0foo bar\0 bla"
         | 
| 29 | 
            +
                assert_equal("this is a foo bar bla", @indexer.preprocess(data))
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              def test_find_suffixes_simple
         | 
| 33 | 
            +
                data = <<EOF
         | 
| 34 | 
            +
            this is a simple test with these words: Aaaa01 0.1 _asdA1
         | 
| 35 | 
            +
            EOF
         | 
| 36 | 
            +
                assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 47, 49, 51], 
         | 
| 37 | 
            +
                             @indexer.find_suffixes_simple(data, /[A-Za-z0-9_]+/, /[^A-Za-z0-9_]+/,0))
         | 
| 38 | 
            +
                assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 52], 
         | 
| 39 | 
            +
                             @indexer.find_suffixes_simple(data, /[A-Za-z]+/, /[^A-Za-z]+/, 0))
         | 
| 40 | 
            +
                assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 52].map{|x| x+10}, 
         | 
| 41 | 
            +
                             @indexer.find_suffixes_simple(data, /[A-Za-z]+/, /[^A-Za-z]+/, 10))
         | 
| 42 | 
            +
                assert_equal([0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 
         | 
| 43 | 
            +
                             22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37,
         | 
| 44 | 
            +
                
         | 
| 45 | 
            +
                             40, 41, 42, 43, 52, 53, 54, 55], 
         | 
| 46 | 
            +
                             @indexer.find_suffixes_simple(data, /[A-Za-z]/, /[^A-Za-z]+/, 0))
         | 
| 47 | 
            +
                assert_equal([0, 5], @indexer.find_suffixes_simple("abcd\ndefg", /\S+/, /\s+/, 0))
         | 
| 48 | 
            +
                assert_equal([1, 6], @indexer.find_suffixes_simple("abcd\ndefg", /\S+/, /\s+/, 1))
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
              def test_build_index_trivial
         | 
| 52 | 
            +
                @indexer.add_document("foo.txt", DATA1)
         | 
| 53 | 
            +
                fulltext    = StringIO.new("")
         | 
| 54 | 
            +
                suffixarray = StringIO.new("")
         | 
| 55 | 
            +
                @indexer.build_index(fulltext, suffixarray)
         | 
| 56 | 
            +
                assert_equal(["\000\r\000\000\000foo.txt\000\004\b{\000\000"], 
         | 
| 57 | 
            +
                             fulltext.string[-200..-1].scan(/\0.*$/))
         | 
| 58 | 
            +
                assert_equal(4000 * 4, suffixarray.string.size)
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              def build_index_test_helper(data, suffixes)
         | 
| 62 | 
            +
                @indexer.add_document("foo.txt", data)
         | 
| 63 | 
            +
                offset = FullTextIndexer::MAGIC.size
         | 
| 64 | 
            +
                suffixes = suffixes.map{|x| x + offset}
         | 
| 65 | 
            +
                sorted   = suffixes.sort_by{|i| data[i - offset]}
         | 
| 66 | 
            +
                f_io  = StringIO.new("")
         | 
| 67 | 
            +
                sa_io = StringIO.new("")
         | 
| 68 | 
            +
                @indexer.build_index(f_io, sa_io)
         | 
| 69 | 
            +
                assert_equal(sorted, sa_io.string.scan(/..../m).map{|x| x.unpack("V")[0]})
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
              def test_build_index_harder
         | 
| 73 | 
            +
                data = <<EOF
         | 
| 74 | 
            +
            a bcd efghi jklmn opqrst
         | 
| 75 | 
            +
            EOF
         | 
| 76 | 
            +
                suffixes = [0, 2, 6, 12, 18]
         | 
| 77 | 
            +
                build_index_test_helper(data, suffixes)
         | 
| 78 | 
            +
                data = <<EOF
         | 
| 79 | 
            +
            e xcd afghi zklmn bpqrst
         | 
| 80 | 
            +
            EOF
         | 
| 81 | 
            +
                suffixes = [0, 2, 6, 12, 18]
         | 
| 82 | 
            +
                build_index_test_helper(data, suffixes)
         | 
| 83 | 
            +
              end
         | 
| 84 | 
            +
            end
         |