RubyGems - rsi - Versions diffs - 0.4 - Mend

rsi 0.4

Files changed (31) hide show

data/LICENSE +25 -0
data/Makefile +24 -0
data/Manifest +30 -0
data/README +49 -0
data/TODO +30 -0
data/bin/rsi_search.rb +50 -0
data/bin/search_bench.rb +47 -0
data/docs/ATTRIB +14 -0
data/docs/Changes +25 -0
data/docs/Roadmap +41 -0
data/lib/rsi.rb +40 -0
data/lib/rsi/analysis.rb +79 -0
data/lib/rsi/compressed_serializers.rb +60 -0
data/lib/rsi/dictionary.rb +232 -0
data/lib/rsi/index.rb +245 -0
data/lib/rsi/logmanager.rb +105 -0
data/lib/rsi/porter.rb +213 -0
data/lib/rsi/query.rb +98 -0
data/lib/rsi/rsi_intro.rb +91 -0
data/lib/rsi/serializers.rb +31 -0
data/lib/rsi/stoplist.rb +72 -0
data/lib/rsi/stoplist.txt +59 -0
data/rsi.gemspec +59 -0
data/setup.rb +1360 -0
data/tests/suite_all.rb +14 -0
data/tests/t_analysis.rb +43 -0
data/tests/t_dictionary.rb +76 -0
data/tests/t_index.rb +78 -0
data/tests/t_index_multi.rb +71 -0
data/version.release +1 -0
metadata +72 -0

@@ -0,0 +1,105 @@
+require 'tmpdir'
+require 'logger'
+require 'singleton'
+module RSI
+  # Mixin providing a RSI::LogManager-managed #logger() method.
+  # #logger() returns a Logger object.
+  #
+  #     class StuffThing
+  #       include RSI::Loggable
+  #       def do_dealie()
+  #         logger.info( "Doing some dealie" )
+  #       end
+  #     end
+  #
+  # The settings of the logger returned by #logger() can be modified:
+  #
+  #     def initialize()
+  #       logger.level = Logger.DEBUG  # log all messages
+  #       logger.debug( "This will show up in the log, now" )
+  #     end
+  #
+  module Loggable
+    def logger
+      return RSI::LogManager.instance().logger_for( self )
+    end
+  end
+  # Trivial extension of Logger, providing it a #write() method.
+  # This allows instances of this logger to be used as the
+  # argument to Logger#new().
+  #
+  #     root = XLogger.new( "foo.log" )
+  #     other = Logger.new( root )
+  #
+  class XLogger < Logger
+    def write( msg )
+      @logdev.write( msg )
+    end
+  end
+  # Manages logger creation for classes which mixin RSI::Loggable.
+  # LogManager has default settings for the log directory (Dir::tmpdir)
+  # and for the log file name ("app.log").
+  #
+  # If you'd like to override the defaults, call #root=()
+  # and/or #log_filename=() before LogManager is first used (ie,
+  # before RSI::Loggable#logger() is called the first time).
+  # You can also supply an arbitrary IO to #root_fh=() .
+  #
+  # By default, the LogManager will create logs with level set
+  # to Logger::INFO.  Individual classes mixing in RSI::Loggable may
+  # choose to override this by calling #logger.level=() .
+  #
+  class LogManager
+    include Singleton
+    attr_reader :root_logger
+    attr_accessor :root, :log_filename, :root_fh
+    def initialize()
+      @root = Dir::tmpdir
+      @log_filename = "app.log"
+      @root_fh = nil
+      @logger_cache = {}
+      @root_logger = nil
+    end
+    # Gets the logger for a class.
+    # Can be passed an object, a Class, or a String.
+    public
+    def logger_for( obj="root" )
+      if obj.kind_of?( String )
+        n = obj
+      elsif obj.kind_of?( Module )
+        n = obj.name
+      else
+        n = obj.class.name
+      end
+      unless @logger_cache.has_key?( n )
+        configure() if @root_logger.nil?
+        @logger_cache[n] = Logger.new( @root_logger )
+        @logger_cache[n].progname = n
+        @logger_cache[n].level = Logger::INFO
+      end
+      return @logger_cache[n]
+    end
+    private
+    def configure()
+      if @root_fh.nil?
+        @root_fh = File.open( File.join(@root, @log_filename),
+                              File::WRONLY | File::APPEND | File::CREAT )
+        @root_fh.sync = true
+      end
+      @root_logger = XLogger.new( @root_fh )
+      @root_logger.progname = "root"
+       #setting @root_logger.level seems to screw things up
+    end
+  end
+end

data/lib/rsi/porter.rb ADDED

@@ -0,0 +1,213 @@
+#! /local/ruby/bin/ruby
+#
+# $Id: porter.rb 37 2005-01-13 04:23:07Z gdf $
+#
+# See example usage at the end of this file.
+#
+module Stemmable
+  STEP_2_LIST = {
+    'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
+    'izer'=>'ize', 'bli'=>'ble',
+    'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
+    'ization'=>'ize', 'ation'=>'ate',
+    'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
+    'ousness'=>'ous', 'aliti'=>'al',
+    'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
+  }
+  STEP_3_LIST = {
+    'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
+    'ical'=>'ic', 'ful'=>'', 'ness'=>''
+  }
+  SUFFIX_1_REGEXP = /(
+                    ational  |
+                    tional   |
+                    enci     |
+                    anci     |
+                    izer     |
+                    bli      |
+                    alli     |
+                    entli    |
+                    eli      |
+                    ousli    |
+                    ization  |
+                    ation    |
+                    ator     |
+                    alism    |
+                    iveness  |
+                    fulness  |
+                    ousness  |
+                    aliti    |
+                    iviti    |
+                    biliti   |
+                    logi)$/x
+  SUFFIX_2_REGEXP = /(
+                      al       |
+                      ance     |
+                      ence     |
+                      er       |
+                      ic       |
+                      able     |
+                      ible     |
+                      ant      |
+                      ement    |
+                      ment     |
+                      ent      |
+                      ou       |
+                      ism      |
+                      ate      |
+                      iti      |
+                      ous      |
+                      ive      |
+                      ize)$/x
+  C = "[^aeiou]"         # consonant
+  V = "[aeiouy]"         # vowel
+  CC = "#{C}(?>[^aeiouy]*)"  # consonant sequence
+  VV = "#{V}(?>[aeiou]*)"    # vowel sequence
+  MGR0 = /^(#{CC})?#{VV}#{CC}/o                # [cc]vvcc... is m>0
+  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o       # [cc]vvcc[vv] is m=1
+  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o      # [cc]vvccvvcc... is m>1
+  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o                      # vowel in stem
+  #
+  # Porter stemmer in Ruby.
+  #
+  # This is the Porter stemming algorithm, ported to Ruby from the
+  # version coded up in Perl.  It's easy to follow against the rules
+  # in the original paper in:
+  #
+  #   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+  #   no. 3, pp 130-137,
+  #
+  # See also http://www.tartarus.org/~martin/PorterStemmer
+  #
+  # Send comments to raypereda@hotmail.com
+  #
+  def stem_porter
+    # make a copy of the given object and convert it to a string.
+    w = self.dup.to_str
+    return w if w.length < 3
+    # now map initial y to Y so that the patterns never treat it as vowel
+    w[0] = 'Y' if w[0] == ?y
+    # Step 1a
+    if w =~ /(ss|i)es$/
+      w = $` + $1
+    elsif w =~ /([^s])s$/
+      w = $` + $1
+    end
+    # Step 1b
+    if w =~ /eed$/
+      w.chop! if $` =~ MGR0
+    elsif w =~ /(ed|ing)$/
+      stem = $`
+      if stem =~ VOWEL_IN_STEM
+        w = stem
+        case w
+        when /(at|bl|iz)$/             then w << "e"
+        when /([^aeiouylsz])\1$/       then w.chop!
+        when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
+        end
+      end
+    end
+    if w =~ /y$/
+      stem = $`
+      w = stem + "i" if stem =~ VOWEL_IN_STEM
+    end
+    # Step 2
+    if w =~ SUFFIX_1_REGEXP
+      stem = $`
+      suffix = $1
+      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
+      if stem =~ MGR0
+        w = stem + STEP_2_LIST[suffix]
+      end
+    end
+    # Step 3
+    if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
+      stem = $`
+      suffix = $1
+      if stem =~ MGR0
+        w = stem + STEP_3_LIST[suffix]
+      end
+    end
+    # Step 4
+    if w =~ SUFFIX_2_REGEXP
+      stem = $`
+      if stem =~ MGR1
+        w = stem
+      end
+    elsif w =~ /(s|t)(ion)$/
+      stem = $` + $1
+      if stem =~ MGR1
+        w = stem
+      end
+    end
+    #  Step 5
+    if w =~ /e$/
+      stem = $`
+      if (stem =~ MGR1) ||
+          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
+        w = stem
+      end
+    end
+    if w =~ /ll$/ && w =~ MGR1
+      w.chop!
+    end
+    # and turn initial Y back to y
+    w[0] = 'y' if w[0] == ?Y
+    w
+  end
+  #
+  # make the stem_porter the default stem method, just in case we
+  # feel like having multiple stemmers available later.
+  #
+  alias stem stem_porter
+end
+#
+# Make this script executable, and send it words on stdin, one per
+# line, and it will output the stemmed versions to stdout.
+#
+if $0 == __FILE__ then
+  class String
+    include Stemmable
+  end
+  # the String class, and any subclasses of it you might have, now know
+  # how to stem things.
+  $stdin.each do |word|
+    puts word.stem
+  end
+end

data/lib/rsi/query.rb ADDED

@@ -0,0 +1,98 @@
+require 'rsi/logmanager'
+module RSI
+  class Query
+    include Loggable
+    def initialize()
+      @subqueries = []
+    end
+    def add_subquery( query )
+      @subqueries << query
+    end
+    def evaluate( locator ); end
+  end
+  class ANDQuery < Query
+    def evaluate( locator )
+      ret_set = nil
+      @subqueries.each do |q|
+        set = q.evaluate( locator )
+        if ret_set.nil?
+          ret_set = set
+        else
+          ret_set = ret_set & set
+        end
+        # short-circuit bottoming out
+        if ret_set.size()==0
+          return ret_set
+        end
+      end
+      return ret_set
+    end
+    def to_s
+      return "( " + @subqueries.join(" AND ") + " )";
+    end
+  end
+  class ORQuery < Query
+    def evaluate()
+      ret_set = []
+      @subqueries.each do |q|
+        ret_set = ret_set | q.evaluate( locator )
+      end
+      return ret_set
+    end
+    def to_s
+      return "( " + @subqueries.join(" OR ") + " )";
+    end
+  end
+  class TermQuery < Query
+    attr_accessor :field, :term
+    def initialize( field, term )
+      @field = field
+      @term = term
+    end
+    def evaluate( locator )
+      logger.debug( "Getting dict for #@field" )
+      dict = locator.get_dict_for_field( @field )
+      # get all docids containing @field:@term -> []
+      # return set
+      unless dict.has_term?( term )
+        logger.debug( "Dict has no such term #{term}" )
+        return []
+      else
+        ret = []
+        termid = dict.get_termid_for( term )
+        logger.debug( "Getting entries for #{term}(#{termid})" )
+        dict.get_entry_list( termid ).each do |termentry|
+          logger.debug( termentry.to_s )
+          ret << termentry.docid
+        end
+        return ret.uniq
+      end
+    end
+    def to_s
+      return "#@field='#@term'"
+    end
+  end
+##;   def analyze_query( q_str )
+##;     # (a OR b) AND (c OR d)
+##;     # -> AND[ OR[a,b], OR[c,d] ]
+##;     # split on whitespace
+##;     # split x:foo
+##;     # tokenize foo
+##;     # add another AND termquery
+##;
+##;   end
+end

data/lib/rsi/rsi_intro.rb ADDED

@@ -0,0 +1,91 @@
+#
+# = RSI (Ruby Simple Indexer)
+#
+# RSI is a simple full text search engine implementation in Ruby.  It
+# aims to be easily useful within other programs: simple to set up,
+# simple to use.
+#
+# An emphasis has been placed on getting functionality out the door,
+# rather than heavy optimization (that can come later).  It still
+# appears to be reasonably fast and efficient (while admitting to have
+# not been heavily profiled...).
+#
+# == Getting RSI
+#
+# RSI can be downloaded from Rubyforge (http://rubyforge.org/projects/rsi/).
+#
+# == Using RSI
+#
+# Creating an index:
+#
+#   require 'rsi'
+#   indexer = RSI::Index.new( "/path/to/index" )
+#   Dir.foreach( "~/words" ) do |textfile|
+#     indexer.add_document( textfile, File.read("~/words/#{textfile}") )
+#   end
+#   indexer.flush()
+#
+# By default, the RSI indexer assumes that documents fed to it are plain
+# text docs (more complex analyzers should appear in future releases).
+#
+# Searching an index:
+#
+#   require 'rsi'
+#   indexer = RSI::Index.new( "/path/to/index" )
+#   puts indexer.find_all( "some three terms" )
+#
+# == Advanced Usage
+#
+# (Tweakability will be enhanced in future releases.)
+#
+#   require 'rsi'
+#
+#   indexer = RSI::Indexer.new( "/data/search" )
+#   indexer.serializer = RSI::NativeSerializer.new()
+#   indexer.analyzer = RSI::DefaultTextAnalyzer.new()
+#   indexer.query_analyzer = RSI::DefaultTextAnalyzer.new()
+#
+# === Changing the dictionary serializer
+#
+# The dictionary's serializer controls how the index database is
+# stored. By default, RSI uses Ruby's Marshal to store the database
+# objects. These serializers are also available:
+#
+# * RSI::NativeSerializer - default, uses Ruby's built-in Marshal lib.
+#
+# * RSI::YAMLSerializer - serializes DB objects as YAML. Excellent for
+#   debugging purposes. Very slow compared to NativeSerializer.
+#
+# * RSI::CompressedSerializer - uses Marshall (by default), plus
+#   compresses the output with bzip. The speed penalty is probably not
+#   worth the space savings (at least the way the db is currently
+#   implemented).  Also requires the `bz2` library.
+#
+# Naturally, if you create an index with a give serializer, you will
+# need to re-open the index with that same serializer. (This should be
+# auto-detected in future releases.)
+#
+# === Changing the analyzer
+#
+# The analyzer is used both to tokenize documents into indexable
+# terms.  The default analyzer splits on whitespace and performs some
+# normalization (stemming, stopword removal, etc).
+#
+# The query analyzer is used to tokenize query terms.
+#
+# Currently there are no other analyzers available (see Roadmap).
+#
+# === Changing the stoplist
+#
+# The default stoplist is pretty minimal (see stoplist.rb).
+#
+# (should be easier: see Development Roadmap)
+#
+#   class MyAnalyzer < RSI::Analyzer
+#     def initialize_stoplist()
+#        return unless @stoplist.nil?
+#        @stoplist = { 'THE' => 1, ... }
+#     end
+#   end
+#
+module RSI; end