RubyGems - SimpleSearch - Versions diffs - 0.5.0 - Mend

SimpleSearch 0.5.0

Files changed (9) hide show

data/README +68 -0
data/bin/simplesearch +42 -0
data/lib/search/simple.rb +1 -0
data/lib/search/simple/dictionary.rb +126 -0
data/lib/search/simple/porter_stemmer.rb +220 -0
data/lib/search/simple/searcher.rb +191 -0
data/lib/search/simple/vector.rb +97 -0
data/setup.rb +1360 -0
metadata +46 -0

data/README ADDED

@@ -0,0 +1,68 @@
+===SimpleSearch - Simple vector space search library
+==What is SimpleSearch?
+-----
+SimpleSearch is a simple vector space text search engine.
+==Installation
+-----
+Prerequisites
+  * Ruby 1.8 (http://www.ruby-lang.org/)
+Optional
+  * RubyGems (http://rubygems.rubyforge.org)
+==Installing SimpleSearch
+-----
+RubyGems (http://rubygems.rubyforge.org):
+	gem install SimpleSearch
+...or...
+.tar.gz installation:
+	ruby setup.rb  #not yet available
+==Using SimpleSearch
+-----
+SimpleSearch comes with a command line program that was primarily written as an example of how to use the API but might actually be useful.
+To run the command line program, simply type:
+$ search-simple --help
+An example:
+$ search-simple --cache=/tmp/mycache --dir=/usr/local/lib/ruby/gems/1.8/doc --extensions=html markup
+This will cause search-simple to (re)index all of the files with a .html extension in your RubyGems rdoc directory and then search them for the words "markup" and "html".  The search indices will be stored in /tmp/mycache.
+At the heart of SimpleSearch is, of course, an API that can be embedded in other programs.  The code of SimpleSearch was originally created by Dave Thomas as a search mechanism for his RubLog (http://rubyforge.org/projects/rublog) weblogging package.  The API can be used as follows:
+	require 'search/simple'
+	Search::Simple::Searcher.load(content_for_indexing(options), "/tmp/search_cache")
+	contents = Search::Simple::Contents.new
+	# silly example
+        Dir['**/*'].each do |file_name|
+		File.open(file_name) do |file|
+        		contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
+		end
+	end
+	sr = s.find_words(['some', 'keywords', 'to', 'search', 'for'])
+	if sr.contains_matches
+		sr.results.sort.each do |res|
+		puts "#{res.score}:#{res.name}"
+		end
+	else
+		puts "No matches"
+	end
+==Credits
+------
+Almost all of this code was written by Dave Thomas (http://pragprog.com/pragdave).  The original code was a complete rewrite at an attempt that Chad Fowler (http://www.chadfowler.com) made to do a vector space search for RubLog.  Chad Fowler adapted Dave's working RubLog code to be Rublog-independent and created what is now SimpleSearch out of it.

data/bin/simplesearch ADDED

@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+require 'optparse'
+require 'search/simple'
+options = {}
+ARGV.options do |opts|
+  opts.on_tail("--help", "show this message") {puts opts}
+  opts.on('-cCACHEFILE','--cache=CACHEFILE', "Location of the search cache (defaults to /tmp/search_cache") { |options[:cachefile]| }
+  opts.on('-eEXTENSIONS','--extensions=EXTENSIONS', "Comma separated list of file name extensions to include in the search/index") { |options[:extensions]| }
+  opts.on('-dCONTENTDIR', '--dir=CONTENTDIR', "Directory from which to get the content to index")    {|options[:directory]|}
+  opts.on('-tTERMS', '--terms=TERMS', "Comma separated list of words to search for")    {|options[:terms]|}
+  opts.parse!
+end
+def content_for_indexing(options)
+  contents = Search::Simple::Contents.new
+  extensions = options[:extensions] || ""
+  globpattern = (options[:directory] || ".") + "/**/*" + "{#{extensions}}"
+  Dir[globpattern].each do |file_name|
+    next if File.directory?(file_name)
+    File.open(file_name) do |file|
+      contents <<  Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
+    end
+  end
+  contents
+end
+unless options[:terms]
+  puts "Usage: simplesearch --help"
+  exit 1
+end
+s = Search::Simple::Searcher.load(content_for_indexing(options), options[:cachefile] || "/tmp/search_cache")
+sr = s.find_words(options[:terms].split(/,/))
+if sr.contains_matches
+   require 'pp'
+     puts "Score\t#File"
+   sr.results.sort.each do |res|
+     puts "#{res.score}\t#{res.name}"
+   end
+else
+  puts "No matches"
+end

data/lib/search/simple.rb ADDED

	@@ -0,0 +1 @@
1	+ require 'search/simple/searcher'

data/lib/search/simple/dictionary.rb ADDED

@@ -0,0 +1,126 @@
+# Maintain a dictionary mapping words to consecutive integers (the
+# first unique word is 0, the second is 1 and so on)
+require 'search/simple/porter_stemmer'
+module Search
+  module Simple
+  class Dictionary
+    STOP_WORDS = {
+      "a" => 1,
+      "again" => 1,
+      "all" => 1,
+      "along" => 1,
+      "also" => 1,
+      "an" => 1,
+      "and" => 1,
+      "arialhelvetica" => 1,
+      "as" => 1,
+      "at" => 1,
+      "but" => 1,
+      "by" => 1,
+      "came" => 1,
+      "can" => 1,
+      "cant" => 1,
+      "couldnt" => 1,
+      "did" => 1,
+      "didn" => 1,
+      "didnt" => 1,
+      "do" => 1,
+      "doesnt" => 1,
+      "dont" => 1,
+      "entrytitledetail" => 1,
+      "ever" => 1,
+      "first" => 1,
+      "fontvariant" => 1,
+      "from" => 1,
+      "have" => 1,
+      "her" => 1,
+      "here" => 1,
+      "him" => 1,
+      "how" => 1,
+      "i" => 1,
+      "if" => 1,
+      "in" => 1,
+      "into" => 1,
+      "is" => 1,
+      "isnt" => 1,
+      "it" => 1,
+      "itll" => 1,
+      "just" => 1,
+      "last" => 1,
+      "least" => 1,
+      "like" => 1,
+      "most" => 1,
+      "my" => 1,
+      "new" => 1,
+      "no" => 1,
+      "not" => 1,
+      "now" => 1,
+      "of" => 1,
+      "on" => 1,
+      "or" => 1,
+      "should" => 1,
+      "sidebartitl" => 1,
+      "sinc" => 1,
+      "so" => 1,
+      "some" => 1,
+      "textdecoration" => 1,
+      "th" => 1,
+      "than" => 1,
+      "that" => 1,
+      "the" => 1,
+      "their" => 1,
+      "then" => 1,
+      "those" => 1,
+      "to" => 1,
+      "told" => 1,
+      "too" => 1,
+      "true" => 1,
+      "try" => 1,
+      "until" => 1,
+      "url" => 1,
+      "us" => 1,
+      "were" => 1,
+      "when" => 1,
+      "whether" => 1,
+      "while" => 1,
+      "with" => 1,
+      "within" => 1,
+      "yes" => 1,
+      "you" => 1,
+      "youll" => 1,
+      }
+      def initialize
+        @words = {}
+      end
+      def add_word(word)
+        word = Stemmable::stem_porter(word)
+        if STOP_WORDS[word]
+          nil
+        else
+          @words[word] ||= @words.size
+        end
+      end
+      def find(word)
+        word = Stemmable::stem_porter(word)
+        if STOP_WORDS[word]
+          nil
+        else
+          @words[word]
+        end
+      end
+      def size
+        @words.size
+      end
+      def dump
+        puts @words.keys.sort
+      end
+    end
+  end
+end

data/lib/search/simple/porter_stemmer.rb ADDED

@@ -0,0 +1,220 @@
+#! /local/ruby/bin/ruby
+#
+# $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
+#
+# See example usage at the end of this file.
+#
+module Stemmable
+  STEMMED = {}
+  STEP_2_LIST = {
+    'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
+    'izer'=>'ize', 'bli'=>'ble',
+    'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
+    'ization'=>'ize', 'ation'=>'ate',
+    'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
+    'ousness'=>'ous', 'aliti'=>'al',
+    'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
+  }
+  STEP_3_LIST = {
+    'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
+    'ical'=>'ic', 'ful'=>'', 'ness'=>''
+  }
+  SUFFIX_1_REGEXP = /(
+                    ational  |
+                    tional   |
+                    enci     |
+                    anci     |
+                    izer     |
+                    bli      |
+                    alli     |
+                    entli    |
+                    eli      |
+                    ousli    |
+                    ization  |
+                    ation    |
+                    ator     |
+                    alism    |
+                    iveness  |
+                    fulness  |
+                    ousness  |
+                    aliti    |
+                    iviti    |
+                    biliti   |
+                    logi)$/x
+  SUFFIX_2_REGEXP = /(
+                      al       |
+                      ance     |
+                      ence     |
+                      er       |
+                      ic       |
+                      able     |
+                      ible     |
+                      ant      |
+                      ement    |
+                      ment     |
+                      ent      |
+                      ou       |
+                      ism      |
+                      ate      |
+                      iti      |
+                      ous      |
+                      ive      |
+                      ize)$/x
+  C = "[^aeiou]"         # consonant
+  V = "[aeiouy]"         # vowel
+  CC = "#{C}(?>[^aeiouy]*)"  # consonant sequence
+  VV = "#{V}(?>[aeiou]*)"    # vowel sequence
+  MGR0 = /^(#{CC})?#{VV}#{CC}/o                # [cc]vvcc... is m>0
+  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o       # [cc]vvcc[vv] is m=1
+  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o      # [cc]vvccvvcc... is m>1
+  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o                      # vowel in stem
+  #
+  # Porter stemmer in Ruby.
+  #
+  # This is the Porter stemming algorithm, ported to Ruby from the
+  # version coded up in Perl.  It's easy to follow against the rules
+  # in the original paper in:
+  #
+  #   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+  #   no. 3, pp 130-137,
+  #
+  # See also http://www.tartarus.org/~martin/PorterStemmer
+  #
+  # Send comments to raypereda@hotmail.com
+  #
+  def stem_porter(w = self.to_str.dup)
+    # make a copy of the given object and convert it to a string.
+    original_word = w
+    return w if w.length < 3
+    result = STEMMED[w]
+    return result if result
+    # now map initial y to Y so that the patterns never treat it as vowel
+    w[0] = 'Y' if w[0] == ?y
+    # Step 1a
+    if w =~ /(ss|i)es$/
+      w = $` + $1
+    elsif w =~ /([^s])s$/
+      w = $` + $1
+    end
+    # Step 1b
+    if w =~ /eed$/
+      w.chop! if $` =~ MGR0
+    elsif w =~ /(ed|ing)$/
+      stem = $`
+      if stem =~ VOWEL_IN_STEM
+        w = stem
+	case w
+        when /(at|bl|iz)$/             then w << "e"
+        when /([^aeiouylsz])\1$/       then w.chop!
+        when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
+        end
+      end
+    end
+    if w =~ /y$/
+      stem = $`
+      w = stem + "i" if stem =~ VOWEL_IN_STEM
+    end
+    # Step 2
+    if w =~ SUFFIX_1_REGEXP
+      stem = $`
+      suffix = $1
+      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
+      if stem =~ MGR0
+        w = stem + STEP_2_LIST[suffix]
+      end
+    end
+    # Step 3
+    if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
+      stem = $`
+      suffix = $1
+      if stem =~ MGR0
+        w = stem + STEP_3_LIST[suffix]
+      end
+    end
+    # Step 4
+    if w =~ SUFFIX_2_REGEXP
+      stem = $`
+      if stem =~ MGR1
+        w = stem
+      end
+    elsif w =~ /(s|t)(ion)$/
+      stem = $` + $1
+      if stem =~ MGR1
+        w = stem
+      end
+    end
+    #  Step 5
+    if w =~ /e$/
+      stem = $`
+      if (stem =~ MGR1) ||
+          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
+        w = stem
+      end
+    end
+    if w =~ /ll$/ && w =~ MGR1
+      w.chop!
+    end
+    # and turn initial Y back to y
+    w[0] = 'y' if w[0] == ?Y
+    STEMMED[original_word] = w
+    w
+  end
+  module_function :stem_porter
+  #
+  # make the stem_porter the default stem method, just in case we
+  # feel like having multiple stemmers available later.
+  #
+  alias stem stem_porter
+end
+#
+# Make this script executable, and send it words on stdin, one per
+# line, and it will output the stemmed versions to stdout.
+#
+if $0 == __FILE__ then
+  class String
+    include Stemmable
+  end
+  # the String class, and any subclasses of it you might have, now know
+  # how to stem things.
+  $stdin.each do |word|
+    puts word.stem
+  end
+end