RubyGems - luisparravicini-classifier - Versions diffs - 1.3.7 - Mend

luisparravicini-classifier 1.3.7

Files changed (27) hide show

data/LICENSE +429 -0
data/Manifest +19 -0
data/README +87 -0
data/Rakefile +21 -0
data/VERSION.yml +5 -0
data/lib/classifier.rb +32 -0
data/lib/classifier/base.rb +55 -0
data/lib/classifier/bayes.rb +136 -0
data/lib/classifier/extensions/vector.rb +109 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/lsi.rb +337 -0
data/lib/classifier/lsi/content_node.rb +73 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/stopwords.rb +30 -0
data/lib/classifier/stopwords/en +82 -0
data/lib/classifier/stopwords/es +339 -0
data/lib/classifier/stopwords/ru +161 -0
data/lib/init.rb +1 -0
data/luisparravicini-classifier.gemspec +76 -0
data/tasks/test.rake +6 -0
data/test/base_test.rb +17 -0
data/test/bayes/bayesian_test.rb +58 -0
data/test/lsi/lsi_test.rb +167 -0
data/test/stopwords_test.rb +20 -0
data/test/test_helper.rb +4 -0
metadata +104 -0

data/Rakefile ADDED Viewed

@@ -0,0 +1,21 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |s|
+    s.name        = "luisparravicini-classifier"
+    s.summary     = "A general classifier module to allow Bayesian and other types of classifications."
+    s.description = "Bayesian classifier and others."
+    s.homepage    = "http://github.com/yury/classifier"
+    s.author      = "Yury Korolev"
+    s.email       = "yury.korolev@gmail.com"
+    s.add_dependency "activesupport", ">= 2.2.2"
+    s.add_dependency "ruby-stemmer", ">= 0.5.1"
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end
+Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }

data/VERSION.yml ADDED Viewed

@@ -0,0 +1,5 @@
+---
+:major: 1
+:minor: 3
+:patch: 7
+:build:

data/lib/classifier.rb ADDED Viewed

@@ -0,0 +1,32 @@
+#--
+# Copyright (c) 2005 Lucas Carlson
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+require 'rubygems'
+require 'activesupport'
+require 'lingua/stemmer'
+require 'classifier/base'
+require 'classifier/bayes'
+require 'classifier/lsi'

data/lib/classifier/base.rb ADDED Viewed

@@ -0,0 +1,55 @@
+module Classifier
+  class Base
+    def initialize(options = {})
+      options.reverse_merge!(:language => 'en')
+      options.reverse_merge!(:encoding => 'UTF_8')
+      @options = options
+    end
+    def prepare_category_name val
+      val.to_s.gsub("_"," ").capitalize.intern
+    end
+    # Removes common punctuation symbols, returning a new string.
+    # E.g.,
+    #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+    #   => "Hello  greetings   with  braces         "
+    def without_punctuation str
+      str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+    end
+    # Return a Hash of strings => ints. Each word in the string is stemmed,
+    # interned, and indexes to its frequency in the document.
+  	def word_hash str
+  		word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
+  	end
+  	# Return a word hash without extra punctuation or short symbols, just stemmed words
+  	def clean_word_hash str
+  		word_hash_for_words str.gsub(/[^\w\s]/,"").split
+  	end
+  	private
+		def stemmer
+			@stemmer ||= Lingua::Stemmer.new(@options)
+		end
+  	def word_hash_for_words(words)
+  		d = Hash.new
+  		skip_words = SkipWords.for(@options[:language])
+  		words.each do |word|
+  			word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
+  			key = stemmer.stem(word).intern
+  			if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
+  				d[key] ||= 0
+  				d[key] += 1
+  			end
+  		end
+  		return d
+  	end
+  end
+end

data/lib/classifier/bayes.rb ADDED Viewed

@@ -0,0 +1,136 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+require 'classifier/stopwords'
+module Classifier
+class Bayes < Classifier::Base
+  # The class can be created with one or more categories, each of which will be
+  # initialized and given a training method. E.g.,
+  #      b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
+  #  you can specify language and encoding parameters for stemmer
+  # (default values - :language => 'en', :encoding => 'UTF_8')
+  #      b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
+	def initialize(options = {})
+		@categories = Hash.new
+		options.reverse_merge!(:categories => [])
+		options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
+		@total_words = 0
+		super
+	end
+	#
+	# Provides a general training method for all categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.train "that", "That text"
+	#     b.train "The other", "The other text"
+	def train(category, text)
+		category = prepare_category_name(category)
+		word_hash(text).each do |word, count|
+			@categories[category][word]     ||=     0
+			@categories[category][word]      +=     count
+			@total_words += count
+		end
+	end
+	#
+	# Provides a untraining method for all categories specified in Bayes#new
+	# Be very careful with this method.
+	#
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.untrain :this, "This text"
+	def untrain(category, text)
+		category = prepare_category_name(category)
+    word_hash(text).each do |word, count|
+			if @total_words >= 0
+				orig = @categories[category][word] || 0
+				@categories[category][word]     ||=     0
+				@categories[category][word]      -=     count
+				if @categories[category][word] <= 0
+					@categories[category].delete(word)
+					count = orig
+				end
+				@total_words -= count
+			end
+		end
+	end
+	#
+	# Returns the scores in each category the provided +text+. E.g.,
+	#    b.classifications "I hate bad words and you"
+	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+	# The largest of these scores (the one closest to 0) is the one picked out by #classify
+	def classifications(text)
+		score = Hash.new
+		@categories.each do |category, category_words|
+			score[category.to_s] = 0
+			total = category_words.values.sum
+			word_hash(text).each do |word, count|
+				s = category_words.has_key?(word) ? category_words[word] : 0.1
+				score[category.to_s] += Math.log(s/total.to_f)
+			end
+		end
+		return score
+	end
+  #
+  # Returns the classification of the provided +text+, which is one of the
+  # categories given in the initializer. E.g.,
+  #    b.classify "I hate bad words and you"
+  #    =>  'Uninteresting'
+	def classify(text)
+		(classifications(text).sort_by { |a| -a[1] })[0][0]
+	end
+	#
+	# Provides training and untraining methods for the categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train_this "This text"
+	#     b.train_that "That text"
+	#     b.untrain_that "That text"
+	#     b.train_the_other "The other text"
+	def method_missing(name, *args)
+		category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
+		if @categories.has_key? category
+			args.each { |text| eval("#{$1}train(category, text)") }
+		elsif name.to_s =~ /(un)?train_([\w]+)/
+			raise StandardError, "No such category: #{category}"
+		else
+	    super  #raise StandardError, "No such method: #{name}"
+		end
+	end
+	#
+	# Provides a list of category names
+	# For example:
+	#     b.categories
+	#     =>   ['This', 'That', 'the_other']
+	def categories # :nodoc:
+		@categories.keys.collect {|c| c.to_s}
+	end
+	#
+	# Allows you to add categories to the classifier.
+	# For example:
+	#     b.add_category "Not spam"
+	#
+	# WARNING: Adding categories to a trained classifier will
+	# result in an undertrained category that will tend to match
+	# more criteria than the trained selective categories. In short,
+	# try to initialize your categories at initialization.
+	def add_category(category)
+		@categories[prepare_category_name(category)] = Hash.new
+	end
+	alias append_category add_category
+end
+end

data/lib/classifier/extensions/vector.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# Author::    Ernest Ellingson
+# Copyright:: Copyright (c) 2005
+# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
+require 'matrix'
+require 'mathn'
+# Conflicts with ActiveSupport
+unless Array.new.respond_to?(:sum)
+  class Array
+    def sum
+      inject(0) { |sum,term| sum += term  }.to_f
+    end
+  end
+end
+class Vector
+  def magnitude
+    sumsqs = 0.0
+    self.size.times do |i|
+      sumsqs += self[i] ** 2.0
+    end
+    Math.sqrt(sumsqs)
+  end
+  def normalize
+    nv = []
+    mag = self.magnitude
+    self.size.times do |i|
+      nv << (self[i] / mag)
+    end
+    Vector[*nv]
+  end
+end
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/classifier/extensions/vector_serialize.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump( self.to_a )
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
+end

data/lib/classifier/lsi.rb ADDED Viewed

@@ -0,0 +1,337 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+begin
+   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+   require 'gsl' # requires http://rb-gsl.rubyforge.org/
+   require 'classifier/extensions/vector_serialize'
+   $GSL = true
+rescue LoadError
+	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
+	require 'classifier/extensions/vector'
+end
+require 'classifier/lsi/word_list'
+require 'classifier/lsi/content_node'
+require 'classifier/lsi/summary'
+module Classifier
+  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
+  # data based on underlying semantic relations. For more information on the algorithms used,
+  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
+  class LSI < Classifier::Base
+    attr_reader :word_list
+    attr_accessor :auto_rebuild
+    # Create a fresh index.
+    # If you want to call #build_index manually, use
+    #      Classifier::LSI.new :auto_rebuild => false
+    #
+    def initialize(options = {})
+      @auto_rebuild = true unless options[:auto_rebuild] == false
+      @word_list, @items = WordList.new, {}
+      @version, @built_at_version = 0, -1
+      super
+    end
+    # Returns true if the index needs to be rebuilt.  The index needs
+    # to be built after all informaton is added, but before you start
+    # using it for search, classification and cluster detection.
+    def needs_rebuild?
+      (@items.keys.size > 1) && (@version != @built_at_version)
+    end
+    # Adds an item to the index. item is assumed to be a string, but
+    # any item may be indexed so long as it responds to #to_s or if
+    # you provide an optional block explaining how the indexer can
+    # fetch fresh string data. This optional block is passed the item,
+    # so the item may only be a reference to a URL or file name.
+    #
+    # For example:
+    #   lsi = Classifier::LSI.new
+    #   lsi.add_item "This is just plain text"
+    #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
+    #   ar = ActiveRecordObject.find( :all )
+    #   lsi.add_item ar, *ar.categories { |x| ar.content }
+    #
+    def add_item( item, *categories, &block )
+      clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
+      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      @version += 1
+      build_index if @auto_rebuild
+    end
+    # A less flexible shorthand for add_item that assumes
+    # you are passing in a string with no categorries. item
+    # will be duck typed via to_s .
+    #
+    def <<( item )
+      add_item item
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # Removes an item from the database, if it is indexed.
+    #
+    def remove_item( item )
+      if @items.keys.contain? item
+        @items.remove item
+        @version += 1
+      end
+    end
+    # Returns an array of items that are indexed.
+    def items
+      @items.keys
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # This function rebuilds the index if needs_rebuild? returns true.
+    # For very large document spaces, this indexing operation may take some
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
+    #
+    # As a rule, indexing will be fairly swift on modern machines until
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
+    #
+    # The optional parameter "cutoff" is a tuning parameter. When the index is
+    # built, a certain number of s-values are discarded from the system. The
+    # cutoff parameter tells the indexer how many of these values to keep.
+    # A value of 1 for cutoff means that no semantic analysis will take place,
+    # turning the LSI class into a simple vector search engine.
+    def build_index( cutoff=0.75 )
+      return unless needs_rebuild?
+      make_word_list
+      doc_list = @items.values
+      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      if $GSL
+         tdm = GSL::Matrix.alloc(*tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.size[1].times do |col|
+           vec = GSL::Vector.alloc( ntdm.column(col) ).row
+           doc_list[col].lsi_vector = vec
+           doc_list[col].lsi_norm = vec.normalize
+         end
+      else
+         tdm = Matrix.rows(tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.row_size.times do |col|
+           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
+         end
+      end
+      @built_at_version = @version
+    end
+    # This method returns max_chunks entries, ordered by their average semantic rating.
+    # Essentially, the average distance of each entry from all other entries is calculated,
+    # the highest are returned.
+    #
+    # This can be used to build a summary service, or to provide more information about
+    # your dataset's general content. For example, if you were to use categorize on the
+    # results of this data, you could gather information on what your dataset is generally
+    # about.
+    def highest_relative_content( max_chunks=10 )
+       return [] if needs_rebuild?
+       avg_density = Hash.new
+       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    end
+    # This function is the primitive that find_related and classify
+    # build upon. It returns an array of 2-element arrays. The first element
+    # of this array is a document, and the second is its "score", defining
+    # how "close" it is to other indexed items.
+    #
+    # These values are somewhat arbitrary, having to do with the vector space
+    # created by your content, so the magnitude is interpretable but not always
+    # meaningful between indexes.
+    #
+    # The parameter doc is the content to compare. If that content is not
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
+    def proximity_array_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          next if @items[item].search_vector.blank? # not enough data
+          if $GSL
+             val = content_node.search_vector * @items[item].search_vector.col
+          else
+             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+          end
+          [item, val]
+        end
+      result.compact.sort_by { |x| x[1] }.reverse
+    end
+    # Similar to proximity_array_for_content, this function takes similar
+    # arguments and returns a similar array. However, it uses the normalized
+    # calculated vectors instead of their full versions. This is useful when
+    # you're trying to perform operations on content that is much smaller than
+    # the text you're working with. search uses this primitive.
+    def proximity_norms_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          next if @items[item].search_norm.blank? # not enough data
+          if $GSL
+            val = content_node.search_norm * @items[item].search_norm.col
+          else
+            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+          end
+          [item, val]
+        end
+      result.compact.sort_by { |x| x[1] }.reverse
+    end
+    # This function allows for text-based search of your index. Unlike other functions
+    # like find_related and classify, search only takes short strings. It will also ignore
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
+    #
+    # While this may seem backwards compared to the other functions that LSI supports,
+    # it is actually the same algorithm, just applied on a smaller document.
+    def search( string, max_nearest=3 )
+      return [] if needs_rebuild?
+      carry = proximity_norms_for_content( string )
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function takes content and finds other documents
+    # that are semantically "close", returning an array of documents sorted
+    # from most to least relavant.
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
+    #
+    # This is particularly useful for identifing clusters in your document space.
+    # For example you may want to identify several "What's Related" items for weblog
+    # articles, or find paragraphs that relate to each other in an essay.
+    def find_related( doc, max_nearest=3, &block )
+      carry =
+        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
+    # find_related function to find related documents, then returns the
+    # most obvious category from this list.
+    #
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
+    # what category the document is in. This may not always make sense.
+    #
+    def classify( doc, cutoff=0.30, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      ranking = votes.keys.sort_by { |x| votes[x] }
+      return ranking[-1]
+    end
+    # Same as previous but returns all results, also more permissive in default cut-off
+    def classify_multiple( doc, cutoff=0.50, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
+    end
+    # Prototype, only works on indexed documents.
+    # I have no clue if this is going to work, but in theory
+    # it's supposed to.
+    def highest_ranked_stems( doc, count=3 )
+      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+      arr = node_for_content(doc).lsi_vector.to_a
+      top_n = arr.sort.reverse[0..count-1]
+      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    end
+    private
+    def build_reduced_matrix( matrix, cutoff=0.75 )
+      # TODO: Check that M>=N on these dimensions! Transpose helps assure this
+      u, v, s = matrix.SV_decomp
+      # TODO: Better than 75% term, please. :\
+      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      s.size.times do |ord|
+        s[ord] = 0.0 if s[ord] < s_cutoff
+      end
+      # Reconstruct the term document matrix, only with reduced rank
+      u * ($GSL ? GSL::Matrix : Matrix).diag( s ) * v.trans
+    end
+    def node_for_content(item, &block)
+      if @items[item]
+        return @items[item]
+      else
+        clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
+        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        unless needs_rebuild?
+          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+        end
+      end
+      return cn
+    end
+    def make_word_list
+      @word_list = WordList.new
+      @items.each_value do |node|
+        node.word_hash.each_key { |key| @word_list.add_word key }
+      end
+    end
+  end
+end