RubyGems - classifier - Versions diffs - 1.3.5 → 1.4.0 - Mend

classifier 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/lib/classifier/bayes.rb +128 -120
data/lib/classifier/extensions/string.rb +1 -1
data/lib/classifier/extensions/vector.rb +66 -72
data/lib/classifier/extensions/vector_serialize.rb +6 -8
data/lib/classifier/extensions/word_hash.rb +108 -114
data/lib/classifier/lsi/content_node.rb +25 -23
data/lib/classifier/lsi/summary.rb +20 -20
data/lib/classifier/lsi/word_list.rb +1 -2
data/lib/classifier/lsi.rb +112 -89
data/lib/classifier.rb +1 -0
data/test/test_helper.rb +5 -0
metadata +7 -21

data/lib/classifier/extensions/word_hash.rb CHANGED Viewed

@@ -2,135 +2,129 @@
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
-require "set"
 # These are extensions to the String class to provide convenience
 # methods for the Classifier package.
 class String
   # Removes common punctuation symbols, returning a new string.
   # E.g.,
   #   "Hello (greeting's), with {braces} < >...?".without_punctuation
   #   => "Hello  greetings   with  braces         "
   def without_punctuation
-    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+    tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
   end
   # Return a Hash of strings => ints. Each word in the string is stemmed,
   # interned, and indexes to its frequency in the document.
-	def word_hash
-		word_hash = clean_word_hash()
-		symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
-		return word_hash.merge(symbol_hash)
-	end
-	# Return a word hash without extra punctuation or short symbols, just stemmed words
-	def clean_word_hash
-		word_hash_for_words gsub(/[^\w\s]/,"").split
-	end
+  def word_hash
+    word_hash = clean_word_hash
+    symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
+    word_hash.merge(symbol_hash)
+  end
-	private
+  # Return a word hash without extra punctuation or short symbols, just stemmed words
+  def clean_word_hash
+    word_hash_for_words gsub(/[^\w\s]/, '').split
+  end
-	def word_hash_for_words(words)
-		d = Hash.new(0)
-		words.each do |word|
-			word.downcase!
-			if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
-				d[word.stem.intern] += 1
-			end
-		end
-		return d
-	end
+  private
+  def word_hash_for_words(words)
+    d = Hash.new(0)
+    words.each do |word|
+      word.downcase!
+      d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+    end
+    d
+  end
-	def word_hash_for_symbols(words)
-		d = Hash.new(0)
-		words.each do |word|
-			d[word.intern] += 1
-		end
-		return d
-	end
+  def word_hash_for_symbols(words)
+    d = Hash.new(0)
+    words.each do |word|
+      d[word.intern] += 1
+    end
+    d
+  end
-	CORPUS_SKIP_WORDS = Set.new([
-      "a",
-      "again",
-      "all",
-      "along",
-      "are",
-      "also",
-      "an",
-      "and",
-      "as",
-      "at",
-      "but",
-      "by",
-      "came",
-      "can",
-      "cant",
-      "couldnt",
-      "did",
-      "didn",
-      "didnt",
-      "do",
-      "doesnt",
-      "dont",
-      "ever",
-      "first",
-      "from",
-      "have",
-      "her",
-      "here",
-      "him",
-      "how",
-      "i",
-      "if",
-      "in",
-      "into",
-      "is",
-      "isnt",
-      "it",
-      "itll",
-      "just",
-      "last",
-      "least",
-      "like",
-      "most",
-      "my",
-      "new",
-      "no",
-      "not",
-      "now",
-      "of",
-      "on",
-      "or",
-      "should",
-      "sinc",
-      "so",
-      "some",
-      "th",
-      "than",
-      "this",
-      "that",
-      "the",
-      "their",
-      "then",
-      "those",
-      "to",
-      "told",
-      "too",
-      "true",
-      "try",
-      "until",
-      "url",
-      "us",
-      "were",
-      "when",
-      "whether",
-      "while",
-      "with",
-      "within",
-      "yes",
-      "you",
-      "youll",
-      ])
+  CORPUS_SKIP_WORDS = Set.new(%w[
+                                a
+                                again
+                                all
+                                along
+                                are
+                                also
+                                an
+                                and
+                                as
+                                at
+                                but
+                                by
+                                came
+                                can
+                                cant
+                                couldnt
+                                did
+                                didn
+                                didnt
+                                do
+                                doesnt
+                                dont
+                                ever
+                                first
+                                from
+                                have
+                                her
+                                here
+                                him
+                                how
+                                i
+                                if
+                                in
+                                into
+                                is
+                                isnt
+                                it
+                                itll
+                                just
+                                last
+                                least
+                                like
+                                most
+                                my
+                                new
+                                no
+                                not
+                                now
+                                of
+                                on
+                                or
+                                should
+                                sinc
+                                so
+                                some
+                                th
+                                than
+                                this
+                                that
+                                the
+                                their
+                                then
+                                those
+                                to
+                                told
+                                too
+                                true
+                                try
+                                until
+                                url
+                                us
+                                were
+                                when
+                                whether
+                                while
+                                with
+                                within
+                                yes
+                                you
+                                youll
+                              ])
 end

data/lib/classifier/lsi/content_node.rb CHANGED Viewed

@@ -3,21 +3,21 @@
 # License::   LGPL
 module Classifier
-# This is an internal data structure class for the LSI node. Save for
-# raw_vector_with, it should be fairly straightforward to understand.
-# You should never have to use it directly.
+  # This is an internal data structure class for the LSI node. Save for
+  # raw_vector_with, it should be fairly straightforward to understand.
+  # You should never have to use it directly.
   class ContentNode
     attr_accessor :raw_vector, :raw_norm,
                   :lsi_vector, :lsi_norm,
                   :categories
     attr_reader :word_hash
     # If text_proc is not specified, the source will be duck-typed
     # via source.to_s
-    def initialize( word_hash, *categories )
+    def initialize(word_frequencies, *categories)
       @categories = categories || []
-      @word_hash = word_hash
+      @word_hash = word_frequencies
     end
     # Use this to fetch the appropriate search vector.
@@ -32,41 +32,43 @@ module Classifier
     # Creates the raw vector out of word_hash using word_list as the
     # key for mapping the vector space.
-    def raw_vector_with( word_list )
-      if $GSL
-         vec = GSL::Vector.alloc(word_list.size)
-      else
-         vec = Array.new(word_list.size, 0)
-      end
+    def raw_vector_with(word_list)
+      vec = if $GSL
+              GSL::Vector.alloc(word_list.size)
+            else
+              Array.new(word_list.size, 0)
+            end
       @word_hash.each_key do |word|
         vec[word_list[word]] = @word_hash[word] if word_list[word]
       end
       # Perform the scaling transform
-      total_words = vec.sum
+      total_words = $GSL ? vec.sum : vec.sum_with_identity
       # Perform first-order association transform if this vector has more
       # than one word in it.
       if total_words > 1.0
         weighted_total = 0.0
         vec.each do |term|
-          if ( term > 0 )
-            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
-          end
+          next unless term.positive?
+          next if total_words.zero?
+          term_over_total = term / total_words
+          val = term_over_total * Math.log(term_over_total)
+          weighted_total += val unless val.nan?
         end
-        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+        vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
       end
       if $GSL
-         @raw_norm   = vec.normalize
-         @raw_vector = vec
+        @raw_norm   = vec.normalize
+        @raw_vector = vec
       else
-         @raw_norm   = Vector[*vec].normalize
-         @raw_vector = Vector[*vec]
+        @raw_norm   = Vector[*vec].normalize
+        @raw_vector = Vector[*vec]
       end
     end
   end
 end

data/lib/classifier/lsi/summary.rb CHANGED Viewed

@@ -3,29 +3,29 @@
 # License::   LGPL
 class String
-   def summary( count=10, separator=" [...] " )
-      perform_lsi split_sentences, count, separator
-   end
+  def summary(count = 10, separator = ' [...] ')
+    perform_lsi split_sentences, count, separator
+  end
-   def paragraph_summary( count=1, separator=" [...] " )
-      perform_lsi split_paragraphs, count, separator
-   end
+  def paragraph_summary(count = 1, separator = ' [...] ')
+    perform_lsi split_paragraphs, count, separator
+  end
-   def split_sentences
-      split /(\.|\!|\?)/ # TODO: make this less primitive
-   end
+  def split_sentences
+    split(/(\.|!|\?)/) # TODO: make this less primitive
+  end
-   def split_paragraphs
-      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
-   end
+  def split_paragraphs
+    split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
+  end
-   private
+  private
-   def perform_lsi(chunks, count, separator)
-      lsi = Classifier::LSI.new :auto_rebuild => false
-      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
-      lsi.build_index
-      summaries = lsi.highest_relative_content count
-      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
-   end
+  def perform_lsi(chunks, count, separator)
+    lsi = Classifier::LSI.new auto_rebuild: false
+    chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+    lsi.build_index
+    summaries = lsi.highest_relative_content count
+    summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
+  end
 end

data/lib/classifier/lsi/word_list.rb CHANGED Viewed

@@ -8,7 +8,7 @@ module Classifier
   class WordList
     def initialize
-      @location_table = Hash.new
+      @location_table = {}
     end
     # Adds a word (if it is new) and assigns it a unique dimension.
@@ -31,6 +31,5 @@ module Classifier
     def size
       @location_table.size
     end
   end
 end