RubyGems - classifier-reborn - Versions diffs - 2.0.3 → 2.0.4 - Mend

classifier-reborn 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/README.markdown +130 -14
data/data/stopwords/ca +126 -0
data/data/stopwords/cs +138 -0
data/data/stopwords/da +101 -0
data/data/stopwords/de +604 -0
data/data/stopwords/en +80 -0
data/data/stopwords/es +351 -0
data/data/stopwords/fi +747 -0
data/data/stopwords/fr +463 -0
data/data/stopwords/hu +35 -0
data/data/stopwords/it +430 -0
data/data/stopwords/nl +48 -0
data/data/stopwords/no +119 -0
data/data/stopwords/pl +93 -0
data/data/stopwords/pt +356 -0
data/data/stopwords/se +386 -0
data/data/stopwords/tr +114 -0
data/lib/classifier-reborn/bayes.rb +86 -16
data/lib/classifier-reborn/category_namer.rb +3 -1
data/lib/classifier-reborn/extensions/hasher.rb +25 -100
data/lib/classifier-reborn/extensions/vector.rb +0 -1
data/lib/classifier-reborn/lsi.rb +36 -25
data/lib/classifier-reborn/lsi/cached_content_node.rb +48 -0
data/lib/classifier-reborn/lsi/content_node.rb +27 -10
data/lib/classifier-reborn/lsi/summarizer.rb +2 -2
data/lib/classifier-reborn/version.rb +1 -1
metadata +37 -3

data/lib/classifier-reborn/lsi/cached_content_node.rb ADDED

@@ -0,0 +1,48 @@
+# Author::    Kelley Reynolds  (mailto:kelley@insidesystems.net)
+# Copyright:: Copyright (c) 2015 Kelley Reynolds
+# License::   LGPL
+module ClassifierReborn
+  # Subclass of ContentNode which caches the search_vector transpositions.
+  # Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
+  # if you Marshal your classifier and want to keep the size down, you'll need to manually
+  # clear the cache before you dump
+  class CachedContentNode < ContentNode
+    module InstanceMethods
+      # Go through each item in this index and clear the cache
+      def clear_cache!
+        @items.each_value(&:clear_cache!)
+      end
+    end
+    def initialize( word_hash, *categories )
+      clear_cache!
+      super
+    end
+    def clear_cache!
+      @transposed_search_vector = nil
+    end
+    # Cache the transposed vector, it gets used a lot
+    def transposed_search_vector
+      @transposed_search_vector ||= super
+    end
+    # Clear the cache before we continue on
+    def raw_vector_with( word_list )
+      clear_cache!
+      super
+    end
+    # We don't want the cached_data here
+    def marshal_dump
+      [@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
+    end
+    def marshal_load(array)
+      @lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash = array
+    end
+  end
+end

data/lib/classifier-reborn/lsi/content_node.rb CHANGED

@@ -18,6 +18,7 @@ module ClassifierReborn
     def initialize( word_hash, *categories )
       @categories = categories || []
       @word_hash = word_hash
+      @lsi_norm, @lsi_vector = nil
     end
     # Use this to fetch the appropriate search vector.
@@ -25,6 +26,11 @@ module ClassifierReborn
       @lsi_vector || @raw_vector
     end
+    # Method to access the transposed search vector
+    def transposed_search_vector
+      search_vector.col
+    end
     # Use this to fetch the appropriate search vector in normalized form.
     def search_norm
       @lsi_norm || @raw_norm
@@ -46,7 +52,7 @@ module ClassifierReborn
       # Perform the scaling transform and force floating point arithmetic
       if $GSL
         sum = 0.0
-        vec.collect{|v| sum += v}
+        vec.each {|v| sum += v }
         total_words = sum
       else
         total_words = vec.reduce(0, :+).to_f
@@ -55,7 +61,7 @@ module ClassifierReborn
       total_unique_words = 0
       if $GSL
-        vec.each { |word| total_unique_words += 1 if word != 0 }
+        vec.each { |word| total_unique_words += 1 if word != 0.0 }
       else
         total_unique_words = vec.count{ |word| word != 0 }
       end
@@ -64,20 +70,31 @@ module ClassifierReborn
       # then one word in it.
       if total_words > 1.0 && total_unique_words > 1
         weighted_total = 0.0
+        # Cache calculations, this takes too long on large indexes
+        cached_calcs = Hash.new { |hash, term|
+          hash[term] = (( term / total_words ) * Math.log( term / total_words ))
+        }
         vec.each do |term|
-          if ( term > 0 )
-            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
-          end
+          weighted_total += cached_calcs[term] if term > 0.0
         end
-        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+        # Cache calculations, this takes too long on large indexes
+        cached_calcs = Hash.new do |hash, val|
+          hash[val] = Math.log( val + 1 ) / -weighted_total
+        end
+        vec.collect! { |val|
+          cached_calcs[val]
+        }
       end
       if $GSL
-         @raw_norm   = vec.normalize
-         @raw_vector = vec
+        @raw_norm   = vec.normalize
+        @raw_vector = vec
       else
-         @raw_norm   = Vector[*vec].normalize
-         @raw_vector = Vector[*vec]
+        @raw_norm   = Vector[*vec].normalize
+        @raw_vector = Vector[*vec]
       end
     end

data/lib/classifier-reborn/lsi/summarizer.rb CHANGED

@@ -15,11 +15,11 @@ module ClassifierReborn
     end
     def split_sentences(str)
-      str.split /(\.|\!|\?)/ # TODO: make this less primitive
+      str.split(/(\.|\!|\?)/) # TODO: make this less primitive
     end
     def split_paragraphs(str)
-      str.split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+      str.split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
     end
     def perform_lsi(chunks, count, separator)

data/lib/classifier-reborn/version.rb CHANGED

@@ -1,3 +1,3 @@
 module ClassifierReborn
-  VERSION = '2.0.3'
+  VERSION = '2.0.4'
 end

metadata CHANGED

@@ -1,15 +1,16 @@
 --- !ruby/object:Gem::Specification
 name: classifier-reborn
 version: !ruby/object:Gem::Version
-  version: 2.0.3
+  version: 2.0.4
 platform: ruby
 authors:
 - Lucas Carlson
 - Parker Moore
+- Chase Gilliam
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-12-23 00:00:00.000000000 Z
+date: 2015-10-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: fast-stemmer
@@ -53,10 +54,25 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: test-unit
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description:
 email:
 - lucas@rufy.com
 - parkrmoore@gmail.com
+- chase.gilliam@gmail.com
 executables:
 - bayes.rb
 - summarize.rb
@@ -69,6 +85,22 @@ files:
 - README.markdown
 - bin/bayes.rb
 - bin/summarize.rb
+- data/stopwords/ca
+- data/stopwords/cs
+- data/stopwords/da
+- data/stopwords/de
+- data/stopwords/en
+- data/stopwords/es
+- data/stopwords/fi
+- data/stopwords/fr
+- data/stopwords/hu
+- data/stopwords/it
+- data/stopwords/nl
+- data/stopwords/no
+- data/stopwords/pl
+- data/stopwords/pt
+- data/stopwords/se
+- data/stopwords/tr
 - lib/classifier-reborn.rb
 - lib/classifier-reborn/bayes.rb
 - lib/classifier-reborn/category_namer.rb
@@ -76,6 +108,7 @@ files:
 - lib/classifier-reborn/extensions/vector.rb
 - lib/classifier-reborn/extensions/vector_serialize.rb
 - lib/classifier-reborn/lsi.rb
+- lib/classifier-reborn/lsi/cached_content_node.rb
 - lib/classifier-reborn/lsi/content_node.rb
 - lib/classifier-reborn/lsi/summarizer.rb
 - lib/classifier-reborn/lsi/word_list.rb
@@ -101,8 +134,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.4.8
 signing_key:
 specification_version: 2
 summary: A general classifier module to allow Bayesian and other types of classifications.
 test_files: []
+has_rdoc: true