RubyGems - sclust - Versions diffs - 1.0.0 - Mend

sclust 1.0.0

Files changed (7) hide show

data/lib/sclust/cluster.rb ADDED Viewed

@@ -0,0 +1,197 @@
+require 'rubygems'
+require 'log4r'
+module SClust
+class CosineDistance
+    # Given two vectors, compute the distance
+    def self.distance(a,b)
+        acc1 = 0.0
+        acc2 = 0.0
+        acc3 = 0.0
+        0.upto(a.length-1) do |i|
+            acc1 += a[i]*b[i]
+            acc2 *= a[i]*a[i]
+            acc3 *= b[i]*b[i]
+        end
+        1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
+    end
+end
+class ClusterPoint
+    attr_reader :terms, :values, :cluster, :source_object
+    attr_writer :cluster, :source_object
+    # Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
+    def initialize(terms, values, source_object = nil)
+      @terms   = terms
+      @values  = values
+      @cluster = nil
+      @source_object = source_object
+    end
+    def distance(clusterPoint)
+        CosineDistance.distance(@values, clusterPoint.values)
+    end
+    # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
+    # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
+    # this cluster point while a weight value of 0 will have no effect.
+    def add(clusterPoint, weight)
+        0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
+        # Validation code
+        #0.upto(@values.length-1) do |i|
+        #    if ( @values[i].nan? || ! @values[i].finite? )
+        #        throw Exception.new("Cluster has invalid number #{@values[i]}")
+        #    end
+        #end
+    end
+    # Similar to add, but subtract.
+    def sub(clusterPoint, weight)
+        0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
+        # Validation code
+        #0.upto(@values.length-1) do |i|
+        #    if ( @values[i].nan? || ! @values[i].finite? )
+        #        throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
+        #    end
+        #end
+    end
+    def get_max_terms(n=3)
+        values = {}
+        @terms.length.times do |i|
+            t = @terms[i]
+            v = @values[i]
+            values[v] = [] unless values.has_key?(v)
+            values[v] << t
+        end
+        vlist = values.keys.sort { |x,y|  ( x > y ) ? -1 : 1 }
+        result = []
+        n = vlist.length if ( n > vlist.length )
+        n.times { |i| result += values[vlist[i]] }
+        result.slice(0,n)
+    end
+    def get_term_value(term)
+        i=0
+        catch(:found) do
+            @terms.each do |t|
+                throw :found if ( t == term )
+                i+=1
+            end
+        end
+        @values[i]
+    end
+end
+class Cluster
+    attr_reader :center, :size
+    def initialize(centerPoint)
+        @fixed      = false
+        @center     = centerPoint.clone
+        @size       = 1
+    end
+    def +(point)
+        point.cluster = self
+        @size+=1
+        @center.add(point, 1.0/@size.to_f)
+    end
+    def -(point)
+        point.cluster = nil
+        @center.sub(point, 1.0/@size.to_f)
+        @size-=1
+    end
+    def get_max_terms(n=3)
+        @center.get_max_terms(n)
+    end
+end
+class Clusterer
+    attr_reader :clusters, :points, :cluster_count, :iterations, :logger
+    attr_writer :clusters, :points, :cluster_count, :iterations
+    # Optionally takes a notifier.
+    def initialize(points)
+        @iterations    = 2
+        @cluster_count = 10
+        @points        = points
+        @clusters      = []
+        @logger        = Log4r::Logger.new('Clusterer')
+        # Randomly select a few starting documents.
+        @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
+    end
+    def +(point)
+        @points << point
+    end
+    def each_cluster(&c)
+        @clusters.each { |cluster| yield cluster }
+    end
+    def assign_all_points
+        @points.each do |pt|
+            @logger.debug("Assigning point #{pt}.")
+            min_cluster = @clusters[0]
+            min_dst = min_cluster.center.distance(pt)
+            @clusters.each do |cluster|
+                tmp_distance = cluster.center.distance(pt)
+                if ( tmp_distance < min_dst )
+                    min_cluster = cluster
+                    min_dst = tmp_distance
+                end
+            end
+            pt.cluster - pt if pt.cluster
+            min_cluster + pt
+        end
+    end
+  def cluster
+      iterations.times do |i|
+          @logger.info("Starting iteration #{i+1} of #{iterations}.")
+          assign_all_points
+      end
+  end
+end
+end

data/lib/sclust/doc.rb ADDED Viewed

@@ -0,0 +1,92 @@
+module SClust
+# Filters a document term
+class DocumentTermFilter
+    # Return nil if the term should be excluded. Otherwise the version of the term
+    # that should be included is returned.
+    def filter(term)
+        if ( term.nil? )
+            nil
+        elsif (term.size < 2)
+            nil
+        elsif ( term =~ /^[\d\.]+$/ )
+            nil
+        else
+            term.downcase!
+        end
+    end
+end
+class NullFilter
+    def filter(term)
+        term
+    end
+end
+class Document
+    attr_reader :terms, :userDate, :filter
+    # Takes { :userData, :ngrams => [1,2,3], :filter }
+    def initialize(text, opts={})
+        @text = text
+        @userData = opts[:userData]
+        opts[:ngramrange] ||= [ 1, 2, 3 ]
+        opts[:filter] ||= DocumentTermFilter.new()
+        word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
+        @terms = Hash.new(0)
+        # Array of counts of grams built.
+        builtGramCounts = []
+        # Build a set of n-grams from our requested ngram range.
+        opts[:ngrams].each do |n|
+            builtGramCounts[n] = 0
+            # For each word in our list...
+            0.upto(word_arr.length-1) do |j|
+                if ( n + j < word_arr.length )
+                    term = word_arr[j]
+                    (n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
+                end
+                term = opts[:filter].filter(term)
+                @terms[term] += 1.0 if term
+                builtGramCounts[n] += 1
+            end
+        end
+        @terms.each { |k,v| @terms[k] /= @terms.length }
+    end
+    def term_frequency(term)
+        @terms[term]
+    end
+    alias tf term_frequency
+    def each_term(&call)
+        terms.each_key { |k| yield k }
+    end
+    def has_term?(term)
+        @terms.has_key?(term)
+    end
+end
+end

data/lib/sclust/doccluster.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'sclust/doc'
+require 'sclust/doccol'
+require 'sclust/cluster'
+module SClust
+# A document clusterer that overrides the + operator
+# to allow for adding Document objects.
+class DocumentClusterer < Clusterer
+    def initialize(documentCollection)
+        # List of all terms
+        term_list = documentCollection.terms.keys.sort
+        point_list = []
+        documentCollection.doclist.each do |doc|
+            doc_terms       = [] # Sorted list of terms.
+            doc_term_values = [] # Corosponding values.
+            # Buid a BIG term vector list for this document.
+            term_list.each do |term|
+                doc_terms << term
+                doc_term_values << doc.tf(term) - documentCollection.idf(term)
+            end
+            # def initialize(terms, values, source_object = nil)
+            point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
+        end
+        super(point_list)
+    end
+end
+end

data/lib/sclust/doccol.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require 'rubygems'
+require 'log4r'
+module SClust
+class DocumentCollection
+    # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
+    attr_reader :terms
+    # A list of documents
+    attr_reader :doclist
+    # Log4r::Logger for this document collection.
+    attr_reader :logger
+    def initialize()
+        @logger = Log4r::Logger.new("SClust::DocumentCollection")
+        @terms   = Hash.new(0)
+        @doclist = []
+    end
+    # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
+    # The document is also added to the @doclist attribute.
+    def +(d)
+        d.each_term do |term|
+          @terms[term] += 1.0
+        end
+        @doclist<<d
+        @logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
+        self
+    end
+    def drop_terms(min_frequency=0.10, max_frequency=0.80)
+        min_docs = @doclist.length * min_frequency
+        max_docs = @doclist.length * max_frequency
+        @logger.info("Analyzing #{@terms.length} terms for removal.")
+        @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
+        remove_list = []
+        @terms.each do |term, frequency|
+            if ( frequency < min_docs or frequency > max_docs )
+                @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
+                @terms.delete(term)
+                remove_list << term
+            end
+        end
+        @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
+        @doclist.each do |doc|
+            remove_list.each do |term|
+                doc.terms.delete(term)
+            end
+        end
+    end
+    def inverse_document_frequency(term)
+        Math.log( @terms.length / @terms[term] )
+    end
+    alias idf inverse_document_frequency
+    def each_term(&c)
+        @terms.each_key { |k| yield k }
+    end
+end
+end

data/tests/clustertest.rb ADDED Viewed

@@ -0,0 +1,51 @@
+require 'test/unit'
+require 'sclust/doccluster'
+class ClusterTest < Test::Unit::TestCase
+    def setup()
+        @dc = SClust::DocumentCollection.new()
+        filter = SClust::NullFilter.new()
+        d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
+        d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
+        d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
+        d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
+        @dc + d1
+        @dc + d2
+        @dc + d3
+        @dc + d4
+    end
+    def teardown()
+    end
+    def test_makecluster()
+        c = SClust::DocumentClusterer.new(@dc)
+        c.cluster
+        c.each_cluster do |cl|
+            max = 0
+            0.upto(cl.center.terms.length - 1) do |i|
+                term  = cl.center.terms[i]
+                value = cl.center.values[i]
+                max = i if ( cl.center.values[i] > cl.center.values[max] )
+            end
+            puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
+            cl.center.get_max_terms(3).each do |t|
+                puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
+            end
+            assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
+        end
+    end
+end

data/tests/test001.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'sclust/doc'
+require 'sclust/doccol'
+require 'test/unit'
+class DocTests < Test::Unit::TestCase
+  #def setup
+  #end
+  #def teardown
+  #end
+  def test_builddoc
+    d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
+    d.terms.each do |k,v|
+      assert(k != ".", "Period found")
+      assert(k != "", "Empty term found")
+      #puts("#{k}=#{v}")
+    end
+  end
+end
+class DocCollectionTests < Test::Unit::TestCase
+  def test_collectionadd()
+    dc = SClust::DocumentCollection.new()
+    d1 = SClust::Document.new("a b c d d e a q a b")
+    d2 = SClust::Document.new("a b d e a")
+    d3 = SClust::Document.new("bob")
+    d4 = SClust::Document.new("frank a")
+    dc + d1
+    dc + d2
+    dc + d3
+    dc + d4
+    dc.terms.each do |k,v|
+      if k == "a"
+        assert(v == 3, "A appers in 3 documents out of 4.")
+        assert(dc.idf("a") > 2.2, "Known value for a")
+        assert(dc.idf("a") < 2.3, "Known value for a")
+      end
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,80 @@
+--- !ruby/object:Gem::Specification
+name: sclust
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+  - Sam Baskinger
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-12-01 00:00:00 -06:00
+default_executable:
+dependencies:
+  - !ruby/object:Gem::Dependency
+    name: log4r
+    type: :runtime
+    version_requirement:
+    version_requirements: !ruby/object:Gem::Requirement
+      requirements:
+        - - ">="
+          - !ruby/object:Gem::Version
+            version: 1.0.5
+      version:
+  - !ruby/object:Gem::Dependency
+    name: mechanize
+    type: :runtime
+    version_requirement:
+    version_requirements: !ruby/object:Gem::Requirement
+      requirements:
+        - - ">="
+          - !ruby/object:Gem::Version
+            version: 0.9.3
+      version:
+description: A k-mean text clustering library for ruby.
+email: basking2@rubyforge.org.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+  - lib/sclust/cluster.rb
+  - lib/sclust/doc.rb
+  - lib/sclust/doccluster.rb
+  - lib/sclust/doccol.rb
+has_rdoc: true
+homepage: http://sclust.rubyforge.org
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+  - lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.8
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  version:
+requirements: []
+rubyforge_project: http://sclust.rubyforge.org/
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: k-mean clustering.
+test_files:
+  - tests/clustertest.rb
+  - tests/test001.rb
+  - tests/clustertest.rb
+  - tests/test001.rb