RubyGems - sclust - Versions diffs - 1.0.0 → 2.1.0 - Mend

sclust 1.0.0 → 2.1.0

Files changed (22) hide show

data/lib/sclust/kmean/cluster.rb +294 -0
data/lib/sclust/kmean/doccluster.rb +83 -0
data/lib/sclust/lda/lda.rb +243 -0
data/lib/sclust/lda/lda2.rb +328 -0
data/lib/sclust/util/doc.rb +134 -0
data/lib/sclust/util/doccol.rb +187 -0
data/lib/sclust/util/filters.rb +210 -0
data/lib/sclust/util/rss.rb +96 -0
data/lib/sclust/util/sparse_vector.rb +96 -0
data/lib/sclust/util/stopwords.rb +1149 -0
data/lib/sclust/util/weightedmovingaverage.rb +25 -0
data/lib/sclust/util/word.rb +53 -0
data/tests/clustertest.rb +56 -29
data/tests/filters_test.rb +48 -0
data/tests/ldatest.rb +75 -0
data/tests/sparse_vector_test.rb +61 -0
data/tests/test001.rb +49 -19
metadata +74 -40
data/lib/sclust/cluster.rb +0 -197
data/lib/sclust/doc.rb +0 -92
data/lib/sclust/doccluster.rb +0 -39
data/lib/sclust/doccol.rb +0 -75

@@ -0,0 +1,294 @@
+#
+# The MIT License
+#
+# Copyright (c) 2010 Samuel R. Baskinger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+require 'rubygems'
+require 'log4r'
+require 'sclust/util/word'
+module SClust
+    module KMean
+        class CosineDistance
+            # Given two vectors, compute the distance
+            def self.distance(a,b)
+                acc1 = 0.0
+                acc2 = 0.0
+                acc3 = 0.0
+                a.merge(b).keys.each do |i|
+                    acc1 += a[i]*b[i]
+                    acc2 += a[i]*a[i]
+                    acc3 += b[i]*b[i]
+                end
+                v = 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
+                # Return nil if we detect no distance between documents.
+                (v==1)? nil : v
+            end
+        end
+        class ClusterPoint
+            attr_reader :values, :cluster, :source_object
+            attr_writer :cluster, :source_object
+            # Initialize the ClusterPoint with a SparseVector or SparseLabeledVector.
+            def initialize(sparse_vector, source_object = nil)
+                @values  = sparse_vector
+                @cluster = nil
+                @source_object = source_object
+            end
+            def distance(clusterPoint)
+                CosineDistance.distance(@values, clusterPoint.values)
+            end
+            # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
+            # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
+            # this cluster point while a weight value of 0 will have no effect.
+            def add(clusterPoint, weight)
+                @values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] * (1-weight) ) + (clusterPoint.values[i] * weight)}
+            end
+            # Similar to add, but subtract.
+            def sub(clusterPoint, weight)
+                @values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / ( 1 - weight ) }
+            end
+            # Return the top n words. Return all the terms sorted if n is 0.
+            def get_max_terms(n=3)
+                values_to_terms = {}
+                @values.each do |t, v|
+                    values_to_terms[v] ||= []
+                    values_to_terms[v] << SClust::Util::Word.new(t, v, {:stemmed_word => t})
+                end
+                sorted_values = values_to_terms.keys.sort { |x,y|  y <=> x }
+                result = []
+                #n = @values.length if ( n > @values.length || n == 0)
+                catch(:haveEnough) do
+                    sorted_values.each do |value|
+                        result += values_to_terms[value]
+                        throw :haveEnough if result.length >= n
+                    end
+                end
+                # Trim our results to exactly the requested size.
+                result[0...n]
+            end
+            def get_term_value(term)
+                @values[term]
+            end
+        end
+        class Cluster
+            attr_reader :center, :size
+            def initialize(centerPoint)
+                @fixed      = false
+                @center     = centerPoint.clone
+                @size       = 1
+            end
+            def +(point)
+                point.cluster = self
+                @size+=1
+                @center.add(point, 1.0/@size.to_f)
+            end
+            def -(point)
+                point.cluster = nil
+                @center.sub(point, 1.0/@size.to_f)
+                @size-=1
+            end
+            def get_max_terms(n=3)
+                @center.get_max_terms(n)
+            end
+        end
+        class Clusterer
+            attr_reader :clusters, :points, :cluster_count, :iterations, :logger
+            attr_writer :clusters, :points, :cluster_count, :iterations, :logger
+            # Optionally takes a notifier.
+            def initialize(points=[])
+                @iterations    = 3
+                @cluster_count = 0
+                @points        = points
+                @clusters      = []
+                @logger        = Log4r::Logger.new('Clusterer')
+                @logger.add('default')
+                # Randomly select a few starting documents.
+                #build_empty_clusters('crp')
+            end
+            # Drop all existing clusters and recreate them using the given method.
+            # If the given method is an integer, then that many clusters are created
+            # and the centers are randomly chosen from the documents contained in the @points attribute.
+            # If it is CRP, then the Chinese Resteraunt Process is used, considering each document
+            # and creating a cluster with that document as the center stochastically and proportionally
+            # the number of documents already considered.
+            def topics=(process)
+                @clusters = []
+                if ( process.is_a?(Integer))
+                    @logger.info("Building cluster of constant cluster count #{process}.")
+                    @cluster_count = process
+                    @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
+                elsif(process.is_a?(String))
+                    if ( process == "crp" )
+                        @logger.info("Building clusters using CRP.")
+                        1.upto(@points.length) do |i|
+                            @cluster_count = 0
+                            if ( rand(i) == 0 )
+                                @clusters << Cluster.new(@points[i-1])
+                                @cluster_count += 1
+                            end
+                        end
+                        @logger.info("Built #{@cluster_count} clusters.")
+                    end
+                end
+            end
+            def +(point)
+                @points << point
+            end
+            def each_cluster(&c)
+                @clusters.each { |cluster| yield cluster }
+            end
+            def assign_all_points
+                @points.each do |pt|
+                    #@logger.debug("Assigning point #{pt}.")
+                    # Randomize the first selection to ensure that in the case where there are
+                    # many centers that are close, each has a (statistically) equal chance of
+                    # getting the document, thus moving the center, changing the center,
+                    # and perhaps matching other documents better because of more terms.
+                    min_cluster = @clusters[rand(@clusters.length)]
+                    min_dst     = min_cluster.center.distance(pt)
+                    @clusters.each do |cluster|
+                        tmp_distance = cluster.center.distance(pt)
+                        if tmp_distance.nil?
+                            next
+                        elsif min_dst.nil?
+                            min_dst = tmp_distance
+                            min_cluster = cluster
+                        elsif tmp_distance < min_dst
+                            min_cluster = cluster
+                            min_dst = tmp_distance
+                        end
+                    end
+                    # If a point has a center...
+                    if pt.cluster
+                        # If it is not the same cluster...
+                        unless pt.cluster.equal? min_cluster
+                            pt.cluster  - pt
+                            min_cluster + pt
+                        end
+                    else
+                        min_cluster + pt
+                    end
+                    #pt.cluster  - pt if pt.cluster
+                    #min_cluster + pt
+                end
+            end
+            def cluster
+                # If we are not initialized, initialize the cluster! :)
+                self.build_empty_clusters('crp') unless @clusters && @clusters.size > 0
+                iterations.times do |i|
+                    @logger.info("Starting iteration #{i+1} of #{iterations}.")
+                    assign_all_points
+                end
+            end
+            def get_max_terms(n=3)
+                r = []
+                each_cluster do |cluster|
+                    r << cluster.get_max_terms(n)
+                end
+                r
+            end
+            # If you edit the document collection behind the scenes in and LDA clusterer, you need to run
+            # this to avoid terms with 0 showing up. However, K-Mean has so little document-related
+            # state that this method does nothing and is only here for API compatibility.
+            # We would like LDA and KMean implementations that are drop-in replacements.
+            def rebuild_document_collection()
+            end
+        end
+    end
+end

data/lib/sclust/kmean/doccluster.rb ADDED

@@ -0,0 +1,83 @@
+#
+# The MIT License
+#
+# Copyright (c) 2010 Samuel R. Baskinger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+require 'sclust/kmean/cluster'
+require 'sclust/util/sparse_vector'
+module SClust
+    module KMean
+        # A document clusterer that overrides the + operator
+        # to allow for adding Document objects.
+        class DocumentClusterer < Clusterer
+            attr_reader :document_collection
+            def initialize()
+                @document_collection = SClust::Util::DocumentCollection.new()
+                super()
+            end
+            def <<(d)
+                if ( d.is_a?(SClust::Util::Document) )
+                    @document_collection << d
+                else
+                    @document_collection << SClust::Util::Document.new(d.to_s)
+                end
+            end
+            # This must be run to conver the document collection into
+            # the points in a cluster.
+            def initialize_points()
+                point_list = []
+                @document_collection.doclist.each do |doc|
+                    doc_terms = SClust::Util::SparseVector.new(0)
+                    # Buid a BIG term vector list for this document.
+                    doc.terms.each_key do |term|
+                        doc_terms[term] = doc.tf(term) - @document_collection.idf(term)
+                    end
+                    # def initialize(terms, values, source_object = nil)
+                    point_list << ClusterPoint.new(doc_terms, doc)
+                end
+                self.points = point_list
+            end
+            def topics=(n)
+                initialize_points unless ( self.points && self.points.size > 0 )
+                super(n)
+            end
+        end
+    end
+end

data/lib/sclust/lda/lda.rb ADDED

@@ -0,0 +1,243 @@
+#
+# The MIT License
+#
+# Copyright (c) 2010 Samuel R. Baskinger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+require 'rubygems'
+require 'sclust/util/word'
+require 'log4r'
+module SClust
+    module LDA
+        class Topic
+            attr_reader :words, :wordcount, :docs
+            attr_writer :words, :wordcount, :docs
+            def initialize()
+                @words = {}
+                @wordcount = 0
+                @docs = {}
+            end
+        end
+        class LDA
+            attr_reader :logger, :iterations, :doclist, :topics
+            attr_writer :logger, :iterations, :doclist
+            # Documents may be added after LDA is created, unlike k-mean clustering.
+            def initialize(docCol=nil)
+                @iterations = 3
+                @wordlist    = []
+                @doclist     = []
+                @logger      = Log4r::Logger.new('Clusterer')
+                # Array the same size as @wordlist but stores the document object at index i
+                # that produced @wordlist[i].
+                @word2doc = []
+                self.topics = 10
+                if ( docCol )
+                    docCol.each {|d| self << d}
+                end
+            end
+            def <<(document)
+                @doclist << document
+                @wordlist += document.words
+                document.words.length.times {@word2doc << document}
+            end
+            def topics=(count)
+                @topics = []
+                count.times do |t|
+                    @topics << Topic.new()
+                    @topic2doc
+                end
+            end
+            # Build a wordlist index array. This is an array that contains indexes into @wordlist.
+            # However, instead of being simply {0,1,2,3...} this array is randomized so that
+            # we index into @wordlist in a random order.
+            def build_randomized_index_into_words()
+                @randomized_word_index = []
+                @wordlist.each_index { |i| @randomized_word_index << i }
+                @wordlist.each_index do |i|
+                    new_home = (@wordlist.length * rand).to_i
+                    tmp = @randomized_word_index[i]
+                    @randomized_word_index[i] = @randomized_word_index[new_home]
+                    @randomized_word_index[new_home] = tmp
+                end
+            end
+            # Compute P(z=j | z..._i, w). Or, the probability that
+            # a topic z is the topic j represented by the given word given that word.
+            def p_of_z(topic, word)
+                return 0 unless topic.words[word]
+                ((topic.words[word] - 1 + @beta)  / (topic.wordcount - topic.words[word] - 1 + @beta  * @wordlist.length)) *
+                ((topic.docs.size   - 1 + @alpha) / (@doclist.size    - 1 + @alpha * @topics.size))
+            end
+            def each_radomized_word_index(&call)
+                @randomized_word_index.each &call
+            end
+            def lda_setup()
+                @beta  = 0.01
+                @alpha = 50.0 / @topics.length
+                build_randomized_index_into_words()
+                @word2topic       = []
+                @doc2topic        = []
+                each_radomized_word_index do |i|
+                    topic = (@topics.size * rand).to_i
+                    @word2topic[i] = topic                        # Record that this word goes to this topic.
+                    @topics[topic].words[@wordlist[i]] ||= 0
+                    @topics[topic].docs[@word2doc[i]]  ||= 0
+                    @topics[topic].words[@wordlist[i]]  += 1    # Record a new word in this topic
+                    @topics[topic].wordcount            += 1    # Total sum of words
+                    @topics[topic].docs[@word2doc[i]]   += 1   # Record this doc index in this topic
+                end
+            end
+            # Perform 1 phase of lda
+            def lda_once()
+                each_radomized_word_index do |random_word_index|
+                    random_word = @wordlist[random_word_index]
+                    zdist = []
+                    ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
+                    # Compute distribution over z for word i.
+                    @topics.each do |topic|
+                        z = p_of_z(topic, random_word)
+                        ztotal += z
+                        zdist << z
+                    end
+                    r      = rand * ztotal # Random value to pick topic with.
+                    zacc   = 0.0           # Accumulator of seen values of zdist[topici].
+                    topici = (rand() * @topics.size).to_i
+                    # Pick a topic, t
+                    catch(:picked_topic) do
+                        @topics.each_index do |topici|
+                            zacc += zdist[topici]
+                            throw :picked_topic if r < zacc
+                        end
+                    end
+                    topic = @topics[topici]
+                    previous_topic = @topics[@word2topic[random_word_index]]
+                    # Skip if src and dst topic are the same
+                    next if @word2topic[random_word_index] == topici
+                    # Remove word from previous topic.
+                    if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
+                        previous_topic.words[@wordlist[random_word_index]] -= 1    # Remove a new word in this topic
+                        previous_topic.wordcount                           -= 1    # Reduce sum of words
+                        previous_topic.docs[@word2doc[random_word_index]]  -= 1   # Remove this doc index in this topic
+                        previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
+                    end
+                    topic.words[@wordlist[random_word_index]] ||= 0     # If word was not in previous topic, add to this one.
+                    topic.docs[@word2doc[random_word_index]]  ||= 0     # If doc was not previously here.
+                    # Add word to chosen topic.
+                    @word2topic[random_word_index] = topici           # Record that this word goes to this topic.
+                    topic.words[@wordlist[random_word_index]] += 1    # Record a new word in this topic
+                    topic.wordcount                           += 1    # Total sum of words
+                    topic.docs[@word2doc[random_word_index]]  += 1 # Record this doc index in this topic
+                end
+            end
+            def lda(opts={})
+                opts[:iterations] ||= @iterations
+                unless (opts[:continue])
+                    lda_setup()
+                end
+                opts[:iterations].times do |i|
+                    lda_once()
+                end
+            end
+            # Takes {|topic| ... }
+            def each_topic(&topicproc)
+                @topics.each &topicproc
+            end
+            # Return a list lists, [ z, word ].
+            def get_top_words_for_topic(topic, n = 3)
+                # List of (z, topic, word)
+                tupleList = []
+                topic.words.each_key do |word|
+                    tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
+                end
+                # Yes, rev the comparison so the list sorts backwards.
+                tupleList.sort! { |x, y| y.weight <=> x.weight }
+                tupleList[0...n]
+            end
+            # Returns list list list.
+            # Each list is a topic list.
+            # Each topic list contains a word list.
+            # [ [ z, word, topic ], ... ]
+            def get_max_terms(n=3)
+                topics = []
+                each_topic { |t| topics << get_top_words_for_topic(t, n) }
+                topics
+            end
+            alias cluster lda
+        end
+    end
+end