RubyGems - clusterer - Versions diffs - 0.1.0 - Mend

clusterer 0.1.0

Files changed (8) hide show

data/README +21 -0
data/examples/google_search_cluster.rb +49 -0
data/examples/yahoo_search_cluster.rb +59 -0
data/lib/clusterer.rb +100 -0
data/lib/similarity.rb +27 -0
data/lib/word_hash.rb +93 -0
data/tests/clusterer_test.rb +20 -0
metadata +59 -0

data/README ADDED Viewed

@@ -0,0 +1,21 @@
+A ruby library which implements  clustering algorithms for text
+mining.
+Currently implemented algorithms are K-Means, and Hierarchical
+clustering.
+Hierarchical gives better results, but complexity roughly O(n*n)
+K-means is very fast, O(k*n*i), i is number of iterations.
+the examples need google/yahoo api keys, and the yahoo example requires
+ysearch-rb from
+http://developer.yahoo.com/download/download.html
+Hybrid clustering algorithms + more similarity metrics + semi-supervised
+clustering... coming soon ... (or submit pacthes/show keen interest, if
+you want faster results)
+Happy hacking......

data/examples/google_search_cluster.rb ADDED Viewed

@@ -0,0 +1,49 @@
+#The MIT License
+#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+require 'soap/wsdlDriver'
+require 'clusterer'
+## try using HTML stripping to get better results
+WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
+driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
+query = 'kreeti'
+key = ""
+results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
+count= results.resultElements.size
+max_count = results.estimatedTotalResultsCount.to_i
+results = results.resultElements
+while (count < 100 && count <= max_count)
+  more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
+  results.concat(more_results.resultElements)
+  count += more_results.resultElements.size
+end
+clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
+                                                     " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
+#writing the output
+File.open("temp.html","w") do |f|
+  f.write("<ul>")
+  clusters.each do |clus|
+    f.write("<li>")
+    f.write("<ul>")
+    clus.each do |d|
+      f.write("<li>")
+      f.write("<span class='title'>")
+      f.write(results[d].title)
+      f.write("</span>")
+      f.write("<span class='snippet'>")
+      f.write(results[d].snippet)
+      f.write("</span>")
+      f.write("</li>")
+    end
+    f.write("</ul>")
+  end
+  f.write("</ul>")
+  f.write("</li>")
+end

data/examples/yahoo_search_cluster.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#The MIT License
+#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+require 'clusterer'
+ require 'ysearch-rb/lib/ysearch'
+## try using HTML stripping to get better results
+# get the query parameter
+query = "kreeti"
+##
+# create a web search object:
+# Arguments:
+# 1. App ID (You can get one at http://developer.yahoo.net)
+# 2. The query
+# 3. type can be one of: 'all', 'any' or 'phrase'
+# 4. The no. of results
+##
+obj = WebSearch.new('YahooDemo', query, 'all', 100)
+results = obj.parse_results
+# count= results.resultElements.size
+# max_count = results.estimatedTotalResultsCount.to_i
+# results = results.resultElements
+# while (count < 100 && count <= max_count)
+#   more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
+#   results.concat(more_results.resultElements)
+#   count += more_results.resultElements.size
+# end
+#kmeans_clustering
+clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
+                                                     " " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
+#writing the output
+File.open("temp.html","w") do |f|
+  f.write("<ul>")
+  clusters.each do |clus|
+    f.write("<li>")
+    f.write("<ul>")
+    clus.each do |d|
+      f.write("<li>")
+      f.write("<span class='title'>")
+      f.write(results[d]['Title'])
+      f.write("</span>")
+      f.write("<span class='snippet'>")
+      f.write(results[d]['Summary'])
+      f.write("</span>")
+      f.write("</li>")
+    end
+    f.write("</ul>")
+  end
+  f.write("</ul>")
+  f.write("</li>")
+end

data/lib/clusterer.rb ADDED Viewed

@@ -0,0 +1,100 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+require 'word_hash'
+require 'similarity'
+module Clusterer
+  class Clustering
+    class << self
+      #returns clusters containing index of the elements in doc
+      def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
+        similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
+        k = Math.sqrt(docs.size) unless k
+        docs_hash = Array.new(docs.size)
+        clusters = Array.new(k)
+        cluster_centers = Array.new(k)
+        old_cluster_centers = Array.new(k)
+        docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
+        0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
+        iter = 0
+        while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
+          puts "Iteration ....#{iter}...#{clusters.inspect}"
+          0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
+          docs_hash.each_with_index do |doc, i|
+            max_value, max_index = 0, 0
+            cluster_centers.each_with_index do |cen, j|
+              sim = similarity_function.call(doc,cen)
+              max_value, max_index = sim,j if sim >= max_value
+            end
+            clusters[max_index] << i
+          end
+          recalculate_centers(cluster_centers,clusters,docs_hash)
+          iter += 1
+        end
+        clusters
+      end
+      def hierarchical_clustering (docs, k = nil, &similarity_function)
+        similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
+        k = Math.sqrt(docs.size) unless k
+        docs_hash = Array.new(docs.size)
+        clusters = Array.new(docs.size)
+        cluster_centers = Array.new(docs.size)
+        docs.each_with_index do |d,i|
+          cluster_centers[i] = d.clean_word_hash
+          clusters[i] = [i]
+        end
+        iter = 0
+        while (clusters.size > k)
+          puts "Iteration ....#{iter}...#{clusters.inspect}"
+          min_value, min_index = clusters.size[0], 0
+          clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
+          p = cluster_centers.delete_at(min_index)
+          c = clusters.delete_at(min_index)
+          max_value, max_index = 0, 0
+          cluster_centers.each_with_index do |cen, j|
+            sim = similarity_function.call(p,cen)
+            max_value, max_index = sim,j if sim >= max_value
+          end
+          merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
+          iter += 1
+        end
+        clusters
+      end
+      private
+      #merge cluster 2 into cluster 1
+      def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
+        cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
+        cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
+        cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
+        cluster1.concat(cluster2)
+        cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
+        cluster_center1.delete(:total)
+      end
+      def recalculate_centers(cluster_centers,clusters,docs_hash)
+        clusters.each_with_index do |cluster,i|
+          center = { }
+          cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
+          total = 0
+          count = cluster.size
+          count = 1 if count ==0
+          center.each_key {|w| next unless w.class == String;
+            center[w] /=count.to_f; total += center[w]**2}
+          total = 1 if total == 0
+          center[:total] = total
+          cluster_centers[i] = center
+        end
+      end
+      def convergence(new_centers,old_centers)
+        new_centers.each_with_index {|c,i|     return false unless c == old_centers[i]}
+        return true
+      end
+    end
+  end
+end

data/lib/similarity.rb ADDED Viewed

@@ -0,0 +1,27 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+module Clusterer
+  module Similarity
+    #find similarity between two clusters, or two points
+    def Similarity.vector_similarity(cluster1, cluster2)
+      similarity = 0
+      total = 0
+      cluster1.each do |w,value|
+        next unless w.class == String
+        total += (value*value) unless cluster1[:total]
+        similarity += (value * (cluster2[w] || 0))
+      end
+      cluster1[:total] = total unless cluster1[:total]
+      unless cluster2[:total]
+        total = 0
+        cluster2.each_value {|v| total += (v*v) }
+        total = 1 if total == 0
+        cluster2[:total] = total
+      end
+      cluster1[:total] = 1 if cluster1[:total] == 0
+      similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
+    end
+  end
+end

data/lib/word_hash.rb ADDED Viewed

@@ -0,0 +1,93 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+begin
+  require 'stemmer'
+rescue LoadError
+  puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
+  exit(-1)
+end
+class String
+  def clean_word_hash
+    word_hash gsub(/[^\w\s]/,"").split
+  end
+  private
+  def word_hash(words)
+    h = Hash.new
+    words.each do |w|
+      w = w.downcase.stem
+      h[w] = (h[w] || 0) + 1 if w.size > 2 and !STOP_WORDS.include?(w)
+    end
+    h
+  end
+  STOP_WORDS = ["and",
+                "but",
+                "came",
+                "can",
+                "cant",
+                "com",
+                "couldnt",
+                "did",
+                "didn",
+                "didnt",
+                "doesnt",
+                "dont",
+                "ever",
+                "first",
+                "for",
+                "from",
+                "have",
+                "her",
+                "here",
+                "him",
+                "how",
+                "into",
+                "isnt",
+                "itll",
+                "just",
+                "last",
+                "least",
+                "like",
+                "most",
+                "new",
+                "not",
+                "now",
+                "sai",
+                "said",
+                "she",
+                "should",
+                "since",
+                "some",
+                "than",
+                "thi",
+                "that",
+                "the",
+                "thei",
+                "their",
+                "then",
+                "those",
+                "told",
+                "too",
+                "true",
+                "try",
+                "until",
+                "url",
+                "wasnt",
+                "were",
+                "when",
+                "who",
+                "whether",
+                "while",
+                "will",
+                "with",
+                "within",
+                "would",
+                "www",
+                "yes",
+                "you",
+                "youll",
+               ]
+end

data/tests/clusterer_test.rb ADDED Viewed

@@ -0,0 +1,20 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'test/unit'
+require 'clusterer'
+class TestClusterer < Test::Unit::TestCase
+  def test_simple_kmeans
+    assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
+    assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
+  end
+  def test_simple_hierarchical_clustering
+    assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
+    assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,59 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.8.11
+specification_version: 1
+name: clusterer
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+date: 2006-08-22 00:00:00 +05:30
+summary: A library of clustering algorithms for text data.
+require_paths:
+- lib
+email: ssinghi@kreeti.com
+homepage: http://rubyforge.org/projects/clusterer/
+rubyforge_project:
+description:
+autorequire: clusterer
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+authors:
+- Surendra K Singhi
+files:
+- tests/clusterer_test.rb
+- lib/clusterer.rb
+- lib/similarity.rb
+- lib/word_hash.rb
+- examples/google_search_cluster.rb
+- examples/yahoo_search_cluster.rb
+- README
+test_files:
+- tests/clusterer_test.rb
+rdoc_options: []
+extra_rdoc_files:
+- README
+executables: []
+extensions: []
+requirements: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: stemmer
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.0.0
+    version: