RubyGems - clusterer - Versions diffs - 0.1.0 - Mend

clusterer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README +21 -0
data/examples/google_search_cluster.rb +49 -0
data/examples/yahoo_search_cluster.rb +59 -0
data/lib/clusterer.rb +100 -0
data/lib/similarity.rb +27 -0
data/lib/word_hash.rb +93 -0
data/tests/clusterer_test.rb +20 -0
metadata +59 -0

data/README ADDED Viewed

@@ -0,0 +1,21 @@
+A ruby library which implements  clustering algorithms for text
+mining.
+Currently implemented algorithms are K-Means, and Hierarchical
+clustering.
+Hierarchical gives better results, but complexity roughly O(n*n)
+K-means is very fast, O(k*n*i), i is number of iterations.
+the examples need google/yahoo api keys, and the yahoo example requires
+ysearch-rb from
+http://developer.yahoo.com/download/download.html
+Hybrid clustering algorithms + more similarity metrics + semi-supervised
+clustering... coming soon ... (or submit pacthes/show keen interest, if
+you want faster results)
+Happy hacking......

data/examples/google_search_cluster.rb ADDED Viewed

@@ -0,0 +1,49 @@
+#The MIT License
+#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+require 'soap/wsdlDriver'
+require 'clusterer'
+## try using HTML stripping to get better results
+WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
+driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
+query = 'kreeti'
+key = ""
+results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
+count= results.resultElements.size
+max_count = results.estimatedTotalResultsCount.to_i
+results = results.resultElements
+while (count < 100 && count <= max_count)
+  more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
+  results.concat(more_results.resultElements)
+  count += more_results.resultElements.size
+end
+clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
+                                                     " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
+#writing the output
+File.open("temp.html","w") do |f|
+  f.write("<ul>")
+  clusters.each do |clus|
+    f.write("<li>")
+    f.write("<ul>")
+    clus.each do |d|
+      f.write("<li>")
+      f.write("<span class='title'>")
+      f.write(results[d].title)
+      f.write("</span>")
+      f.write("<span class='snippet'>")
+      f.write(results[d].snippet)
+      f.write("</span>")
+      f.write("</li>")
+    end
+    f.write("</ul>")
+  end
+  f.write("</ul>")
+  f.write("</li>")
+end

data/examples/yahoo_search_cluster.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#The MIT License
+#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+require 'clusterer'
+ require 'ysearch-rb/lib/ysearch'
+## try using HTML stripping to get better results
+# get the query parameter
+query = "kreeti"
+##
+# create a web search object:
+# Arguments:
+# 1. App ID (You can get one at http://developer.yahoo.net)
+# 2. The query
+# 3. type can be one of: 'all', 'any' or 'phrase'
+# 4. The no. of results
+##
+obj = WebSearch.new('YahooDemo', query, 'all', 100)
+results = obj.parse_results
+# count= results.resultElements.size
+# max_count = results.estimatedTotalResultsCount.to_i
+# results = results.resultElements
+# while (count < 100 && count <= max_count)
+#   more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
+#   results.concat(more_results.resultElements)
+#   count += more_results.resultElements.size
+# end
+#kmeans_clustering
+clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
+                                                     " " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
+#writing the output
+File.open("temp.html","w") do |f|
+  f.write("<ul>")
+  clusters.each do |clus|
+    f.write("<li>")
+    f.write("<ul>")
+    clus.each do |d|
+      f.write("<li>")
+      f.write("<span class='title'>")
+      f.write(results[d]['Title'])
+      f.write("</span>")
+      f.write("<span class='snippet'>")
+      f.write(results[d]['Summary'])
+      f.write("</span>")
+      f.write("</li>")
+    end
+    f.write("</ul>")
+  end
+  f.write("</ul>")
+  f.write("</li>")
+end

data/lib/clusterer.rb ADDED Viewed

@@ -0,0 +1,100 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+require 'word_hash'
+require 'similarity'
+module Clusterer
+  class Clustering
+    class << self
+      #returns clusters containing index of the elements in doc
+      def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
+        similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
+        k = Math.sqrt(docs.size) unless k
+        docs_hash = Array.new(docs.size)
+        clusters = Array.new(k)
+        cluster_centers = Array.new(k)
+        old_cluster_centers = Array.new(k)
+        docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
+        0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
+        iter = 0
+        while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
+          puts "Iteration ....#{iter}...#{clusters.inspect}"
+          0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
+          docs_hash.each_with_index do |doc, i|
+            max_value, max_index = 0, 0
+            cluster_centers.each_with_index do |cen, j|
+              sim = similarity_function.call(doc,cen)
+              max_value, max_index = sim,j if sim >= max_value
+            end
+            clusters[max_index] << i
+          end
+          recalculate_centers(cluster_centers,clusters,docs_hash)
+          iter += 1
+        end
+        clusters
+      end
+      def hierarchical_clustering (docs, k = nil, &similarity_function)
+        similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
+        k = Math.sqrt(docs.size) unless k
+        docs_hash = Array.new(docs.size)
+        clusters = Array.new(docs.size)
+        cluster_centers = Array.new(docs.size)
+        docs.each_with_index do |d,i|
+          cluster_centers[i] = d.clean_word_hash
+          clusters[i] = [i]
+        end
+        iter = 0
+        while (clusters.size > k)
+          puts "Iteration ....#{iter}...#{clusters.inspect}"
+          min_value, min_index = clusters.size[0], 0
+          clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
+          p = cluster_centers.delete_at(min_index)
+          c = clusters.delete_at(min_index)
+          max_value, max_index = 0, 0
+          cluster_centers.each_with_index do |cen, j|
+            sim = similarity_function.call(p,cen)
+            max_value, max_index = sim,j if sim >= max_value
+          end
+          merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
+          iter += 1
+        end
+        clusters
+      end
+      private
+      #merge cluster 2 into cluster 1
+      def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
+        cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
+        cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
+        cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
+        cluster1.concat(cluster2)
+        cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
+        cluster_center1.delete(:total)
+      end
+      def recalculate_centers(cluster_centers,clusters,docs_hash)
+        clusters.each_with_index do |cluster,i|
+          center = { }
+          cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
+          total = 0
+          count = cluster.size
+          count = 1 if count ==0
+          center.each_key {|w| next unless w.class == String;
+            center[w] /=count.to_f; total += center[w]**2}
+          total = 1 if total == 0
+          center[:total] = total
+          cluster_centers[i] = center
+        end
+      end
+      def convergence(new_centers,old_centers)
+        new_centers.each_with_index {|c,i|     return false unless c == old_centers[i]}
+        return true
+      end
+    end
+  end
+end

data/lib/similarity.rb ADDED Viewed

@@ -0,0 +1,27 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+module Clusterer
+  module Similarity
+    #find similarity between two clusters, or two points
+    def Similarity.vector_similarity(cluster1, cluster2)
+      similarity = 0
+      total = 0
+      cluster1.each do |w,value|
+        next unless w.class == String
+        total += (value*value) unless cluster1[:total]
+        similarity += (value * (cluster2[w] || 0))
+      end
+      cluster1[:total] = total unless cluster1[:total]
+      unless cluster2[:total]
+        total = 0
+        cluster2.each_value {|v| total += (v*v) }
+        total = 1 if total == 0
+        cluster2[:total] = total
+      end
+      cluster1[:total] = 1 if cluster1[:total] == 0
+      similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
+    end
+  end
+end

data/lib/word_hash.rb ADDED Viewed

@@ -0,0 +1,93 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+begin
+  require 'stemmer'
+rescue LoadError
+  puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
+  exit(-1)
+end
+class String
+  def clean_word_hash
+    word_hash gsub(/[^\w\s]/,"").split
+  end
+  private
+  def word_hash(words)
+    h = Hash.new
+    words.each do |w|
+      w = w.downcase.stem
+      h[w] = (h[w] || 0) + 1 if w.size > 2 and !STOP_WORDS.include?(w)
+    end
+    h
+  end
+  STOP_WORDS = ["and",
+                "but",
+                "came",
+                "can",
+                "cant",
+                "com",
+                "couldnt",
+                "did",
+                "didn",
+                "didnt",
+                "doesnt",
+                "dont",
+                "ever",
+                "first",
+                "for",
+                "from",
+                "have",
+                "her",
+                "here",
+                "him",
+                "how",
+                "into",
+                "isnt",
+                "itll",
+                "just",
+                "last",
+                "least",
+                "like",
+                "most",
+                "new",
+                "not",
+                "now",
+                "sai",
+                "said",
+                "she",
+                "should",
+                "since",
+                "some",
+                "than",
+                "thi",
+                "that",
+                "the",
+                "thei",
+                "their",
+                "then",
+                "those",
+                "told",
+                "too",
+                "true",
+                "try",
+                "until",
+                "url",
+                "wasnt",
+                "were",
+                "when",
+                "who",
+                "whether",
+                "while",
+                "will",
+                "with",
+                "within",
+                "would",
+                "www",
+                "yes",
+                "you",
+                "youll",
+               ]
+end

data/tests/clusterer_test.rb ADDED Viewed

@@ -0,0 +1,20 @@
+#The MIT License
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'test/unit'
+require 'clusterer'
+class TestClusterer < Test::Unit::TestCase
+  def test_simple_kmeans
+    assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
+    assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
+  end
+  def test_simple_hierarchical_clustering
+    assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
+    assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,59 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.8.11
+specification_version: 1
+name: clusterer
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+date: 2006-08-22 00:00:00 +05:30
+summary: A library of clustering algorithms for text data.
+require_paths:
+- lib
+email: ssinghi@kreeti.com
+homepage: http://rubyforge.org/projects/clusterer/
+rubyforge_project:
+description:
+autorequire: clusterer
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+authors:
+- Surendra K Singhi
+files:
+- tests/clusterer_test.rb
+- lib/clusterer.rb
+- lib/similarity.rb
+- lib/word_hash.rb
+- examples/google_search_cluster.rb
+- examples/yahoo_search_cluster.rb
+- README
+test_files:
+- tests/clusterer_test.rb
+rdoc_options: []
+extra_rdoc_files:
+- README
+executables: []
+extensions: []
+requirements: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: stemmer
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.0.0
+    version: