RubyGems - clusterer - Versions diffs - 0.1.0 → 0.1.9 - Mend

clusterer 0.1.0 → 0.1.9

Files changed (33) hide show

data/README +29 -7
data/examples/google_search_cluster.rb +13 -7
data/examples/yahoo_search_cluster.rb +18 -31
data/lib/clusterer.rb +36 -95
data/lib/clusterer/algorithms.rb +95 -0
data/lib/clusterer/bayes.rb +255 -0
data/lib/clusterer/cluster.rb +56 -0
data/lib/clusterer/clustering.rb +35 -0
data/lib/clusterer/document.rb +71 -0
data/lib/clusterer/document_array.rb +79 -0
data/lib/clusterer/document_base.rb +32 -0
data/lib/clusterer/documents_centroid.rb +44 -0
data/lib/clusterer/inverse_document_frequency.rb +83 -0
data/lib/clusterer/lsi/dmatrix.rb +132 -0
data/lib/clusterer/lsi/document_vector.rb +54 -0
data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
data/lib/clusterer/lsi/lsi.rb +95 -0
data/lib/clusterer/similarity.rb +34 -0
data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
data/lib/clusterer/tokenizer.rb +70 -0
data/tests/algorithms_test.rb +48 -0
data/tests/bayes_test.rb +68 -0
data/tests/cluster_test.rb +54 -0
data/tests/document_array_test.rb +64 -0
data/tests/document_centroid_test.rb +64 -0
data/tests/document_test.rb +71 -0
data/tests/inverse_document_frequency_test.rb +76 -0
data/tests/lsi_test.rb +77 -0
data/tests/similarity_test.rb +62 -0
data/tests/tokenizer_test.rb +72 -0
metadata +35 -9
data/lib/similarity.rb +0 -27
data/tests/clusterer_test.rb +0 -20

@@ -0,0 +1,72 @@
+#--
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'test/unit'
+require 'clusterer'
+class TestSimilarity < Test::Unit::TestCase
+  include Clusterer::Tokenizer
+  def test_simple_tokenizer
+    x = []
+    simple_tokenizer("good! morrow!! the AB called") {|w| x << w}
+    assert_equal 3, x.size
+    assert_equal "morrow".stem, x[1]
+    assert_equal "call", x[2]
+  end
+  def test_simple_tokenizer_with_no_stemming
+    x = []
+    simple_tokenizer("good! morrow!! the AB called", :no_stem => true) {|w| x << w}
+    assert_equal 3, x.size
+    assert_equal "morrow", x[1]
+    assert_equal "called", x[2]
+  end
+  def test_simple_ngram_tokenizer_1
+    x = []
+    simple_ngram_tokenizer("Good! morrow!! the AB",1) {|w| x << w}
+    assert_equal 2, x.size
+    assert_equal "morrow".stem, x[1]
+  end
+  def test_simple_ngram_tokenizer
+    x = []
+    simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 1) {|w| x << w}
+    assert_equal 4, x.size
+    x = []
+    simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 2) {|w| x << w}
+    assert_equal 6, x.size
+    assert x.include?(["holy".stem, "animal".stem].join(" "))
+    x = []
+    simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 3) {|w| x << w}
+    assert_equal 7, x.size
+    assert x.include?(["holy".stem, "animal".stem].join(" "))
+    assert x.include?(["cool".stem, "holy".stem, "animal".stem].join(" "))
+    x = []
+    simple_ngram_tokenizer("Ruby on Rails is cool.") {|w| x << w}
+    assert_equal 5, x.size
+    assert x.include?(["ruby".stem, "on".stem, "rails".stem].join(" "))
+    assert x.include?(["rails".stem, "is".stem, "cool".stem].join(" "))
+  end
+end

metadata CHANGED

@@ -1,11 +1,11 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.8.11
+rubygems_version: 0.9.0
 specification_version: 1
 name: clusterer
 version: !ruby/object:Gem::Version
-  version: 0.1.0
-date: 2006-08-22 00:00:00 +05:30
-summary: A library of clustering algorithms for text data.
+  version: 0.1.9
+date: 2007-03-22 00:00:00 +05:30
+summary: A library of clustering and classification algorithms for text data.
 require_paths:
 - lib
 email: ssinghi@kreeti.com
@@ -25,18 +25,44 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
 platform: ruby
 signing_key:
 cert_chain:
+post_install_message:
 authors:
 - Surendra K Singhi
 files:
-- tests/clusterer_test.rb
+- tests/similarity_test.rb
+- tests/document_test.rb
+- tests/cluster_test.rb
+- tests/bayes_test.rb
+- tests/document_centroid_test.rb
+- tests/lsi_test.rb
+- tests/algorithms_test.rb
+- tests/tokenizer_test.rb
+- tests/inverse_document_frequency_test.rb
+- tests/document_array_test.rb
+- lib/clusterer
 - lib/clusterer.rb
-- lib/similarity.rb
-- lib/word_hash.rb
+- lib/clusterer/lsi
+- lib/clusterer/clustering.rb
+- lib/clusterer/document.rb
+- lib/clusterer/stop_words.rb
+- lib/clusterer/cluster.rb
+- lib/clusterer/bayes.rb
+- lib/clusterer/document_array.rb
+- lib/clusterer/similarity.rb
+- lib/clusterer/document_base.rb
+- lib/clusterer/algorithms.rb
+- lib/clusterer/documents_centroid.rb
+- lib/clusterer/tokenizer.rb
+- lib/clusterer/inverse_document_frequency.rb
+- lib/clusterer/lsi/document_vector.rb
+- lib/clusterer/lsi/lsi.rb
+- lib/clusterer/lsi/dmatrix.rb
+- lib/clusterer/lsi/documents_centroid_vector.rb
 - examples/google_search_cluster.rb
 - examples/yahoo_search_cluster.rb
 - README
-test_files:
-- tests/clusterer_test.rb
+test_files: []
 rdoc_options: []
 extra_rdoc_files:

data/lib/similarity.rb DELETED

@@ -1,27 +0,0 @@
-#The MIT License
-###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
-module Clusterer
-  module Similarity
-    #find similarity between two clusters, or two points
-    def Similarity.vector_similarity(cluster1, cluster2)
-      similarity = 0
-      total = 0
-      cluster1.each do |w,value|
-        next unless w.class == String
-        total += (value*value) unless cluster1[:total]
-        similarity += (value * (cluster2[w] || 0))
-      end
-      cluster1[:total] = total unless cluster1[:total]
-      unless cluster2[:total]
-        total = 0
-        cluster2.each_value {|v| total += (v*v) }
-        total = 1 if total == 0
-        cluster2[:total] = total
-      end
-      cluster1[:total] = 1 if cluster1[:total] == 0
-      similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
-    end
-  end
-end

data/tests/clusterer_test.rb DELETED

@@ -1,20 +0,0 @@
-#The MIT License
-###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'test/unit'
-require 'clusterer'
-class TestClusterer < Test::Unit::TestCase
-  def test_simple_kmeans
-    assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
-    assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
-  end
-  def test_simple_hierarchical_clustering
-    assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
-    assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
-  end
-end