clusterer 0.1.0 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/README +29 -7
- data/examples/google_search_cluster.rb +13 -7
- data/examples/yahoo_search_cluster.rb +18 -31
- data/lib/clusterer.rb +36 -95
- data/lib/clusterer/algorithms.rb +95 -0
- data/lib/clusterer/bayes.rb +255 -0
- data/lib/clusterer/cluster.rb +56 -0
- data/lib/clusterer/clustering.rb +35 -0
- data/lib/clusterer/document.rb +71 -0
- data/lib/clusterer/document_array.rb +79 -0
- data/lib/clusterer/document_base.rb +32 -0
- data/lib/clusterer/documents_centroid.rb +44 -0
- data/lib/clusterer/inverse_document_frequency.rb +83 -0
- data/lib/clusterer/lsi/dmatrix.rb +132 -0
- data/lib/clusterer/lsi/document_vector.rb +54 -0
- data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
- data/lib/clusterer/lsi/lsi.rb +95 -0
- data/lib/clusterer/similarity.rb +34 -0
- data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
- data/lib/clusterer/tokenizer.rb +70 -0
- data/tests/algorithms_test.rb +48 -0
- data/tests/bayes_test.rb +68 -0
- data/tests/cluster_test.rb +54 -0
- data/tests/document_array_test.rb +64 -0
- data/tests/document_centroid_test.rb +64 -0
- data/tests/document_test.rb +71 -0
- data/tests/inverse_document_frequency_test.rb +76 -0
- data/tests/lsi_test.rb +77 -0
- data/tests/similarity_test.rb +62 -0
- data/tests/tokenizer_test.rb +72 -0
- metadata +35 -9
- data/lib/similarity.rb +0 -27
- data/tests/clusterer_test.rb +0 -20
@@ -0,0 +1,72 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
require 'clusterer'
|
27
|
+
|
28
|
+
class TestSimilarity < Test::Unit::TestCase
|
29
|
+
include Clusterer::Tokenizer
|
30
|
+
def test_simple_tokenizer
|
31
|
+
x = []
|
32
|
+
simple_tokenizer("good! morrow!! the AB called") {|w| x << w}
|
33
|
+
assert_equal 3, x.size
|
34
|
+
assert_equal "morrow".stem, x[1]
|
35
|
+
assert_equal "call", x[2]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_simple_tokenizer_with_no_stemming
|
39
|
+
x = []
|
40
|
+
simple_tokenizer("good! morrow!! the AB called", :no_stem => true) {|w| x << w}
|
41
|
+
assert_equal 3, x.size
|
42
|
+
assert_equal "morrow", x[1]
|
43
|
+
assert_equal "called", x[2]
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_simple_ngram_tokenizer_1
|
47
|
+
x = []
|
48
|
+
simple_ngram_tokenizer("Good! morrow!! the AB",1) {|w| x << w}
|
49
|
+
assert_equal 2, x.size
|
50
|
+
assert_equal "morrow".stem, x[1]
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_simple_ngram_tokenizer
|
54
|
+
x = []
|
55
|
+
simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 1) {|w| x << w}
|
56
|
+
assert_equal 4, x.size
|
57
|
+
x = []
|
58
|
+
simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 2) {|w| x << w}
|
59
|
+
assert_equal 6, x.size
|
60
|
+
assert x.include?(["holy".stem, "animal".stem].join(" "))
|
61
|
+
x = []
|
62
|
+
simple_ngram_tokenizer("The cow is a cool holy animal.",:ngram => 3) {|w| x << w}
|
63
|
+
assert_equal 7, x.size
|
64
|
+
assert x.include?(["holy".stem, "animal".stem].join(" "))
|
65
|
+
assert x.include?(["cool".stem, "holy".stem, "animal".stem].join(" "))
|
66
|
+
x = []
|
67
|
+
simple_ngram_tokenizer("Ruby on Rails is cool.") {|w| x << w}
|
68
|
+
assert_equal 5, x.size
|
69
|
+
assert x.include?(["ruby".stem, "on".stem, "rails".stem].join(" "))
|
70
|
+
assert x.include?(["rails".stem, "is".stem, "cool".stem].join(" "))
|
71
|
+
end
|
72
|
+
end
|
metadata
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.9.0
|
3
3
|
specification_version: 1
|
4
4
|
name: clusterer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date:
|
8
|
-
summary: A library of clustering algorithms for text data.
|
6
|
+
version: 0.1.9
|
7
|
+
date: 2007-03-22 00:00:00 +05:30
|
8
|
+
summary: A library of clustering and classification algorithms for text data.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: ssinghi@kreeti.com
|
@@ -25,18 +25,44 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- Surendra K Singhi
|
30
31
|
files:
|
31
|
-
- tests/
|
32
|
+
- tests/similarity_test.rb
|
33
|
+
- tests/document_test.rb
|
34
|
+
- tests/cluster_test.rb
|
35
|
+
- tests/bayes_test.rb
|
36
|
+
- tests/document_centroid_test.rb
|
37
|
+
- tests/lsi_test.rb
|
38
|
+
- tests/algorithms_test.rb
|
39
|
+
- tests/tokenizer_test.rb
|
40
|
+
- tests/inverse_document_frequency_test.rb
|
41
|
+
- tests/document_array_test.rb
|
42
|
+
- lib/clusterer
|
32
43
|
- lib/clusterer.rb
|
33
|
-
- lib/
|
34
|
-
- lib/
|
44
|
+
- lib/clusterer/lsi
|
45
|
+
- lib/clusterer/clustering.rb
|
46
|
+
- lib/clusterer/document.rb
|
47
|
+
- lib/clusterer/stop_words.rb
|
48
|
+
- lib/clusterer/cluster.rb
|
49
|
+
- lib/clusterer/bayes.rb
|
50
|
+
- lib/clusterer/document_array.rb
|
51
|
+
- lib/clusterer/similarity.rb
|
52
|
+
- lib/clusterer/document_base.rb
|
53
|
+
- lib/clusterer/algorithms.rb
|
54
|
+
- lib/clusterer/documents_centroid.rb
|
55
|
+
- lib/clusterer/tokenizer.rb
|
56
|
+
- lib/clusterer/inverse_document_frequency.rb
|
57
|
+
- lib/clusterer/lsi/document_vector.rb
|
58
|
+
- lib/clusterer/lsi/lsi.rb
|
59
|
+
- lib/clusterer/lsi/dmatrix.rb
|
60
|
+
- lib/clusterer/lsi/documents_centroid_vector.rb
|
35
61
|
- examples/google_search_cluster.rb
|
36
62
|
- examples/yahoo_search_cluster.rb
|
37
63
|
- README
|
38
|
-
test_files:
|
39
|
-
|
64
|
+
test_files: []
|
65
|
+
|
40
66
|
rdoc_options: []
|
41
67
|
|
42
68
|
extra_rdoc_files:
|
data/lib/similarity.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
#The MIT License
|
2
|
-
|
3
|
-
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
-
|
5
|
-
module Clusterer
|
6
|
-
module Similarity
|
7
|
-
#find similarity between two clusters, or two points
|
8
|
-
def Similarity.vector_similarity(cluster1, cluster2)
|
9
|
-
similarity = 0
|
10
|
-
total = 0
|
11
|
-
cluster1.each do |w,value|
|
12
|
-
next unless w.class == String
|
13
|
-
total += (value*value) unless cluster1[:total]
|
14
|
-
similarity += (value * (cluster2[w] || 0))
|
15
|
-
end
|
16
|
-
cluster1[:total] = total unless cluster1[:total]
|
17
|
-
unless cluster2[:total]
|
18
|
-
total = 0
|
19
|
-
cluster2.each_value {|v| total += (v*v) }
|
20
|
-
total = 1 if total == 0
|
21
|
-
cluster2[:total] = total
|
22
|
-
end
|
23
|
-
cluster1[:total] = 1 if cluster1[:total] == 0
|
24
|
-
similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
data/tests/clusterer_test.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
#The MIT License
|
2
|
-
|
3
|
-
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
-
|
5
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
-
|
7
|
-
require 'test/unit'
|
8
|
-
require 'clusterer'
|
9
|
-
|
10
|
-
class TestClusterer < Test::Unit::TestCase
|
11
|
-
def test_simple_kmeans
|
12
|
-
assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
|
13
|
-
assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_simple_hierarchical_clustering
|
17
|
-
assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
|
18
|
-
assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
|
19
|
-
end
|
20
|
-
end
|