clusterer 0.1.0 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/README +29 -7
- data/examples/google_search_cluster.rb +13 -7
- data/examples/yahoo_search_cluster.rb +18 -31
- data/lib/clusterer.rb +36 -95
- data/lib/clusterer/algorithms.rb +95 -0
- data/lib/clusterer/bayes.rb +255 -0
- data/lib/clusterer/cluster.rb +56 -0
- data/lib/clusterer/clustering.rb +35 -0
- data/lib/clusterer/document.rb +71 -0
- data/lib/clusterer/document_array.rb +79 -0
- data/lib/clusterer/document_base.rb +32 -0
- data/lib/clusterer/documents_centroid.rb +44 -0
- data/lib/clusterer/inverse_document_frequency.rb +83 -0
- data/lib/clusterer/lsi/dmatrix.rb +132 -0
- data/lib/clusterer/lsi/document_vector.rb +54 -0
- data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
- data/lib/clusterer/lsi/lsi.rb +95 -0
- data/lib/clusterer/similarity.rb +34 -0
- data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
- data/lib/clusterer/tokenizer.rb +70 -0
- data/tests/algorithms_test.rb +48 -0
- data/tests/bayes_test.rb +68 -0
- data/tests/cluster_test.rb +54 -0
- data/tests/document_array_test.rb +64 -0
- data/tests/document_centroid_test.rb +64 -0
- data/tests/document_test.rb +71 -0
- data/tests/inverse_document_frequency_test.rb +76 -0
- data/tests/lsi_test.rb +77 -0
- data/tests/similarity_test.rb +62 -0
- data/tests/tokenizer_test.rb +72 -0
- metadata +35 -9
- data/lib/similarity.rb +0 -27
- data/tests/clusterer_test.rb +0 -20
@@ -0,0 +1,54 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'clusterer'
|
9
|
+
|
10
|
+
class TestCluster < Test::Unit::TestCase
|
11
|
+
include Clusterer
|
12
|
+
|
13
|
+
def setup
|
14
|
+
@idf = InverseDocumentFrequency.new()
|
15
|
+
@d = Document.new("hello world, mea culpa, goodbye world.", :idf => @idf).normalize!(@idf)
|
16
|
+
@e = Document.new("the world is not a bad place to live.", :idf => @idf).normalize!(@idf)
|
17
|
+
@f = Document.new("the world is a crazy place to live.", :idf => @idf).normalize!(@idf)
|
18
|
+
@g = Document.new("unique document.")
|
19
|
+
@c1 = Cluster.new([@d, @e, @f])
|
20
|
+
@c2 = Cluster.new([@d, @g])
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_centroid
|
24
|
+
assert @c1.centroid
|
25
|
+
assert_nil Cluster.new.centroid
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_merge!
|
29
|
+
@c1.merge!(@c2)
|
30
|
+
assert_nil @c1.instance_variable_get("@intra_cluster_similarity")
|
31
|
+
c = Cluster.new
|
32
|
+
c.merge!(@c2)
|
33
|
+
assert_equal c.centroid, @c2.centroid
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_add
|
37
|
+
c= @c1 + @c2
|
38
|
+
assert_not_equal c, @c1
|
39
|
+
assert_not_equal c, @c2
|
40
|
+
assert_equal (Cluster.new + @c1), @c1
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_equal
|
44
|
+
assert_not_equal @c1, @c2
|
45
|
+
assert_not_equal @c1, nil
|
46
|
+
assert_equal @c1, Cluster.new([@d, @e, @f])
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_intra_cluster_cosine_similarity
|
50
|
+
assert_equal Cluster.new.intra_cluster_cosine_similarity,Cluster.new.intra_cluster_cosine_similarity
|
51
|
+
assert @c1.intra_cluster_cosine_similarity > 0
|
52
|
+
assert_not_equal @c1.intra_cluster_cosine_similarity, @c2.intra_cluster_cosine_similarity
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
require 'clusterer'
|
27
|
+
|
28
|
+
class DocumentArrayTest < Test::Unit::TestCase
|
29
|
+
include Clusterer
|
30
|
+
|
31
|
+
def setup
|
32
|
+
@idf = InverseDocumentFrequency.new()
|
33
|
+
@d = DocumentArray.new("hello world, mea culpa, goodbye world.",:idf => @idf)
|
34
|
+
@e = DocumentArray.new("the world is not a bad place to live.",:idf => @idf).normalize!(@idf)
|
35
|
+
@f = DocumentArray.new("the world is a crazy place to live.",:idf => @idf).normalize!(@idf)
|
36
|
+
@g = DocumentArray.new("unique document.")
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_insert
|
40
|
+
t = @d.term_array_position_mapper('weird')
|
41
|
+
assert_nil @d[t]
|
42
|
+
@d << "weird"
|
43
|
+
assert @d[t]
|
44
|
+
assert_equal @d[t] + 1, (@d << "weird"; @d[t])
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_term_array_position_mapper
|
48
|
+
@d.term_array_position_mapper("world")
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_vector_length
|
52
|
+
assert_not_nil @f.vector_length
|
53
|
+
assert_in_delta 1.0, @f.vector_length, 0.01
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_object
|
57
|
+
assert_equal "unique document.", @g.object
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_normalize!
|
61
|
+
@d.normalize!
|
62
|
+
assert_in_delta 1.0, @d.vector_length, 0.01
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
|
24
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
25
|
+
|
26
|
+
require 'test/unit'
|
27
|
+
require 'clusterer'
|
28
|
+
|
29
|
+
class DocumentCentroidTest < Test::Unit::TestCase
|
30
|
+
include Clusterer
|
31
|
+
|
32
|
+
def setup
|
33
|
+
@idf = InverseDocumentFrequency.new()
|
34
|
+
@d = Document.new("hello world, mea culpa, goodbye world.", :idf => @idf).normalize!(@idf)
|
35
|
+
@e = Document.new("the world is not a bad place to live.", :idf => @idf).normalize!(@idf)
|
36
|
+
@f = Document.new("the world is a crazy place to live.", :idf => @idf).normalize!(@idf)
|
37
|
+
@g = Document.new("unique document.")
|
38
|
+
@c1 = Clusterer::Cluster.new([@d, @e, @f])
|
39
|
+
@c2 = Clusterer::Cluster.new([@d, @g])
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_initialization
|
43
|
+
c = DocumentsCentroid.new([@d, @e, @f])
|
44
|
+
assert 3, c.no_of_documents
|
45
|
+
assert c.vector_length
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_merge!
|
49
|
+
c1 = DocumentsCentroid.new([@d, @e, @f])
|
50
|
+
c2 = DocumentsCentroid.new([@d, @g])
|
51
|
+
c3 = c2.clone
|
52
|
+
t = c1.vector_length
|
53
|
+
c1.merge!(c2)
|
54
|
+
assert 5, c1.no_of_documents
|
55
|
+
assert_equal c2, c3
|
56
|
+
assert_not_equal t, c1.vector_length
|
57
|
+
|
58
|
+
c4 = DocumentsCentroid.new()
|
59
|
+
t = c1.vector_length
|
60
|
+
c1.merge!(c4)
|
61
|
+
assert 5, c1.no_of_documents
|
62
|
+
assert_equal t, c1.vector_length
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
require 'clusterer'
|
27
|
+
|
28
|
+
class DocumentTest < Test::Unit::TestCase
|
29
|
+
include Clusterer
|
30
|
+
|
31
|
+
def setup
|
32
|
+
@idf = InverseDocumentFrequency.new()
|
33
|
+
@d = Document.new("hello world, mea culpa, goodbye world.",:idf => @idf) {|x| x.to_s }
|
34
|
+
@e = Document.new("the world is not a bad place to live.",:idf => @idf).normalize!(@idf)
|
35
|
+
@f = Document.new("the world is a crazy place to live.").normalize!(@idf, true)
|
36
|
+
@g = Document.new(["unique ","document."])
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_cosine_similarity
|
40
|
+
assert @d.cosine_similarity(@d) > 0.99 #almost 1
|
41
|
+
assert @f.cosine_similarity(@g) < 0.0001 # almost 0
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_centroid_class
|
45
|
+
assert_equal DocumentsCentroid, @d.class.centroid_class
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_add
|
49
|
+
assert_nil @d['weird']
|
50
|
+
t = @d.length + 1
|
51
|
+
@d << "weird"
|
52
|
+
assert_equal t, @d.length
|
53
|
+
assert @d['weird']
|
54
|
+
assert_equal @d['weird'] + 1, (@d << "weird"; @d['weird'])
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_vector_length
|
58
|
+
assert_not_nil @f.vector_length
|
59
|
+
assert_in_delta 1.0, @f.vector_length, 0.01
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_object
|
63
|
+
assert_equal ["unique ","document."], @g.object
|
64
|
+
assert_equal "the world is a crazy place to live.", @f.object
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_normalize!
|
68
|
+
@d.normalize!
|
69
|
+
assert_in_delta 1.0, @d.vector_length, 0.01
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
|
24
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
25
|
+
|
26
|
+
require 'test/unit'
|
27
|
+
require 'clusterer'
|
28
|
+
|
29
|
+
class InverseDocumentFrequencyTest < Test::Unit::TestCase
|
30
|
+
include Clusterer
|
31
|
+
|
32
|
+
def test_insertion
|
33
|
+
idf = InverseDocumentFrequency.new()
|
34
|
+
idf << "hello"
|
35
|
+
assert_equal 1, idf.instance_variable_get("@terms_count").size
|
36
|
+
assert_in_delta 1.0, idf.instance_variable_get("@terms_count")["hello"], 0.01
|
37
|
+
idf << "hello"
|
38
|
+
assert_equal 2.0, idf.instance_variable_get("@terms_count")["hello"]
|
39
|
+
assert_equal 1, idf.instance_variable_get("@terms_count").size
|
40
|
+
|
41
|
+
idf << "world"
|
42
|
+
assert_equal 2, idf.instance_variable_get("@terms_count").size
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_documents_count
|
46
|
+
idf = InverseDocumentFrequency.new()
|
47
|
+
Document.new("the world is not a bad place to live.", :idf => idf)
|
48
|
+
Document.new("the world is a crazy place to live.", :idf => idf)
|
49
|
+
assert_equal 2, idf.documents_count
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_clean_cached_normalizing_factor
|
53
|
+
idf = InverseDocumentFrequency.new()
|
54
|
+
Document.new("the world is not a bad place to live.", :idf => idf)
|
55
|
+
Document.new("hello, the world is a crazy place to live.", :idf => idf)
|
56
|
+
t ="crazy".stem
|
57
|
+
f = idf[t]
|
58
|
+
assert_in_delta Math.log(2/1), f, 0.1
|
59
|
+
Document.new("the world is a weird place to live.", :idf => idf)
|
60
|
+
assert_equal f, idf[t]
|
61
|
+
idf.clean_cached_normalizing_factor
|
62
|
+
assert_not_equal f, idf[t]
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_array_index
|
66
|
+
idf = InverseDocumentFrequency.new()
|
67
|
+
Document.new("the world is not a bad place to live.", :idf => idf)
|
68
|
+
assert_in_delta 1.0, idf["world"], 0.001
|
69
|
+
assert_in_delta 1.0, idf["hello"], 0.001
|
70
|
+
|
71
|
+
Document.new("hello, the world is a crazy place to live.", :idf => idf)
|
72
|
+
idf.clean_cached_normalizing_factor
|
73
|
+
idf << "hello"
|
74
|
+
assert idf["hello"] < 0.99
|
75
|
+
end
|
76
|
+
end
|
data/tests/lsi_test.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
require 'clusterer'
|
27
|
+
|
28
|
+
class LsiTest < Test::Unit::TestCase
|
29
|
+
include Clusterer
|
30
|
+
|
31
|
+
def setup
|
32
|
+
@idf = InverseDocumentFrequency.new()
|
33
|
+
@d = DocumentArray.new("hello world, mea culpa, goodbye world.",:idf => @idf)
|
34
|
+
@e = DocumentArray.new("the world is not a bad place to live.",:idf => @idf)
|
35
|
+
@f = DocumentArray.new("the world is a crazy place to live.",:idf => @idf)
|
36
|
+
@g = DocumentArray.new("unique document.")
|
37
|
+
[@d, @e, @f, @g].each {|d| d.normalize! }
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_initialize
|
41
|
+
l = Lsi.new([@d, @e, @f, @g])
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_perform_svd
|
45
|
+
l = Lsi.new([@d, @e, @f, @g])
|
46
|
+
l.perform_svd(1.0)
|
47
|
+
assert l.instance_variable_get("@t")
|
48
|
+
assert l.instance_variable_get("@d")
|
49
|
+
assert l.instance_variable_get("@s")
|
50
|
+
assert l.instance_variable_get("@t") * l.instance_variable_get("@s") * l.instance_variable_get("@d")
|
51
|
+
l = Lsi.new([@d, @e, @f])
|
52
|
+
l.perform_svd(0.1)
|
53
|
+
assert l.instance_variable_get("@t")
|
54
|
+
assert l.instance_variable_get("@d")
|
55
|
+
assert l.instance_variable_get("@s")
|
56
|
+
assert l.instance_variable_get("@t") * l.instance_variable_get("@s") * l.instance_variable_get("@d")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_add_document
|
60
|
+
l = Lsi.new([@d, @e, @f, @g])
|
61
|
+
l << @d
|
62
|
+
l.perform_svd(0.75)
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_cluster_documents
|
66
|
+
l = Lsi.new([@d, @e, @f, @g])
|
67
|
+
puts l.cluster_documents(2).collect {|c| c.collect {|d| d.object } }.inspect
|
68
|
+
assert_equal 2, l.cluster_documents(2).size
|
69
|
+
assert_equal 2, l.cluster_documents(2,:algorithm => :hierarchical).size
|
70
|
+
assert_equal 2, l.cluster_documents(2, :algorithm => :bisecting_kmeans).size
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_search
|
74
|
+
l = Lsi.new([@d, @e, @f, @g])
|
75
|
+
assert l.search(@f).size >= 1
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
24
|
+
|
25
|
+
require 'test/unit'
|
26
|
+
require 'clusterer'
|
27
|
+
|
28
|
+
class TestSimilarity < Test::Unit::TestCase
|
29
|
+
include Clusterer
|
30
|
+
|
31
|
+
def setup
|
32
|
+
@d = Document.new("hello world, mea culpa, goodbye world.")
|
33
|
+
@e = Document.new("the world is not a bad place to live.").normalize!
|
34
|
+
@f = Document.new("the world is a crazy place to live.").normalize!
|
35
|
+
@g = Document.new("unique document.")
|
36
|
+
|
37
|
+
@c1 = Clusterer::Cluster.new([@d, @e, @f])
|
38
|
+
@c2 = Clusterer::Cluster.new([@g, @g])
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_cosine_similarity
|
42
|
+
assert_in_delta 1.0, @d.cosine_similarity(@d), 0.01
|
43
|
+
assert_in_delta 1.0, @e.cosine_similarity(@e), 0.01
|
44
|
+
assert_in_delta 0.0, @f.cosine_similarity(@g), 0.01
|
45
|
+
assert @e.cosine_similarity(@f) > 0.5 # very similar
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_intra_cluster_similarity
|
49
|
+
assert @c1.intra_cluster_similarity(@c2) < 0
|
50
|
+
assert_in_delta 0.0, @c1.intra_cluster_similarity(@c1), 0.01
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_centroid_similarity
|
54
|
+
assert_in_delta 0.0, @c1.centroid_similarity(@c2), 0.01
|
55
|
+
assert_in_delta 1.0, @c1.centroid_similarity(@c1), 0.01
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_upgma
|
59
|
+
assert_in_delta 0, @c1.upgma(@c2), 0.01
|
60
|
+
assert @c1.upgma(@c1) > 0.5
|
61
|
+
end
|
62
|
+
end
|