clusterer 0.1.0 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,54 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
7
+ require 'test/unit'
8
+ require 'clusterer'
9
+
10
+ class TestCluster < Test::Unit::TestCase
11
+ include Clusterer
12
+
13
+ def setup
14
+ @idf = InverseDocumentFrequency.new()
15
+ @d = Document.new("hello world, mea culpa, goodbye world.", :idf => @idf).normalize!(@idf)
16
+ @e = Document.new("the world is not a bad place to live.", :idf => @idf).normalize!(@idf)
17
+ @f = Document.new("the world is a crazy place to live.", :idf => @idf).normalize!(@idf)
18
+ @g = Document.new("unique document.")
19
+ @c1 = Cluster.new([@d, @e, @f])
20
+ @c2 = Cluster.new([@d, @g])
21
+ end
22
+
23
+ def test_centroid
24
+ assert @c1.centroid
25
+ assert_nil Cluster.new.centroid
26
+ end
27
+
28
+ def test_merge!
29
+ @c1.merge!(@c2)
30
+ assert_nil @c1.instance_variable_get("@intra_cluster_similarity")
31
+ c = Cluster.new
32
+ c.merge!(@c2)
33
+ assert_equal c.centroid, @c2.centroid
34
+ end
35
+
36
+ def test_add
37
+ c= @c1 + @c2
38
+ assert_not_equal c, @c1
39
+ assert_not_equal c, @c2
40
+ assert_equal (Cluster.new + @c1), @c1
41
+ end
42
+
43
+ def test_equal
44
+ assert_not_equal @c1, @c2
45
+ assert_not_equal @c1, nil
46
+ assert_equal @c1, Cluster.new([@d, @e, @f])
47
+ end
48
+
49
+ def test_intra_cluster_cosine_similarity
50
+ assert_equal Cluster.new.intra_cluster_cosine_similarity,Cluster.new.intra_cluster_cosine_similarity
51
+ assert @c1.intra_cluster_cosine_similarity > 0
52
+ assert_not_equal @c1.intra_cluster_cosine_similarity, @c2.intra_cluster_cosine_similarity
53
+ end
54
+ end
@@ -0,0 +1,64 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class DocumentArrayTest < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @idf = InverseDocumentFrequency.new()
33
+ @d = DocumentArray.new("hello world, mea culpa, goodbye world.",:idf => @idf)
34
+ @e = DocumentArray.new("the world is not a bad place to live.",:idf => @idf).normalize!(@idf)
35
+ @f = DocumentArray.new("the world is a crazy place to live.",:idf => @idf).normalize!(@idf)
36
+ @g = DocumentArray.new("unique document.")
37
+ end
38
+
39
+ def test_insert
40
+ t = @d.term_array_position_mapper('weird')
41
+ assert_nil @d[t]
42
+ @d << "weird"
43
+ assert @d[t]
44
+ assert_equal @d[t] + 1, (@d << "weird"; @d[t])
45
+ end
46
+
47
+ def test_term_array_position_mapper
48
+ @d.term_array_position_mapper("world")
49
+ end
50
+
51
+ def test_vector_length
52
+ assert_not_nil @f.vector_length
53
+ assert_in_delta 1.0, @f.vector_length, 0.01
54
+ end
55
+
56
+ def test_object
57
+ assert_equal "unique document.", @g.object
58
+ end
59
+
60
+ def test_normalize!
61
+ @d.normalize!
62
+ assert_in_delta 1.0, @d.vector_length, 0.01
63
+ end
64
+ end
@@ -0,0 +1,64 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+
24
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
25
+
26
+ require 'test/unit'
27
+ require 'clusterer'
28
+
29
+ class DocumentCentroidTest < Test::Unit::TestCase
30
+ include Clusterer
31
+
32
+ def setup
33
+ @idf = InverseDocumentFrequency.new()
34
+ @d = Document.new("hello world, mea culpa, goodbye world.", :idf => @idf).normalize!(@idf)
35
+ @e = Document.new("the world is not a bad place to live.", :idf => @idf).normalize!(@idf)
36
+ @f = Document.new("the world is a crazy place to live.", :idf => @idf).normalize!(@idf)
37
+ @g = Document.new("unique document.")
38
+ @c1 = Clusterer::Cluster.new([@d, @e, @f])
39
+ @c2 = Clusterer::Cluster.new([@d, @g])
40
+ end
41
+
42
+ def test_initialization
43
+ c = DocumentsCentroid.new([@d, @e, @f])
44
+ assert 3, c.no_of_documents
45
+ assert c.vector_length
46
+ end
47
+
48
+ def test_merge!
49
+ c1 = DocumentsCentroid.new([@d, @e, @f])
50
+ c2 = DocumentsCentroid.new([@d, @g])
51
+ c3 = c2.clone
52
+ t = c1.vector_length
53
+ c1.merge!(c2)
54
+ assert 5, c1.no_of_documents
55
+ assert_equal c2, c3
56
+ assert_not_equal t, c1.vector_length
57
+
58
+ c4 = DocumentsCentroid.new()
59
+ t = c1.vector_length
60
+ c1.merge!(c4)
61
+ assert 5, c1.no_of_documents
62
+ assert_equal t, c1.vector_length
63
+ end
64
+ end
@@ -0,0 +1,71 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class DocumentTest < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @idf = InverseDocumentFrequency.new()
33
+ @d = Document.new("hello world, mea culpa, goodbye world.",:idf => @idf) {|x| x.to_s }
34
+ @e = Document.new("the world is not a bad place to live.",:idf => @idf).normalize!(@idf)
35
+ @f = Document.new("the world is a crazy place to live.").normalize!(@idf, true)
36
+ @g = Document.new(["unique ","document."])
37
+ end
38
+
39
+ def test_cosine_similarity
40
+ assert @d.cosine_similarity(@d) > 0.99 #almost 1
41
+ assert @f.cosine_similarity(@g) < 0.0001 # almost 0
42
+ end
43
+
44
+ def test_centroid_class
45
+ assert_equal DocumentsCentroid, @d.class.centroid_class
46
+ end
47
+
48
+ def test_add
49
+ assert_nil @d['weird']
50
+ t = @d.length + 1
51
+ @d << "weird"
52
+ assert_equal t, @d.length
53
+ assert @d['weird']
54
+ assert_equal @d['weird'] + 1, (@d << "weird"; @d['weird'])
55
+ end
56
+
57
+ def test_vector_length
58
+ assert_not_nil @f.vector_length
59
+ assert_in_delta 1.0, @f.vector_length, 0.01
60
+ end
61
+
62
+ def test_object
63
+ assert_equal ["unique ","document."], @g.object
64
+ assert_equal "the world is a crazy place to live.", @f.object
65
+ end
66
+
67
+ def test_normalize!
68
+ @d.normalize!
69
+ assert_in_delta 1.0, @d.vector_length, 0.01
70
+ end
71
+ end
@@ -0,0 +1,76 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+
24
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
25
+
26
+ require 'test/unit'
27
+ require 'clusterer'
28
+
29
+ class InverseDocumentFrequencyTest < Test::Unit::TestCase
30
+ include Clusterer
31
+
32
+ def test_insertion
33
+ idf = InverseDocumentFrequency.new()
34
+ idf << "hello"
35
+ assert_equal 1, idf.instance_variable_get("@terms_count").size
36
+ assert_in_delta 1.0, idf.instance_variable_get("@terms_count")["hello"], 0.01
37
+ idf << "hello"
38
+ assert_equal 2.0, idf.instance_variable_get("@terms_count")["hello"]
39
+ assert_equal 1, idf.instance_variable_get("@terms_count").size
40
+
41
+ idf << "world"
42
+ assert_equal 2, idf.instance_variable_get("@terms_count").size
43
+ end
44
+
45
+ def test_documents_count
46
+ idf = InverseDocumentFrequency.new()
47
+ Document.new("the world is not a bad place to live.", :idf => idf)
48
+ Document.new("the world is a crazy place to live.", :idf => idf)
49
+ assert_equal 2, idf.documents_count
50
+ end
51
+
52
+ def test_clean_cached_normalizing_factor
53
+ idf = InverseDocumentFrequency.new()
54
+ Document.new("the world is not a bad place to live.", :idf => idf)
55
+ Document.new("hello, the world is a crazy place to live.", :idf => idf)
56
+ t ="crazy".stem
57
+ f = idf[t]
58
+ assert_in_delta Math.log(2/1), f, 0.1
59
+ Document.new("the world is a weird place to live.", :idf => idf)
60
+ assert_equal f, idf[t]
61
+ idf.clean_cached_normalizing_factor
62
+ assert_not_equal f, idf[t]
63
+ end
64
+
65
+ def test_array_index
66
+ idf = InverseDocumentFrequency.new()
67
+ Document.new("the world is not a bad place to live.", :idf => idf)
68
+ assert_in_delta 1.0, idf["world"], 0.001
69
+ assert_in_delta 1.0, idf["hello"], 0.001
70
+
71
+ Document.new("hello, the world is a crazy place to live.", :idf => idf)
72
+ idf.clean_cached_normalizing_factor
73
+ idf << "hello"
74
+ assert idf["hello"] < 0.99
75
+ end
76
+ end
@@ -0,0 +1,77 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class LsiTest < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @idf = InverseDocumentFrequency.new()
33
+ @d = DocumentArray.new("hello world, mea culpa, goodbye world.",:idf => @idf)
34
+ @e = DocumentArray.new("the world is not a bad place to live.",:idf => @idf)
35
+ @f = DocumentArray.new("the world is a crazy place to live.",:idf => @idf)
36
+ @g = DocumentArray.new("unique document.")
37
+ [@d, @e, @f, @g].each {|d| d.normalize! }
38
+ end
39
+
40
+ def test_initialize
41
+ l = Lsi.new([@d, @e, @f, @g])
42
+ end
43
+
44
+ def test_perform_svd
45
+ l = Lsi.new([@d, @e, @f, @g])
46
+ l.perform_svd(1.0)
47
+ assert l.instance_variable_get("@t")
48
+ assert l.instance_variable_get("@d")
49
+ assert l.instance_variable_get("@s")
50
+ assert l.instance_variable_get("@t") * l.instance_variable_get("@s") * l.instance_variable_get("@d")
51
+ l = Lsi.new([@d, @e, @f])
52
+ l.perform_svd(0.1)
53
+ assert l.instance_variable_get("@t")
54
+ assert l.instance_variable_get("@d")
55
+ assert l.instance_variable_get("@s")
56
+ assert l.instance_variable_get("@t") * l.instance_variable_get("@s") * l.instance_variable_get("@d")
57
+ end
58
+
59
+ def test_add_document
60
+ l = Lsi.new([@d, @e, @f, @g])
61
+ l << @d
62
+ l.perform_svd(0.75)
63
+ end
64
+
65
+ def test_cluster_documents
66
+ l = Lsi.new([@d, @e, @f, @g])
67
+ puts l.cluster_documents(2).collect {|c| c.collect {|d| d.object } }.inspect
68
+ assert_equal 2, l.cluster_documents(2).size
69
+ assert_equal 2, l.cluster_documents(2,:algorithm => :hierarchical).size
70
+ assert_equal 2, l.cluster_documents(2, :algorithm => :bisecting_kmeans).size
71
+ end
72
+
73
+ def test_search
74
+ l = Lsi.new([@d, @e, @f, @g])
75
+ assert l.search(@f).size >= 1
76
+ end
77
+ end
@@ -0,0 +1,62 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class TestSimilarity < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @d = Document.new("hello world, mea culpa, goodbye world.")
33
+ @e = Document.new("the world is not a bad place to live.").normalize!
34
+ @f = Document.new("the world is a crazy place to live.").normalize!
35
+ @g = Document.new("unique document.")
36
+
37
+ @c1 = Clusterer::Cluster.new([@d, @e, @f])
38
+ @c2 = Clusterer::Cluster.new([@g, @g])
39
+ end
40
+
41
+ def test_cosine_similarity
42
+ assert_in_delta 1.0, @d.cosine_similarity(@d), 0.01
43
+ assert_in_delta 1.0, @e.cosine_similarity(@e), 0.01
44
+ assert_in_delta 0.0, @f.cosine_similarity(@g), 0.01
45
+ assert @e.cosine_similarity(@f) > 0.5 # very similar
46
+ end
47
+
48
+ def test_intra_cluster_similarity
49
+ assert @c1.intra_cluster_similarity(@c2) < 0
50
+ assert_in_delta 0.0, @c1.intra_cluster_similarity(@c1), 0.01
51
+ end
52
+
53
+ def test_centroid_similarity
54
+ assert_in_delta 0.0, @c1.centroid_similarity(@c2), 0.01
55
+ assert_in_delta 1.0, @c1.centroid_similarity(@c1), 0.01
56
+ end
57
+
58
+ def test_upgma
59
+ assert_in_delta 0, @c1.upgma(@c2), 0.01
60
+ assert @c1.upgma(@c1) > 0.5
61
+ end
62
+ end