clusterer 0.1.0 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
7
+ require 'test/unit'
8
+ require 'clusterer'
9
+
10
+ class TestCluster < Test::Unit::TestCase
11
+ include Clusterer
12
+
13
+ def setup
14
+ @idf = InverseDocumentFrequency.new()
15
+ @d = Document.new("hello world, mea culpa, goodbye world.", :idf => @idf).normalize!(@idf)
16
+ @e = Document.new("the world is not a bad place to live.", :idf => @idf).normalize!(@idf)
17
+ @f = Document.new("the world is a crazy place to live.", :idf => @idf).normalize!(@idf)
18
+ @g = Document.new("unique document.")
19
+ @c1 = Cluster.new([@d, @e, @f])
20
+ @c2 = Cluster.new([@d, @g])
21
+ end
22
+
23
+ def test_centroid
24
+ assert @c1.centroid
25
+ assert_nil Cluster.new.centroid
26
+ end
27
+
28
+ def test_merge!
29
+ @c1.merge!(@c2)
30
+ assert_nil @c1.instance_variable_get("@intra_cluster_similarity")
31
+ c = Cluster.new
32
+ c.merge!(@c2)
33
+ assert_equal c.centroid, @c2.centroid
34
+ end
35
+
36
+ def test_add
37
+ c= @c1 + @c2
38
+ assert_not_equal c, @c1
39
+ assert_not_equal c, @c2
40
+ assert_equal (Cluster.new + @c1), @c1
41
+ end
42
+
43
+ def test_equal
44
+ assert_not_equal @c1, @c2
45
+ assert_not_equal @c1, nil
46
+ assert_equal @c1, Cluster.new([@d, @e, @f])
47
+ end
48
+
49
+ def test_intra_cluster_cosine_similarity
50
+ assert_equal Cluster.new.intra_cluster_cosine_similarity,Cluster.new.intra_cluster_cosine_similarity
51
+ assert @c1.intra_cluster_cosine_similarity > 0
52
+ assert_not_equal @c1.intra_cluster_cosine_similarity, @c2.intra_cluster_cosine_similarity
53
+ end
54
+ end
@@ -0,0 +1,64 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class DocumentArrayTest < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @idf = InverseDocumentFrequency.new()
33
+ @d = DocumentArray.new("hello world, mea culpa, goodbye world.",:idf => @idf)
34
+ @e = DocumentArray.new("the world is not a bad place to live.",:idf => @idf).normalize!(@idf)
35
+ @f = DocumentArray.new("the world is a crazy place to live.",:idf => @idf).normalize!(@idf)
36
+ @g = DocumentArray.new("unique document.")
37
+ end
38
+
39
+ def test_insert
40
+ t = @d.term_array_position_mapper('weird')
41
+ assert_nil @d[t]
42
+ @d << "weird"
43
+ assert @d[t]
44
+ assert_equal @d[t] + 1, (@d << "weird"; @d[t])
45
+ end
46
+
47
+ def test_term_array_position_mapper
48
+ @d.term_array_position_mapper("world")
49
+ end
50
+
51
+ def test_vector_length
52
+ assert_not_nil @f.vector_length
53
+ assert_in_delta 1.0, @f.vector_length, 0.01
54
+ end
55
+
56
+ def test_object
57
+ assert_equal "unique document.", @g.object
58
+ end
59
+
60
+ def test_normalize!
61
+ @d.normalize!
62
+ assert_in_delta 1.0, @d.vector_length, 0.01
63
+ end
64
+ end
@@ -0,0 +1,64 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+
24
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
25
+
26
+ require 'test/unit'
27
+ require 'clusterer'
28
+
29
+ class DocumentCentroidTest < Test::Unit::TestCase
30
+ include Clusterer
31
+
32
+ def setup
33
+ @idf = InverseDocumentFrequency.new()
34
+ @d = Document.new("hello world, mea culpa, goodbye world.", :idf => @idf).normalize!(@idf)
35
+ @e = Document.new("the world is not a bad place to live.", :idf => @idf).normalize!(@idf)
36
+ @f = Document.new("the world is a crazy place to live.", :idf => @idf).normalize!(@idf)
37
+ @g = Document.new("unique document.")
38
+ @c1 = Clusterer::Cluster.new([@d, @e, @f])
39
+ @c2 = Clusterer::Cluster.new([@d, @g])
40
+ end
41
+
42
+ def test_initialization
43
+ c = DocumentsCentroid.new([@d, @e, @f])
44
+ assert 3, c.no_of_documents
45
+ assert c.vector_length
46
+ end
47
+
48
+ def test_merge!
49
+ c1 = DocumentsCentroid.new([@d, @e, @f])
50
+ c2 = DocumentsCentroid.new([@d, @g])
51
+ c3 = c2.clone
52
+ t = c1.vector_length
53
+ c1.merge!(c2)
54
+ assert 5, c1.no_of_documents
55
+ assert_equal c2, c3
56
+ assert_not_equal t, c1.vector_length
57
+
58
+ c4 = DocumentsCentroid.new()
59
+ t = c1.vector_length
60
+ c1.merge!(c4)
61
+ assert 5, c1.no_of_documents
62
+ assert_equal t, c1.vector_length
63
+ end
64
+ end
@@ -0,0 +1,71 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class DocumentTest < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @idf = InverseDocumentFrequency.new()
33
+ @d = Document.new("hello world, mea culpa, goodbye world.",:idf => @idf) {|x| x.to_s }
34
+ @e = Document.new("the world is not a bad place to live.",:idf => @idf).normalize!(@idf)
35
+ @f = Document.new("the world is a crazy place to live.").normalize!(@idf, true)
36
+ @g = Document.new(["unique ","document."])
37
+ end
38
+
39
+ def test_cosine_similarity
40
+ assert @d.cosine_similarity(@d) > 0.99 #almost 1
41
+ assert @f.cosine_similarity(@g) < 0.0001 # almost 0
42
+ end
43
+
44
+ def test_centroid_class
45
+ assert_equal DocumentsCentroid, @d.class.centroid_class
46
+ end
47
+
48
+ def test_add
49
+ assert_nil @d['weird']
50
+ t = @d.length + 1
51
+ @d << "weird"
52
+ assert_equal t, @d.length
53
+ assert @d['weird']
54
+ assert_equal @d['weird'] + 1, (@d << "weird"; @d['weird'])
55
+ end
56
+
57
+ def test_vector_length
58
+ assert_not_nil @f.vector_length
59
+ assert_in_delta 1.0, @f.vector_length, 0.01
60
+ end
61
+
62
+ def test_object
63
+ assert_equal ["unique ","document."], @g.object
64
+ assert_equal "the world is a crazy place to live.", @f.object
65
+ end
66
+
67
+ def test_normalize!
68
+ @d.normalize!
69
+ assert_in_delta 1.0, @d.vector_length, 0.01
70
+ end
71
+ end
@@ -0,0 +1,76 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+
24
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
25
+
26
+ require 'test/unit'
27
+ require 'clusterer'
28
+
29
+ class InverseDocumentFrequencyTest < Test::Unit::TestCase
30
+ include Clusterer
31
+
32
+ def test_insertion
33
+ idf = InverseDocumentFrequency.new()
34
+ idf << "hello"
35
+ assert_equal 1, idf.instance_variable_get("@terms_count").size
36
+ assert_in_delta 1.0, idf.instance_variable_get("@terms_count")["hello"], 0.01
37
+ idf << "hello"
38
+ assert_equal 2.0, idf.instance_variable_get("@terms_count")["hello"]
39
+ assert_equal 1, idf.instance_variable_get("@terms_count").size
40
+
41
+ idf << "world"
42
+ assert_equal 2, idf.instance_variable_get("@terms_count").size
43
+ end
44
+
45
+ def test_documents_count
46
+ idf = InverseDocumentFrequency.new()
47
+ Document.new("the world is not a bad place to live.", :idf => idf)
48
+ Document.new("the world is a crazy place to live.", :idf => idf)
49
+ assert_equal 2, idf.documents_count
50
+ end
51
+
52
+ def test_clean_cached_normalizing_factor
53
+ idf = InverseDocumentFrequency.new()
54
+ Document.new("the world is not a bad place to live.", :idf => idf)
55
+ Document.new("hello, the world is a crazy place to live.", :idf => idf)
56
+ t ="crazy".stem
57
+ f = idf[t]
58
+ assert_in_delta Math.log(2/1), f, 0.1
59
+ Document.new("the world is a weird place to live.", :idf => idf)
60
+ assert_equal f, idf[t]
61
+ idf.clean_cached_normalizing_factor
62
+ assert_not_equal f, idf[t]
63
+ end
64
+
65
+ def test_array_index
66
+ idf = InverseDocumentFrequency.new()
67
+ Document.new("the world is not a bad place to live.", :idf => idf)
68
+ assert_in_delta 1.0, idf["world"], 0.001
69
+ assert_in_delta 1.0, idf["hello"], 0.001
70
+
71
+ Document.new("hello, the world is a crazy place to live.", :idf => idf)
72
+ idf.clean_cached_normalizing_factor
73
+ idf << "hello"
74
+ assert idf["hello"] < 0.99
75
+ end
76
+ end
@@ -0,0 +1,77 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class LsiTest < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @idf = InverseDocumentFrequency.new()
33
+ @d = DocumentArray.new("hello world, mea culpa, goodbye world.",:idf => @idf)
34
+ @e = DocumentArray.new("the world is not a bad place to live.",:idf => @idf)
35
+ @f = DocumentArray.new("the world is a crazy place to live.",:idf => @idf)
36
+ @g = DocumentArray.new("unique document.")
37
+ [@d, @e, @f, @g].each {|d| d.normalize! }
38
+ end
39
+
40
+ def test_initialize
41
+ l = Lsi.new([@d, @e, @f, @g])
42
+ end
43
+
44
+ def test_perform_svd
45
+ l = Lsi.new([@d, @e, @f, @g])
46
+ l.perform_svd(1.0)
47
+ assert l.instance_variable_get("@t")
48
+ assert l.instance_variable_get("@d")
49
+ assert l.instance_variable_get("@s")
50
+ assert l.instance_variable_get("@t") * l.instance_variable_get("@s") * l.instance_variable_get("@d")
51
+ l = Lsi.new([@d, @e, @f])
52
+ l.perform_svd(0.1)
53
+ assert l.instance_variable_get("@t")
54
+ assert l.instance_variable_get("@d")
55
+ assert l.instance_variable_get("@s")
56
+ assert l.instance_variable_get("@t") * l.instance_variable_get("@s") * l.instance_variable_get("@d")
57
+ end
58
+
59
+ def test_add_document
60
+ l = Lsi.new([@d, @e, @f, @g])
61
+ l << @d
62
+ l.perform_svd(0.75)
63
+ end
64
+
65
+ def test_cluster_documents
66
+ l = Lsi.new([@d, @e, @f, @g])
67
+ puts l.cluster_documents(2).collect {|c| c.collect {|d| d.object } }.inspect
68
+ assert_equal 2, l.cluster_documents(2).size
69
+ assert_equal 2, l.cluster_documents(2,:algorithm => :hierarchical).size
70
+ assert_equal 2, l.cluster_documents(2, :algorithm => :bisecting_kmeans).size
71
+ end
72
+
73
+ def test_search
74
+ l = Lsi.new([@d, @e, @f, @g])
75
+ assert l.search(@f).size >= 1
76
+ end
77
+ end
@@ -0,0 +1,62 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
24
+
25
+ require 'test/unit'
26
+ require 'clusterer'
27
+
28
+ class TestSimilarity < Test::Unit::TestCase
29
+ include Clusterer
30
+
31
+ def setup
32
+ @d = Document.new("hello world, mea culpa, goodbye world.")
33
+ @e = Document.new("the world is not a bad place to live.").normalize!
34
+ @f = Document.new("the world is a crazy place to live.").normalize!
35
+ @g = Document.new("unique document.")
36
+
37
+ @c1 = Clusterer::Cluster.new([@d, @e, @f])
38
+ @c2 = Clusterer::Cluster.new([@g, @g])
39
+ end
40
+
41
+ def test_cosine_similarity
42
+ assert_in_delta 1.0, @d.cosine_similarity(@d), 0.01
43
+ assert_in_delta 1.0, @e.cosine_similarity(@e), 0.01
44
+ assert_in_delta 0.0, @f.cosine_similarity(@g), 0.01
45
+ assert @e.cosine_similarity(@f) > 0.5 # very similar
46
+ end
47
+
48
+ def test_intra_cluster_similarity
49
+ assert @c1.intra_cluster_similarity(@c2) < 0
50
+ assert_in_delta 0.0, @c1.intra_cluster_similarity(@c1), 0.01
51
+ end
52
+
53
+ def test_centroid_similarity
54
+ assert_in_delta 0.0, @c1.centroid_similarity(@c2), 0.01
55
+ assert_in_delta 1.0, @c1.centroid_similarity(@c1), 0.01
56
+ end
57
+
58
+ def test_upgma
59
+ assert_in_delta 0, @c1.upgma(@c2), 0.01
60
+ assert @c1.upgma(@c1) > 0.5
61
+ end
62
+ end