clusterer 0.1.0 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/README +29 -7
- data/examples/google_search_cluster.rb +13 -7
- data/examples/yahoo_search_cluster.rb +18 -31
- data/lib/clusterer.rb +36 -95
- data/lib/clusterer/algorithms.rb +95 -0
- data/lib/clusterer/bayes.rb +255 -0
- data/lib/clusterer/cluster.rb +56 -0
- data/lib/clusterer/clustering.rb +35 -0
- data/lib/clusterer/document.rb +71 -0
- data/lib/clusterer/document_array.rb +79 -0
- data/lib/clusterer/document_base.rb +32 -0
- data/lib/clusterer/documents_centroid.rb +44 -0
- data/lib/clusterer/inverse_document_frequency.rb +83 -0
- data/lib/clusterer/lsi/dmatrix.rb +132 -0
- data/lib/clusterer/lsi/document_vector.rb +54 -0
- data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
- data/lib/clusterer/lsi/lsi.rb +95 -0
- data/lib/clusterer/similarity.rb +34 -0
- data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
- data/lib/clusterer/tokenizer.rb +70 -0
- data/tests/algorithms_test.rb +48 -0
- data/tests/bayes_test.rb +68 -0
- data/tests/cluster_test.rb +54 -0
- data/tests/document_array_test.rb +64 -0
- data/tests/document_centroid_test.rb +64 -0
- data/tests/document_test.rb +71 -0
- data/tests/inverse_document_frequency_test.rb +76 -0
- data/tests/lsi_test.rb +77 -0
- data/tests/similarity_test.rb +62 -0
- data/tests/tokenizer_test.rb +72 -0
- metadata +35 -9
- data/lib/similarity.rb +0 -27
- data/tests/clusterer_test.rb +0 -20
@@ -0,0 +1,56 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class Cluster
|
25
|
+
attr_reader :centroid, :documents
|
26
|
+
include ClusterSimilarity
|
27
|
+
|
28
|
+
def initialize(docs = [])
|
29
|
+
@documents = docs
|
30
|
+
end
|
31
|
+
|
32
|
+
def centroid
|
33
|
+
@centroid ||= (@documents.empty? ? nil : @documents[0].class.centroid_class.new(documents))
|
34
|
+
end
|
35
|
+
|
36
|
+
def merge!(cluster)
|
37
|
+
documents.concat(cluster.documents)
|
38
|
+
@centroid ? centroid.merge!(cluster.centroid) : @centroid = cluster.centroid
|
39
|
+
@intra_cluster_similarity = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def + (cluster)
|
43
|
+
c = Cluster.new(self.documents.clone)
|
44
|
+
c.merge!(cluster)
|
45
|
+
return c
|
46
|
+
end
|
47
|
+
|
48
|
+
def ==(cluster)
|
49
|
+
cluster && self.documents == cluster.documents
|
50
|
+
end
|
51
|
+
|
52
|
+
def intra_cluster_cosine_similarity
|
53
|
+
@intra_cluster_similarity ||= documents.inject(0) {|n,d| n + d.cosine_similarity(centroid) }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class Clustering
|
25
|
+
class << self
|
26
|
+
def cluster(algorithm, objects, options = { })
|
27
|
+
options[:no_of_clusters] ||= Math.sqrt(objects.size).to_i
|
28
|
+
idf = InverseDocumentFrequency.new
|
29
|
+
docs = objects.collect {|o|
|
30
|
+
(defined? yield) == "yield" ? Document.new(o, options.merge(:idf => idf)) {|o| yield(o)} : Document.new(o, options.merge(:idf => idf))}
|
31
|
+
Algorithms.send(algorithm, docs.collect {|d| d.normalize!(idf) }, options[:no_of_clusters])
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
#Document tokenizes the text and stores the count of each token in the document.
|
25
|
+
class Document < DocumentBase
|
26
|
+
#stores the text using hash
|
27
|
+
|
28
|
+
#Reference to the original text or the object from which the text is derived.
|
29
|
+
attr_reader :object
|
30
|
+
include(Tokenizer)
|
31
|
+
|
32
|
+
#Reference to the centroid class which is used by Kmeans algorithm
|
33
|
+
def self.centroid_class
|
34
|
+
DocumentsCentroid
|
35
|
+
end
|
36
|
+
|
37
|
+
def initialize (object, options = { })
|
38
|
+
@object = object
|
39
|
+
send(options[:tokenizer] || :simple_tokenizer,
|
40
|
+
((defined? yield) == "yield" ? yield(object) : object.to_s),
|
41
|
+
options[:tokenizer_options] || {}) {|term| self << term }
|
42
|
+
|
43
|
+
if (idf = options[:idf])
|
44
|
+
idf.increment_documents_count
|
45
|
+
self.each_key {|term| idf << term}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def << (term)
|
50
|
+
self[term] = (self[term] || 0) + 1
|
51
|
+
end
|
52
|
+
|
53
|
+
def normalize!(idf = nil, add_term = false)
|
54
|
+
normalizing_factor = 0.0
|
55
|
+
idf.increment_documents_count if add_term
|
56
|
+
|
57
|
+
self.each do |term,frequency|
|
58
|
+
idf << term if add_term
|
59
|
+
f = idf ? idf[term] : 1.0
|
60
|
+
self[term] = Math.log(1 + frequency) * f
|
61
|
+
normalizing_factor += self[term] ** 2
|
62
|
+
end
|
63
|
+
|
64
|
+
normalizing_factor = Math.sqrt(normalizing_factor)
|
65
|
+
normalizing_factor = 1 if normalizing_factor.zero?
|
66
|
+
self.each {|term,frequency| self[term] = frequency/normalizing_factor}
|
67
|
+
@vector_length = 1
|
68
|
+
self.freeze
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DocumentArray < Array
|
25
|
+
#stores the text in an array format, used with LSI or SVD
|
26
|
+
attr_reader :object
|
27
|
+
|
28
|
+
@@term_array_position_mapper = {}
|
29
|
+
include(Tokenizer)
|
30
|
+
|
31
|
+
def initialize(object = "",options = { })
|
32
|
+
@object = object
|
33
|
+
super(@@term_array_position_mapper.size,0.0)
|
34
|
+
send(options[:tokenizer] || :simple_tokenizer,
|
35
|
+
((defined? yield) == "yield" ? yield(object) : object.to_s),
|
36
|
+
options[:tokenizer_options] || {}) {|term| self << term }
|
37
|
+
|
38
|
+
if (idf = options[:idf])
|
39
|
+
idf.increment_documents_count
|
40
|
+
self.each_with_index {|ind,val| idf << @@term_array_position_mapper.index(ind) if val && val > 0.0}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def << (term)
|
45
|
+
self[term_array_position_mapper(term)] = (self[term_array_position_mapper(term)] || 0) + 1
|
46
|
+
end
|
47
|
+
|
48
|
+
def normalize!(idf = nil, add_term = false)
|
49
|
+
normalizing_factor = 0.0
|
50
|
+
idf.increment_documents_count if add_term
|
51
|
+
|
52
|
+
self[@@term_array_position_mapper.size - 1] ||= 0.0
|
53
|
+
|
54
|
+
self.each_with_index do |frequency, ind|
|
55
|
+
f = add_term ? (idf << term) : (idf ? idf[@@term_array_position_mapper.index(ind)] : 1.0)
|
56
|
+
self[ind] = (frequency || 0) * f
|
57
|
+
normalizing_factor += self[ind] ** 2
|
58
|
+
end
|
59
|
+
|
60
|
+
normalizing_factor = Math.sqrt(normalizing_factor)
|
61
|
+
normalizing_factor = 1 if normalizing_factor.zero?
|
62
|
+
self.each_with_index {|frequency, ind| self[ind] = frequency/normalizing_factor}
|
63
|
+
@vector_length = 1.0
|
64
|
+
self.freeze
|
65
|
+
end
|
66
|
+
|
67
|
+
def vector_length
|
68
|
+
@vector_length ||= Math.sqrt(self.inject(0) {|n,y| n + y*y})
|
69
|
+
end
|
70
|
+
|
71
|
+
def term_array_position_mapper(term)
|
72
|
+
if (x = @@term_array_position_mapper[term])
|
73
|
+
x
|
74
|
+
else
|
75
|
+
@@term_array_position_mapper[term] = @@term_array_position_mapper.size
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DocumentBase < Hash
|
25
|
+
#base class, not meant for direct use
|
26
|
+
include DocumentSimilarity
|
27
|
+
|
28
|
+
def vector_length
|
29
|
+
@vector_length ||= Math.sqrt(self.inject(0) {|n,y| n + y[1]*y[1]})
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DocumentsCentroid < DocumentBase
|
25
|
+
attr_reader :no_of_documents
|
26
|
+
|
27
|
+
def initialize(docs = [])
|
28
|
+
@no_of_documents = docs.size
|
29
|
+
docs.each do |d|
|
30
|
+
d.each {|w,f| self[w] = (self[w] || 0.0) + f/@no_of_documents}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def merge!(centroid)
|
35
|
+
@vector_length = nil
|
36
|
+
temp = @no_of_documents/(@no_of_documents + centroid.no_of_documents)
|
37
|
+
self.each {|w,v| self[w] = v*temp}
|
38
|
+
@no_of_documents += centroid.no_of_documents
|
39
|
+
|
40
|
+
temp = centroid.no_of_documents/@no_of_documents
|
41
|
+
centroid.each {|w,v| self[w] = (self[w] || 0) + v*temp }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
#DocumentsCount is used to store the count of number of documents seen.
|
25
|
+
#This class could have been just replaced by a simple variable, in
|
26
|
+
#InverseDocumentFrequency class but to make the InverseDocumentFrequency
|
27
|
+
#class more flexible and be able to store the count in DB/File store
|
28
|
+
#this class is provided.
|
29
|
+
class DocumentsCount
|
30
|
+
attr_reader :value
|
31
|
+
def initialize
|
32
|
+
@value = 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def increment
|
36
|
+
@value +=1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#TermsCount is used to store the count of number of documents in which the
|
41
|
+
#term has been seen. This class could have been just replaced by a simple
|
42
|
+
#hash object, in InverseDocumentFrequency class but to make the
|
43
|
+
#InverseDocumentFrequency class more flexible and be able to store the
|
44
|
+
#term count in DB/File store this class is provided.
|
45
|
+
class TermsCount < Hash
|
46
|
+
def increment_count(term)
|
47
|
+
self[term] = (self[term] || 0) + 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
#InverseDocumentFrequency maintains a count of the total number of documents
|
52
|
+
#and the number of documents where a term has been seen with the help of helper
|
53
|
+
#classes. It also calculates the normalizing factor, the formula for whichis
|
54
|
+
#Math.log(total_number of documents/ number of documents containing the term)
|
55
|
+
class InverseDocumentFrequency < Hash
|
56
|
+
def documents_count
|
57
|
+
@documents_count.value
|
58
|
+
end
|
59
|
+
|
60
|
+
def clean_cached_normalizing_factor
|
61
|
+
@nf.clear
|
62
|
+
end
|
63
|
+
|
64
|
+
def initialize (options = { })
|
65
|
+
@terms_count = options[:terms_count] || TermsCount.new
|
66
|
+
@nf = Hash.new
|
67
|
+
@documents_count = options[:documents_count] || DocumentsCount.new
|
68
|
+
end
|
69
|
+
|
70
|
+
def increment_documents_count
|
71
|
+
@documents_count.increment
|
72
|
+
end
|
73
|
+
|
74
|
+
def << (term)
|
75
|
+
@terms_count.increment_count(term) unless term.nil? || term.empty?
|
76
|
+
end
|
77
|
+
|
78
|
+
def [] (term)
|
79
|
+
@nf[term] ||= (@terms_count[term] && @documents_count.value >1) ? Math.log(@documents_count.value/@terms_count[term].to_f) : 1.0
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DMatrix < Matrix
|
25
|
+
#algorithm description from "Simple Algoritms for the partial singular value decomposition"
|
26
|
+
#by J. C. Nash and S. Shlien
|
27
|
+
#Plane rotation method
|
28
|
+
#there were some typos in the original algorithm in the paper
|
29
|
+
#also see the Pascal code in NashSVD, file alg01.pas; for an idea
|
30
|
+
#the partial algorithm is an adaptation of that algo
|
31
|
+
|
32
|
+
def svd
|
33
|
+
m, n = self.row_size, self.column_size
|
34
|
+
tol = 0.001
|
35
|
+
slimit = [n/4.to_i, 6].max
|
36
|
+
u, z, v = DMatrix[*(1..m).to_a.collect {|i| Array.new(n,0) }], Array.new(n), DMatrix.diagonal(*Array.new(n,1))
|
37
|
+
|
38
|
+
nt = n
|
39
|
+
slimit.times do
|
40
|
+
rcount = nt *(nt-1)/2
|
41
|
+
(nt-1).times do |j|
|
42
|
+
(j+1).upto(nt - 1) do |k|
|
43
|
+
p=q=r=0
|
44
|
+
m.times do |i|
|
45
|
+
p += self[i,j]*self[i,k]
|
46
|
+
q += self[i,j]*self[i,j]
|
47
|
+
r += self[i,k]*self[i,k]
|
48
|
+
end
|
49
|
+
z[j], z[k] = q, r
|
50
|
+
if q < r
|
51
|
+
p, q = p/r, q/r - 1
|
52
|
+
vt = Math.sqrt(4*p*p + q*q)
|
53
|
+
s = Math.sqrt(0.5*(1 - q/vt))
|
54
|
+
s = -s if p < 0
|
55
|
+
c = p / (vt*s)
|
56
|
+
elsif (q * r <= tol * tol) || (p/q)*(p/r) <= tol
|
57
|
+
rcount -= 1
|
58
|
+
next
|
59
|
+
else
|
60
|
+
p, r = p/q, 1 - r/q
|
61
|
+
vt = Math.sqrt(4*p*p + r*r)
|
62
|
+
c = Math.sqrt(0.5*(1 + r/vt))
|
63
|
+
s = p/(vt * c)
|
64
|
+
end
|
65
|
+
m.times do |i|
|
66
|
+
r = self[i,j]
|
67
|
+
self[i,j] = c * r + s * self[i,k]
|
68
|
+
self[i,k] = -s*r + c * self[i,k]
|
69
|
+
end
|
70
|
+
n.times do |i|
|
71
|
+
r = v[i,j]
|
72
|
+
v[i,j] = c * r + s * v[i,k] #typo in paper replace r by s
|
73
|
+
v[i,k] = -s*r + c * v[i,k] #typo in paper replace A(i,k) by v(i,k)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
until nt < 3 || z[nt-1]/(z[0] + tol) > tol
|
78
|
+
nt -= 1
|
79
|
+
end
|
80
|
+
break unless rcount > 0
|
81
|
+
end
|
82
|
+
nt.times do |j|
|
83
|
+
z[j] = Math.sqrt(z[j])
|
84
|
+
m.times {|i| u[i,j] = self[i,j]/z[j] }
|
85
|
+
end
|
86
|
+
z = DMatrix.diagonal(*z)
|
87
|
+
return u, z, v.transpose
|
88
|
+
end
|
89
|
+
|
90
|
+
def []=(i,j,val)
|
91
|
+
@rows[i][j] = val
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.join_rows(rows)
|
95
|
+
DMatrix[*rows.collect {|r| [*r] }]
|
96
|
+
end
|
97
|
+
|
98
|
+
def transpose
|
99
|
+
x = super
|
100
|
+
y = DMatrix[]
|
101
|
+
y.instance_variable_set("@rows",x.instance_variable_get("@rows"))
|
102
|
+
y
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.join_columns(columns)
|
106
|
+
DMatrix[*columns.collect {|c| [*c] }].transpose
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class Matrix
|
112
|
+
alias :nrow :row_size
|
113
|
+
alias :ncol :column_size
|
114
|
+
alias :rows :row_vectors
|
115
|
+
alias :columns :column_vectors
|
116
|
+
end
|
117
|
+
|
118
|
+
class Vector
|
119
|
+
alias :dot :inner_product
|
120
|
+
|
121
|
+
def transpose
|
122
|
+
self
|
123
|
+
end
|
124
|
+
|
125
|
+
def dimensions
|
126
|
+
[size]
|
127
|
+
end
|
128
|
+
|
129
|
+
def / (x)
|
130
|
+
self * (1/x)
|
131
|
+
end
|
132
|
+
end
|