clusterer 0.1.0 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +29 -7
- data/examples/google_search_cluster.rb +13 -7
- data/examples/yahoo_search_cluster.rb +18 -31
- data/lib/clusterer.rb +36 -95
- data/lib/clusterer/algorithms.rb +95 -0
- data/lib/clusterer/bayes.rb +255 -0
- data/lib/clusterer/cluster.rb +56 -0
- data/lib/clusterer/clustering.rb +35 -0
- data/lib/clusterer/document.rb +71 -0
- data/lib/clusterer/document_array.rb +79 -0
- data/lib/clusterer/document_base.rb +32 -0
- data/lib/clusterer/documents_centroid.rb +44 -0
- data/lib/clusterer/inverse_document_frequency.rb +83 -0
- data/lib/clusterer/lsi/dmatrix.rb +132 -0
- data/lib/clusterer/lsi/document_vector.rb +54 -0
- data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
- data/lib/clusterer/lsi/lsi.rb +95 -0
- data/lib/clusterer/similarity.rb +34 -0
- data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
- data/lib/clusterer/tokenizer.rb +70 -0
- data/tests/algorithms_test.rb +48 -0
- data/tests/bayes_test.rb +68 -0
- data/tests/cluster_test.rb +54 -0
- data/tests/document_array_test.rb +64 -0
- data/tests/document_centroid_test.rb +64 -0
- data/tests/document_test.rb +71 -0
- data/tests/inverse_document_frequency_test.rb +76 -0
- data/tests/lsi_test.rb +77 -0
- data/tests/similarity_test.rb +62 -0
- data/tests/tokenizer_test.rb +72 -0
- metadata +35 -9
- data/lib/similarity.rb +0 -27
- data/tests/clusterer_test.rb +0 -20
@@ -0,0 +1,56 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class Cluster
|
25
|
+
attr_reader :centroid, :documents
|
26
|
+
include ClusterSimilarity
|
27
|
+
|
28
|
+
def initialize(docs = [])
|
29
|
+
@documents = docs
|
30
|
+
end
|
31
|
+
|
32
|
+
def centroid
|
33
|
+
@centroid ||= (@documents.empty? ? nil : @documents[0].class.centroid_class.new(documents))
|
34
|
+
end
|
35
|
+
|
36
|
+
def merge!(cluster)
|
37
|
+
documents.concat(cluster.documents)
|
38
|
+
@centroid ? centroid.merge!(cluster.centroid) : @centroid = cluster.centroid
|
39
|
+
@intra_cluster_similarity = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def + (cluster)
|
43
|
+
c = Cluster.new(self.documents.clone)
|
44
|
+
c.merge!(cluster)
|
45
|
+
return c
|
46
|
+
end
|
47
|
+
|
48
|
+
def ==(cluster)
|
49
|
+
cluster && self.documents == cluster.documents
|
50
|
+
end
|
51
|
+
|
52
|
+
def intra_cluster_cosine_similarity
|
53
|
+
@intra_cluster_similarity ||= documents.inject(0) {|n,d| n + d.cosine_similarity(centroid) }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class Clustering
|
25
|
+
class << self
|
26
|
+
def cluster(algorithm, objects, options = { })
|
27
|
+
options[:no_of_clusters] ||= Math.sqrt(objects.size).to_i
|
28
|
+
idf = InverseDocumentFrequency.new
|
29
|
+
docs = objects.collect {|o|
|
30
|
+
(defined? yield) == "yield" ? Document.new(o, options.merge(:idf => idf)) {|o| yield(o)} : Document.new(o, options.merge(:idf => idf))}
|
31
|
+
Algorithms.send(algorithm, docs.collect {|d| d.normalize!(idf) }, options[:no_of_clusters])
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
#Document tokenizes the text and stores the count of each token in the document.
|
25
|
+
class Document < DocumentBase
|
26
|
+
#stores the text using hash
|
27
|
+
|
28
|
+
#Reference to the original text or the object from which the text is derived.
|
29
|
+
attr_reader :object
|
30
|
+
include(Tokenizer)
|
31
|
+
|
32
|
+
#Reference to the centroid class which is used by Kmeans algorithm
|
33
|
+
def self.centroid_class
|
34
|
+
DocumentsCentroid
|
35
|
+
end
|
36
|
+
|
37
|
+
def initialize (object, options = { })
|
38
|
+
@object = object
|
39
|
+
send(options[:tokenizer] || :simple_tokenizer,
|
40
|
+
((defined? yield) == "yield" ? yield(object) : object.to_s),
|
41
|
+
options[:tokenizer_options] || {}) {|term| self << term }
|
42
|
+
|
43
|
+
if (idf = options[:idf])
|
44
|
+
idf.increment_documents_count
|
45
|
+
self.each_key {|term| idf << term}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def << (term)
|
50
|
+
self[term] = (self[term] || 0) + 1
|
51
|
+
end
|
52
|
+
|
53
|
+
def normalize!(idf = nil, add_term = false)
|
54
|
+
normalizing_factor = 0.0
|
55
|
+
idf.increment_documents_count if add_term
|
56
|
+
|
57
|
+
self.each do |term,frequency|
|
58
|
+
idf << term if add_term
|
59
|
+
f = idf ? idf[term] : 1.0
|
60
|
+
self[term] = Math.log(1 + frequency) * f
|
61
|
+
normalizing_factor += self[term] ** 2
|
62
|
+
end
|
63
|
+
|
64
|
+
normalizing_factor = Math.sqrt(normalizing_factor)
|
65
|
+
normalizing_factor = 1 if normalizing_factor.zero?
|
66
|
+
self.each {|term,frequency| self[term] = frequency/normalizing_factor}
|
67
|
+
@vector_length = 1
|
68
|
+
self.freeze
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DocumentArray < Array
|
25
|
+
#stores the text in an array format, used with LSI or SVD
|
26
|
+
attr_reader :object
|
27
|
+
|
28
|
+
@@term_array_position_mapper = {}
|
29
|
+
include(Tokenizer)
|
30
|
+
|
31
|
+
def initialize(object = "",options = { })
|
32
|
+
@object = object
|
33
|
+
super(@@term_array_position_mapper.size,0.0)
|
34
|
+
send(options[:tokenizer] || :simple_tokenizer,
|
35
|
+
((defined? yield) == "yield" ? yield(object) : object.to_s),
|
36
|
+
options[:tokenizer_options] || {}) {|term| self << term }
|
37
|
+
|
38
|
+
if (idf = options[:idf])
|
39
|
+
idf.increment_documents_count
|
40
|
+
self.each_with_index {|ind,val| idf << @@term_array_position_mapper.index(ind) if val && val > 0.0}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def << (term)
|
45
|
+
self[term_array_position_mapper(term)] = (self[term_array_position_mapper(term)] || 0) + 1
|
46
|
+
end
|
47
|
+
|
48
|
+
def normalize!(idf = nil, add_term = false)
|
49
|
+
normalizing_factor = 0.0
|
50
|
+
idf.increment_documents_count if add_term
|
51
|
+
|
52
|
+
self[@@term_array_position_mapper.size - 1] ||= 0.0
|
53
|
+
|
54
|
+
self.each_with_index do |frequency, ind|
|
55
|
+
f = add_term ? (idf << term) : (idf ? idf[@@term_array_position_mapper.index(ind)] : 1.0)
|
56
|
+
self[ind] = (frequency || 0) * f
|
57
|
+
normalizing_factor += self[ind] ** 2
|
58
|
+
end
|
59
|
+
|
60
|
+
normalizing_factor = Math.sqrt(normalizing_factor)
|
61
|
+
normalizing_factor = 1 if normalizing_factor.zero?
|
62
|
+
self.each_with_index {|frequency, ind| self[ind] = frequency/normalizing_factor}
|
63
|
+
@vector_length = 1.0
|
64
|
+
self.freeze
|
65
|
+
end
|
66
|
+
|
67
|
+
def vector_length
|
68
|
+
@vector_length ||= Math.sqrt(self.inject(0) {|n,y| n + y*y})
|
69
|
+
end
|
70
|
+
|
71
|
+
def term_array_position_mapper(term)
|
72
|
+
if (x = @@term_array_position_mapper[term])
|
73
|
+
x
|
74
|
+
else
|
75
|
+
@@term_array_position_mapper[term] = @@term_array_position_mapper.size
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DocumentBase < Hash
|
25
|
+
#base class, not meant for direct use
|
26
|
+
include DocumentSimilarity
|
27
|
+
|
28
|
+
def vector_length
|
29
|
+
@vector_length ||= Math.sqrt(self.inject(0) {|n,y| n + y[1]*y[1]})
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DocumentsCentroid < DocumentBase
|
25
|
+
attr_reader :no_of_documents
|
26
|
+
|
27
|
+
def initialize(docs = [])
|
28
|
+
@no_of_documents = docs.size
|
29
|
+
docs.each do |d|
|
30
|
+
d.each {|w,f| self[w] = (self[w] || 0.0) + f/@no_of_documents}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def merge!(centroid)
|
35
|
+
@vector_length = nil
|
36
|
+
temp = @no_of_documents/(@no_of_documents + centroid.no_of_documents)
|
37
|
+
self.each {|w,v| self[w] = v*temp}
|
38
|
+
@no_of_documents += centroid.no_of_documents
|
39
|
+
|
40
|
+
temp = centroid.no_of_documents/@no_of_documents
|
41
|
+
centroid.each {|w,v| self[w] = (self[w] || 0) + v*temp }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
#DocumentsCount is used to store the count of number of documents seen.
|
25
|
+
#This class could have been just replaced by a simple variable, in
|
26
|
+
#InverseDocumentFrequency class but to make the InverseDocumentFrequency
|
27
|
+
#class more flexible and be able to store the count in DB/File store
|
28
|
+
#this class is provided.
|
29
|
+
class DocumentsCount
|
30
|
+
attr_reader :value
|
31
|
+
def initialize
|
32
|
+
@value = 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def increment
|
36
|
+
@value +=1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#TermsCount is used to store the count of number of documents in which the
|
41
|
+
#term has been seen. This class could have been just replaced by a simple
|
42
|
+
#hash object, in InverseDocumentFrequency class but to make the
|
43
|
+
#InverseDocumentFrequency class more flexible and be able to store the
|
44
|
+
#term count in DB/File store this class is provided.
|
45
|
+
class TermsCount < Hash
|
46
|
+
def increment_count(term)
|
47
|
+
self[term] = (self[term] || 0) + 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
#InverseDocumentFrequency maintains a count of the total number of documents
|
52
|
+
#and the number of documents where a term has been seen with the help of helper
|
53
|
+
#classes. It also calculates the normalizing factor, the formula for whichis
|
54
|
+
#Math.log(total_number of documents/ number of documents containing the term)
|
55
|
+
class InverseDocumentFrequency < Hash
|
56
|
+
def documents_count
|
57
|
+
@documents_count.value
|
58
|
+
end
|
59
|
+
|
60
|
+
def clean_cached_normalizing_factor
|
61
|
+
@nf.clear
|
62
|
+
end
|
63
|
+
|
64
|
+
def initialize (options = { })
|
65
|
+
@terms_count = options[:terms_count] || TermsCount.new
|
66
|
+
@nf = Hash.new
|
67
|
+
@documents_count = options[:documents_count] || DocumentsCount.new
|
68
|
+
end
|
69
|
+
|
70
|
+
def increment_documents_count
|
71
|
+
@documents_count.increment
|
72
|
+
end
|
73
|
+
|
74
|
+
def << (term)
|
75
|
+
@terms_count.increment_count(term) unless term.nil? || term.empty?
|
76
|
+
end
|
77
|
+
|
78
|
+
def [] (term)
|
79
|
+
@nf[term] ||= (@terms_count[term] && @documents_count.value >1) ? Math.log(@documents_count.value/@terms_count[term].to_f) : 1.0
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class DMatrix < Matrix
|
25
|
+
#algorithm description from "Simple Algoritms for the partial singular value decomposition"
|
26
|
+
#by J. C. Nash and S. Shlien
|
27
|
+
#Plane rotation method
|
28
|
+
#there were some typos in the original algorithm in the paper
|
29
|
+
#also see the Pascal code in NashSVD, file alg01.pas; for an idea
|
30
|
+
#the partial algorithm is an adaptation of that algo
|
31
|
+
|
32
|
+
def svd
|
33
|
+
m, n = self.row_size, self.column_size
|
34
|
+
tol = 0.001
|
35
|
+
slimit = [n/4.to_i, 6].max
|
36
|
+
u, z, v = DMatrix[*(1..m).to_a.collect {|i| Array.new(n,0) }], Array.new(n), DMatrix.diagonal(*Array.new(n,1))
|
37
|
+
|
38
|
+
nt = n
|
39
|
+
slimit.times do
|
40
|
+
rcount = nt *(nt-1)/2
|
41
|
+
(nt-1).times do |j|
|
42
|
+
(j+1).upto(nt - 1) do |k|
|
43
|
+
p=q=r=0
|
44
|
+
m.times do |i|
|
45
|
+
p += self[i,j]*self[i,k]
|
46
|
+
q += self[i,j]*self[i,j]
|
47
|
+
r += self[i,k]*self[i,k]
|
48
|
+
end
|
49
|
+
z[j], z[k] = q, r
|
50
|
+
if q < r
|
51
|
+
p, q = p/r, q/r - 1
|
52
|
+
vt = Math.sqrt(4*p*p + q*q)
|
53
|
+
s = Math.sqrt(0.5*(1 - q/vt))
|
54
|
+
s = -s if p < 0
|
55
|
+
c = p / (vt*s)
|
56
|
+
elsif (q * r <= tol * tol) || (p/q)*(p/r) <= tol
|
57
|
+
rcount -= 1
|
58
|
+
next
|
59
|
+
else
|
60
|
+
p, r = p/q, 1 - r/q
|
61
|
+
vt = Math.sqrt(4*p*p + r*r)
|
62
|
+
c = Math.sqrt(0.5*(1 + r/vt))
|
63
|
+
s = p/(vt * c)
|
64
|
+
end
|
65
|
+
m.times do |i|
|
66
|
+
r = self[i,j]
|
67
|
+
self[i,j] = c * r + s * self[i,k]
|
68
|
+
self[i,k] = -s*r + c * self[i,k]
|
69
|
+
end
|
70
|
+
n.times do |i|
|
71
|
+
r = v[i,j]
|
72
|
+
v[i,j] = c * r + s * v[i,k] #typo in paper replace r by s
|
73
|
+
v[i,k] = -s*r + c * v[i,k] #typo in paper replace A(i,k) by v(i,k)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
until nt < 3 || z[nt-1]/(z[0] + tol) > tol
|
78
|
+
nt -= 1
|
79
|
+
end
|
80
|
+
break unless rcount > 0
|
81
|
+
end
|
82
|
+
nt.times do |j|
|
83
|
+
z[j] = Math.sqrt(z[j])
|
84
|
+
m.times {|i| u[i,j] = self[i,j]/z[j] }
|
85
|
+
end
|
86
|
+
z = DMatrix.diagonal(*z)
|
87
|
+
return u, z, v.transpose
|
88
|
+
end
|
89
|
+
|
90
|
+
def []=(i,j,val)
|
91
|
+
@rows[i][j] = val
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.join_rows(rows)
|
95
|
+
DMatrix[*rows.collect {|r| [*r] }]
|
96
|
+
end
|
97
|
+
|
98
|
+
def transpose
|
99
|
+
x = super
|
100
|
+
y = DMatrix[]
|
101
|
+
y.instance_variable_set("@rows",x.instance_variable_get("@rows"))
|
102
|
+
y
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.join_columns(columns)
|
106
|
+
DMatrix[*columns.collect {|c| [*c] }].transpose
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class Matrix
|
112
|
+
alias :nrow :row_size
|
113
|
+
alias :ncol :column_size
|
114
|
+
alias :rows :row_vectors
|
115
|
+
alias :columns :column_vectors
|
116
|
+
end
|
117
|
+
|
118
|
+
class Vector
|
119
|
+
alias :dot :inner_product
|
120
|
+
|
121
|
+
def transpose
|
122
|
+
self
|
123
|
+
end
|
124
|
+
|
125
|
+
def dimensions
|
126
|
+
[size]
|
127
|
+
end
|
128
|
+
|
129
|
+
def / (x)
|
130
|
+
self * (1/x)
|
131
|
+
end
|
132
|
+
end
|