kmeans 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +16 -0
- data/README.md +55 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/doc/AUTHORS +1 -0
- data/doc/COPYING +674 -0
- data/doc/COPYING.LESSER +165 -0
- data/doc/ChangeLog +5 -0
- data/doc/LICENSE +8 -0
- data/doc/README +19 -0
- data/kmeans.gemspec +66 -0
- data/lib/kmeans.rb +12 -0
- data/lib/kmeans/cluster.rb +99 -0
- data/lib/kmeans/pair.rb +25 -0
- data/lib/kmeans/pearson.rb +38 -0
- data/script/.gitkeep +0 -0
- data/spec/lib/kmeans/cluster_spec.rb +64 -0
- data/spec/lib/kmeans/pair_spec.rb +46 -0
- data/spec/lib/kmeans/pearson_spec.rb +257 -0
- data/spec/lib/kmeans_spec.rb +13 -0
- data/spec/spec_helper.rb +26 -0
- data/vendor/.gitkeep +0 -0
- metadata +116 -0
data/doc/COPYING.LESSER
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
GNU LESSER GENERAL PUBLIC LICENSE
|
2
|
+
Version 3, 29 June 2007
|
3
|
+
|
4
|
+
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
5
|
+
Everyone is permitted to copy and distribute verbatim copies
|
6
|
+
of this license document, but changing it is not allowed.
|
7
|
+
|
8
|
+
|
9
|
+
This version of the GNU Lesser General Public License incorporates
|
10
|
+
the terms and conditions of version 3 of the GNU General Public
|
11
|
+
License, supplemented by the additional permissions listed below.
|
12
|
+
|
13
|
+
0. Additional Definitions.
|
14
|
+
|
15
|
+
As used herein, "this License" refers to version 3 of the GNU Lesser
|
16
|
+
General Public License, and the "GNU GPL" refers to version 3 of the GNU
|
17
|
+
General Public License.
|
18
|
+
|
19
|
+
"The Library" refers to a covered work governed by this License,
|
20
|
+
other than an Application or a Combined Work as defined below.
|
21
|
+
|
22
|
+
An "Application" is any work that makes use of an interface provided
|
23
|
+
by the Library, but which is not otherwise based on the Library.
|
24
|
+
Defining a subclass of a class defined by the Library is deemed a mode
|
25
|
+
of using an interface provided by the Library.
|
26
|
+
|
27
|
+
A "Combined Work" is a work produced by combining or linking an
|
28
|
+
Application with the Library. The particular version of the Library
|
29
|
+
with which the Combined Work was made is also called the "Linked
|
30
|
+
Version".
|
31
|
+
|
32
|
+
The "Minimal Corresponding Source" for a Combined Work means the
|
33
|
+
Corresponding Source for the Combined Work, excluding any source code
|
34
|
+
for portions of the Combined Work that, considered in isolation, are
|
35
|
+
based on the Application, and not on the Linked Version.
|
36
|
+
|
37
|
+
The "Corresponding Application Code" for a Combined Work means the
|
38
|
+
object code and/or source code for the Application, including any data
|
39
|
+
and utility programs needed for reproducing the Combined Work from the
|
40
|
+
Application, but excluding the System Libraries of the Combined Work.
|
41
|
+
|
42
|
+
1. Exception to Section 3 of the GNU GPL.
|
43
|
+
|
44
|
+
You may convey a covered work under sections 3 and 4 of this License
|
45
|
+
without being bound by section 3 of the GNU GPL.
|
46
|
+
|
47
|
+
2. Conveying Modified Versions.
|
48
|
+
|
49
|
+
If you modify a copy of the Library, and, in your modifications, a
|
50
|
+
facility refers to a function or data to be supplied by an Application
|
51
|
+
that uses the facility (other than as an argument passed when the
|
52
|
+
facility is invoked), then you may convey a copy of the modified
|
53
|
+
version:
|
54
|
+
|
55
|
+
a) under this License, provided that you make a good faith effort to
|
56
|
+
ensure that, in the event an Application does not supply the
|
57
|
+
function or data, the facility still operates, and performs
|
58
|
+
whatever part of its purpose remains meaningful, or
|
59
|
+
|
60
|
+
b) under the GNU GPL, with none of the additional permissions of
|
61
|
+
this License applicable to that copy.
|
62
|
+
|
63
|
+
3. Object Code Incorporating Material from Library Header Files.
|
64
|
+
|
65
|
+
The object code form of an Application may incorporate material from
|
66
|
+
a header file that is part of the Library. You may convey such object
|
67
|
+
code under terms of your choice, provided that, if the incorporated
|
68
|
+
material is not limited to numerical parameters, data structure
|
69
|
+
layouts and accessors, or small macros, inline functions and templates
|
70
|
+
(ten or fewer lines in length), you do both of the following:
|
71
|
+
|
72
|
+
a) Give prominent notice with each copy of the object code that the
|
73
|
+
Library is used in it and that the Library and its use are
|
74
|
+
covered by this License.
|
75
|
+
|
76
|
+
b) Accompany the object code with a copy of the GNU GPL and this license
|
77
|
+
document.
|
78
|
+
|
79
|
+
4. Combined Works.
|
80
|
+
|
81
|
+
You may convey a Combined Work under terms of your choice that,
|
82
|
+
taken together, effectively do not restrict modification of the
|
83
|
+
portions of the Library contained in the Combined Work and reverse
|
84
|
+
engineering for debugging such modifications, if you also do each of
|
85
|
+
the following:
|
86
|
+
|
87
|
+
a) Give prominent notice with each copy of the Combined Work that
|
88
|
+
the Library is used in it and that the Library and its use are
|
89
|
+
covered by this License.
|
90
|
+
|
91
|
+
b) Accompany the Combined Work with a copy of the GNU GPL and this license
|
92
|
+
document.
|
93
|
+
|
94
|
+
c) For a Combined Work that displays copyright notices during
|
95
|
+
execution, include the copyright notice for the Library among
|
96
|
+
these notices, as well as a reference directing the user to the
|
97
|
+
copies of the GNU GPL and this license document.
|
98
|
+
|
99
|
+
d) Do one of the following:
|
100
|
+
|
101
|
+
0) Convey the Minimal Corresponding Source under the terms of this
|
102
|
+
License, and the Corresponding Application Code in a form
|
103
|
+
suitable for, and under terms that permit, the user to
|
104
|
+
recombine or relink the Application with a modified version of
|
105
|
+
the Linked Version to produce a modified Combined Work, in the
|
106
|
+
manner specified by section 6 of the GNU GPL for conveying
|
107
|
+
Corresponding Source.
|
108
|
+
|
109
|
+
1) Use a suitable shared library mechanism for linking with the
|
110
|
+
Library. A suitable mechanism is one that (a) uses at run time
|
111
|
+
a copy of the Library already present on the user's computer
|
112
|
+
system, and (b) will operate properly with a modified version
|
113
|
+
of the Library that is interface-compatible with the Linked
|
114
|
+
Version.
|
115
|
+
|
116
|
+
e) Provide Installation Information, but only if you would otherwise
|
117
|
+
be required to provide such information under section 6 of the
|
118
|
+
GNU GPL, and only to the extent that such information is
|
119
|
+
necessary to install and execute a modified version of the
|
120
|
+
Combined Work produced by recombining or relinking the
|
121
|
+
Application with a modified version of the Linked Version. (If
|
122
|
+
you use option 4d0, the Installation Information must accompany
|
123
|
+
the Minimal Corresponding Source and Corresponding Application
|
124
|
+
Code. If you use option 4d1, you must provide the Installation
|
125
|
+
Information in the manner specified by section 6 of the GNU GPL
|
126
|
+
for conveying Corresponding Source.)
|
127
|
+
|
128
|
+
5. Combined Libraries.
|
129
|
+
|
130
|
+
You may place library facilities that are a work based on the
|
131
|
+
Library side by side in a single library together with other library
|
132
|
+
facilities that are not Applications and are not covered by this
|
133
|
+
License, and convey such a combined library under terms of your
|
134
|
+
choice, if you do both of the following:
|
135
|
+
|
136
|
+
a) Accompany the combined library with a copy of the same work based
|
137
|
+
on the Library, uncombined with any other library facilities,
|
138
|
+
conveyed under the terms of this License.
|
139
|
+
|
140
|
+
b) Give prominent notice with the combined library that part of it
|
141
|
+
is a work based on the Library, and explaining where to find the
|
142
|
+
accompanying uncombined form of the same work.
|
143
|
+
|
144
|
+
6. Revised Versions of the GNU Lesser General Public License.
|
145
|
+
|
146
|
+
The Free Software Foundation may publish revised and/or new versions
|
147
|
+
of the GNU Lesser General Public License from time to time. Such new
|
148
|
+
versions will be similar in spirit to the present version, but may
|
149
|
+
differ in detail to address new problems or concerns.
|
150
|
+
|
151
|
+
Each version is given a distinguishing version number. If the
|
152
|
+
Library as you received it specifies that a certain numbered version
|
153
|
+
of the GNU Lesser General Public License "or any later version"
|
154
|
+
applies to it, you have the option of following the terms and
|
155
|
+
conditions either of that published version or of any later version
|
156
|
+
published by the Free Software Foundation. If the Library as you
|
157
|
+
received it does not specify a version number of the GNU Lesser
|
158
|
+
General Public License, you may choose any version of the GNU Lesser
|
159
|
+
General Public License ever published by the Free Software Foundation.
|
160
|
+
|
161
|
+
If the Library as you received it specifies that a proxy can decide
|
162
|
+
whether future versions of the GNU Lesser General Public License shall
|
163
|
+
apply, that proxy's public statement of acceptance of any version is
|
164
|
+
permanent authorization for you to choose that version for the
|
165
|
+
Library.
|
data/doc/ChangeLog
ADDED
data/doc/LICENSE
ADDED
data/doc/README
ADDED
data/kmeans.gemspec
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "kmeans"
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["id774"]
|
12
|
+
s.date = "2012-10-07"
|
13
|
+
s.description = "K-means clustering"
|
14
|
+
s.email = "idnanashi@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"README.md"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
"Gemfile",
|
20
|
+
"README.md",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION",
|
23
|
+
"doc/AUTHORS",
|
24
|
+
"doc/COPYING",
|
25
|
+
"doc/COPYING.LESSER",
|
26
|
+
"doc/ChangeLog",
|
27
|
+
"doc/LICENSE",
|
28
|
+
"doc/README",
|
29
|
+
"kmeans.gemspec",
|
30
|
+
"lib/kmeans.rb",
|
31
|
+
"lib/kmeans/cluster.rb",
|
32
|
+
"lib/kmeans/pair.rb",
|
33
|
+
"lib/kmeans/pearson.rb",
|
34
|
+
"script/.gitkeep",
|
35
|
+
"spec/lib/kmeans/cluster_spec.rb",
|
36
|
+
"spec/lib/kmeans/pair_spec.rb",
|
37
|
+
"spec/lib/kmeans/pearson_spec.rb",
|
38
|
+
"spec/lib/kmeans_spec.rb",
|
39
|
+
"spec/spec_helper.rb",
|
40
|
+
"vendor/.gitkeep"
|
41
|
+
]
|
42
|
+
s.homepage = "http://github.com/id774/kmeans"
|
43
|
+
s.licenses = ["GPL"]
|
44
|
+
s.require_paths = ["lib"]
|
45
|
+
s.rubygems_version = "1.8.24"
|
46
|
+
s.summary = "kmeans"
|
47
|
+
|
48
|
+
if s.respond_to? :specification_version then
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
53
|
+
s.add_development_dependency(%q<bundler>, [">= 0"])
|
54
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
57
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
58
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
62
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
63
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
data/lib/kmeans.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
module Kmeans
|
5
|
+
VERSION = "0.0.1"
|
6
|
+
ROOT_DIR = File.expand_path("..", File.dirname(__FILE__))
|
7
|
+
$:.unshift ROOT_DIR
|
8
|
+
$:.unshift ROOT_DIR + '/lib'
|
9
|
+
$:.unshift ROOT_DIR + '/lib/kmeans'
|
10
|
+
|
11
|
+
require 'cluster'
|
12
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'pair'
|
4
|
+
require 'pearson'
|
5
|
+
|
6
|
+
module Kmeans
|
7
|
+
class Cluster
|
8
|
+
def initialize(word_counts, user_options = {})
|
9
|
+
@word_counts = word_counts
|
10
|
+
@min_and_max = {}
|
11
|
+
@centroids = {}
|
12
|
+
@cluster = Hash.new {|hash, key| hash[key] = []}
|
13
|
+
@options = {
|
14
|
+
:centroids => 4,
|
15
|
+
:loop_max => 100
|
16
|
+
}.merge(user_options)
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :cluster
|
20
|
+
|
21
|
+
def make_cluster
|
22
|
+
@min_and_max = min_and_max_in_word_counts
|
23
|
+
@centroids = random_centroids
|
24
|
+
|
25
|
+
loop_counter = 0
|
26
|
+
old_centroids = nil
|
27
|
+
until (@centroids == old_centroids) or (@options[:loop_max] < loop_counter)
|
28
|
+
loop_counter += 1
|
29
|
+
attach_urls_to_nearest_centroid
|
30
|
+
old_centroids = Marshal.load(Marshal.dump(@centroids))
|
31
|
+
|
32
|
+
@centroids.each_key {|centroid|
|
33
|
+
@centroids[centroid] = average_attached(centroid) if @cluster[centroid].any?
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def min_and_max_in_word_counts
|
40
|
+
all_counts = Hash.new {|hash, key| hash[key] = []}
|
41
|
+
min_and_max = {}
|
42
|
+
|
43
|
+
@word_counts.each {|url, counts|
|
44
|
+
counts.each {|word, count|
|
45
|
+
all_counts[word] << count
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
all_counts.each {|word, counts|
|
50
|
+
min_and_max[word] = Pair.new [counts.min, counts.max]
|
51
|
+
}
|
52
|
+
min_and_max
|
53
|
+
end
|
54
|
+
|
55
|
+
def random_centroids
|
56
|
+
centroids = {}
|
57
|
+
|
58
|
+
@options[:centroids].times {|centroid|
|
59
|
+
random_counts = {}
|
60
|
+
@min_and_max.each {|word, counts|
|
61
|
+
random_counts[word] = rand(counts.max - counts.min) + counts.min
|
62
|
+
}
|
63
|
+
centroids[centroid] = random_counts
|
64
|
+
}
|
65
|
+
centroids
|
66
|
+
end
|
67
|
+
|
68
|
+
def attach_urls_to_nearest_centroid
|
69
|
+
@cluster.clear
|
70
|
+
|
71
|
+
@word_counts.each_key {|url|
|
72
|
+
@cluster[nearest_centroid(url)] << url
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
def nearest_centroid(url)
|
77
|
+
correlations = @centroids.map {|centroid, centroid_word_count|
|
78
|
+
web_counts = []
|
79
|
+
centroid_counts = []
|
80
|
+
|
81
|
+
@word_counts[url].each {|word, count|
|
82
|
+
web_counts << count
|
83
|
+
centroid_counts << centroid_word_count[word]
|
84
|
+
}
|
85
|
+
1 - Pearson.calc(web_counts, centroid_counts)
|
86
|
+
}
|
87
|
+
correlations.rindex(correlations.min { |x, y| x.abs <=> y.abs })
|
88
|
+
end
|
89
|
+
|
90
|
+
def average_attached(centroid)
|
91
|
+
average_word_counts = @cluster[centroid].map {|url|
|
92
|
+
@centroids[centroid].keys.map {|word| @word_counts[url][word]}
|
93
|
+
}.transpose.map {|all_counts|
|
94
|
+
all_counts.inject(0) { |sum, count| sum + count }.quo(all_counts.size)
|
95
|
+
}
|
96
|
+
Hash[*@centroids[centroid].keys.zip(average_word_counts).flatten]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
data/lib/kmeans/pair.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Kmeans
|
4
|
+
class Pair < Array
|
5
|
+
def initialize(*args)
|
6
|
+
super
|
7
|
+
slice!(2, (size - 2))
|
8
|
+
end
|
9
|
+
|
10
|
+
alias_method :original_eql?, :eql?
|
11
|
+
alias_method :original_hash, :hash
|
12
|
+
|
13
|
+
def eql?(other)
|
14
|
+
sort.original_eql?(other.sort)
|
15
|
+
end
|
16
|
+
|
17
|
+
def hash
|
18
|
+
sort.original_hash
|
19
|
+
end
|
20
|
+
|
21
|
+
alias == eql?
|
22
|
+
alias left first
|
23
|
+
alias right last
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Kmeans
|
4
|
+
class Pearson
|
5
|
+
def self.calc(v1,v2)
|
6
|
+
v1 = [v1] if v1.class != Array
|
7
|
+
v2 = [v2] if v2.class != Array
|
8
|
+
|
9
|
+
sum1 = 0
|
10
|
+
v1.each{|n|
|
11
|
+
sum1 += n
|
12
|
+
}
|
13
|
+
sum2 = 0
|
14
|
+
v2.each{|n|
|
15
|
+
sum2 += n
|
16
|
+
}
|
17
|
+
|
18
|
+
sum1Sq = 0
|
19
|
+
v1.each{|n|
|
20
|
+
sum1Sq += n*n
|
21
|
+
}
|
22
|
+
sum2Sq = 0
|
23
|
+
v2.each{|n|
|
24
|
+
sum2Sq += n*n
|
25
|
+
}
|
26
|
+
|
27
|
+
pSum = 0
|
28
|
+
for i in 0...v1.length
|
29
|
+
pSum += v1[i]*v2[i]
|
30
|
+
end
|
31
|
+
|
32
|
+
num = pSum - (sum1*sum2/v1.length)
|
33
|
+
den = Math::sqrt((sum1Sq-sum1*sum1/v1.length)*(sum2Sq-sum2*sum2/v1.length))
|
34
|
+
return 0 if den == 0
|
35
|
+
return num/den
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|