k_means 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +23 -9
- data/VERSION +1 -1
- data/benchmark/benchmark_ai4r.rb +2 -2
- data/k_means.gemspec +3 -3
- data/lib/k_means/centroid.rb +2 -2
- data/lib/k_means/k_means.rb +21 -21
- data/lib/k_means/node.rb +31 -13
- data/test/k_means/test_centroid.rb +1 -1
- data/test/k_means/test_k_means.rb +33 -7
- data/test/k_means/test_node.rb +24 -10
- metadata +20 -9
data/README.rdoc
CHANGED
|
@@ -4,18 +4,32 @@ Attempting to build a fast, memory efficient K-Means program.
|
|
|
4
4
|
|
|
5
5
|
== Install
|
|
6
6
|
|
|
7
|
-
gem sources -a http://
|
|
8
|
-
sudo gem install
|
|
7
|
+
gem sources -a http://rubygems.org
|
|
8
|
+
sudo gem install k_means
|
|
9
9
|
|
|
10
10
|
== How To Use
|
|
11
11
|
require 'rubygems'
|
|
12
12
|
require 'k_means'
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
|
|
15
|
-
kmeans = KMeans.new(
|
|
15
|
+
kmeans = KMeans.new(data, :centroids => 2)
|
|
16
16
|
kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
|
|
17
17
|
=> [[3, 4], [0, 1, 2]]
|
|
18
|
-
|
|
18
|
+
|
|
19
|
+
== Custom Centroids
|
|
20
|
+
require 'rubygems'
|
|
21
|
+
require 'k_means'
|
|
22
|
+
|
|
23
|
+
# Your custom centroid needs to have #position and #reposition methods
|
|
24
|
+
class CustomCentroid
|
|
25
|
+
attr_accessor :position
|
|
26
|
+
def initialize(position); @position = position; end
|
|
27
|
+
def reposition(nodes, centroid_positions); end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
|
|
31
|
+
kmeans = KMeans.new(data, :custom_centroids => @specified_centroids)
|
|
32
|
+
|
|
19
33
|
== Distance Measurements
|
|
20
34
|
|
|
21
35
|
KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
|
|
@@ -35,19 +49,19 @@ The measurements currently available are:
|
|
|
35
49
|
binary_jaccard_distance
|
|
36
50
|
|
|
37
51
|
tanimoto_coefficient
|
|
38
|
-
|
|
52
|
+
|
|
39
53
|
To specify a particular one to use in the KMeans algorithm, just provide it as an option:
|
|
40
54
|
|
|
41
55
|
KMeans.new(@data, :distance_measure => :jaccard_index)
|
|
42
56
|
KMeans.new(@data, :distance_measure => :cosine_similarity)
|
|
43
57
|
KMeans.new(@data, :distance_measure => :tanimoto_coefficient)
|
|
44
|
-
|
|
58
|
+
|
|
45
59
|
You get the idea...
|
|
46
|
-
|
|
60
|
+
|
|
47
61
|
== Benchmarks
|
|
48
62
|
|
|
49
63
|
# 1000 records with 50 dimensions
|
|
50
|
-
data = Array.new(1000) {Array.new(50) {rand(10)}}
|
|
64
|
+
data = Array.new(1000) {Array.new(50) {rand(10)}}
|
|
51
65
|
ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
|
|
52
66
|
|
|
53
67
|
# Clustering can happen in magical ways
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.0.
|
|
1
|
+
0.0.7
|
data/benchmark/benchmark_ai4r.rb
CHANGED
|
@@ -3,7 +3,7 @@ require 'rubygems'
|
|
|
3
3
|
require 'ai4r'
|
|
4
4
|
require File.dirname(__FILE__) + '/../lib/k_means'
|
|
5
5
|
|
|
6
|
-
data = Array.new(
|
|
6
|
+
data = Array.new(100) {Array.new(50) {rand(10)}}
|
|
7
7
|
|
|
8
8
|
ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
|
|
9
9
|
|
|
@@ -16,7 +16,7 @@ Benchmark.bm do |x|
|
|
|
16
16
|
x.report('Mine') do
|
|
17
17
|
n.times { KMeans.new(data) }
|
|
18
18
|
end
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
x.report("Ai4R") do
|
|
21
21
|
n.times do
|
|
22
22
|
b = Ai4r::Clusterers::KMeans.new
|
data/k_means.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = %q{k_means}
|
|
8
|
-
s.version = "0.0.
|
|
8
|
+
s.version = "0.0.7"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["reddavis"]
|
|
12
|
-
s.date = %q{2010-
|
|
12
|
+
s.date = %q{2010-07-11}
|
|
13
13
|
s.description = %q{Attempting to create a fast, memory efficient KMeans}
|
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
|
15
15
|
s.extra_rdoc_files = [
|
|
@@ -40,7 +40,7 @@ Gem::Specification.new do |s|
|
|
|
40
40
|
s.homepage = %q{http://github.com/reddavis/k_means}
|
|
41
41
|
s.rdoc_options = ["--charset=UTF-8"]
|
|
42
42
|
s.require_paths = ["lib"]
|
|
43
|
-
s.rubygems_version = %q{1.3.
|
|
43
|
+
s.rubygems_version = %q{1.3.6}
|
|
44
44
|
s.summary = %q{K Means algorithm}
|
|
45
45
|
s.test_files = [
|
|
46
46
|
"test/ext/test_object.rb",
|
data/lib/k_means/centroid.rb
CHANGED
|
@@ -35,7 +35,7 @@ class Centroid
|
|
|
35
35
|
|
|
36
36
|
# Finds the average distance of all the nodes assigned to
|
|
37
37
|
# the centroid and then moves the centroid to that position
|
|
38
|
-
def reposition(nodes)
|
|
38
|
+
def reposition(nodes, centroids)
|
|
39
39
|
return if nodes.empty?
|
|
40
40
|
averages = [0.0] * nodes[0].position.size
|
|
41
41
|
nodes.each do |node|
|
|
@@ -46,4 +46,4 @@ class Centroid
|
|
|
46
46
|
@position = averages.map {|x| x / nodes.size}
|
|
47
47
|
end
|
|
48
48
|
|
|
49
|
-
end
|
|
49
|
+
end
|
data/lib/k_means/k_means.rb
CHANGED
|
@@ -1,42 +1,41 @@
|
|
|
1
1
|
require 'ext/object'
|
|
2
2
|
|
|
3
3
|
class KMeans
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
attr_reader :centroids, :nodes
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
def initialize(data, options={})
|
|
8
|
-
k = options[:centroids] || 4
|
|
9
|
-
@verbose = options[:verbose]
|
|
10
|
-
|
|
11
8
|
distance_measure = options[:distance_measure] || :euclidean_distance
|
|
12
9
|
@nodes = Node.create_nodes(data, distance_measure)
|
|
13
|
-
@centroids =
|
|
14
|
-
|
|
10
|
+
@centroids = options[:custom_centroids] ||
|
|
11
|
+
Centroid.create_centroids(options[:centroids] || 4, @nodes)
|
|
12
|
+
@verbose = options[:verbose]
|
|
13
|
+
|
|
15
14
|
perform_cluster_process
|
|
16
15
|
end
|
|
17
|
-
|
|
16
|
+
|
|
18
17
|
def inspect
|
|
19
18
|
@centroid_pockets.inspect
|
|
20
19
|
end
|
|
21
|
-
|
|
20
|
+
|
|
22
21
|
def view
|
|
23
22
|
@centroid_pockets
|
|
24
23
|
end
|
|
25
|
-
|
|
24
|
+
|
|
26
25
|
private
|
|
27
|
-
|
|
26
|
+
|
|
28
27
|
def perform_cluster_process
|
|
29
28
|
iterations, updates = 0, 1
|
|
30
|
-
while updates > 0 && iterations < 100
|
|
29
|
+
while updates > 0 && iterations < 100
|
|
31
30
|
iterations += 1
|
|
32
31
|
verbose_message("Iteration #{iterations}")
|
|
33
|
-
updates = 0
|
|
32
|
+
updates = 0
|
|
34
33
|
updates += update_nodes
|
|
35
34
|
reposition_centroids
|
|
36
35
|
end
|
|
37
36
|
place_nodes_into_pockets
|
|
38
37
|
end
|
|
39
|
-
|
|
38
|
+
|
|
40
39
|
# This creates an array of arrays
|
|
41
40
|
# Each internal array represents a centroid
|
|
42
41
|
# and each in the array represents the nodes index
|
|
@@ -51,7 +50,7 @@ class KMeans
|
|
|
51
50
|
end
|
|
52
51
|
@centroid_pockets = centroid_pockets
|
|
53
52
|
end
|
|
54
|
-
|
|
53
|
+
|
|
55
54
|
def update_nodes
|
|
56
55
|
sum = 0
|
|
57
56
|
@nodes.each do |node|
|
|
@@ -59,17 +58,18 @@ class KMeans
|
|
|
59
58
|
end
|
|
60
59
|
sum
|
|
61
60
|
end
|
|
62
|
-
|
|
61
|
+
|
|
63
62
|
def reposition_centroids
|
|
63
|
+
centroid_positions = @centroids.map(&:position)
|
|
64
64
|
@centroids.each do |centroid|
|
|
65
|
-
nodes = []
|
|
65
|
+
nodes = []
|
|
66
66
|
@nodes.each {|n| nodes << n if n.closest_centroid == centroid}
|
|
67
|
-
centroid.reposition(nodes)
|
|
67
|
+
centroid.reposition(nodes, centroid_positions)
|
|
68
68
|
end
|
|
69
69
|
end
|
|
70
|
-
|
|
70
|
+
|
|
71
71
|
def verbose_message(message)
|
|
72
72
|
puts message if @verbose
|
|
73
73
|
end
|
|
74
|
-
|
|
75
|
-
end
|
|
74
|
+
|
|
75
|
+
end
|
data/lib/k_means/node.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
class Node
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
class << self
|
|
4
4
|
def create_nodes(data, similarity_measure)
|
|
5
5
|
nodes = []
|
|
@@ -9,41 +9,59 @@ class Node
|
|
|
9
9
|
nodes
|
|
10
10
|
end
|
|
11
11
|
end
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
attr_accessor :position, :best_distance, :closest_centroid
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
def initialize(position, similarity_measure)
|
|
16
16
|
@position = position
|
|
17
17
|
@similarity_measure = similarity_measure
|
|
18
18
|
end
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
def update_closest_centroid(centroids)
|
|
21
|
+
# If we haven't processed this node we need to give it an initial centroid
|
|
22
|
+
# so that we have something to compare distances against
|
|
21
23
|
calculate_initial_centroid(centroids.first) unless @closest_centroid
|
|
24
|
+
|
|
22
25
|
updated = false
|
|
23
26
|
centroids.each do |centroid|
|
|
27
|
+
# Check if they are in the same position
|
|
28
|
+
if centroid.position == @position
|
|
29
|
+
updated = update_attributes(centroid, 0.0)
|
|
30
|
+
break
|
|
31
|
+
end
|
|
32
|
+
|
|
24
33
|
distance = calculate_distance(centroid)
|
|
25
34
|
if distance < best_distance
|
|
26
|
-
updated =
|
|
27
|
-
@closest_centroid = centroid
|
|
28
|
-
@best_distance = distance
|
|
35
|
+
updated = update_attributes(centroid, distance)
|
|
29
36
|
end
|
|
30
37
|
end
|
|
38
|
+
|
|
31
39
|
updated == true ? 1 : 0
|
|
32
40
|
end
|
|
33
|
-
|
|
41
|
+
|
|
42
|
+
def reset!
|
|
43
|
+
@closest_centroid = nil
|
|
44
|
+
@best_distance = nil
|
|
45
|
+
end
|
|
46
|
+
|
|
34
47
|
private
|
|
35
|
-
|
|
48
|
+
|
|
49
|
+
def update_attributes(closest_centroid, best_distance)
|
|
50
|
+
@closest_centroid, @best_distance = closest_centroid, best_distance
|
|
51
|
+
true
|
|
52
|
+
end
|
|
53
|
+
|
|
36
54
|
def calculate_initial_centroid(centroid)
|
|
37
55
|
@closest_centroid = centroid
|
|
38
56
|
@best_distance = calculate_distance(centroid)
|
|
39
57
|
end
|
|
40
|
-
|
|
58
|
+
|
|
41
59
|
def calculate_distance(centroid)
|
|
42
60
|
begin
|
|
43
61
|
@position.send(@similarity_measure, centroid.position)
|
|
44
62
|
rescue NoMethodError
|
|
45
|
-
raise "Hey,
|
|
63
|
+
raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements"
|
|
46
64
|
end
|
|
47
65
|
end
|
|
48
|
-
|
|
49
|
-
end
|
|
66
|
+
|
|
67
|
+
end
|
|
@@ -2,23 +2,49 @@ require 'helper'
|
|
|
2
2
|
|
|
3
3
|
class TestKMeans < Test::Unit::TestCase
|
|
4
4
|
context "A KMeans Instance" do
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
setup do
|
|
7
|
-
@data = Array.new(
|
|
7
|
+
@data = Array.new(3) {Array.new(2) {rand}}
|
|
8
8
|
@kmeans = KMeans.new(@data, :centroids => 2, :distance_measure => :cosine_similarity)
|
|
9
9
|
end
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
should "return an array" do
|
|
12
12
|
assert_kind_of String, @kmeans.inspect
|
|
13
13
|
end
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
should "have 2 centroids" do
|
|
16
16
|
assert_equal 2, @kmeans.centroids.size
|
|
17
17
|
end
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
should "have 200 nodes" do
|
|
20
|
-
assert_equal
|
|
20
|
+
assert_equal 3, @kmeans.nodes.size
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
context "A KMeans Instance with specified initial centroids" do
|
|
26
|
+
setup do
|
|
27
|
+
@data = Array.new(3) {Array.new(2) {rand}}
|
|
28
|
+
class CustomCentroid
|
|
29
|
+
attr_accessor :position
|
|
30
|
+
def initialize(position); @position = position; end
|
|
31
|
+
def reposition(nodes, centroid_positions); end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
@specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) }
|
|
35
|
+
@kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
should "return an inspected array" do
|
|
39
|
+
assert_kind_of String, @kmeans.inspect
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
should "have 3 centroids" do
|
|
43
|
+
assert_equal 3, @kmeans.centroids.size
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
should "have 3 nodes" do
|
|
47
|
+
assert_equal 3, @kmeans.nodes.size
|
|
21
48
|
end
|
|
22
|
-
|
|
23
49
|
end
|
|
24
50
|
end
|
data/test/k_means/test_node.rb
CHANGED
|
@@ -2,49 +2,63 @@ require 'helper'
|
|
|
2
2
|
|
|
3
3
|
class TestNode < Test::Unit::TestCase
|
|
4
4
|
context "A Data Instance" do
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
setup do
|
|
7
7
|
@node = Node.new([4, 4], :euclidean_distance)
|
|
8
8
|
end
|
|
9
|
-
|
|
9
|
+
|
|
10
10
|
should "return an array" do
|
|
11
11
|
assert_kind_of Array, @node.position
|
|
12
12
|
end
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
should "create an array of nodes" do
|
|
15
15
|
data = Array.new(10) {Array.new(2) {rand}}
|
|
16
16
|
nodes = Node.create_nodes(data, :euclidean_distance)
|
|
17
17
|
assert_kind_of Array, nodes
|
|
18
18
|
end
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
should "create 10 nodes" do
|
|
21
21
|
data = Array.new(10) {Array.new(2) {rand}}
|
|
22
22
|
nodes = Node.create_nodes(data, :euclidean_distance)
|
|
23
23
|
assert_equal 10, nodes.size
|
|
24
24
|
end
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
should "initialize closest centroid" do
|
|
27
27
|
a = @node.closest_centroid
|
|
28
28
|
centroids = [Centroid.new([4, 4]), Centroid.new([5, 4])]
|
|
29
29
|
@node.update_closest_centroid(centroids)
|
|
30
30
|
assert_not_equal nil, @node.closest_centroid
|
|
31
31
|
end
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
should "update closest centroid" do
|
|
34
34
|
centroids = [Centroid.new([5, 4])]
|
|
35
35
|
@node.update_closest_centroid(centroids)
|
|
36
36
|
a = @node.closest_centroid
|
|
37
37
|
@node.update_closest_centroid([Centroid.new([4,4])])
|
|
38
|
-
|
|
38
|
+
|
|
39
39
|
assert_not_equal a, @node.closest_centroid
|
|
40
40
|
assert_equal 0, @node.best_distance
|
|
41
41
|
end
|
|
42
|
-
|
|
42
|
+
|
|
43
|
+
context "Reset the node" do
|
|
44
|
+
should "update closest centroid" do
|
|
45
|
+
centroids = [Centroid.new([4, 4])]
|
|
46
|
+
@node.update_closest_centroid(centroids)
|
|
47
|
+
a = @node.closest_centroid
|
|
48
|
+
@node.reset!
|
|
49
|
+
|
|
50
|
+
@node.update_closest_centroid([Centroid.new([5,4])])
|
|
51
|
+
|
|
52
|
+
assert_not_equal a, @node.closest_centroid
|
|
53
|
+
assert_equal 1, @node.best_distance
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
43
57
|
should "raise error if a false measure is specified" do
|
|
44
58
|
assert_raise NoMethodError do
|
|
45
59
|
Node.new([9,9], :fakey).calculate_distance([1,1])
|
|
46
60
|
end
|
|
47
61
|
end
|
|
48
|
-
|
|
62
|
+
|
|
49
63
|
end
|
|
50
|
-
end
|
|
64
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: k_means
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 0
|
|
8
|
+
- 7
|
|
9
|
+
version: 0.0.7
|
|
5
10
|
platform: ruby
|
|
6
11
|
authors:
|
|
7
12
|
- reddavis
|
|
@@ -9,19 +14,23 @@ autorequire:
|
|
|
9
14
|
bindir: bin
|
|
10
15
|
cert_chain: []
|
|
11
16
|
|
|
12
|
-
date: 2010-
|
|
17
|
+
date: 2010-07-11 00:00:00 +01:00
|
|
13
18
|
default_executable:
|
|
14
19
|
dependencies:
|
|
15
20
|
- !ruby/object:Gem::Dependency
|
|
16
21
|
name: distance_measures
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
prerelease: false
|
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
20
24
|
requirements:
|
|
21
25
|
- - ">="
|
|
22
26
|
- !ruby/object:Gem::Version
|
|
27
|
+
segments:
|
|
28
|
+
- 0
|
|
29
|
+
- 0
|
|
30
|
+
- 0
|
|
23
31
|
version: 0.0.0
|
|
24
|
-
|
|
32
|
+
type: :runtime
|
|
33
|
+
version_requirements: *id001
|
|
25
34
|
description: Attempting to create a fast, memory efficient KMeans
|
|
26
35
|
email: reddavis@gmail.com
|
|
27
36
|
executables: []
|
|
@@ -64,18 +73,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
64
73
|
requirements:
|
|
65
74
|
- - ">="
|
|
66
75
|
- !ruby/object:Gem::Version
|
|
76
|
+
segments:
|
|
77
|
+
- 0
|
|
67
78
|
version: "0"
|
|
68
|
-
version:
|
|
69
79
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
70
80
|
requirements:
|
|
71
81
|
- - ">="
|
|
72
82
|
- !ruby/object:Gem::Version
|
|
83
|
+
segments:
|
|
84
|
+
- 0
|
|
73
85
|
version: "0"
|
|
74
|
-
version:
|
|
75
86
|
requirements: []
|
|
76
87
|
|
|
77
88
|
rubyforge_project:
|
|
78
|
-
rubygems_version: 1.3.
|
|
89
|
+
rubygems_version: 1.3.6
|
|
79
90
|
signing_key:
|
|
80
91
|
specification_version: 3
|
|
81
92
|
summary: K Means algorithm
|