k_means 0.0.5 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +23 -9
- data/VERSION +1 -1
- data/benchmark/benchmark_ai4r.rb +2 -2
- data/k_means.gemspec +3 -3
- data/lib/k_means/centroid.rb +2 -2
- data/lib/k_means/k_means.rb +21 -21
- data/lib/k_means/node.rb +31 -13
- data/test/k_means/test_centroid.rb +1 -1
- data/test/k_means/test_k_means.rb +33 -7
- data/test/k_means/test_node.rb +24 -10
- metadata +20 -9
data/README.rdoc
CHANGED
@@ -4,18 +4,32 @@ Attempting to build a fast, memory efficient K-Means program.
|
|
4
4
|
|
5
5
|
== Install
|
6
6
|
|
7
|
-
gem sources -a http://
|
8
|
-
sudo gem install
|
7
|
+
gem sources -a http://rubygems.org
|
8
|
+
sudo gem install k_means
|
9
9
|
|
10
10
|
== How To Use
|
11
11
|
require 'rubygems'
|
12
12
|
require 'k_means'
|
13
|
-
|
13
|
+
|
14
14
|
data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
|
15
|
-
kmeans = KMeans.new(
|
15
|
+
kmeans = KMeans.new(data, :centroids => 2)
|
16
16
|
kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
|
17
17
|
=> [[3, 4], [0, 1, 2]]
|
18
|
-
|
18
|
+
|
19
|
+
== Custom Centroids
|
20
|
+
require 'rubygems'
|
21
|
+
require 'k_means'
|
22
|
+
|
23
|
+
# Your custom centroid needs to have #position and #reposition methods
|
24
|
+
class CustomCentroid
|
25
|
+
attr_accessor :position
|
26
|
+
def initialize(position); @position = position; end
|
27
|
+
def reposition(nodes, centroid_positions); end
|
28
|
+
end
|
29
|
+
|
30
|
+
data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
|
31
|
+
kmeans = KMeans.new(data, :custom_centroids => @specified_centroids)
|
32
|
+
|
19
33
|
== Distance Measurements
|
20
34
|
|
21
35
|
KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
|
@@ -35,19 +49,19 @@ The measurements currently available are:
|
|
35
49
|
binary_jaccard_distance
|
36
50
|
|
37
51
|
tanimoto_coefficient
|
38
|
-
|
52
|
+
|
39
53
|
To specify a particular one to use in the KMeans algorithm, just provide it as an option:
|
40
54
|
|
41
55
|
KMeans.new(@data, :distance_measure => :jaccard_index)
|
42
56
|
KMeans.new(@data, :distance_measure => :cosine_similarity)
|
43
57
|
KMeans.new(@data, :distance_measure => :tanimoto_coefficient)
|
44
|
-
|
58
|
+
|
45
59
|
You get the idea...
|
46
|
-
|
60
|
+
|
47
61
|
== Benchmarks
|
48
62
|
|
49
63
|
# 1000 records with 50 dimensions
|
50
|
-
data = Array.new(1000) {Array.new(50) {rand(10)}}
|
64
|
+
data = Array.new(1000) {Array.new(50) {rand(10)}}
|
51
65
|
ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
|
52
66
|
|
53
67
|
# Clustering can happen in magical ways
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.7
|
data/benchmark/benchmark_ai4r.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rubygems'
|
|
3
3
|
require 'ai4r'
|
4
4
|
require File.dirname(__FILE__) + '/../lib/k_means'
|
5
5
|
|
6
|
-
data = Array.new(
|
6
|
+
data = Array.new(100) {Array.new(50) {rand(10)}}
|
7
7
|
|
8
8
|
ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
|
9
9
|
|
@@ -16,7 +16,7 @@ Benchmark.bm do |x|
|
|
16
16
|
x.report('Mine') do
|
17
17
|
n.times { KMeans.new(data) }
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
x.report("Ai4R") do
|
21
21
|
n.times do
|
22
22
|
b = Ai4r::Clusterers::KMeans.new
|
data/k_means.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{k_means}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["reddavis"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-11}
|
13
13
|
s.description = %q{Attempting to create a fast, memory efficient KMeans}
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -40,7 +40,7 @@ Gem::Specification.new do |s|
|
|
40
40
|
s.homepage = %q{http://github.com/reddavis/k_means}
|
41
41
|
s.rdoc_options = ["--charset=UTF-8"]
|
42
42
|
s.require_paths = ["lib"]
|
43
|
-
s.rubygems_version = %q{1.3.
|
43
|
+
s.rubygems_version = %q{1.3.6}
|
44
44
|
s.summary = %q{K Means algorithm}
|
45
45
|
s.test_files = [
|
46
46
|
"test/ext/test_object.rb",
|
data/lib/k_means/centroid.rb
CHANGED
@@ -35,7 +35,7 @@ class Centroid
|
|
35
35
|
|
36
36
|
# Finds the average distance of all the nodes assigned to
|
37
37
|
# the centroid and then moves the centroid to that position
|
38
|
-
def reposition(nodes)
|
38
|
+
def reposition(nodes, centroids)
|
39
39
|
return if nodes.empty?
|
40
40
|
averages = [0.0] * nodes[0].position.size
|
41
41
|
nodes.each do |node|
|
@@ -46,4 +46,4 @@ class Centroid
|
|
46
46
|
@position = averages.map {|x| x / nodes.size}
|
47
47
|
end
|
48
48
|
|
49
|
-
end
|
49
|
+
end
|
data/lib/k_means/k_means.rb
CHANGED
@@ -1,42 +1,41 @@
|
|
1
1
|
require 'ext/object'
|
2
2
|
|
3
3
|
class KMeans
|
4
|
-
|
4
|
+
|
5
5
|
attr_reader :centroids, :nodes
|
6
|
-
|
6
|
+
|
7
7
|
def initialize(data, options={})
|
8
|
-
k = options[:centroids] || 4
|
9
|
-
@verbose = options[:verbose]
|
10
|
-
|
11
8
|
distance_measure = options[:distance_measure] || :euclidean_distance
|
12
9
|
@nodes = Node.create_nodes(data, distance_measure)
|
13
|
-
@centroids =
|
14
|
-
|
10
|
+
@centroids = options[:custom_centroids] ||
|
11
|
+
Centroid.create_centroids(options[:centroids] || 4, @nodes)
|
12
|
+
@verbose = options[:verbose]
|
13
|
+
|
15
14
|
perform_cluster_process
|
16
15
|
end
|
17
|
-
|
16
|
+
|
18
17
|
def inspect
|
19
18
|
@centroid_pockets.inspect
|
20
19
|
end
|
21
|
-
|
20
|
+
|
22
21
|
def view
|
23
22
|
@centroid_pockets
|
24
23
|
end
|
25
|
-
|
24
|
+
|
26
25
|
private
|
27
|
-
|
26
|
+
|
28
27
|
def perform_cluster_process
|
29
28
|
iterations, updates = 0, 1
|
30
|
-
while updates > 0 && iterations < 100
|
29
|
+
while updates > 0 && iterations < 100
|
31
30
|
iterations += 1
|
32
31
|
verbose_message("Iteration #{iterations}")
|
33
|
-
updates = 0
|
32
|
+
updates = 0
|
34
33
|
updates += update_nodes
|
35
34
|
reposition_centroids
|
36
35
|
end
|
37
36
|
place_nodes_into_pockets
|
38
37
|
end
|
39
|
-
|
38
|
+
|
40
39
|
# This creates an array of arrays
|
41
40
|
# Each internal array represents a centroid
|
42
41
|
# and each in the array represents the nodes index
|
@@ -51,7 +50,7 @@ class KMeans
|
|
51
50
|
end
|
52
51
|
@centroid_pockets = centroid_pockets
|
53
52
|
end
|
54
|
-
|
53
|
+
|
55
54
|
def update_nodes
|
56
55
|
sum = 0
|
57
56
|
@nodes.each do |node|
|
@@ -59,17 +58,18 @@ class KMeans
|
|
59
58
|
end
|
60
59
|
sum
|
61
60
|
end
|
62
|
-
|
61
|
+
|
63
62
|
def reposition_centroids
|
63
|
+
centroid_positions = @centroids.map(&:position)
|
64
64
|
@centroids.each do |centroid|
|
65
|
-
nodes = []
|
65
|
+
nodes = []
|
66
66
|
@nodes.each {|n| nodes << n if n.closest_centroid == centroid}
|
67
|
-
centroid.reposition(nodes)
|
67
|
+
centroid.reposition(nodes, centroid_positions)
|
68
68
|
end
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
def verbose_message(message)
|
72
72
|
puts message if @verbose
|
73
73
|
end
|
74
|
-
|
75
|
-
end
|
74
|
+
|
75
|
+
end
|
data/lib/k_means/node.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
class Node
|
2
|
-
|
2
|
+
|
3
3
|
class << self
|
4
4
|
def create_nodes(data, similarity_measure)
|
5
5
|
nodes = []
|
@@ -9,41 +9,59 @@ class Node
|
|
9
9
|
nodes
|
10
10
|
end
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
attr_accessor :position, :best_distance, :closest_centroid
|
14
|
-
|
14
|
+
|
15
15
|
def initialize(position, similarity_measure)
|
16
16
|
@position = position
|
17
17
|
@similarity_measure = similarity_measure
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
def update_closest_centroid(centroids)
|
21
|
+
# If we haven't processed this node we need to give it an initial centroid
|
22
|
+
# so that we have something to compare distances against
|
21
23
|
calculate_initial_centroid(centroids.first) unless @closest_centroid
|
24
|
+
|
22
25
|
updated = false
|
23
26
|
centroids.each do |centroid|
|
27
|
+
# Check if they are in the same position
|
28
|
+
if centroid.position == @position
|
29
|
+
updated = update_attributes(centroid, 0.0)
|
30
|
+
break
|
31
|
+
end
|
32
|
+
|
24
33
|
distance = calculate_distance(centroid)
|
25
34
|
if distance < best_distance
|
26
|
-
updated =
|
27
|
-
@closest_centroid = centroid
|
28
|
-
@best_distance = distance
|
35
|
+
updated = update_attributes(centroid, distance)
|
29
36
|
end
|
30
37
|
end
|
38
|
+
|
31
39
|
updated == true ? 1 : 0
|
32
40
|
end
|
33
|
-
|
41
|
+
|
42
|
+
def reset!
|
43
|
+
@closest_centroid = nil
|
44
|
+
@best_distance = nil
|
45
|
+
end
|
46
|
+
|
34
47
|
private
|
35
|
-
|
48
|
+
|
49
|
+
def update_attributes(closest_centroid, best_distance)
|
50
|
+
@closest_centroid, @best_distance = closest_centroid, best_distance
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
36
54
|
def calculate_initial_centroid(centroid)
|
37
55
|
@closest_centroid = centroid
|
38
56
|
@best_distance = calculate_distance(centroid)
|
39
57
|
end
|
40
|
-
|
58
|
+
|
41
59
|
def calculate_distance(centroid)
|
42
60
|
begin
|
43
61
|
@position.send(@similarity_measure, centroid.position)
|
44
62
|
rescue NoMethodError
|
45
|
-
raise "Hey,
|
63
|
+
raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements"
|
46
64
|
end
|
47
65
|
end
|
48
|
-
|
49
|
-
end
|
66
|
+
|
67
|
+
end
|
@@ -2,23 +2,49 @@ require 'helper'
|
|
2
2
|
|
3
3
|
class TestKMeans < Test::Unit::TestCase
|
4
4
|
context "A KMeans Instance" do
|
5
|
-
|
5
|
+
|
6
6
|
setup do
|
7
|
-
@data = Array.new(
|
7
|
+
@data = Array.new(3) {Array.new(2) {rand}}
|
8
8
|
@kmeans = KMeans.new(@data, :centroids => 2, :distance_measure => :cosine_similarity)
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
should "return an array" do
|
12
12
|
assert_kind_of String, @kmeans.inspect
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
should "have 2 centroids" do
|
16
16
|
assert_equal 2, @kmeans.centroids.size
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
should "have 200 nodes" do
|
20
|
-
assert_equal
|
20
|
+
assert_equal 3, @kmeans.nodes.size
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
context "A KMeans Instance with specified initial centroids" do
|
26
|
+
setup do
|
27
|
+
@data = Array.new(3) {Array.new(2) {rand}}
|
28
|
+
class CustomCentroid
|
29
|
+
attr_accessor :position
|
30
|
+
def initialize(position); @position = position; end
|
31
|
+
def reposition(nodes, centroid_positions); end
|
32
|
+
end
|
33
|
+
|
34
|
+
@specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) }
|
35
|
+
@kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity)
|
36
|
+
end
|
37
|
+
|
38
|
+
should "return an inspected array" do
|
39
|
+
assert_kind_of String, @kmeans.inspect
|
40
|
+
end
|
41
|
+
|
42
|
+
should "have 3 centroids" do
|
43
|
+
assert_equal 3, @kmeans.centroids.size
|
44
|
+
end
|
45
|
+
|
46
|
+
should "have 3 nodes" do
|
47
|
+
assert_equal 3, @kmeans.nodes.size
|
21
48
|
end
|
22
|
-
|
23
49
|
end
|
24
50
|
end
|
data/test/k_means/test_node.rb
CHANGED
@@ -2,49 +2,63 @@ require 'helper'
|
|
2
2
|
|
3
3
|
class TestNode < Test::Unit::TestCase
|
4
4
|
context "A Data Instance" do
|
5
|
-
|
5
|
+
|
6
6
|
setup do
|
7
7
|
@node = Node.new([4, 4], :euclidean_distance)
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
should "return an array" do
|
11
11
|
assert_kind_of Array, @node.position
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
should "create an array of nodes" do
|
15
15
|
data = Array.new(10) {Array.new(2) {rand}}
|
16
16
|
nodes = Node.create_nodes(data, :euclidean_distance)
|
17
17
|
assert_kind_of Array, nodes
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
should "create 10 nodes" do
|
21
21
|
data = Array.new(10) {Array.new(2) {rand}}
|
22
22
|
nodes = Node.create_nodes(data, :euclidean_distance)
|
23
23
|
assert_equal 10, nodes.size
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
should "initialize closest centroid" do
|
27
27
|
a = @node.closest_centroid
|
28
28
|
centroids = [Centroid.new([4, 4]), Centroid.new([5, 4])]
|
29
29
|
@node.update_closest_centroid(centroids)
|
30
30
|
assert_not_equal nil, @node.closest_centroid
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
should "update closest centroid" do
|
34
34
|
centroids = [Centroid.new([5, 4])]
|
35
35
|
@node.update_closest_centroid(centroids)
|
36
36
|
a = @node.closest_centroid
|
37
37
|
@node.update_closest_centroid([Centroid.new([4,4])])
|
38
|
-
|
38
|
+
|
39
39
|
assert_not_equal a, @node.closest_centroid
|
40
40
|
assert_equal 0, @node.best_distance
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
|
+
context "Reset the node" do
|
44
|
+
should "update closest centroid" do
|
45
|
+
centroids = [Centroid.new([4, 4])]
|
46
|
+
@node.update_closest_centroid(centroids)
|
47
|
+
a = @node.closest_centroid
|
48
|
+
@node.reset!
|
49
|
+
|
50
|
+
@node.update_closest_centroid([Centroid.new([5,4])])
|
51
|
+
|
52
|
+
assert_not_equal a, @node.closest_centroid
|
53
|
+
assert_equal 1, @node.best_distance
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
43
57
|
should "raise error if a false measure is specified" do
|
44
58
|
assert_raise NoMethodError do
|
45
59
|
Node.new([9,9], :fakey).calculate_distance([1,1])
|
46
60
|
end
|
47
61
|
end
|
48
|
-
|
62
|
+
|
49
63
|
end
|
50
|
-
end
|
64
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: k_means
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- reddavis
|
@@ -9,19 +14,23 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date: 2010-
|
17
|
+
date: 2010-07-11 00:00:00 +01:00
|
13
18
|
default_executable:
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: distance_measures
|
17
|
-
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
- 0
|
30
|
+
- 0
|
23
31
|
version: 0.0.0
|
24
|
-
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
25
34
|
description: Attempting to create a fast, memory efficient KMeans
|
26
35
|
email: reddavis@gmail.com
|
27
36
|
executables: []
|
@@ -64,18 +73,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
64
73
|
requirements:
|
65
74
|
- - ">="
|
66
75
|
- !ruby/object:Gem::Version
|
76
|
+
segments:
|
77
|
+
- 0
|
67
78
|
version: "0"
|
68
|
-
version:
|
69
79
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
80
|
requirements:
|
71
81
|
- - ">="
|
72
82
|
- !ruby/object:Gem::Version
|
83
|
+
segments:
|
84
|
+
- 0
|
73
85
|
version: "0"
|
74
|
-
version:
|
75
86
|
requirements: []
|
76
87
|
|
77
88
|
rubyforge_project:
|
78
|
-
rubygems_version: 1.3.
|
89
|
+
rubygems_version: 1.3.6
|
79
90
|
signing_key:
|
80
91
|
specification_version: 3
|
81
92
|
summary: K Means algorithm
|