k_means 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +28 -0
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/benchmark/benchmark_ai4r.rb +2 -2
- data/k_means.gemspec +15 -14
- data/lib/{centroid.rb → k_means/centroid.rb} +0 -0
- data/lib/k_means/k_means.rb +75 -0
- data/lib/{node.rb → k_means/node.rb} +9 -4
- data/lib/k_means.rb +7 -77
- data/test/{test_centroid.rb → k_means/test_centroid.rb} +1 -1
- data/test/{test_k_means.rb → k_means/test_k_means.rb} +1 -1
- data/test/{test_node.rb → k_means/test_node.rb} +10 -3
- metadata +22 -15
- data/lib/ext/enumerable.rb +0 -10
- data/test/ext/test_enumerable.rb +0 -11
data/README.rdoc
CHANGED
@@ -16,6 +16,34 @@ Attempting to build a fast, memory efficient K-Means program.
|
|
16
16
|
kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
|
17
17
|
=> [[3, 4], [0, 1, 2]]
|
18
18
|
|
19
|
+
== Distance Measurements
|
20
|
+
|
21
|
+
KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
|
22
|
+
|
23
|
+
The measurements currently available are:
|
24
|
+
|
25
|
+
euclidean_distance
|
26
|
+
|
27
|
+
cosine_similarity
|
28
|
+
|
29
|
+
jaccard_index
|
30
|
+
|
31
|
+
jaccard_distance
|
32
|
+
|
33
|
+
binary_jaccard_index
|
34
|
+
|
35
|
+
binary_jaccard_distance
|
36
|
+
|
37
|
+
tanimoto_coefficient
|
38
|
+
|
39
|
+
To specify a particular one to use in the KMeans algorithm, just provide it as an option:
|
40
|
+
|
41
|
+
KMeans.new(@data, :similarity_measure => :jaccard_index)
|
42
|
+
KMeans.new(@data, :similarity_measure => :cosine_similarity)
|
43
|
+
KMeans.new(@data, :similarity_measure => :tanimoto_coefficient)
|
44
|
+
|
45
|
+
You get the idea...
|
46
|
+
|
19
47
|
== Benchmarks
|
20
48
|
|
21
49
|
# 1000 records with 50 dimensions
|
data/Rakefile
CHANGED
@@ -10,6 +10,7 @@ begin
|
|
10
10
|
gem.email = "reddavis@gmail.com"
|
11
11
|
gem.homepage = "http://github.com/reddavis/k_means"
|
12
12
|
gem.authors = ["reddavis"]
|
13
|
+
gem.add_dependency('distance_measures', '>= 0.0.0')
|
13
14
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
15
|
end
|
15
16
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/benchmark/benchmark_ai4r.rb
CHANGED
@@ -3,13 +3,13 @@ require 'rubygems'
|
|
3
3
|
require 'ai4r'
|
4
4
|
require File.dirname(__FILE__) + '/../lib/k_means'
|
5
5
|
|
6
|
-
data = Array.new(
|
6
|
+
data = Array.new(1000) {Array.new(50) {rand(10)}}
|
7
7
|
|
8
8
|
ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
|
9
9
|
|
10
10
|
# Clustering can happen in magical ways
|
11
11
|
# so lets do it over multiple times
|
12
|
-
n =
|
12
|
+
n = 5
|
13
13
|
|
14
14
|
|
15
15
|
Benchmark.bm do |x|
|
data/k_means.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{k_means}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["reddavis"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-25}
|
13
13
|
s.description = %q{Attempting to create a fast, memory efficient KMeans}
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -25,18 +25,17 @@ Gem::Specification.new do |s|
|
|
25
25
|
"VERSION",
|
26
26
|
"benchmark/benchmark_ai4r.rb",
|
27
27
|
"k_means.gemspec",
|
28
|
-
"lib/centroid.rb",
|
29
|
-
"lib/ext/enumerable.rb",
|
30
28
|
"lib/ext/object.rb",
|
31
29
|
"lib/k_means.rb",
|
32
|
-
"lib/
|
30
|
+
"lib/k_means/centroid.rb",
|
31
|
+
"lib/k_means/k_means.rb",
|
32
|
+
"lib/k_means/node.rb",
|
33
33
|
"profiling/profile.rb",
|
34
|
-
"test/ext/test_enumerable.rb",
|
35
34
|
"test/ext/test_object.rb",
|
36
35
|
"test/helper.rb",
|
37
|
-
"test/test_centroid.rb",
|
38
|
-
"test/test_k_means.rb",
|
39
|
-
"test/test_node.rb"
|
36
|
+
"test/k_means/test_centroid.rb",
|
37
|
+
"test/k_means/test_k_means.rb",
|
38
|
+
"test/k_means/test_node.rb"
|
40
39
|
]
|
41
40
|
s.homepage = %q{http://github.com/reddavis/k_means}
|
42
41
|
s.rdoc_options = ["--charset=UTF-8"]
|
@@ -44,12 +43,11 @@ Gem::Specification.new do |s|
|
|
44
43
|
s.rubygems_version = %q{1.3.5}
|
45
44
|
s.summary = %q{K Means algorithm}
|
46
45
|
s.test_files = [
|
47
|
-
"test/ext/
|
48
|
-
"test/ext/test_object.rb",
|
46
|
+
"test/ext/test_object.rb",
|
49
47
|
"test/helper.rb",
|
50
|
-
"test/test_centroid.rb",
|
51
|
-
"test/test_k_means.rb",
|
52
|
-
"test/test_node.rb"
|
48
|
+
"test/k_means/test_centroid.rb",
|
49
|
+
"test/k_means/test_k_means.rb",
|
50
|
+
"test/k_means/test_node.rb"
|
53
51
|
]
|
54
52
|
|
55
53
|
if s.respond_to? :specification_version then
|
@@ -57,9 +55,12 @@ Gem::Specification.new do |s|
|
|
57
55
|
s.specification_version = 3
|
58
56
|
|
59
57
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
58
|
+
s.add_runtime_dependency(%q<distance_measures>, [">= 0.0.0"])
|
60
59
|
else
|
60
|
+
s.add_dependency(%q<distance_measures>, [">= 0.0.0"])
|
61
61
|
end
|
62
62
|
else
|
63
|
+
s.add_dependency(%q<distance_measures>, [">= 0.0.0"])
|
63
64
|
end
|
64
65
|
end
|
65
66
|
|
File without changes
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'ext/object'
|
2
|
+
|
3
|
+
class KMeans
|
4
|
+
|
5
|
+
attr_reader :centroids, :nodes
|
6
|
+
|
7
|
+
def initialize(data, options={})
|
8
|
+
k = options[:centroids] || 4
|
9
|
+
@verbose = options[:verbose]
|
10
|
+
|
11
|
+
similarity_measure = options[:similarity_measure] || :euclidean_distance
|
12
|
+
@nodes = Node.create_nodes(data, similarity_measure)
|
13
|
+
@centroids = Centroid.create_centroids(k, @nodes)
|
14
|
+
|
15
|
+
perform_cluster_process
|
16
|
+
end
|
17
|
+
|
18
|
+
def inspect
|
19
|
+
@centroid_pockets.inspect
|
20
|
+
end
|
21
|
+
|
22
|
+
def view
|
23
|
+
@centroid_pockets
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def perform_cluster_process
|
29
|
+
iterations, updates = 0, 1
|
30
|
+
while updates > 0 && iterations < 100
|
31
|
+
iterations += 1
|
32
|
+
verbose_message("Iteration #{iterations}")
|
33
|
+
updates = 0
|
34
|
+
updates += update_nodes
|
35
|
+
reposition_centroids
|
36
|
+
end
|
37
|
+
place_nodes_into_pockets
|
38
|
+
end
|
39
|
+
|
40
|
+
# This creates an array of arrays
|
41
|
+
# Each internal array represents a centroid
|
42
|
+
# and each in the array represents the nodes index
|
43
|
+
def place_nodes_into_pockets
|
44
|
+
centroid_pockets = Array.new(@centroids.size) {[]}
|
45
|
+
@centroids.each_with_index do |centroid, centroid_index|
|
46
|
+
@nodes.each_with_index do |node, node_index|
|
47
|
+
if node.closest_centroid == centroid
|
48
|
+
centroid_pockets[centroid_index] << node_index
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
@centroid_pockets = centroid_pockets
|
53
|
+
end
|
54
|
+
|
55
|
+
def update_nodes
|
56
|
+
sum = 0
|
57
|
+
@nodes.each do |node|
|
58
|
+
sum += node.update_closest_centroid(@centroids)
|
59
|
+
end
|
60
|
+
sum
|
61
|
+
end
|
62
|
+
|
63
|
+
def reposition_centroids
|
64
|
+
@centroids.each do |centroid|
|
65
|
+
nodes = []
|
66
|
+
@nodes.each {|n| nodes << n if n.closest_centroid == centroid}
|
67
|
+
centroid.reposition(nodes)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def verbose_message(message)
|
72
|
+
puts message if @verbose
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -1,10 +1,10 @@
|
|
1
1
|
class Node
|
2
2
|
|
3
3
|
class << self
|
4
|
-
def create_nodes(data)
|
4
|
+
def create_nodes(data, similarity_measure)
|
5
5
|
nodes = []
|
6
6
|
data.each do |position|
|
7
|
-
nodes << new(position)
|
7
|
+
nodes << new(position, similarity_measure)
|
8
8
|
end
|
9
9
|
nodes
|
10
10
|
end
|
@@ -12,8 +12,9 @@ class Node
|
|
12
12
|
|
13
13
|
attr_accessor :position, :best_distance, :closest_centroid
|
14
14
|
|
15
|
-
def initialize(position)
|
15
|
+
def initialize(position, similarity_measure)
|
16
16
|
@position = position
|
17
|
+
@similarity_measure = similarity_measure
|
17
18
|
end
|
18
19
|
|
19
20
|
def update_closest_centroid(centroids)
|
@@ -38,7 +39,11 @@ class Node
|
|
38
39
|
end
|
39
40
|
|
40
41
|
def calculate_distance(centroid)
|
41
|
-
|
42
|
+
begin
|
43
|
+
@position.send(@similarity_measure, centroid.position)
|
44
|
+
rescue NoMethodError
|
45
|
+
raise "Hey, that's not a measurement. Read the REAdME for available measurements"
|
46
|
+
end
|
42
47
|
end
|
43
48
|
|
44
49
|
end
|
data/lib/k_means.rb
CHANGED
@@ -1,78 +1,8 @@
|
|
1
|
-
|
2
|
-
require 'centroid'
|
3
|
-
require 'node'
|
4
|
-
require 'ext/enumerable'
|
5
|
-
require 'ext/object'
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
6
2
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
@verbose = options[:verbose] == true ? true : nil
|
14
|
-
|
15
|
-
@nodes = Node.create_nodes(data)
|
16
|
-
@centroids = Centroid.create_centroids(k, @nodes)
|
17
|
-
|
18
|
-
perform_cluster_process
|
19
|
-
end
|
20
|
-
|
21
|
-
def inspect
|
22
|
-
@centroid_pockets.inspect
|
23
|
-
end
|
24
|
-
|
25
|
-
def view
|
26
|
-
@centroid_pockets
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def perform_cluster_process
|
32
|
-
iterations, updates = 0, 1
|
33
|
-
while updates > 0 && iterations < 100
|
34
|
-
iterations += 1
|
35
|
-
verbose_message("Iteration #{iterations}")
|
36
|
-
updates = 0
|
37
|
-
updates += update_nodes
|
38
|
-
reposition_centroids
|
39
|
-
end
|
40
|
-
place_nodes_into_pockets
|
41
|
-
end
|
42
|
-
|
43
|
-
# This creates an array of arrays
|
44
|
-
# Each internal array represents a centroid
|
45
|
-
# and each in the array represents the nodes index
|
46
|
-
def place_nodes_into_pockets
|
47
|
-
centroid_pockets = Array.new(@centroids.size) {[]}
|
48
|
-
@centroids.each_with_index do |centroid, centroid_index|
|
49
|
-
@nodes.each_with_index do |node, node_index|
|
50
|
-
if node.closest_centroid == centroid
|
51
|
-
centroid_pockets[centroid_index] << node_index
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
@centroid_pockets = centroid_pockets
|
56
|
-
end
|
57
|
-
|
58
|
-
def update_nodes
|
59
|
-
sum = 0
|
60
|
-
@nodes.each do |node|
|
61
|
-
sum += node.update_closest_centroid(@centroids)
|
62
|
-
end
|
63
|
-
sum
|
64
|
-
end
|
65
|
-
|
66
|
-
def reposition_centroids
|
67
|
-
@centroids.each do |centroid|
|
68
|
-
nodes = []
|
69
|
-
@nodes.each {|n| nodes << n if n.closest_centroid == centroid}
|
70
|
-
centroid.reposition(nodes)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def verbose_message(message)
|
75
|
-
puts message if @verbose
|
76
|
-
end
|
77
|
-
|
78
|
-
end
|
3
|
+
require 'k_means/k_means'
|
4
|
+
require 'k_means/centroid'
|
5
|
+
require 'k_means/node'
|
6
|
+
|
7
|
+
# Gems
|
8
|
+
require 'distance_measures'
|
@@ -5,7 +5,7 @@ class TestKMeans < Test::Unit::TestCase
|
|
5
5
|
|
6
6
|
setup do
|
7
7
|
@data = Array.new(200) {Array.new(2) {rand}}
|
8
|
-
@kmeans = KMeans.new(@data, :centroids => 2)
|
8
|
+
@kmeans = KMeans.new(@data, :centroids => 2, :similarity_measure => :cosine_similarity)
|
9
9
|
end
|
10
10
|
|
11
11
|
should "return an array" do
|
@@ -4,7 +4,7 @@ class TestNode < Test::Unit::TestCase
|
|
4
4
|
context "A Data Instance" do
|
5
5
|
|
6
6
|
setup do
|
7
|
-
@node = Node.new([4, 4])
|
7
|
+
@node = Node.new([4, 4], :euclidean_distance)
|
8
8
|
end
|
9
9
|
|
10
10
|
should "return an array" do
|
@@ -13,13 +13,13 @@ class TestNode < Test::Unit::TestCase
|
|
13
13
|
|
14
14
|
should "create an array of nodes" do
|
15
15
|
data = Array.new(10) {Array.new(2) {rand}}
|
16
|
-
nodes = Node.create_nodes(data)
|
16
|
+
nodes = Node.create_nodes(data, :euclidean_distance)
|
17
17
|
assert_kind_of Array, nodes
|
18
18
|
end
|
19
19
|
|
20
20
|
should "create 10 nodes" do
|
21
21
|
data = Array.new(10) {Array.new(2) {rand}}
|
22
|
-
nodes = Node.create_nodes(data)
|
22
|
+
nodes = Node.create_nodes(data, :euclidean_distance)
|
23
23
|
assert_equal 10, nodes.size
|
24
24
|
end
|
25
25
|
|
@@ -35,9 +35,16 @@ class TestNode < Test::Unit::TestCase
|
|
35
35
|
@node.update_closest_centroid(centroids)
|
36
36
|
a = @node.closest_centroid
|
37
37
|
@node.update_closest_centroid([Centroid.new([4,4])])
|
38
|
+
|
38
39
|
assert_not_equal a, @node.closest_centroid
|
39
40
|
assert_equal 0, @node.best_distance
|
40
41
|
end
|
42
|
+
|
43
|
+
should "raise error if a false measure is specified" do
|
44
|
+
assert_raise NoMethodError do
|
45
|
+
Node.new([9,9], :fakey).calculate_distance([1,1])
|
46
|
+
end
|
47
|
+
end
|
41
48
|
|
42
49
|
end
|
43
50
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: k_means
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- reddavis
|
@@ -9,10 +9,19 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-25 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: distance_measures
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
16
25
|
description: Attempting to create a fast, memory efficient KMeans
|
17
26
|
email: reddavis@gmail.com
|
18
27
|
executables: []
|
@@ -31,18 +40,17 @@ files:
|
|
31
40
|
- VERSION
|
32
41
|
- benchmark/benchmark_ai4r.rb
|
33
42
|
- k_means.gemspec
|
34
|
-
- lib/centroid.rb
|
35
|
-
- lib/ext/enumerable.rb
|
36
43
|
- lib/ext/object.rb
|
37
44
|
- lib/k_means.rb
|
38
|
-
- lib/
|
45
|
+
- lib/k_means/centroid.rb
|
46
|
+
- lib/k_means/k_means.rb
|
47
|
+
- lib/k_means/node.rb
|
39
48
|
- profiling/profile.rb
|
40
|
-
- test/ext/test_enumerable.rb
|
41
49
|
- test/ext/test_object.rb
|
42
50
|
- test/helper.rb
|
43
|
-
- test/test_centroid.rb
|
44
|
-
- test/test_k_means.rb
|
45
|
-
- test/test_node.rb
|
51
|
+
- test/k_means/test_centroid.rb
|
52
|
+
- test/k_means/test_k_means.rb
|
53
|
+
- test/k_means/test_node.rb
|
46
54
|
has_rdoc: true
|
47
55
|
homepage: http://github.com/reddavis/k_means
|
48
56
|
licenses: []
|
@@ -72,9 +80,8 @@ signing_key:
|
|
72
80
|
specification_version: 3
|
73
81
|
summary: K Means algorithm
|
74
82
|
test_files:
|
75
|
-
- test/ext/test_enumerable.rb
|
76
83
|
- test/ext/test_object.rb
|
77
84
|
- test/helper.rb
|
78
|
-
- test/test_centroid.rb
|
79
|
-
- test/test_k_means.rb
|
80
|
-
- test/test_node.rb
|
85
|
+
- test/k_means/test_centroid.rb
|
86
|
+
- test/k_means/test_k_means.rb
|
87
|
+
- test/k_means/test_node.rb
|
data/lib/ext/enumerable.rb
DELETED