k_means 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -16,6 +16,34 @@ Attempting to build a fast, memory efficient K-Means program.
16
16
  kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
17
17
  => [[3, 4], [0, 1, 2]]
18
18
 
19
+ == Distance Measurements
20
+
21
+ KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
22
+
23
+ The measurements currently available are:
24
+
25
+ euclidean_distance
26
+
27
+ cosine_similarity
28
+
29
+ jaccard_index
30
+
31
+ jaccard_distance
32
+
33
+ binary_jaccard_index
34
+
35
+ binary_jaccard_distance
36
+
37
+ tanimoto_coefficient
38
+
39
+ To specify a particular one to use in the KMeans algorithm, just provide it as an option:
40
+
41
+ KMeans.new(@data, :similarity_measure => :jaccard_index)
42
+ KMeans.new(@data, :similarity_measure => :cosine_similarity)
43
+ KMeans.new(@data, :similarity_measure => :tanimoto_coefficient)
44
+
45
+ You get the idea...
46
+
19
47
  == Benchmarks
20
48
 
21
49
  # 1000 records with 50 dimensions
data/Rakefile CHANGED
@@ -10,6 +10,7 @@ begin
10
10
  gem.email = "reddavis@gmail.com"
11
11
  gem.homepage = "http://github.com/reddavis/k_means"
12
12
  gem.authors = ["reddavis"]
13
+ gem.add_dependency('distance_measures', '>= 0.0.0')
13
14
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
15
  end
15
16
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -3,13 +3,13 @@ require 'rubygems'
3
3
  require 'ai4r'
4
4
  require File.dirname(__FILE__) + '/../lib/k_means'
5
5
 
6
- data = Array.new(500) {Array.new(50) {rand(10)}}
6
+ data = Array.new(1000) {Array.new(50) {rand(10)}}
7
7
 
8
8
  ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
9
9
 
10
10
  # Clustering can happen in magical ways
11
11
  # so lets do it over multiple times
12
- n = 20
12
+ n = 5
13
13
 
14
14
 
15
15
  Benchmark.bm do |x|
data/k_means.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{k_means}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2009-11-25}
12
+ s.date = %q{2010-01-25}
13
13
  s.description = %q{Attempting to create a fast, memory efficient KMeans}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -25,18 +25,17 @@ Gem::Specification.new do |s|
25
25
  "VERSION",
26
26
  "benchmark/benchmark_ai4r.rb",
27
27
  "k_means.gemspec",
28
- "lib/centroid.rb",
29
- "lib/ext/enumerable.rb",
30
28
  "lib/ext/object.rb",
31
29
  "lib/k_means.rb",
32
- "lib/node.rb",
30
+ "lib/k_means/centroid.rb",
31
+ "lib/k_means/k_means.rb",
32
+ "lib/k_means/node.rb",
33
33
  "profiling/profile.rb",
34
- "test/ext/test_enumerable.rb",
35
34
  "test/ext/test_object.rb",
36
35
  "test/helper.rb",
37
- "test/test_centroid.rb",
38
- "test/test_k_means.rb",
39
- "test/test_node.rb"
36
+ "test/k_means/test_centroid.rb",
37
+ "test/k_means/test_k_means.rb",
38
+ "test/k_means/test_node.rb"
40
39
  ]
41
40
  s.homepage = %q{http://github.com/reddavis/k_means}
42
41
  s.rdoc_options = ["--charset=UTF-8"]
@@ -44,12 +43,11 @@ Gem::Specification.new do |s|
44
43
  s.rubygems_version = %q{1.3.5}
45
44
  s.summary = %q{K Means algorithm}
46
45
  s.test_files = [
47
- "test/ext/test_enumerable.rb",
48
- "test/ext/test_object.rb",
46
+ "test/ext/test_object.rb",
49
47
  "test/helper.rb",
50
- "test/test_centroid.rb",
51
- "test/test_k_means.rb",
52
- "test/test_node.rb"
48
+ "test/k_means/test_centroid.rb",
49
+ "test/k_means/test_k_means.rb",
50
+ "test/k_means/test_node.rb"
53
51
  ]
54
52
 
55
53
  if s.respond_to? :specification_version then
@@ -57,9 +55,12 @@ Gem::Specification.new do |s|
57
55
  s.specification_version = 3
58
56
 
59
57
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<distance_measures>, [">= 0.0.0"])
60
59
  else
60
+ s.add_dependency(%q<distance_measures>, [">= 0.0.0"])
61
61
  end
62
62
  else
63
+ s.add_dependency(%q<distance_measures>, [">= 0.0.0"])
63
64
  end
64
65
  end
65
66
 
File without changes
@@ -0,0 +1,75 @@
1
+ require 'ext/object'
2
+
3
+ class KMeans
4
+
5
+ attr_reader :centroids, :nodes
6
+
7
+ def initialize(data, options={})
8
+ k = options[:centroids] || 4
9
+ @verbose = options[:verbose]
10
+
11
+ similarity_measure = options[:similarity_measure] || :euclidean_distance
12
+ @nodes = Node.create_nodes(data, similarity_measure)
13
+ @centroids = Centroid.create_centroids(k, @nodes)
14
+
15
+ perform_cluster_process
16
+ end
17
+
18
+ def inspect
19
+ @centroid_pockets.inspect
20
+ end
21
+
22
+ def view
23
+ @centroid_pockets
24
+ end
25
+
26
+ private
27
+
28
+ def perform_cluster_process
29
+ iterations, updates = 0, 1
30
+ while updates > 0 && iterations < 100
31
+ iterations += 1
32
+ verbose_message("Iteration #{iterations}")
33
+ updates = 0
34
+ updates += update_nodes
35
+ reposition_centroids
36
+ end
37
+ place_nodes_into_pockets
38
+ end
39
+
40
+ # This creates an array of arrays
41
+ # Each internal array represents a centroid
42
+ # and each in the array represents the nodes index
43
+ def place_nodes_into_pockets
44
+ centroid_pockets = Array.new(@centroids.size) {[]}
45
+ @centroids.each_with_index do |centroid, centroid_index|
46
+ @nodes.each_with_index do |node, node_index|
47
+ if node.closest_centroid == centroid
48
+ centroid_pockets[centroid_index] << node_index
49
+ end
50
+ end
51
+ end
52
+ @centroid_pockets = centroid_pockets
53
+ end
54
+
55
+ def update_nodes
56
+ sum = 0
57
+ @nodes.each do |node|
58
+ sum += node.update_closest_centroid(@centroids)
59
+ end
60
+ sum
61
+ end
62
+
63
+ def reposition_centroids
64
+ @centroids.each do |centroid|
65
+ nodes = []
66
+ @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
67
+ centroid.reposition(nodes)
68
+ end
69
+ end
70
+
71
+ def verbose_message(message)
72
+ puts message if @verbose
73
+ end
74
+
75
+ end
@@ -1,10 +1,10 @@
1
1
  class Node
2
2
 
3
3
  class << self
4
- def create_nodes(data)
4
+ def create_nodes(data, similarity_measure)
5
5
  nodes = []
6
6
  data.each do |position|
7
- nodes << new(position)
7
+ nodes << new(position, similarity_measure)
8
8
  end
9
9
  nodes
10
10
  end
@@ -12,8 +12,9 @@ class Node
12
12
 
13
13
  attr_accessor :position, :best_distance, :closest_centroid
14
14
 
15
- def initialize(position)
15
+ def initialize(position, similarity_measure)
16
16
  @position = position
17
+ @similarity_measure = similarity_measure
17
18
  end
18
19
 
19
20
  def update_closest_centroid(centroids)
@@ -38,7 +39,11 @@ class Node
38
39
  end
39
40
 
40
41
  def calculate_distance(centroid)
41
- @position.euclidean_distance(centroid.position)
42
+ begin
43
+ @position.send(@similarity_measure, centroid.position)
44
+ rescue NoMethodError
45
+ raise "Hey, that's not a measurement. Read the REAdME for available measurements"
46
+ end
42
47
  end
43
48
 
44
49
  end
data/lib/k_means.rb CHANGED
@@ -1,78 +1,8 @@
1
- $: << File.dirname(__FILE__)
2
- require 'centroid'
3
- require 'node'
4
- require 'ext/enumerable'
5
- require 'ext/object'
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
6
2
 
7
- class KMeans
8
-
9
- attr_reader :centroids, :nodes
10
-
11
- def initialize(data, options={})
12
- k = options[:centroids] || 4
13
- @verbose = options[:verbose] == true ? true : nil
14
-
15
- @nodes = Node.create_nodes(data)
16
- @centroids = Centroid.create_centroids(k, @nodes)
17
-
18
- perform_cluster_process
19
- end
20
-
21
- def inspect
22
- @centroid_pockets.inspect
23
- end
24
-
25
- def view
26
- @centroid_pockets
27
- end
28
-
29
- private
30
-
31
- def perform_cluster_process
32
- iterations, updates = 0, 1
33
- while updates > 0 && iterations < 100
34
- iterations += 1
35
- verbose_message("Iteration #{iterations}")
36
- updates = 0
37
- updates += update_nodes
38
- reposition_centroids
39
- end
40
- place_nodes_into_pockets
41
- end
42
-
43
- # This creates an array of arrays
44
- # Each internal array represents a centroid
45
- # and each in the array represents the nodes index
46
- def place_nodes_into_pockets
47
- centroid_pockets = Array.new(@centroids.size) {[]}
48
- @centroids.each_with_index do |centroid, centroid_index|
49
- @nodes.each_with_index do |node, node_index|
50
- if node.closest_centroid == centroid
51
- centroid_pockets[centroid_index] << node_index
52
- end
53
- end
54
- end
55
- @centroid_pockets = centroid_pockets
56
- end
57
-
58
- def update_nodes
59
- sum = 0
60
- @nodes.each do |node|
61
- sum += node.update_closest_centroid(@centroids)
62
- end
63
- sum
64
- end
65
-
66
- def reposition_centroids
67
- @centroids.each do |centroid|
68
- nodes = []
69
- @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
70
- centroid.reposition(nodes)
71
- end
72
- end
73
-
74
- def verbose_message(message)
75
- puts message if @verbose
76
- end
77
-
78
- end
3
+ require 'k_means/k_means'
4
+ require 'k_means/centroid'
5
+ require 'k_means/node'
6
+
7
+ # Gems
8
+ require 'distance_measures'
@@ -40,6 +40,6 @@ class TestCentroid < Test::Unit::TestCase
40
40
  private
41
41
 
42
42
  def create_nodes
43
- Node.create_nodes([[1,2,3], [4,5,6]])
43
+ Node.create_nodes([[1,2,3], [4,5,6]], :euclidean)
44
44
  end
45
45
  end
@@ -5,7 +5,7 @@ class TestKMeans < Test::Unit::TestCase
5
5
 
6
6
  setup do
7
7
  @data = Array.new(200) {Array.new(2) {rand}}
8
- @kmeans = KMeans.new(@data, :centroids => 2)
8
+ @kmeans = KMeans.new(@data, :centroids => 2, :similarity_measure => :cosine_similarity)
9
9
  end
10
10
 
11
11
  should "return an array" do
@@ -4,7 +4,7 @@ class TestNode < Test::Unit::TestCase
4
4
  context "A Data Instance" do
5
5
 
6
6
  setup do
7
- @node = Node.new([4, 4])
7
+ @node = Node.new([4, 4], :euclidean_distance)
8
8
  end
9
9
 
10
10
  should "return an array" do
@@ -13,13 +13,13 @@ class TestNode < Test::Unit::TestCase
13
13
 
14
14
  should "create an array of nodes" do
15
15
  data = Array.new(10) {Array.new(2) {rand}}
16
- nodes = Node.create_nodes(data)
16
+ nodes = Node.create_nodes(data, :euclidean_distance)
17
17
  assert_kind_of Array, nodes
18
18
  end
19
19
 
20
20
  should "create 10 nodes" do
21
21
  data = Array.new(10) {Array.new(2) {rand}}
22
- nodes = Node.create_nodes(data)
22
+ nodes = Node.create_nodes(data, :euclidean_distance)
23
23
  assert_equal 10, nodes.size
24
24
  end
25
25
 
@@ -35,9 +35,16 @@ class TestNode < Test::Unit::TestCase
35
35
  @node.update_closest_centroid(centroids)
36
36
  a = @node.closest_centroid
37
37
  @node.update_closest_centroid([Centroid.new([4,4])])
38
+
38
39
  assert_not_equal a, @node.closest_centroid
39
40
  assert_equal 0, @node.best_distance
40
41
  end
42
+
43
+ should "raise error if a false measure is specified" do
44
+ assert_raise NoMethodError do
45
+ Node.new([9,9], :fakey).calculate_distance([1,1])
46
+ end
47
+ end
41
48
 
42
49
  end
43
50
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: k_means
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - reddavis
@@ -9,10 +9,19 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-11-25 00:00:00 +00:00
12
+ date: 2010-01-25 00:00:00 +00:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: distance_measures
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
16
25
  description: Attempting to create a fast, memory efficient KMeans
17
26
  email: reddavis@gmail.com
18
27
  executables: []
@@ -31,18 +40,17 @@ files:
31
40
  - VERSION
32
41
  - benchmark/benchmark_ai4r.rb
33
42
  - k_means.gemspec
34
- - lib/centroid.rb
35
- - lib/ext/enumerable.rb
36
43
  - lib/ext/object.rb
37
44
  - lib/k_means.rb
38
- - lib/node.rb
45
+ - lib/k_means/centroid.rb
46
+ - lib/k_means/k_means.rb
47
+ - lib/k_means/node.rb
39
48
  - profiling/profile.rb
40
- - test/ext/test_enumerable.rb
41
49
  - test/ext/test_object.rb
42
50
  - test/helper.rb
43
- - test/test_centroid.rb
44
- - test/test_k_means.rb
45
- - test/test_node.rb
51
+ - test/k_means/test_centroid.rb
52
+ - test/k_means/test_k_means.rb
53
+ - test/k_means/test_node.rb
46
54
  has_rdoc: true
47
55
  homepage: http://github.com/reddavis/k_means
48
56
  licenses: []
@@ -72,9 +80,8 @@ signing_key:
72
80
  specification_version: 3
73
81
  summary: K Means algorithm
74
82
  test_files:
75
- - test/ext/test_enumerable.rb
76
83
  - test/ext/test_object.rb
77
84
  - test/helper.rb
78
- - test/test_centroid.rb
79
- - test/test_k_means.rb
80
- - test/test_node.rb
85
+ - test/k_means/test_centroid.rb
86
+ - test/k_means/test_k_means.rb
87
+ - test/k_means/test_node.rb
@@ -1,10 +0,0 @@
1
- module Enumerable
2
- def euclidean_distance(other)
3
- sum = 0.0
4
- self.each_index do |i|
5
- sum += (self[i] - other[i])**2
6
- end
7
- Math.sqrt(sum)
8
- end
9
- end
10
-
@@ -1,11 +0,0 @@
1
- require 'helper'
2
-
3
- class TestEnumerable < Test::Unit::TestCase
4
- context "Euclidean Distance" do
5
-
6
- should "return 5" do
7
- assert_equal 5, [10].euclidean_distance([5])
8
- end
9
-
10
- end
11
- end