k_means 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -16,6 +16,34 @@ Attempting to build a fast, memory efficient K-Means program.
16
16
  kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
17
17
  => [[3, 4], [0, 1, 2]]
18
18
 
19
+ == Distance Measurements
20
+
21
+ KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
22
+
23
+ The measurements currently available are:
24
+
25
+ euclidean_distance
26
+
27
+ cosine_similarity
28
+
29
+ jaccard_index
30
+
31
+ jaccard_distance
32
+
33
+ binary_jaccard_index
34
+
35
+ binary_jaccard_distance
36
+
37
+ tanimoto_coefficient
38
+
39
+ To specify a particular one to use in the KMeans algorithm, just provide it as an option:
40
+
41
+ KMeans.new(@data, :similarity_measure => :jaccard_index)
42
+ KMeans.new(@data, :similarity_measure => :cosine_similarity)
43
+ KMeans.new(@data, :similarity_measure => :tanimoto_coefficient)
44
+
45
+ You get the idea...
46
+
19
47
  == Benchmarks
20
48
 
21
49
  # 1000 records with 50 dimensions
data/Rakefile CHANGED
@@ -10,6 +10,7 @@ begin
10
10
  gem.email = "reddavis@gmail.com"
11
11
  gem.homepage = "http://github.com/reddavis/k_means"
12
12
  gem.authors = ["reddavis"]
13
+ gem.add_dependency('distance_measures', '>= 0.0.0')
13
14
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
15
  end
15
16
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -3,13 +3,13 @@ require 'rubygems'
3
3
  require 'ai4r'
4
4
  require File.dirname(__FILE__) + '/../lib/k_means'
5
5
 
6
- data = Array.new(500) {Array.new(50) {rand(10)}}
6
+ data = Array.new(1000) {Array.new(50) {rand(10)}}
7
7
 
8
8
  ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
9
9
 
10
10
  # Clustering can happen in magical ways
11
11
  # so lets do it over multiple times
12
- n = 20
12
+ n = 5
13
13
 
14
14
 
15
15
  Benchmark.bm do |x|
data/k_means.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{k_means}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2009-11-25}
12
+ s.date = %q{2010-01-25}
13
13
  s.description = %q{Attempting to create a fast, memory efficient KMeans}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -25,18 +25,17 @@ Gem::Specification.new do |s|
25
25
  "VERSION",
26
26
  "benchmark/benchmark_ai4r.rb",
27
27
  "k_means.gemspec",
28
- "lib/centroid.rb",
29
- "lib/ext/enumerable.rb",
30
28
  "lib/ext/object.rb",
31
29
  "lib/k_means.rb",
32
- "lib/node.rb",
30
+ "lib/k_means/centroid.rb",
31
+ "lib/k_means/k_means.rb",
32
+ "lib/k_means/node.rb",
33
33
  "profiling/profile.rb",
34
- "test/ext/test_enumerable.rb",
35
34
  "test/ext/test_object.rb",
36
35
  "test/helper.rb",
37
- "test/test_centroid.rb",
38
- "test/test_k_means.rb",
39
- "test/test_node.rb"
36
+ "test/k_means/test_centroid.rb",
37
+ "test/k_means/test_k_means.rb",
38
+ "test/k_means/test_node.rb"
40
39
  ]
41
40
  s.homepage = %q{http://github.com/reddavis/k_means}
42
41
  s.rdoc_options = ["--charset=UTF-8"]
@@ -44,12 +43,11 @@ Gem::Specification.new do |s|
44
43
  s.rubygems_version = %q{1.3.5}
45
44
  s.summary = %q{K Means algorithm}
46
45
  s.test_files = [
47
- "test/ext/test_enumerable.rb",
48
- "test/ext/test_object.rb",
46
+ "test/ext/test_object.rb",
49
47
  "test/helper.rb",
50
- "test/test_centroid.rb",
51
- "test/test_k_means.rb",
52
- "test/test_node.rb"
48
+ "test/k_means/test_centroid.rb",
49
+ "test/k_means/test_k_means.rb",
50
+ "test/k_means/test_node.rb"
53
51
  ]
54
52
 
55
53
  if s.respond_to? :specification_version then
@@ -57,9 +55,12 @@ Gem::Specification.new do |s|
57
55
  s.specification_version = 3
58
56
 
59
57
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<distance_measures>, [">= 0.0.0"])
60
59
  else
60
+ s.add_dependency(%q<distance_measures>, [">= 0.0.0"])
61
61
  end
62
62
  else
63
+ s.add_dependency(%q<distance_measures>, [">= 0.0.0"])
63
64
  end
64
65
  end
65
66
 
File without changes
@@ -0,0 +1,75 @@
1
+ require 'ext/object'
2
+
3
+ class KMeans
4
+
5
+ attr_reader :centroids, :nodes
6
+
7
+ def initialize(data, options={})
8
+ k = options[:centroids] || 4
9
+ @verbose = options[:verbose]
10
+
11
+ similarity_measure = options[:similarity_measure] || :euclidean_distance
12
+ @nodes = Node.create_nodes(data, similarity_measure)
13
+ @centroids = Centroid.create_centroids(k, @nodes)
14
+
15
+ perform_cluster_process
16
+ end
17
+
18
+ def inspect
19
+ @centroid_pockets.inspect
20
+ end
21
+
22
+ def view
23
+ @centroid_pockets
24
+ end
25
+
26
+ private
27
+
28
+ def perform_cluster_process
29
+ iterations, updates = 0, 1
30
+ while updates > 0 && iterations < 100
31
+ iterations += 1
32
+ verbose_message("Iteration #{iterations}")
33
+ updates = 0
34
+ updates += update_nodes
35
+ reposition_centroids
36
+ end
37
+ place_nodes_into_pockets
38
+ end
39
+
40
+ # This creates an array of arrays
41
+ # Each internal array represents a centroid
42
+ # and each in the array represents the nodes index
43
+ def place_nodes_into_pockets
44
+ centroid_pockets = Array.new(@centroids.size) {[]}
45
+ @centroids.each_with_index do |centroid, centroid_index|
46
+ @nodes.each_with_index do |node, node_index|
47
+ if node.closest_centroid == centroid
48
+ centroid_pockets[centroid_index] << node_index
49
+ end
50
+ end
51
+ end
52
+ @centroid_pockets = centroid_pockets
53
+ end
54
+
55
+ def update_nodes
56
+ sum = 0
57
+ @nodes.each do |node|
58
+ sum += node.update_closest_centroid(@centroids)
59
+ end
60
+ sum
61
+ end
62
+
63
+ def reposition_centroids
64
+ @centroids.each do |centroid|
65
+ nodes = []
66
+ @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
67
+ centroid.reposition(nodes)
68
+ end
69
+ end
70
+
71
+ def verbose_message(message)
72
+ puts message if @verbose
73
+ end
74
+
75
+ end
@@ -1,10 +1,10 @@
1
1
  class Node
2
2
 
3
3
  class << self
4
- def create_nodes(data)
4
+ def create_nodes(data, similarity_measure)
5
5
  nodes = []
6
6
  data.each do |position|
7
- nodes << new(position)
7
+ nodes << new(position, similarity_measure)
8
8
  end
9
9
  nodes
10
10
  end
@@ -12,8 +12,9 @@ class Node
12
12
 
13
13
  attr_accessor :position, :best_distance, :closest_centroid
14
14
 
15
- def initialize(position)
15
+ def initialize(position, similarity_measure)
16
16
  @position = position
17
+ @similarity_measure = similarity_measure
17
18
  end
18
19
 
19
20
  def update_closest_centroid(centroids)
@@ -38,7 +39,11 @@ class Node
38
39
  end
39
40
 
40
41
  def calculate_distance(centroid)
41
- @position.euclidean_distance(centroid.position)
42
+ begin
43
+ @position.send(@similarity_measure, centroid.position)
44
+ rescue NoMethodError
45
+ raise "Hey, that's not a measurement. Read the REAdME for available measurements"
46
+ end
42
47
  end
43
48
 
44
49
  end
data/lib/k_means.rb CHANGED
@@ -1,78 +1,8 @@
1
- $: << File.dirname(__FILE__)
2
- require 'centroid'
3
- require 'node'
4
- require 'ext/enumerable'
5
- require 'ext/object'
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
6
2
 
7
- class KMeans
8
-
9
- attr_reader :centroids, :nodes
10
-
11
- def initialize(data, options={})
12
- k = options[:centroids] || 4
13
- @verbose = options[:verbose] == true ? true : nil
14
-
15
- @nodes = Node.create_nodes(data)
16
- @centroids = Centroid.create_centroids(k, @nodes)
17
-
18
- perform_cluster_process
19
- end
20
-
21
- def inspect
22
- @centroid_pockets.inspect
23
- end
24
-
25
- def view
26
- @centroid_pockets
27
- end
28
-
29
- private
30
-
31
- def perform_cluster_process
32
- iterations, updates = 0, 1
33
- while updates > 0 && iterations < 100
34
- iterations += 1
35
- verbose_message("Iteration #{iterations}")
36
- updates = 0
37
- updates += update_nodes
38
- reposition_centroids
39
- end
40
- place_nodes_into_pockets
41
- end
42
-
43
- # This creates an array of arrays
44
- # Each internal array represents a centroid
45
- # and each in the array represents the nodes index
46
- def place_nodes_into_pockets
47
- centroid_pockets = Array.new(@centroids.size) {[]}
48
- @centroids.each_with_index do |centroid, centroid_index|
49
- @nodes.each_with_index do |node, node_index|
50
- if node.closest_centroid == centroid
51
- centroid_pockets[centroid_index] << node_index
52
- end
53
- end
54
- end
55
- @centroid_pockets = centroid_pockets
56
- end
57
-
58
- def update_nodes
59
- sum = 0
60
- @nodes.each do |node|
61
- sum += node.update_closest_centroid(@centroids)
62
- end
63
- sum
64
- end
65
-
66
- def reposition_centroids
67
- @centroids.each do |centroid|
68
- nodes = []
69
- @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
70
- centroid.reposition(nodes)
71
- end
72
- end
73
-
74
- def verbose_message(message)
75
- puts message if @verbose
76
- end
77
-
78
- end
3
+ require 'k_means/k_means'
4
+ require 'k_means/centroid'
5
+ require 'k_means/node'
6
+
7
+ # Gems
8
+ require 'distance_measures'
@@ -40,6 +40,6 @@ class TestCentroid < Test::Unit::TestCase
40
40
  private
41
41
 
42
42
  def create_nodes
43
- Node.create_nodes([[1,2,3], [4,5,6]])
43
+ Node.create_nodes([[1,2,3], [4,5,6]], :euclidean)
44
44
  end
45
45
  end
@@ -5,7 +5,7 @@ class TestKMeans < Test::Unit::TestCase
5
5
 
6
6
  setup do
7
7
  @data = Array.new(200) {Array.new(2) {rand}}
8
- @kmeans = KMeans.new(@data, :centroids => 2)
8
+ @kmeans = KMeans.new(@data, :centroids => 2, :similarity_measure => :cosine_similarity)
9
9
  end
10
10
 
11
11
  should "return an array" do
@@ -4,7 +4,7 @@ class TestNode < Test::Unit::TestCase
4
4
  context "A Data Instance" do
5
5
 
6
6
  setup do
7
- @node = Node.new([4, 4])
7
+ @node = Node.new([4, 4], :euclidean_distance)
8
8
  end
9
9
 
10
10
  should "return an array" do
@@ -13,13 +13,13 @@ class TestNode < Test::Unit::TestCase
13
13
 
14
14
  should "create an array of nodes" do
15
15
  data = Array.new(10) {Array.new(2) {rand}}
16
- nodes = Node.create_nodes(data)
16
+ nodes = Node.create_nodes(data, :euclidean_distance)
17
17
  assert_kind_of Array, nodes
18
18
  end
19
19
 
20
20
  should "create 10 nodes" do
21
21
  data = Array.new(10) {Array.new(2) {rand}}
22
- nodes = Node.create_nodes(data)
22
+ nodes = Node.create_nodes(data, :euclidean_distance)
23
23
  assert_equal 10, nodes.size
24
24
  end
25
25
 
@@ -35,9 +35,16 @@ class TestNode < Test::Unit::TestCase
35
35
  @node.update_closest_centroid(centroids)
36
36
  a = @node.closest_centroid
37
37
  @node.update_closest_centroid([Centroid.new([4,4])])
38
+
38
39
  assert_not_equal a, @node.closest_centroid
39
40
  assert_equal 0, @node.best_distance
40
41
  end
42
+
43
+ should "raise error if a false measure is specified" do
44
+ assert_raise NoMethodError do
45
+ Node.new([9,9], :fakey).calculate_distance([1,1])
46
+ end
47
+ end
41
48
 
42
49
  end
43
50
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: k_means
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - reddavis
@@ -9,10 +9,19 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-11-25 00:00:00 +00:00
12
+ date: 2010-01-25 00:00:00 +00:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: distance_measures
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
16
25
  description: Attempting to create a fast, memory efficient KMeans
17
26
  email: reddavis@gmail.com
18
27
  executables: []
@@ -31,18 +40,17 @@ files:
31
40
  - VERSION
32
41
  - benchmark/benchmark_ai4r.rb
33
42
  - k_means.gemspec
34
- - lib/centroid.rb
35
- - lib/ext/enumerable.rb
36
43
  - lib/ext/object.rb
37
44
  - lib/k_means.rb
38
- - lib/node.rb
45
+ - lib/k_means/centroid.rb
46
+ - lib/k_means/k_means.rb
47
+ - lib/k_means/node.rb
39
48
  - profiling/profile.rb
40
- - test/ext/test_enumerable.rb
41
49
  - test/ext/test_object.rb
42
50
  - test/helper.rb
43
- - test/test_centroid.rb
44
- - test/test_k_means.rb
45
- - test/test_node.rb
51
+ - test/k_means/test_centroid.rb
52
+ - test/k_means/test_k_means.rb
53
+ - test/k_means/test_node.rb
46
54
  has_rdoc: true
47
55
  homepage: http://github.com/reddavis/k_means
48
56
  licenses: []
@@ -72,9 +80,8 @@ signing_key:
72
80
  specification_version: 3
73
81
  summary: K Means algorithm
74
82
  test_files:
75
- - test/ext/test_enumerable.rb
76
83
  - test/ext/test_object.rb
77
84
  - test/helper.rb
78
- - test/test_centroid.rb
79
- - test/test_k_means.rb
80
- - test/test_node.rb
85
+ - test/k_means/test_centroid.rb
86
+ - test/k_means/test_k_means.rb
87
+ - test/k_means/test_node.rb
@@ -1,10 +0,0 @@
1
- module Enumerable
2
- def euclidean_distance(other)
3
- sum = 0.0
4
- self.each_index do |i|
5
- sum += (self[i] - other[i])**2
6
- end
7
- Math.sqrt(sum)
8
- end
9
- end
10
-
@@ -1,11 +0,0 @@
1
- require 'helper'
2
-
3
- class TestEnumerable < Test::Unit::TestCase
4
- context "Euclidean Distance" do
5
-
6
- should "return 5" do
7
- assert_equal 5, [10].euclidean_distance([5])
8
- end
9
-
10
- end
11
- end