k_means 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,18 +4,32 @@ Attempting to build a fast, memory efficient K-Means program.
4
4
 
5
5
  == Install
6
6
 
7
- gem sources -a http://gems.github.com
8
- sudo gem install reddavis-k_means
7
+ gem sources -a http://rubygems.org
8
+ sudo gem install k_means
9
9
 
10
10
  == How To Use
11
11
  require 'rubygems'
12
12
  require 'k_means'
13
-
13
+
14
14
  data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
15
- kmeans = KMeans.new(@data, :centroids => 2)
15
+ kmeans = KMeans.new(data, :centroids => 2)
16
16
  kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
17
17
  => [[3, 4], [0, 1, 2]]
18
-
18
+
19
+ == Custom Centroids
20
+ require 'rubygems'
21
+ require 'k_means'
22
+
23
+ # Your custom centroid needs to have #position and #reposition methods
24
+ class CustomCentroid
25
+ attr_accessor :position
26
+ def initialize(position); @position = position; end
27
+ def reposition(nodes, centroid_positions); end
28
+ end
29
+
30
+ data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
31
+ kmeans = KMeans.new(data, :custom_centroids => @specified_centroids)
32
+
19
33
  == Distance Measurements
20
34
 
21
35
  KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
@@ -35,19 +49,19 @@ The measurements currently available are:
35
49
  binary_jaccard_distance
36
50
 
37
51
  tanimoto_coefficient
38
-
52
+
39
53
  To specify a particular one to use in the KMeans algorithm, just provide it as an option:
40
54
 
41
55
  KMeans.new(@data, :distance_measure => :jaccard_index)
42
56
  KMeans.new(@data, :distance_measure => :cosine_similarity)
43
57
  KMeans.new(@data, :distance_measure => :tanimoto_coefficient)
44
-
58
+
45
59
  You get the idea...
46
-
60
+
47
61
  == Benchmarks
48
62
 
49
63
  # 1000 records with 50 dimensions
50
- data = Array.new(1000) {Array.new(50) {rand(10)}}
64
+ data = Array.new(1000) {Array.new(50) {rand(10)}}
51
65
  ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
52
66
 
53
67
  # Clustering can happen in magical ways
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.7
@@ -3,7 +3,7 @@ require 'rubygems'
3
3
  require 'ai4r'
4
4
  require File.dirname(__FILE__) + '/../lib/k_means'
5
5
 
6
- data = Array.new(1000) {Array.new(50) {rand(10)}}
6
+ data = Array.new(100) {Array.new(50) {rand(10)}}
7
7
 
8
8
  ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
9
9
 
@@ -16,7 +16,7 @@ Benchmark.bm do |x|
16
16
  x.report('Mine') do
17
17
  n.times { KMeans.new(data) }
18
18
  end
19
-
19
+
20
20
  x.report("Ai4R") do
21
21
  n.times do
22
22
  b = Ai4r::Clusterers::KMeans.new
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{k_means}
8
- s.version = "0.0.5"
8
+ s.version = "0.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2010-01-25}
12
+ s.date = %q{2010-07-11}
13
13
  s.description = %q{Attempting to create a fast, memory efficient KMeans}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -40,7 +40,7 @@ Gem::Specification.new do |s|
40
40
  s.homepage = %q{http://github.com/reddavis/k_means}
41
41
  s.rdoc_options = ["--charset=UTF-8"]
42
42
  s.require_paths = ["lib"]
43
- s.rubygems_version = %q{1.3.5}
43
+ s.rubygems_version = %q{1.3.6}
44
44
  s.summary = %q{K Means algorithm}
45
45
  s.test_files = [
46
46
  "test/ext/test_object.rb",
@@ -35,7 +35,7 @@ class Centroid
35
35
 
36
36
  # Finds the average distance of all the nodes assigned to
37
37
  # the centroid and then moves the centroid to that position
38
- def reposition(nodes)
38
+ def reposition(nodes, centroids)
39
39
  return if nodes.empty?
40
40
  averages = [0.0] * nodes[0].position.size
41
41
  nodes.each do |node|
@@ -46,4 +46,4 @@ class Centroid
46
46
  @position = averages.map {|x| x / nodes.size}
47
47
  end
48
48
 
49
- end
49
+ end
@@ -1,42 +1,41 @@
1
1
  require 'ext/object'
2
2
 
3
3
  class KMeans
4
-
4
+
5
5
  attr_reader :centroids, :nodes
6
-
6
+
7
7
  def initialize(data, options={})
8
- k = options[:centroids] || 4
9
- @verbose = options[:verbose]
10
-
11
8
  distance_measure = options[:distance_measure] || :euclidean_distance
12
9
  @nodes = Node.create_nodes(data, distance_measure)
13
- @centroids = Centroid.create_centroids(k, @nodes)
14
-
10
+ @centroids = options[:custom_centroids] ||
11
+ Centroid.create_centroids(options[:centroids] || 4, @nodes)
12
+ @verbose = options[:verbose]
13
+
15
14
  perform_cluster_process
16
15
  end
17
-
16
+
18
17
  def inspect
19
18
  @centroid_pockets.inspect
20
19
  end
21
-
20
+
22
21
  def view
23
22
  @centroid_pockets
24
23
  end
25
-
24
+
26
25
  private
27
-
26
+
28
27
  def perform_cluster_process
29
28
  iterations, updates = 0, 1
30
- while updates > 0 && iterations < 100
29
+ while updates > 0 && iterations < 100
31
30
  iterations += 1
32
31
  verbose_message("Iteration #{iterations}")
33
- updates = 0
32
+ updates = 0
34
33
  updates += update_nodes
35
34
  reposition_centroids
36
35
  end
37
36
  place_nodes_into_pockets
38
37
  end
39
-
38
+
40
39
  # This creates an array of arrays
41
40
  # Each internal array represents a centroid
42
41
  # and each in the array represents the nodes index
@@ -51,7 +50,7 @@ class KMeans
51
50
  end
52
51
  @centroid_pockets = centroid_pockets
53
52
  end
54
-
53
+
55
54
  def update_nodes
56
55
  sum = 0
57
56
  @nodes.each do |node|
@@ -59,17 +58,18 @@ class KMeans
59
58
  end
60
59
  sum
61
60
  end
62
-
61
+
63
62
  def reposition_centroids
63
+ centroid_positions = @centroids.map(&:position)
64
64
  @centroids.each do |centroid|
65
- nodes = []
65
+ nodes = []
66
66
  @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
67
- centroid.reposition(nodes)
67
+ centroid.reposition(nodes, centroid_positions)
68
68
  end
69
69
  end
70
-
70
+
71
71
  def verbose_message(message)
72
72
  puts message if @verbose
73
73
  end
74
-
75
- end
74
+
75
+ end
@@ -1,5 +1,5 @@
1
1
  class Node
2
-
2
+
3
3
  class << self
4
4
  def create_nodes(data, similarity_measure)
5
5
  nodes = []
@@ -9,41 +9,59 @@ class Node
9
9
  nodes
10
10
  end
11
11
  end
12
-
12
+
13
13
  attr_accessor :position, :best_distance, :closest_centroid
14
-
14
+
15
15
  def initialize(position, similarity_measure)
16
16
  @position = position
17
17
  @similarity_measure = similarity_measure
18
18
  end
19
-
19
+
20
20
  def update_closest_centroid(centroids)
21
+ # If we haven't processed this node we need to give it an initial centroid
22
+ # so that we have something to compare distances against
21
23
  calculate_initial_centroid(centroids.first) unless @closest_centroid
24
+
22
25
  updated = false
23
26
  centroids.each do |centroid|
27
+ # Check if they are in the same position
28
+ if centroid.position == @position
29
+ updated = update_attributes(centroid, 0.0)
30
+ break
31
+ end
32
+
24
33
  distance = calculate_distance(centroid)
25
34
  if distance < best_distance
26
- updated = true
27
- @closest_centroid = centroid
28
- @best_distance = distance
35
+ updated = update_attributes(centroid, distance)
29
36
  end
30
37
  end
38
+
31
39
  updated == true ? 1 : 0
32
40
  end
33
-
41
+
42
+ def reset!
43
+ @closest_centroid = nil
44
+ @best_distance = nil
45
+ end
46
+
34
47
  private
35
-
48
+
49
+ def update_attributes(closest_centroid, best_distance)
50
+ @closest_centroid, @best_distance = closest_centroid, best_distance
51
+ true
52
+ end
53
+
36
54
  def calculate_initial_centroid(centroid)
37
55
  @closest_centroid = centroid
38
56
  @best_distance = calculate_distance(centroid)
39
57
  end
40
-
58
+
41
59
  def calculate_distance(centroid)
42
60
  begin
43
61
  @position.send(@similarity_measure, centroid.position)
44
62
  rescue NoMethodError
45
- raise "Hey, that's not a measurement. Read the REAdME for available measurements"
63
+ raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements"
46
64
  end
47
65
  end
48
-
49
- end
66
+
67
+ end
@@ -31,7 +31,7 @@ class TestCentroid < Test::Unit::TestCase
31
31
  end
32
32
  end
33
33
  average_position.map! {|x| x / 2}
34
- @centroid.reposition(create_nodes)
34
+ @centroid.reposition(create_nodes, [@centroid.position])
35
35
  assert_equal average_position, @centroid.position
36
36
  end
37
37
 
@@ -2,23 +2,49 @@ require 'helper'
2
2
 
3
3
  class TestKMeans < Test::Unit::TestCase
4
4
  context "A KMeans Instance" do
5
-
5
+
6
6
  setup do
7
- @data = Array.new(200) {Array.new(2) {rand}}
7
+ @data = Array.new(3) {Array.new(2) {rand}}
8
8
  @kmeans = KMeans.new(@data, :centroids => 2, :distance_measure => :cosine_similarity)
9
9
  end
10
-
10
+
11
11
  should "return an array" do
12
12
  assert_kind_of String, @kmeans.inspect
13
13
  end
14
-
14
+
15
15
  should "have 2 centroids" do
16
16
  assert_equal 2, @kmeans.centroids.size
17
17
  end
18
-
18
+
19
19
  should "have 200 nodes" do
20
- assert_equal 200, @kmeans.nodes.size
20
+ assert_equal 3, @kmeans.nodes.size
21
+ end
22
+
23
+ end
24
+
25
+ context "A KMeans Instance with specified initial centroids" do
26
+ setup do
27
+ @data = Array.new(3) {Array.new(2) {rand}}
28
+ class CustomCentroid
29
+ attr_accessor :position
30
+ def initialize(position); @position = position; end
31
+ def reposition(nodes, centroid_positions); end
32
+ end
33
+
34
+ @specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) }
35
+ @kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity)
36
+ end
37
+
38
+ should "return an inspected array" do
39
+ assert_kind_of String, @kmeans.inspect
40
+ end
41
+
42
+ should "have 3 centroids" do
43
+ assert_equal 3, @kmeans.centroids.size
44
+ end
45
+
46
+ should "have 3 nodes" do
47
+ assert_equal 3, @kmeans.nodes.size
21
48
  end
22
-
23
49
  end
24
50
  end
@@ -2,49 +2,63 @@ require 'helper'
2
2
 
3
3
  class TestNode < Test::Unit::TestCase
4
4
  context "A Data Instance" do
5
-
5
+
6
6
  setup do
7
7
  @node = Node.new([4, 4], :euclidean_distance)
8
8
  end
9
-
9
+
10
10
  should "return an array" do
11
11
  assert_kind_of Array, @node.position
12
12
  end
13
-
13
+
14
14
  should "create an array of nodes" do
15
15
  data = Array.new(10) {Array.new(2) {rand}}
16
16
  nodes = Node.create_nodes(data, :euclidean_distance)
17
17
  assert_kind_of Array, nodes
18
18
  end
19
-
19
+
20
20
  should "create 10 nodes" do
21
21
  data = Array.new(10) {Array.new(2) {rand}}
22
22
  nodes = Node.create_nodes(data, :euclidean_distance)
23
23
  assert_equal 10, nodes.size
24
24
  end
25
-
25
+
26
26
  should "initialize closest centroid" do
27
27
  a = @node.closest_centroid
28
28
  centroids = [Centroid.new([4, 4]), Centroid.new([5, 4])]
29
29
  @node.update_closest_centroid(centroids)
30
30
  assert_not_equal nil, @node.closest_centroid
31
31
  end
32
-
32
+
33
33
  should "update closest centroid" do
34
34
  centroids = [Centroid.new([5, 4])]
35
35
  @node.update_closest_centroid(centroids)
36
36
  a = @node.closest_centroid
37
37
  @node.update_closest_centroid([Centroid.new([4,4])])
38
-
38
+
39
39
  assert_not_equal a, @node.closest_centroid
40
40
  assert_equal 0, @node.best_distance
41
41
  end
42
-
42
+
43
+ context "Reset the node" do
44
+ should "update closest centroid" do
45
+ centroids = [Centroid.new([4, 4])]
46
+ @node.update_closest_centroid(centroids)
47
+ a = @node.closest_centroid
48
+ @node.reset!
49
+
50
+ @node.update_closest_centroid([Centroid.new([5,4])])
51
+
52
+ assert_not_equal a, @node.closest_centroid
53
+ assert_equal 1, @node.best_distance
54
+ end
55
+ end
56
+
43
57
  should "raise error if a false measure is specified" do
44
58
  assert_raise NoMethodError do
45
59
  Node.new([9,9], :fakey).calculate_distance([1,1])
46
60
  end
47
61
  end
48
-
62
+
49
63
  end
50
- end
64
+ end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: k_means
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 7
9
+ version: 0.0.7
5
10
  platform: ruby
6
11
  authors:
7
12
  - reddavis
@@ -9,19 +14,23 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2010-01-25 00:00:00 +00:00
17
+ date: 2010-07-11 00:00:00 +01:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: distance_measures
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 0
30
+ - 0
23
31
  version: 0.0.0
24
- version:
32
+ type: :runtime
33
+ version_requirements: *id001
25
34
  description: Attempting to create a fast, memory efficient KMeans
26
35
  email: reddavis@gmail.com
27
36
  executables: []
@@ -64,18 +73,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
64
73
  requirements:
65
74
  - - ">="
66
75
  - !ruby/object:Gem::Version
76
+ segments:
77
+ - 0
67
78
  version: "0"
68
- version:
69
79
  required_rubygems_version: !ruby/object:Gem::Requirement
70
80
  requirements:
71
81
  - - ">="
72
82
  - !ruby/object:Gem::Version
83
+ segments:
84
+ - 0
73
85
  version: "0"
74
- version:
75
86
  requirements: []
76
87
 
77
88
  rubyforge_project:
78
- rubygems_version: 1.3.5
89
+ rubygems_version: 1.3.6
79
90
  signing_key:
80
91
  specification_version: 3
81
92
  summary: K Means algorithm