k_means 0.0.5 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,18 +4,32 @@ Attempting to build a fast, memory efficient K-Means program.
4
4
 
5
5
  == Install
6
6
 
7
- gem sources -a http://gems.github.com
8
- sudo gem install reddavis-k_means
7
+ gem sources -a http://rubygems.org
8
+ sudo gem install k_means
9
9
 
10
10
  == How To Use
11
11
  require 'rubygems'
12
12
  require 'k_means'
13
-
13
+
14
14
  data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
15
- kmeans = KMeans.new(@data, :centroids => 2)
15
+ kmeans = KMeans.new(data, :centroids => 2)
16
16
  kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
17
17
  => [[3, 4], [0, 1, 2]]
18
-
18
+
19
+ == Custom Centroids
20
+ require 'rubygems'
21
+ require 'k_means'
22
+
23
+ # Your custom centroid needs to have #position and #reposition methods
24
+ class CustomCentroid
25
+ attr_accessor :position
26
+ def initialize(position); @position = position; end
27
+ def reposition(nodes, centroid_positions); end
28
+ end
29
+
30
+ data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
31
+ kmeans = KMeans.new(data, :custom_centroids => @specified_centroids)
32
+
19
33
  == Distance Measurements
20
34
 
21
35
  KMeans uses the Distance Measures Gem (http://github.com/reddavis/Distance-Measures) so we get quite a range of distance measurements.
@@ -35,19 +49,19 @@ The measurements currently available are:
35
49
  binary_jaccard_distance
36
50
 
37
51
  tanimoto_coefficient
38
-
52
+
39
53
  To specify a particular one to use in the KMeans algorithm, just provide it as an option:
40
54
 
41
55
  KMeans.new(@data, :distance_measure => :jaccard_index)
42
56
  KMeans.new(@data, :distance_measure => :cosine_similarity)
43
57
  KMeans.new(@data, :distance_measure => :tanimoto_coefficient)
44
-
58
+
45
59
  You get the idea...
46
-
60
+
47
61
  == Benchmarks
48
62
 
49
63
  # 1000 records with 50 dimensions
50
- data = Array.new(1000) {Array.new(50) {rand(10)}}
64
+ data = Array.new(1000) {Array.new(50) {rand(10)}}
51
65
  ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
52
66
 
53
67
  # Clustering can happen in magical ways
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.7
@@ -3,7 +3,7 @@ require 'rubygems'
3
3
  require 'ai4r'
4
4
  require File.dirname(__FILE__) + '/../lib/k_means'
5
5
 
6
- data = Array.new(1000) {Array.new(50) {rand(10)}}
6
+ data = Array.new(100) {Array.new(50) {rand(10)}}
7
7
 
8
8
  ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
9
9
 
@@ -16,7 +16,7 @@ Benchmark.bm do |x|
16
16
  x.report('Mine') do
17
17
  n.times { KMeans.new(data) }
18
18
  end
19
-
19
+
20
20
  x.report("Ai4R") do
21
21
  n.times do
22
22
  b = Ai4r::Clusterers::KMeans.new
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{k_means}
8
- s.version = "0.0.5"
8
+ s.version = "0.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2010-01-25}
12
+ s.date = %q{2010-07-11}
13
13
  s.description = %q{Attempting to create a fast, memory efficient KMeans}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -40,7 +40,7 @@ Gem::Specification.new do |s|
40
40
  s.homepage = %q{http://github.com/reddavis/k_means}
41
41
  s.rdoc_options = ["--charset=UTF-8"]
42
42
  s.require_paths = ["lib"]
43
- s.rubygems_version = %q{1.3.5}
43
+ s.rubygems_version = %q{1.3.6}
44
44
  s.summary = %q{K Means algorithm}
45
45
  s.test_files = [
46
46
  "test/ext/test_object.rb",
@@ -35,7 +35,7 @@ class Centroid
35
35
 
36
36
  # Finds the average distance of all the nodes assigned to
37
37
  # the centroid and then moves the centroid to that position
38
- def reposition(nodes)
38
+ def reposition(nodes, centroids)
39
39
  return if nodes.empty?
40
40
  averages = [0.0] * nodes[0].position.size
41
41
  nodes.each do |node|
@@ -46,4 +46,4 @@ class Centroid
46
46
  @position = averages.map {|x| x / nodes.size}
47
47
  end
48
48
 
49
- end
49
+ end
@@ -1,42 +1,41 @@
1
1
  require 'ext/object'
2
2
 
3
3
  class KMeans
4
-
4
+
5
5
  attr_reader :centroids, :nodes
6
-
6
+
7
7
  def initialize(data, options={})
8
- k = options[:centroids] || 4
9
- @verbose = options[:verbose]
10
-
11
8
  distance_measure = options[:distance_measure] || :euclidean_distance
12
9
  @nodes = Node.create_nodes(data, distance_measure)
13
- @centroids = Centroid.create_centroids(k, @nodes)
14
-
10
+ @centroids = options[:custom_centroids] ||
11
+ Centroid.create_centroids(options[:centroids] || 4, @nodes)
12
+ @verbose = options[:verbose]
13
+
15
14
  perform_cluster_process
16
15
  end
17
-
16
+
18
17
  def inspect
19
18
  @centroid_pockets.inspect
20
19
  end
21
-
20
+
22
21
  def view
23
22
  @centroid_pockets
24
23
  end
25
-
24
+
26
25
  private
27
-
26
+
28
27
  def perform_cluster_process
29
28
  iterations, updates = 0, 1
30
- while updates > 0 && iterations < 100
29
+ while updates > 0 && iterations < 100
31
30
  iterations += 1
32
31
  verbose_message("Iteration #{iterations}")
33
- updates = 0
32
+ updates = 0
34
33
  updates += update_nodes
35
34
  reposition_centroids
36
35
  end
37
36
  place_nodes_into_pockets
38
37
  end
39
-
38
+
40
39
  # This creates an array of arrays
41
40
  # Each internal array represents a centroid
42
41
  # and each in the array represents the nodes index
@@ -51,7 +50,7 @@ class KMeans
51
50
  end
52
51
  @centroid_pockets = centroid_pockets
53
52
  end
54
-
53
+
55
54
  def update_nodes
56
55
  sum = 0
57
56
  @nodes.each do |node|
@@ -59,17 +58,18 @@ class KMeans
59
58
  end
60
59
  sum
61
60
  end
62
-
61
+
63
62
  def reposition_centroids
63
+ centroid_positions = @centroids.map(&:position)
64
64
  @centroids.each do |centroid|
65
- nodes = []
65
+ nodes = []
66
66
  @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
67
- centroid.reposition(nodes)
67
+ centroid.reposition(nodes, centroid_positions)
68
68
  end
69
69
  end
70
-
70
+
71
71
  def verbose_message(message)
72
72
  puts message if @verbose
73
73
  end
74
-
75
- end
74
+
75
+ end
@@ -1,5 +1,5 @@
1
1
  class Node
2
-
2
+
3
3
  class << self
4
4
  def create_nodes(data, similarity_measure)
5
5
  nodes = []
@@ -9,41 +9,59 @@ class Node
9
9
  nodes
10
10
  end
11
11
  end
12
-
12
+
13
13
  attr_accessor :position, :best_distance, :closest_centroid
14
-
14
+
15
15
  def initialize(position, similarity_measure)
16
16
  @position = position
17
17
  @similarity_measure = similarity_measure
18
18
  end
19
-
19
+
20
20
  def update_closest_centroid(centroids)
21
+ # If we haven't processed this node we need to give it an initial centroid
22
+ # so that we have something to compare distances against
21
23
  calculate_initial_centroid(centroids.first) unless @closest_centroid
24
+
22
25
  updated = false
23
26
  centroids.each do |centroid|
27
+ # Check if they are in the same position
28
+ if centroid.position == @position
29
+ updated = update_attributes(centroid, 0.0)
30
+ break
31
+ end
32
+
24
33
  distance = calculate_distance(centroid)
25
34
  if distance < best_distance
26
- updated = true
27
- @closest_centroid = centroid
28
- @best_distance = distance
35
+ updated = update_attributes(centroid, distance)
29
36
  end
30
37
  end
38
+
31
39
  updated == true ? 1 : 0
32
40
  end
33
-
41
+
42
+ def reset!
43
+ @closest_centroid = nil
44
+ @best_distance = nil
45
+ end
46
+
34
47
  private
35
-
48
+
49
+ def update_attributes(closest_centroid, best_distance)
50
+ @closest_centroid, @best_distance = closest_centroid, best_distance
51
+ true
52
+ end
53
+
36
54
  def calculate_initial_centroid(centroid)
37
55
  @closest_centroid = centroid
38
56
  @best_distance = calculate_distance(centroid)
39
57
  end
40
-
58
+
41
59
  def calculate_distance(centroid)
42
60
  begin
43
61
  @position.send(@similarity_measure, centroid.position)
44
62
  rescue NoMethodError
45
- raise "Hey, that's not a measurement. Read the REAdME for available measurements"
63
+ raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements"
46
64
  end
47
65
  end
48
-
49
- end
66
+
67
+ end
@@ -31,7 +31,7 @@ class TestCentroid < Test::Unit::TestCase
31
31
  end
32
32
  end
33
33
  average_position.map! {|x| x / 2}
34
- @centroid.reposition(create_nodes)
34
+ @centroid.reposition(create_nodes, [@centroid.position])
35
35
  assert_equal average_position, @centroid.position
36
36
  end
37
37
 
@@ -2,23 +2,49 @@ require 'helper'
2
2
 
3
3
  class TestKMeans < Test::Unit::TestCase
4
4
  context "A KMeans Instance" do
5
-
5
+
6
6
  setup do
7
- @data = Array.new(200) {Array.new(2) {rand}}
7
+ @data = Array.new(3) {Array.new(2) {rand}}
8
8
  @kmeans = KMeans.new(@data, :centroids => 2, :distance_measure => :cosine_similarity)
9
9
  end
10
-
10
+
11
11
  should "return an array" do
12
12
  assert_kind_of String, @kmeans.inspect
13
13
  end
14
-
14
+
15
15
  should "have 2 centroids" do
16
16
  assert_equal 2, @kmeans.centroids.size
17
17
  end
18
-
18
+
19
19
  should "have 200 nodes" do
20
- assert_equal 200, @kmeans.nodes.size
20
+ assert_equal 3, @kmeans.nodes.size
21
+ end
22
+
23
+ end
24
+
25
+ context "A KMeans Instance with specified initial centroids" do
26
+ setup do
27
+ @data = Array.new(3) {Array.new(2) {rand}}
28
+ class CustomCentroid
29
+ attr_accessor :position
30
+ def initialize(position); @position = position; end
31
+ def reposition(nodes, centroid_positions); end
32
+ end
33
+
34
+ @specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) }
35
+ @kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity)
36
+ end
37
+
38
+ should "return an inspected array" do
39
+ assert_kind_of String, @kmeans.inspect
40
+ end
41
+
42
+ should "have 3 centroids" do
43
+ assert_equal 3, @kmeans.centroids.size
44
+ end
45
+
46
+ should "have 3 nodes" do
47
+ assert_equal 3, @kmeans.nodes.size
21
48
  end
22
-
23
49
  end
24
50
  end
@@ -2,49 +2,63 @@ require 'helper'
2
2
 
3
3
  class TestNode < Test::Unit::TestCase
4
4
  context "A Data Instance" do
5
-
5
+
6
6
  setup do
7
7
  @node = Node.new([4, 4], :euclidean_distance)
8
8
  end
9
-
9
+
10
10
  should "return an array" do
11
11
  assert_kind_of Array, @node.position
12
12
  end
13
-
13
+
14
14
  should "create an array of nodes" do
15
15
  data = Array.new(10) {Array.new(2) {rand}}
16
16
  nodes = Node.create_nodes(data, :euclidean_distance)
17
17
  assert_kind_of Array, nodes
18
18
  end
19
-
19
+
20
20
  should "create 10 nodes" do
21
21
  data = Array.new(10) {Array.new(2) {rand}}
22
22
  nodes = Node.create_nodes(data, :euclidean_distance)
23
23
  assert_equal 10, nodes.size
24
24
  end
25
-
25
+
26
26
  should "initialize closest centroid" do
27
27
  a = @node.closest_centroid
28
28
  centroids = [Centroid.new([4, 4]), Centroid.new([5, 4])]
29
29
  @node.update_closest_centroid(centroids)
30
30
  assert_not_equal nil, @node.closest_centroid
31
31
  end
32
-
32
+
33
33
  should "update closest centroid" do
34
34
  centroids = [Centroid.new([5, 4])]
35
35
  @node.update_closest_centroid(centroids)
36
36
  a = @node.closest_centroid
37
37
  @node.update_closest_centroid([Centroid.new([4,4])])
38
-
38
+
39
39
  assert_not_equal a, @node.closest_centroid
40
40
  assert_equal 0, @node.best_distance
41
41
  end
42
-
42
+
43
+ context "Reset the node" do
44
+ should "update closest centroid" do
45
+ centroids = [Centroid.new([4, 4])]
46
+ @node.update_closest_centroid(centroids)
47
+ a = @node.closest_centroid
48
+ @node.reset!
49
+
50
+ @node.update_closest_centroid([Centroid.new([5,4])])
51
+
52
+ assert_not_equal a, @node.closest_centroid
53
+ assert_equal 1, @node.best_distance
54
+ end
55
+ end
56
+
43
57
  should "raise error if a false measure is specified" do
44
58
  assert_raise NoMethodError do
45
59
  Node.new([9,9], :fakey).calculate_distance([1,1])
46
60
  end
47
61
  end
48
-
62
+
49
63
  end
50
- end
64
+ end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: k_means
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 7
9
+ version: 0.0.7
5
10
  platform: ruby
6
11
  authors:
7
12
  - reddavis
@@ -9,19 +14,23 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2010-01-25 00:00:00 +00:00
17
+ date: 2010-07-11 00:00:00 +01:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: distance_measures
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 0
30
+ - 0
23
31
  version: 0.0.0
24
- version:
32
+ type: :runtime
33
+ version_requirements: *id001
25
34
  description: Attempting to create a fast, memory efficient KMeans
26
35
  email: reddavis@gmail.com
27
36
  executables: []
@@ -64,18 +73,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
64
73
  requirements:
65
74
  - - ">="
66
75
  - !ruby/object:Gem::Version
76
+ segments:
77
+ - 0
67
78
  version: "0"
68
- version:
69
79
  required_rubygems_version: !ruby/object:Gem::Requirement
70
80
  requirements:
71
81
  - - ">="
72
82
  - !ruby/object:Gem::Version
83
+ segments:
84
+ - 0
73
85
  version: "0"
74
- version:
75
86
  requirements: []
76
87
 
77
88
  rubyforge_project:
78
- rubygems_version: 1.3.5
89
+ rubygems_version: 1.3.6
79
90
  signing_key:
80
91
  specification_version: 3
81
92
  summary: K Means algorithm