k_means 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,50 @@
1
+ = KMeans
2
+
3
+ Attempting to build a fast, memory efficient K-Means program.
4
+
5
+ == Install
6
+
7
+ gem sources -a http://gems.github.com
8
+ sudo gem install reddavis-k_means
9
+
10
+ == How To Use
11
+ require 'rubygems'
12
+ require 'k_means'
13
+
14
+ data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
15
+ kmeans = KMeans.new(@data, :centroids => 2)
16
+ kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
17
+ => [[3, 4], [0, 1, 2]]
18
+
19
+ == Benchmarks
20
+
21
+ # 1000 records with 50 dimensions
22
+ data = Array.new(1000) {Array.new(50) {rand(10)}}
23
+ ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
24
+
25
+ # Clustering can happen in magical ways
26
+ # so lets do it over multiple times
27
+ n = 5
28
+
29
+ Benchmark.bm do |x|
30
+ x.report('KMeans') do
31
+ n.times { KMeans.new(data) }
32
+ end
33
+ x.report("Ai4R") do
34
+ n.times do
35
+ b = Ai4r::Clusterers::KMeans.new
36
+ b.build(ai4r_data, 4)
37
+ end
38
+ end
39
+ end
40
+        user     system      total        real
41
+ KMeans 15.960000   0.030000  15.990000 ( 16.062639)
42
+ Ai4R 70.230000   0.180000  70.410000 ( 70.704843)
43
+
44
+ == Thanks
45
+
46
+ * David Richards - For his code reviews and all round helpfulness. - http://github.com/davidrichards
47
+
48
+ == Copyright
49
+
50
+ Copyright (c) 2009 Red Davis. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "k_means"
8
+ gem.summary = %Q{K Means algorithm}
9
+ gem.description = %Q{Attempting to create a fast, memory efficient KMeans}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/k_means"
12
+ gem.authors = ["reddavis"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "k_means #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.2
@@ -0,0 +1,27 @@
1
+ require 'benchmark'
2
+ require 'rubygems'
3
+ require 'benchmarker'
4
+ require 'ai4r'
5
+ require File.dirname(__FILE__) + '/../lib/k_means'
6
+
7
+ data = Array.new(500) {Array.new(50) {rand(10)}}
8
+
9
+ ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
10
+
11
+ # Clustering can happen in magical ways
12
+ # so lets do it over multiple times
13
+ n = 2
14
+
15
+ Benchmarker.go('lib') do
16
+ Benchmark.bm do |x|
17
+ x.report('Mine') do
18
+ n.times { KMeans.new(data) }
19
+ end
20
+ # x.report("Ai4R") do
21
+ # n.times do
22
+ #b = Ai4r::Clusterers::KMeans.new
23
+ #b.build(ai4r_data, 4)
24
+ # end
25
+ # end
26
+ end
27
+ end
data/k_means.gemspec ADDED
@@ -0,0 +1,65 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{k_means}
8
+ s.version = "0.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2009-11-25}
13
+ s.description = %q{Attempting to create a fast, memory efficient KMeans}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "benchmark/benchmark_ai4r.rb",
27
+ "k_means.gemspec",
28
+ "lib/centroid.rb",
29
+ "lib/ext/enumerable.rb",
30
+ "lib/ext/object.rb",
31
+ "lib/k_means.rb",
32
+ "lib/node.rb",
33
+ "profiling/profile.rb",
34
+ "test/ext/test_enumerable.rb",
35
+ "test/ext/test_object.rb",
36
+ "test/helper.rb",
37
+ "test/test_centroid.rb",
38
+ "test/test_k_means.rb",
39
+ "test/test_node.rb"
40
+ ]
41
+ s.homepage = %q{http://github.com/reddavis/k_means}
42
+ s.rdoc_options = ["--charset=UTF-8"]
43
+ s.require_paths = ["lib"]
44
+ s.rubygems_version = %q{1.3.5}
45
+ s.summary = %q{K Means algorithm}
46
+ s.test_files = [
47
+ "test/ext/test_enumerable.rb",
48
+ "test/ext/test_object.rb",
49
+ "test/helper.rb",
50
+ "test/test_centroid.rb",
51
+ "test/test_k_means.rb",
52
+ "test/test_node.rb"
53
+ ]
54
+
55
+ if s.respond_to? :specification_version then
56
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
57
+ s.specification_version = 3
58
+
59
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
60
+ else
61
+ end
62
+ else
63
+ end
64
+ end
65
+
data/lib/centroid.rb ADDED
@@ -0,0 +1,49 @@
1
+ class Centroid
2
+
3
+ class << self
4
+ def create_centroids(amount, nodes)
5
+ ranges = create_ranges(nodes, nodes[0].position.size)
6
+ (1..amount).map do
7
+ position = ranges.inject([]) do |array, range|
8
+ array << rand_between(range[0], range[1])
9
+ end
10
+ new(position)
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def create_ranges(nodes, dimensions)
17
+ ranges = Array.new(dimensions) {[0.0, 0.0]}
18
+ nodes.each do |node|
19
+ node.position.each_with_index do |position, index|
20
+ # Bottom range
21
+ ranges[index][0] = position if position < ranges[index][0]
22
+ # Top range
23
+ ranges[index][1] = position if position > ranges[index][1]
24
+ end
25
+ end
26
+ ranges
27
+ end
28
+ end
29
+
30
+ attr_accessor :position
31
+
32
+ def initialize(position)
33
+ @position = position
34
+ end
35
+
36
+ # Finds the average distance of all the nodes assigned to
37
+ # the centroid and then moves the centroid to that position
38
+ def reposition(nodes)
39
+ return if nodes.empty?
40
+ averages = [0.0] * nodes[0].position.size
41
+ nodes.each do |node|
42
+ node.position.each_with_index do |position, index|
43
+ averages[index] += position
44
+ end
45
+ end
46
+ @position = averages.map {|x| x / nodes.size}
47
+ end
48
+
49
+ end
@@ -0,0 +1,10 @@
1
+ module Enumerable
2
+ def euclidean_distance(other)
3
+ sum = 0.0
4
+ self.each_index do |i|
5
+ sum += (self[i] - other[i])**2
6
+ end
7
+ Math.sqrt(sum)
8
+ end
9
+ end
10
+
data/lib/ext/object.rb ADDED
@@ -0,0 +1,16 @@
1
+ class Object
2
+
3
+ # Simpler way to handle a random number between to values
4
+ def rand_between(a, b)
5
+ return rand_in_floats(a, b) if a.is_a?(Float) or b.is_a?(Float)
6
+ range = (a - b).abs + 1
7
+ rand(range) + [a,b].min
8
+ end
9
+
10
+ # Handles non-integers
11
+ def rand_in_floats(a, b)
12
+ range = (a - b).abs
13
+ (rand * range) + [a,b].min
14
+ end
15
+
16
+ end
data/lib/k_means.rb ADDED
@@ -0,0 +1,78 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'centroid'
3
+ require 'node'
4
+ require 'ext/enumerable'
5
+ require 'ext/object'
6
+
7
+ class KMeans
8
+
9
+ attr_reader :centroids, :nodes
10
+
11
+ def initialize(data, options={})
12
+ k = options[:centroids] || 4
13
+ @verbose = options[:verbose] == true ? true : nil
14
+
15
+ @nodes = Node.create_nodes(data)
16
+ @centroids = Centroid.create_centroids(k, @nodes)
17
+
18
+ perform_cluster_process
19
+ end
20
+
21
+ def inspect
22
+ @centroid_pockets.inspect
23
+ end
24
+
25
+ def view
26
+ @centroid_pockets
27
+ end
28
+
29
+ private
30
+
31
+ def perform_cluster_process
32
+ iterations, updates = 0, 1
33
+ while updates > 0 && iterations < 100
34
+ iterations += 1
35
+ verbose_message("Iteration #{iterations}")
36
+ updates = 0
37
+ updates += update_nodes
38
+ reposition_centroids
39
+ end
40
+ place_nodes_into_pockets
41
+ end
42
+
43
+ # This creates an array of arrays
44
+ # Each internal array represents a centroid
45
+ # and each in the array represents the nodes index
46
+ def place_nodes_into_pockets
47
+ centroid_pockets = Array.new(@centroids.size) {[]}
48
+ @centroids.each_with_index do |centroid, centroid_index|
49
+ @nodes.each_with_index do |node, node_index|
50
+ if node.closest_centroid == centroid
51
+ centroid_pockets[centroid_index] << node_index
52
+ end
53
+ end
54
+ end
55
+ @centroid_pockets = centroid_pockets
56
+ end
57
+
58
+ def update_nodes
59
+ sum = 0
60
+ @nodes.each do |node|
61
+ sum += node.update_closest_centroid(@centroids)
62
+ end
63
+ sum
64
+ end
65
+
66
+ def reposition_centroids
67
+ @centroids.each do |centroid|
68
+ nodes = []
69
+ @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
70
+ centroid.reposition(nodes)
71
+ end
72
+ end
73
+
74
+ def verbose_message(message)
75
+ puts message if @verbose
76
+ end
77
+
78
+ end
data/lib/node.rb ADDED
@@ -0,0 +1,44 @@
1
+ class Node
2
+
3
+ class << self
4
+ def create_nodes(data)
5
+ nodes = []
6
+ data.each do |position|
7
+ nodes << new(position)
8
+ end
9
+ nodes
10
+ end
11
+ end
12
+
13
+ attr_accessor :position, :best_distance, :closest_centroid
14
+
15
+ def initialize(position)
16
+ @position = position
17
+ end
18
+
19
+ def update_closest_centroid(centroids)
20
+ calculate_initial_centroid(centroids.first) unless @closest_centroid
21
+ updated = false
22
+ centroids.each do |centroid|
23
+ distance = calculate_distance(centroid)
24
+ if distance < best_distance
25
+ updated = true
26
+ @closest_centroid = centroid
27
+ @best_distance = distance
28
+ end
29
+ end
30
+ updated == true ? 1 : 0
31
+ end
32
+
33
+ private
34
+
35
+ def calculate_initial_centroid(centroid)
36
+ @closest_centroid = centroid
37
+ @best_distance = calculate_distance(centroid)
38
+ end
39
+
40
+ def calculate_distance(centroid)
41
+ @position.euclidean_distance(centroid.position)
42
+ end
43
+
44
+ end
@@ -0,0 +1,12 @@
1
+ require File.dirname(__FILE__) + '/../lib/k_means'
2
+ require 'rubygems'
3
+ require 'ruby-prof'
4
+
5
+ data = Array.new(100) {Array.new(2) {rand}}
6
+
7
+ result = RubyProf.profile do
8
+ a = KMeans.new(data)
9
+ end
10
+
11
+ printer = RubyProf::FlatPrinter.new(result)
12
+ printer.print(STDOUT, 0)
@@ -0,0 +1,11 @@
1
+ require 'helper'
2
+
3
+ class TestEnumerable < Test::Unit::TestCase
4
+ context "Euclidean Distance" do
5
+
6
+ should "return 5" do
7
+ assert_equal 5, [10].euclidean_distance([5])
8
+ end
9
+
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ require 'helper'
2
+
3
+ class TestObject < Test::Unit::TestCase
4
+ context "Random Number Between" do
5
+
6
+ should "return a number between 10 and 20" do
7
+ n = rand_between(10, 20)
8
+ assert_between(10..20, n)
9
+ end
10
+
11
+ should "return a float between 10.0 and 10.9" do
12
+ n = rand_between(10.0, 10.9)
13
+ assert_between(10..11, n)
14
+ assert_kind_of Float, n
15
+ end
16
+
17
+ end
18
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'k_means'
8
+
9
+ class Test::Unit::TestCase
10
+ def assert_between(range, n)
11
+ assert range === n
12
+ end
13
+ end
@@ -0,0 +1,45 @@
1
+ require 'helper'
2
+
3
+ class TestCentroid < Test::Unit::TestCase
4
+ context "A Centroid" do
5
+
6
+ setup do
7
+ @centroid = Centroid.new([1, 2, 3])
8
+ end
9
+
10
+ should "return an array" do
11
+ assert_kind_of Array, @centroid.position
12
+ end
13
+
14
+ should "return an array of centroids" do
15
+ centroids = Centroid.create_centroids(4, create_nodes)
16
+ assert_kind_of Array, centroids
17
+ assert_kind_of Centroid, centroids.first
18
+ end
19
+
20
+ should "create 4 centroids" do
21
+ centroids = Centroid.create_centroids(4, create_nodes)
22
+ assert_equal 4, centroids.size
23
+ end
24
+
25
+ should "reposition nodes" do
26
+ nodes = create_nodes
27
+ average_position = [0.0] * nodes[0].position.size
28
+ nodes.each do |node|
29
+ node.position.each_with_index do |position, index|
30
+ average_position[index] += position
31
+ end
32
+ end
33
+ average_position.map! {|x| x / 2}
34
+ @centroid.reposition(create_nodes)
35
+ assert_equal average_position, @centroid.position
36
+ end
37
+
38
+ end
39
+
40
+ private
41
+
42
+ def create_nodes
43
+ Node.create_nodes([[1,2,3], [4,5,6]])
44
+ end
45
+ end
@@ -0,0 +1,24 @@
1
+ require 'helper'
2
+
3
+ class TestKMeans < Test::Unit::TestCase
4
+ context "A KMeans Instance" do
5
+
6
+ setup do
7
+ @data = Array.new(200) {Array.new(2) {rand}}
8
+ @kmeans = KMeans.new(@data, :centroids => 2)
9
+ end
10
+
11
+ should "return an array" do
12
+ assert_kind_of String, @kmeans.inspect
13
+ end
14
+
15
+ should "have 2 centroids" do
16
+ assert_equal 2, @kmeans.centroids.size
17
+ end
18
+
19
+ should "have 200 nodes" do
20
+ assert_equal 200, @kmeans.nodes.size
21
+ end
22
+
23
+ end
24
+ end
data/test/test_node.rb ADDED
@@ -0,0 +1,43 @@
1
+ require 'helper'
2
+
3
+ class TestNode < Test::Unit::TestCase
4
+ context "A Data Instance" do
5
+
6
+ setup do
7
+ @node = Node.new([4, 4])
8
+ end
9
+
10
+ should "return an array" do
11
+ assert_kind_of Array, @node.position
12
+ end
13
+
14
+ should "create an array of nodes" do
15
+ data = Array.new(10) {Array.new(2) {rand}}
16
+ nodes = Node.create_nodes(data)
17
+ assert_kind_of Array, nodes
18
+ end
19
+
20
+ should "create 10 nodes" do
21
+ data = Array.new(10) {Array.new(2) {rand}}
22
+ nodes = Node.create_nodes(data)
23
+ assert_equal 10, nodes.size
24
+ end
25
+
26
+ should "initialize closest centroid" do
27
+ a = @node.closest_centroid
28
+ centroids = [Centroid.new([4, 4]), Centroid.new([5, 4])]
29
+ @node.update_closest_centroid(centroids)
30
+ assert_not_equal nil, @node.closest_centroid
31
+ end
32
+
33
+ should "update closest centroid" do
34
+ centroids = [Centroid.new([5, 4])]
35
+ @node.update_closest_centroid(centroids)
36
+ a = @node.closest_centroid
37
+ @node.update_closest_centroid([Centroid.new([4,4])])
38
+ assert_not_equal a, @node.closest_centroid
39
+ assert_equal 0, @node.best_distance
40
+ end
41
+
42
+ end
43
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: k_means
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-25 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Attempting to create a fast, memory efficient KMeans
17
+ email: reddavis@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ files:
26
+ - .document
27
+ - .gitignore
28
+ - LICENSE
29
+ - README.rdoc
30
+ - Rakefile
31
+ - VERSION
32
+ - benchmark/benchmark_ai4r.rb
33
+ - k_means.gemspec
34
+ - lib/centroid.rb
35
+ - lib/ext/enumerable.rb
36
+ - lib/ext/object.rb
37
+ - lib/k_means.rb
38
+ - lib/node.rb
39
+ - profiling/profile.rb
40
+ - test/ext/test_enumerable.rb
41
+ - test/ext/test_object.rb
42
+ - test/helper.rb
43
+ - test/test_centroid.rb
44
+ - test/test_k_means.rb
45
+ - test/test_node.rb
46
+ has_rdoc: true
47
+ homepage: http://github.com/reddavis/k_means
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options:
52
+ - --charset=UTF-8
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ version:
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ requirements: []
68
+
69
+ rubyforge_project:
70
+ rubygems_version: 1.3.5
71
+ signing_key:
72
+ specification_version: 3
73
+ summary: K Means algorithm
74
+ test_files:
75
+ - test/ext/test_enumerable.rb
76
+ - test/ext/test_object.rb
77
+ - test/helper.rb
78
+ - test/test_centroid.rb
79
+ - test/test_k_means.rb
80
+ - test/test_node.rb