k_means 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,50 @@
1
+ = KMeans
2
+
3
+ Attempting to build a fast, memory efficient K-Means program.
4
+
5
+ == Install
6
+
7
+ gem sources -a http://gems.github.com
8
+ sudo gem install reddavis-k_means
9
+
10
+ == How To Use
11
+ require 'rubygems'
12
+ require 'k_means'
13
+
14
+ data = [[1,1], [1,2], [1,1], [1000, 1000], [500, 500]]
15
+ kmeans = KMeans.new(@data, :centroids => 2)
16
+ kmeans.inspect # Use kmeans.view to get hold of the un-inspected array
17
+ => [[3, 4], [0, 1, 2]]
18
+
19
+ == Benchmarks
20
+
21
+ # 1000 records with 50 dimensions
22
+ data = Array.new(1000) {Array.new(50) {rand(10)}}
23
+ ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
24
+
25
+ # Clustering can happen in magical ways
26
+ # so lets do it over multiple times
27
+ n = 5
28
+
29
+ Benchmark.bm do |x|
30
+ x.report('KMeans') do
31
+ n.times { KMeans.new(data) }
32
+ end
33
+ x.report("Ai4R") do
34
+ n.times do
35
+ b = Ai4r::Clusterers::KMeans.new
36
+ b.build(ai4r_data, 4)
37
+ end
38
+ end
39
+ end
40
+        user     system      total        real
41
+ KMeans 15.960000   0.030000  15.990000 ( 16.062639)
42
+ Ai4R 70.230000   0.180000  70.410000 ( 70.704843)
43
+
44
+ == Thanks
45
+
46
+ * David Richards - For his code reviews and all round helpfulness. - http://github.com/davidrichards
47
+
48
+ == Copyright
49
+
50
+ Copyright (c) 2009 Red Davis. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "k_means"
8
+ gem.summary = %Q{K Means algorithm}
9
+ gem.description = %Q{Attempting to create a fast, memory efficient KMeans}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/k_means"
12
+ gem.authors = ["reddavis"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "k_means #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.2
@@ -0,0 +1,27 @@
1
+ require 'benchmark'
2
+ require 'rubygems'
3
+ require 'benchmarker'
4
+ require 'ai4r'
5
+ require File.dirname(__FILE__) + '/../lib/k_means'
6
+
7
+ data = Array.new(500) {Array.new(50) {rand(10)}}
8
+
9
+ ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
10
+
11
+ # Clustering can happen in magical ways
12
+ # so lets do it over multiple times
13
+ n = 2
14
+
15
+ Benchmarker.go('lib') do
16
+ Benchmark.bm do |x|
17
+ x.report('Mine') do
18
+ n.times { KMeans.new(data) }
19
+ end
20
+ # x.report("Ai4R") do
21
+ # n.times do
22
+ #b = Ai4r::Clusterers::KMeans.new
23
+ #b.build(ai4r_data, 4)
24
+ # end
25
+ # end
26
+ end
27
+ end
data/k_means.gemspec ADDED
@@ -0,0 +1,65 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{k_means}
8
+ s.version = "0.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2009-11-25}
13
+ s.description = %q{Attempting to create a fast, memory efficient KMeans}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "benchmark/benchmark_ai4r.rb",
27
+ "k_means.gemspec",
28
+ "lib/centroid.rb",
29
+ "lib/ext/enumerable.rb",
30
+ "lib/ext/object.rb",
31
+ "lib/k_means.rb",
32
+ "lib/node.rb",
33
+ "profiling/profile.rb",
34
+ "test/ext/test_enumerable.rb",
35
+ "test/ext/test_object.rb",
36
+ "test/helper.rb",
37
+ "test/test_centroid.rb",
38
+ "test/test_k_means.rb",
39
+ "test/test_node.rb"
40
+ ]
41
+ s.homepage = %q{http://github.com/reddavis/k_means}
42
+ s.rdoc_options = ["--charset=UTF-8"]
43
+ s.require_paths = ["lib"]
44
+ s.rubygems_version = %q{1.3.5}
45
+ s.summary = %q{K Means algorithm}
46
+ s.test_files = [
47
+ "test/ext/test_enumerable.rb",
48
+ "test/ext/test_object.rb",
49
+ "test/helper.rb",
50
+ "test/test_centroid.rb",
51
+ "test/test_k_means.rb",
52
+ "test/test_node.rb"
53
+ ]
54
+
55
+ if s.respond_to? :specification_version then
56
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
57
+ s.specification_version = 3
58
+
59
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
60
+ else
61
+ end
62
+ else
63
+ end
64
+ end
65
+
data/lib/centroid.rb ADDED
@@ -0,0 +1,49 @@
1
+ class Centroid
2
+
3
+ class << self
4
+ def create_centroids(amount, nodes)
5
+ ranges = create_ranges(nodes, nodes[0].position.size)
6
+ (1..amount).map do
7
+ position = ranges.inject([]) do |array, range|
8
+ array << rand_between(range[0], range[1])
9
+ end
10
+ new(position)
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def create_ranges(nodes, dimensions)
17
+ ranges = Array.new(dimensions) {[0.0, 0.0]}
18
+ nodes.each do |node|
19
+ node.position.each_with_index do |position, index|
20
+ # Bottom range
21
+ ranges[index][0] = position if position < ranges[index][0]
22
+ # Top range
23
+ ranges[index][1] = position if position > ranges[index][1]
24
+ end
25
+ end
26
+ ranges
27
+ end
28
+ end
29
+
30
+ attr_accessor :position
31
+
32
+ def initialize(position)
33
+ @position = position
34
+ end
35
+
36
+ # Finds the average distance of all the nodes assigned to
37
+ # the centroid and then moves the centroid to that position
38
+ def reposition(nodes)
39
+ return if nodes.empty?
40
+ averages = [0.0] * nodes[0].position.size
41
+ nodes.each do |node|
42
+ node.position.each_with_index do |position, index|
43
+ averages[index] += position
44
+ end
45
+ end
46
+ @position = averages.map {|x| x / nodes.size}
47
+ end
48
+
49
+ end
@@ -0,0 +1,10 @@
1
+ module Enumerable
2
+ def euclidean_distance(other)
3
+ sum = 0.0
4
+ self.each_index do |i|
5
+ sum += (self[i] - other[i])**2
6
+ end
7
+ Math.sqrt(sum)
8
+ end
9
+ end
10
+
data/lib/ext/object.rb ADDED
@@ -0,0 +1,16 @@
1
+ class Object
2
+
3
+ # Simpler way to handle a random number between to values
4
+ def rand_between(a, b)
5
+ return rand_in_floats(a, b) if a.is_a?(Float) or b.is_a?(Float)
6
+ range = (a - b).abs + 1
7
+ rand(range) + [a,b].min
8
+ end
9
+
10
+ # Handles non-integers
11
+ def rand_in_floats(a, b)
12
+ range = (a - b).abs
13
+ (rand * range) + [a,b].min
14
+ end
15
+
16
+ end
data/lib/k_means.rb ADDED
@@ -0,0 +1,78 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'centroid'
3
+ require 'node'
4
+ require 'ext/enumerable'
5
+ require 'ext/object'
6
+
7
+ class KMeans
8
+
9
+ attr_reader :centroids, :nodes
10
+
11
+ def initialize(data, options={})
12
+ k = options[:centroids] || 4
13
+ @verbose = options[:verbose] == true ? true : nil
14
+
15
+ @nodes = Node.create_nodes(data)
16
+ @centroids = Centroid.create_centroids(k, @nodes)
17
+
18
+ perform_cluster_process
19
+ end
20
+
21
+ def inspect
22
+ @centroid_pockets.inspect
23
+ end
24
+
25
+ def view
26
+ @centroid_pockets
27
+ end
28
+
29
+ private
30
+
31
+ def perform_cluster_process
32
+ iterations, updates = 0, 1
33
+ while updates > 0 && iterations < 100
34
+ iterations += 1
35
+ verbose_message("Iteration #{iterations}")
36
+ updates = 0
37
+ updates += update_nodes
38
+ reposition_centroids
39
+ end
40
+ place_nodes_into_pockets
41
+ end
42
+
43
+ # This creates an array of arrays
44
+ # Each internal array represents a centroid
45
+ # and each in the array represents the nodes index
46
+ def place_nodes_into_pockets
47
+ centroid_pockets = Array.new(@centroids.size) {[]}
48
+ @centroids.each_with_index do |centroid, centroid_index|
49
+ @nodes.each_with_index do |node, node_index|
50
+ if node.closest_centroid == centroid
51
+ centroid_pockets[centroid_index] << node_index
52
+ end
53
+ end
54
+ end
55
+ @centroid_pockets = centroid_pockets
56
+ end
57
+
58
+ def update_nodes
59
+ sum = 0
60
+ @nodes.each do |node|
61
+ sum += node.update_closest_centroid(@centroids)
62
+ end
63
+ sum
64
+ end
65
+
66
+ def reposition_centroids
67
+ @centroids.each do |centroid|
68
+ nodes = []
69
+ @nodes.each {|n| nodes << n if n.closest_centroid == centroid}
70
+ centroid.reposition(nodes)
71
+ end
72
+ end
73
+
74
+ def verbose_message(message)
75
+ puts message if @verbose
76
+ end
77
+
78
+ end
data/lib/node.rb ADDED
@@ -0,0 +1,44 @@
1
+ class Node
2
+
3
+ class << self
4
+ def create_nodes(data)
5
+ nodes = []
6
+ data.each do |position|
7
+ nodes << new(position)
8
+ end
9
+ nodes
10
+ end
11
+ end
12
+
13
+ attr_accessor :position, :best_distance, :closest_centroid
14
+
15
+ def initialize(position)
16
+ @position = position
17
+ end
18
+
19
+ def update_closest_centroid(centroids)
20
+ calculate_initial_centroid(centroids.first) unless @closest_centroid
21
+ updated = false
22
+ centroids.each do |centroid|
23
+ distance = calculate_distance(centroid)
24
+ if distance < best_distance
25
+ updated = true
26
+ @closest_centroid = centroid
27
+ @best_distance = distance
28
+ end
29
+ end
30
+ updated == true ? 1 : 0
31
+ end
32
+
33
+ private
34
+
35
+ def calculate_initial_centroid(centroid)
36
+ @closest_centroid = centroid
37
+ @best_distance = calculate_distance(centroid)
38
+ end
39
+
40
+ def calculate_distance(centroid)
41
+ @position.euclidean_distance(centroid.position)
42
+ end
43
+
44
+ end
@@ -0,0 +1,12 @@
1
+ require File.dirname(__FILE__) + '/../lib/k_means'
2
+ require 'rubygems'
3
+ require 'ruby-prof'
4
+
5
+ data = Array.new(100) {Array.new(2) {rand}}
6
+
7
+ result = RubyProf.profile do
8
+ a = KMeans.new(data)
9
+ end
10
+
11
+ printer = RubyProf::FlatPrinter.new(result)
12
+ printer.print(STDOUT, 0)
@@ -0,0 +1,11 @@
1
+ require 'helper'
2
+
3
+ class TestEnumerable < Test::Unit::TestCase
4
+ context "Euclidean Distance" do
5
+
6
+ should "return 5" do
7
+ assert_equal 5, [10].euclidean_distance([5])
8
+ end
9
+
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ require 'helper'
2
+
3
+ class TestObject < Test::Unit::TestCase
4
+ context "Random Number Between" do
5
+
6
+ should "return a number between 10 and 20" do
7
+ n = rand_between(10, 20)
8
+ assert_between(10..20, n)
9
+ end
10
+
11
+ should "return a float between 10.0 and 10.9" do
12
+ n = rand_between(10.0, 10.9)
13
+ assert_between(10..11, n)
14
+ assert_kind_of Float, n
15
+ end
16
+
17
+ end
18
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'k_means'
8
+
9
+ class Test::Unit::TestCase
10
+ def assert_between(range, n)
11
+ assert range === n
12
+ end
13
+ end
@@ -0,0 +1,45 @@
1
+ require 'helper'
2
+
3
+ class TestCentroid < Test::Unit::TestCase
4
+ context "A Centroid" do
5
+
6
+ setup do
7
+ @centroid = Centroid.new([1, 2, 3])
8
+ end
9
+
10
+ should "return an array" do
11
+ assert_kind_of Array, @centroid.position
12
+ end
13
+
14
+ should "return an array of centroids" do
15
+ centroids = Centroid.create_centroids(4, create_nodes)
16
+ assert_kind_of Array, centroids
17
+ assert_kind_of Centroid, centroids.first
18
+ end
19
+
20
+ should "create 4 centroids" do
21
+ centroids = Centroid.create_centroids(4, create_nodes)
22
+ assert_equal 4, centroids.size
23
+ end
24
+
25
+ should "reposition nodes" do
26
+ nodes = create_nodes
27
+ average_position = [0.0] * nodes[0].position.size
28
+ nodes.each do |node|
29
+ node.position.each_with_index do |position, index|
30
+ average_position[index] += position
31
+ end
32
+ end
33
+ average_position.map! {|x| x / 2}
34
+ @centroid.reposition(create_nodes)
35
+ assert_equal average_position, @centroid.position
36
+ end
37
+
38
+ end
39
+
40
+ private
41
+
42
+ def create_nodes
43
+ Node.create_nodes([[1,2,3], [4,5,6]])
44
+ end
45
+ end
@@ -0,0 +1,24 @@
1
+ require 'helper'
2
+
3
+ class TestKMeans < Test::Unit::TestCase
4
+ context "A KMeans Instance" do
5
+
6
+ setup do
7
+ @data = Array.new(200) {Array.new(2) {rand}}
8
+ @kmeans = KMeans.new(@data, :centroids => 2)
9
+ end
10
+
11
+ should "return an array" do
12
+ assert_kind_of String, @kmeans.inspect
13
+ end
14
+
15
+ should "have 2 centroids" do
16
+ assert_equal 2, @kmeans.centroids.size
17
+ end
18
+
19
+ should "have 200 nodes" do
20
+ assert_equal 200, @kmeans.nodes.size
21
+ end
22
+
23
+ end
24
+ end
data/test/test_node.rb ADDED
@@ -0,0 +1,43 @@
1
+ require 'helper'
2
+
3
+ class TestNode < Test::Unit::TestCase
4
+ context "A Data Instance" do
5
+
6
+ setup do
7
+ @node = Node.new([4, 4])
8
+ end
9
+
10
+ should "return an array" do
11
+ assert_kind_of Array, @node.position
12
+ end
13
+
14
+ should "create an array of nodes" do
15
+ data = Array.new(10) {Array.new(2) {rand}}
16
+ nodes = Node.create_nodes(data)
17
+ assert_kind_of Array, nodes
18
+ end
19
+
20
+ should "create 10 nodes" do
21
+ data = Array.new(10) {Array.new(2) {rand}}
22
+ nodes = Node.create_nodes(data)
23
+ assert_equal 10, nodes.size
24
+ end
25
+
26
+ should "initialize closest centroid" do
27
+ a = @node.closest_centroid
28
+ centroids = [Centroid.new([4, 4]), Centroid.new([5, 4])]
29
+ @node.update_closest_centroid(centroids)
30
+ assert_not_equal nil, @node.closest_centroid
31
+ end
32
+
33
+ should "update closest centroid" do
34
+ centroids = [Centroid.new([5, 4])]
35
+ @node.update_closest_centroid(centroids)
36
+ a = @node.closest_centroid
37
+ @node.update_closest_centroid([Centroid.new([4,4])])
38
+ assert_not_equal a, @node.closest_centroid
39
+ assert_equal 0, @node.best_distance
40
+ end
41
+
42
+ end
43
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: k_means
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-25 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Attempting to create a fast, memory efficient KMeans
17
+ email: reddavis@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ files:
26
+ - .document
27
+ - .gitignore
28
+ - LICENSE
29
+ - README.rdoc
30
+ - Rakefile
31
+ - VERSION
32
+ - benchmark/benchmark_ai4r.rb
33
+ - k_means.gemspec
34
+ - lib/centroid.rb
35
+ - lib/ext/enumerable.rb
36
+ - lib/ext/object.rb
37
+ - lib/k_means.rb
38
+ - lib/node.rb
39
+ - profiling/profile.rb
40
+ - test/ext/test_enumerable.rb
41
+ - test/ext/test_object.rb
42
+ - test/helper.rb
43
+ - test/test_centroid.rb
44
+ - test/test_k_means.rb
45
+ - test/test_node.rb
46
+ has_rdoc: true
47
+ homepage: http://github.com/reddavis/k_means
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options:
52
+ - --charset=UTF-8
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ version:
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ requirements: []
68
+
69
+ rubyforge_project:
70
+ rubygems_version: 1.3.5
71
+ signing_key:
72
+ specification_version: 3
73
+ summary: K Means algorithm
74
+ test_files:
75
+ - test/ext/test_enumerable.rb
76
+ - test/ext/test_object.rb
77
+ - test/helper.rb
78
+ - test/test_centroid.rb
79
+ - test/test_k_means.rb
80
+ - test/test_node.rb