reddavis-k_means 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,18 @@
1
+ = k_means
2
+
3
+ Attempting to build a memory efficient fast KMeans.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but
13
+ bump version in a commit by itself I can ignore when I pull)
14
+ * Send me a pull request. Bonus points for topic branches.
15
+
16
+ == Copyright
17
+
18
+ Copyright (c) 2009 reddavis. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "k_means"
8
+ gem.summary = %Q{K Means algorithm}
9
+ gem.description = %Q{Attempting to create a fast memory efficient KMeans}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/k_means"
12
+ gem.authors = ["reddavis"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "k_means #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,25 @@
1
+ require 'benchmark'
2
+ require 'rubygems'
3
+ require 'ai4r'
4
+ require File.dirname(__FILE__) + '/../lib/k_means'
5
+
6
+ data = Array.new(200) {Array.new(50) {rand(10)}}
7
+
8
+ puts data.inspect
9
+
10
+ ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
11
+
12
+ # Clustering can happen in magical ways
13
+ # so lets do it over multiple times
14
+ n = 2
15
+
16
+ Benchmark.bm do |x|
17
+ x.report('Mine') do
18
+ a = KMeans.new(4)
19
+ n.times { a.clustify(data) }
20
+ end
21
+ x.report("Ai4R") do
22
+ b = Ai4r::Clusterers::KMeans.new
23
+ n.times { b.build(ai4r_data, 4) }
24
+ end
25
+ end
data/k_means.gemspec ADDED
@@ -0,0 +1,56 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{k_means}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2009-08-15}
13
+ s.description = %q{Attempting to create a fast memory efficient KMeans}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "benchmark/benchmark.rb",
27
+ "k_means.gemspec",
28
+ "lib/basic_cache_store.rb",
29
+ "lib/ext/enumerable.rb",
30
+ "lib/k_means.rb",
31
+ "profiling/profile.rb",
32
+ "test/ext/test_enumerable.rb",
33
+ "test/helper.rb",
34
+ "test/test_k_means.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/reddavis/k_means}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.5}
40
+ s.summary = %q{K Means algorithm}
41
+ s.test_files = [
42
+ "test/ext/test_enumerable.rb",
43
+ "test/helper.rb",
44
+ "test/test_k_means.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
52
+ else
53
+ end
54
+ else
55
+ end
56
+ end
@@ -0,0 +1,15 @@
1
+ class BasicCacheStore
2
+
3
+ def initialize
4
+ @store = {}
5
+ end
6
+
7
+ def set(key, data)
8
+ @store[key] = data
9
+ end
10
+
11
+ def get(key)
12
+ @store[key]
13
+ end
14
+
15
+ end
@@ -0,0 +1,10 @@
1
+ module Enumerable
2
+ def euclidean_distance(other)
3
+ sum = 0.0
4
+ self.each_index do |i|
5
+ sum += (self[i] - other[i])**2
6
+ end
7
+ Math.sqrt(sum)
8
+ end
9
+ end
10
+
data/lib/k_means.rb ADDED
@@ -0,0 +1,128 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'rubygems'
3
+ require 'basic_cache_store'
4
+ require 'ext/enumerable'
5
+
6
+ class KMeans
7
+
8
+ def initialize(k=4, options={})
9
+ @k = k
10
+ @verbose = options[:verbose] == true ? true : nil
11
+ @last_matches = nil
12
+ end
13
+
14
+ def clustify(data)
15
+ @data = data
16
+ place_centroids
17
+ perform_cluster_process
18
+ @best_matches
19
+ end
20
+
21
+ private
22
+
23
+ def get_best_distance(data_index, centroid_index, data)
24
+ if cached_data = @cache.get("#{data_index}_#{centroid_index}")
25
+ cached_data
26
+ else
27
+ data.euclidean_distance(@centroids[centroid_index])
28
+ end
29
+ end
30
+
31
+ def set_best_distance(data_index, centroid_index, data)
32
+ @cache.set("#{data_index}_#{centroid_index}", data.euclidean_distance(@centroids[centroid_index]))
33
+ end
34
+
35
+ def perform_cluster_process
36
+ 100.times do |t|
37
+ verbose_message("Iteration #{t}")
38
+
39
+ # Prepare best matches array
40
+ @best_matches = create_best_matches_array
41
+ # A little bit of caching
42
+ @cache = BasicCacheStore.new
43
+
44
+ # See which centroid is closest to which data
45
+ @data.each_with_index do |data, index|
46
+ best_match = 0
47
+
48
+ @k.times do |i|
49
+ # Calculate the distance between the centroid and the data
50
+ distance = data.euclidean_distance(@centroids[i])
51
+ # Check to see if our new distance is better than what we had before
52
+ if distance < get_best_distance(index, best_match, data)#data.euclidean_distance(@centroids[best_match])
53
+ best_match = i
54
+ set_best_distance(index, best_match, data)
55
+ end #if distance...
56
+ end #@k.times
57
+ @best_matches[best_match] << index
58
+ end #@data.each_with...
59
+
60
+ # Stop the loop if centroids have stopped moving
61
+ break if @last_matches == @best_matches
62
+ @last_matches = @best_matches
63
+
64
+ reposition_centroids
65
+ end
66
+ end
67
+
68
+ # Move the centroids to the average of their surrounding data
69
+ def reposition_centroids
70
+ @k.times do |i|
71
+ averages = [0.0] * @data[0].size # The average data
72
+ # Here we create an average of all the data in @best_matches[i]
73
+ # and then move the centroid (basically replacing the centroids own data with the average)
74
+ # i.e the data is the posistion of the element, if that makes sense?)
75
+ if @best_matches[i].size > 0 # Check the centroid has any matches
76
+ @best_matches[i].each do |data_index|
77
+ @data[data_index].each_with_index do |data, index|
78
+ averages[index] += data
79
+ end
80
+ end
81
+
82
+ # Calculate last part of the average
83
+ averages.each do |average|
84
+ average /= @best_matches[i].size
85
+ end
86
+ @centroids[i] = averages
87
+ end #if @best_matches
88
+
89
+ end
90
+ end
91
+
92
+ def place_centroids
93
+ @centroids = []
94
+ ranges = create_ranges
95
+
96
+ @k.times do |i|
97
+ line_size = @data.first.size
98
+
99
+ ranges.each do |range|
100
+ group = []
101
+ line_size.times do |n|
102
+ group << rand * (range[1] - range[0]) + range[0]
103
+ end
104
+ @centroids << group
105
+ end
106
+ end
107
+ end
108
+
109
+ # Calculate the ranges for each points
110
+ def create_ranges
111
+ ranges = []
112
+ @data.each do |line|
113
+ ranges << [line.max, line.min]
114
+ end
115
+ ranges
116
+ end
117
+
118
+ def verbose_message(message)
119
+ puts message if @verbose
120
+ end
121
+
122
+ def create_best_matches_array
123
+ array = []
124
+ @k.times { array << []}
125
+ array
126
+ end
127
+
128
+ end
@@ -0,0 +1,13 @@
1
+ require File.dirname(__FILE__) + '/../lib/k_means'
2
+ require 'rubygems'
3
+ require 'ruby-prof'
4
+
5
+ data = Array.new(100) {Array.new(2) {rand}}
6
+
7
+ result = RubyProf.profile do
8
+ a = KMeans.new(4)
9
+ a.clustify(data)
10
+ end
11
+
12
+ printer = RubyProf::GraphPrinter.new(result)
13
+ printer.print(STDOUT, 0)
@@ -0,0 +1,11 @@
1
+ require 'helper'
2
+
3
+ class TestEnumerable < Test::Unit::TestCase
4
+ context "Euclidean Distance" do
5
+
6
+ should "return 5" do
7
+ assert_equal 5, [10].euclidean_distance([5])
8
+ end
9
+
10
+ end
11
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'k_means'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,28 @@
1
+ require 'helper'
2
+
3
+ class TestKMeans < Test::Unit::TestCase
4
+ context "A KMeans Instance" do
5
+
6
+ setup do
7
+ @kmeans = KMeans.new(4)
8
+ @data = Array.new(10) {Array.new(2) {rand}}
9
+ end
10
+
11
+ should "return an array" do
12
+ assert_kind_of Array, @kmeans.clustify(@data)
13
+ end
14
+
15
+ should "have 4 centroids" do
16
+ centroids = @kmeans.clustify(@data).size
17
+ assert_equal(4, centroids)
18
+ end
19
+
20
+ should "return same amount of data that went in" do
21
+ output_data_count = @kmeans.clustify(@data).inject(0) do |sum, n|
22
+ sum += n.size
23
+ end
24
+ assert_equal(@data.size, output_data_count)
25
+ end
26
+
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reddavis-k_means
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-15 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Attempting to create a fast memory efficient KMeans
17
+ email: reddavis@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ files:
26
+ - .document
27
+ - .gitignore
28
+ - LICENSE
29
+ - README.rdoc
30
+ - Rakefile
31
+ - VERSION
32
+ - benchmark/benchmark.rb
33
+ - k_means.gemspec
34
+ - lib/basic_cache_store.rb
35
+ - lib/ext/enumerable.rb
36
+ - lib/k_means.rb
37
+ - profiling/profile.rb
38
+ - test/ext/test_enumerable.rb
39
+ - test/helper.rb
40
+ - test/test_k_means.rb
41
+ has_rdoc: false
42
+ homepage: http://github.com/reddavis/k_means
43
+ licenses:
44
+ post_install_message:
45
+ rdoc_options:
46
+ - --charset=UTF-8
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ version:
61
+ requirements: []
62
+
63
+ rubyforge_project:
64
+ rubygems_version: 1.3.5
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: K Means algorithm
68
+ test_files:
69
+ - test/ext/test_enumerable.rb
70
+ - test/helper.rb
71
+ - test/test_k_means.rb