reddavis-k_means 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,18 @@
1
+ = k_means
2
+
3
+ Attempting to build a memory efficient fast KMeans.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but
13
+ bump version in a commit by itself I can ignore when I pull)
14
+ * Send me a pull request. Bonus points for topic branches.
15
+
16
+ == Copyright
17
+
18
+ Copyright (c) 2009 reddavis. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "k_means"
8
+ gem.summary = %Q{K Means algorithm}
9
+ gem.description = %Q{Attempting to create a fast memory efficient KMeans}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/k_means"
12
+ gem.authors = ["reddavis"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "k_means #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,25 @@
1
+ require 'benchmark'
2
+ require 'rubygems'
3
+ require 'ai4r'
4
+ require File.dirname(__FILE__) + '/../lib/k_means'
5
+
6
+ data = Array.new(200) {Array.new(50) {rand(10)}}
7
+
8
+ puts data.inspect
9
+
10
+ ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
11
+
12
+ # Clustering can happen in magical ways
13
+ # so lets do it over multiple times
14
+ n = 2
15
+
16
+ Benchmark.bm do |x|
17
+ x.report('Mine') do
18
+ a = KMeans.new(4)
19
+ n.times { a.clustify(data) }
20
+ end
21
+ x.report("Ai4R") do
22
+ b = Ai4r::Clusterers::KMeans.new
23
+ n.times { b.build(ai4r_data, 4) }
24
+ end
25
+ end
data/k_means.gemspec ADDED
@@ -0,0 +1,56 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{k_means}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2009-08-15}
13
+ s.description = %q{Attempting to create a fast memory efficient KMeans}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "benchmark/benchmark.rb",
27
+ "k_means.gemspec",
28
+ "lib/basic_cache_store.rb",
29
+ "lib/ext/enumerable.rb",
30
+ "lib/k_means.rb",
31
+ "profiling/profile.rb",
32
+ "test/ext/test_enumerable.rb",
33
+ "test/helper.rb",
34
+ "test/test_k_means.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/reddavis/k_means}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.5}
40
+ s.summary = %q{K Means algorithm}
41
+ s.test_files = [
42
+ "test/ext/test_enumerable.rb",
43
+ "test/helper.rb",
44
+ "test/test_k_means.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
52
+ else
53
+ end
54
+ else
55
+ end
56
+ end
@@ -0,0 +1,15 @@
1
+ class BasicCacheStore
2
+
3
+ def initialize
4
+ @store = {}
5
+ end
6
+
7
+ def set(key, data)
8
+ @store[key] = data
9
+ end
10
+
11
+ def get(key)
12
+ @store[key]
13
+ end
14
+
15
+ end
@@ -0,0 +1,10 @@
1
+ module Enumerable
2
+ def euclidean_distance(other)
3
+ sum = 0.0
4
+ self.each_index do |i|
5
+ sum += (self[i] - other[i])**2
6
+ end
7
+ Math.sqrt(sum)
8
+ end
9
+ end
10
+
data/lib/k_means.rb ADDED
@@ -0,0 +1,128 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'rubygems'
3
+ require 'basic_cache_store'
4
+ require 'ext/enumerable'
5
+
6
+ class KMeans
7
+
8
+ def initialize(k=4, options={})
9
+ @k = k
10
+ @verbose = options[:verbose] == true ? true : nil
11
+ @last_matches = nil
12
+ end
13
+
14
+ def clustify(data)
15
+ @data = data
16
+ place_centroids
17
+ perform_cluster_process
18
+ @best_matches
19
+ end
20
+
21
+ private
22
+
23
+ def get_best_distance(data_index, centroid_index, data)
24
+ if cached_data = @cache.get("#{data_index}_#{centroid_index}")
25
+ cached_data
26
+ else
27
+ data.euclidean_distance(@centroids[centroid_index])
28
+ end
29
+ end
30
+
31
+ def set_best_distance(data_index, centroid_index, data)
32
+ @cache.set("#{data_index}_#{centroid_index}", data.euclidean_distance(@centroids[centroid_index]))
33
+ end
34
+
35
+ def perform_cluster_process
36
+ 100.times do |t|
37
+ verbose_message("Iteration #{t}")
38
+
39
+ # Prepare best matches array
40
+ @best_matches = create_best_matches_array
41
+ # A little bit of caching
42
+ @cache = BasicCacheStore.new
43
+
44
+ # See which centroid is closest to which data
45
+ @data.each_with_index do |data, index|
46
+ best_match = 0
47
+
48
+ @k.times do |i|
49
+ # Calculate the distance between the centroid and the data
50
+ distance = data.euclidean_distance(@centroids[i])
51
+ # Check to see if our new distance is better than what we had before
52
+ if distance < get_best_distance(index, best_match, data)#data.euclidean_distance(@centroids[best_match])
53
+ best_match = i
54
+ set_best_distance(index, best_match, data)
55
+ end #if distance...
56
+ end #@k.times
57
+ @best_matches[best_match] << index
58
+ end #@data.each_with...
59
+
60
+ # Stop the loop if centroids have stopped moving
61
+ break if @last_matches == @best_matches
62
+ @last_matches = @best_matches
63
+
64
+ reposition_centroids
65
+ end
66
+ end
67
+
68
+ # Move the centroids to the average of their surrounding data
69
+ def reposition_centroids
70
+ @k.times do |i|
71
+ averages = [0.0] * @data[0].size # The average data
72
+ # Here we create an average of all the data in @best_matches[i]
73
+ # and then move the centroid (basically replacing the centroids own data with the average)
74
+ # i.e the data is the posistion of the element, if that makes sense?)
75
+ if @best_matches[i].size > 0 # Check the centroid has any matches
76
+ @best_matches[i].each do |data_index|
77
+ @data[data_index].each_with_index do |data, index|
78
+ averages[index] += data
79
+ end
80
+ end
81
+
82
+ # Calculate last part of the average
83
+ averages.each do |average|
84
+ average /= @best_matches[i].size
85
+ end
86
+ @centroids[i] = averages
87
+ end #if @best_matches
88
+
89
+ end
90
+ end
91
+
92
+ def place_centroids
93
+ @centroids = []
94
+ ranges = create_ranges
95
+
96
+ @k.times do |i|
97
+ line_size = @data.first.size
98
+
99
+ ranges.each do |range|
100
+ group = []
101
+ line_size.times do |n|
102
+ group << rand * (range[1] - range[0]) + range[0]
103
+ end
104
+ @centroids << group
105
+ end
106
+ end
107
+ end
108
+
109
+ # Calculate the ranges for each points
110
+ def create_ranges
111
+ ranges = []
112
+ @data.each do |line|
113
+ ranges << [line.max, line.min]
114
+ end
115
+ ranges
116
+ end
117
+
118
+ def verbose_message(message)
119
+ puts message if @verbose
120
+ end
121
+
122
+ def create_best_matches_array
123
+ array = []
124
+ @k.times { array << []}
125
+ array
126
+ end
127
+
128
+ end
@@ -0,0 +1,13 @@
1
+ require File.dirname(__FILE__) + '/../lib/k_means'
2
+ require 'rubygems'
3
+ require 'ruby-prof'
4
+
5
+ data = Array.new(100) {Array.new(2) {rand}}
6
+
7
+ result = RubyProf.profile do
8
+ a = KMeans.new(4)
9
+ a.clustify(data)
10
+ end
11
+
12
+ printer = RubyProf::GraphPrinter.new(result)
13
+ printer.print(STDOUT, 0)
@@ -0,0 +1,11 @@
1
+ require 'helper'
2
+
3
+ class TestEnumerable < Test::Unit::TestCase
4
+ context "Euclidean Distance" do
5
+
6
+ should "return 5" do
7
+ assert_equal 5, [10].euclidean_distance([5])
8
+ end
9
+
10
+ end
11
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'k_means'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,28 @@
1
+ require 'helper'
2
+
3
+ class TestKMeans < Test::Unit::TestCase
4
+ context "A KMeans Instance" do
5
+
6
+ setup do
7
+ @kmeans = KMeans.new(4)
8
+ @data = Array.new(10) {Array.new(2) {rand}}
9
+ end
10
+
11
+ should "return an array" do
12
+ assert_kind_of Array, @kmeans.clustify(@data)
13
+ end
14
+
15
+ should "have 4 centroids" do
16
+ centroids = @kmeans.clustify(@data).size
17
+ assert_equal(4, centroids)
18
+ end
19
+
20
+ should "return same amount of data that went in" do
21
+ output_data_count = @kmeans.clustify(@data).inject(0) do |sum, n|
22
+ sum += n.size
23
+ end
24
+ assert_equal(@data.size, output_data_count)
25
+ end
26
+
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reddavis-k_means
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-15 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Attempting to create a fast memory efficient KMeans
17
+ email: reddavis@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ files:
26
+ - .document
27
+ - .gitignore
28
+ - LICENSE
29
+ - README.rdoc
30
+ - Rakefile
31
+ - VERSION
32
+ - benchmark/benchmark.rb
33
+ - k_means.gemspec
34
+ - lib/basic_cache_store.rb
35
+ - lib/ext/enumerable.rb
36
+ - lib/k_means.rb
37
+ - profiling/profile.rb
38
+ - test/ext/test_enumerable.rb
39
+ - test/helper.rb
40
+ - test/test_k_means.rb
41
+ has_rdoc: false
42
+ homepage: http://github.com/reddavis/k_means
43
+ licenses:
44
+ post_install_message:
45
+ rdoc_options:
46
+ - --charset=UTF-8
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ version:
61
+ requirements: []
62
+
63
+ rubyforge_project:
64
+ rubygems_version: 1.3.5
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: K Means algorithm
68
+ test_files:
69
+ - test/ext/test_enumerable.rb
70
+ - test/helper.rb
71
+ - test/test_k_means.rb