reddavis-k_means 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.rdoc +18 -0
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/benchmark/benchmark.rb +25 -0
- data/k_means.gemspec +56 -0
- data/lib/basic_cache_store.rb +15 -0
- data/lib/ext/enumerable.rb +10 -0
- data/lib/k_means.rb +128 -0
- data/profiling/profile.rb +13 -0
- data/test/ext/test_enumerable.rb +11 -0
- data/test/helper.rb +10 -0
- data/test/test_k_means.rb +28 -0
- metadata +71 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 reddavis
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
= k_means
|
2
|
+
|
3
|
+
Attempting to build a memory efficient fast KMeans.
|
4
|
+
|
5
|
+
== Note on Patches/Pull Requests
|
6
|
+
|
7
|
+
* Fork the project.
|
8
|
+
* Make your feature addition or bug fix.
|
9
|
+
* Add tests for it. This is important so I don't break it in a
|
10
|
+
future version unintentionally.
|
11
|
+
* Commit, do not mess with rakefile, version, or history.
|
12
|
+
(if you want to have your own version, that is fine but
|
13
|
+
bump version in a commit by itself I can ignore when I pull)
|
14
|
+
* Send me a pull request. Bonus points for topic branches.
|
15
|
+
|
16
|
+
== Copyright
|
17
|
+
|
18
|
+
Copyright (c) 2009 reddavis. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "k_means"
|
8
|
+
gem.summary = %Q{K Means algorithm}
|
9
|
+
gem.description = %Q{Attempting to create a fast memory efficient KMeans}
|
10
|
+
gem.email = "reddavis@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/reddavis/k_means"
|
12
|
+
gem.authors = ["reddavis"]
|
13
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
|
+
end
|
15
|
+
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'rake/testtask'
|
21
|
+
Rake::TestTask.new(:test) do |test|
|
22
|
+
test.libs << 'lib' << 'test'
|
23
|
+
test.pattern = 'test/**/*_test.rb'
|
24
|
+
test.verbose = true
|
25
|
+
end
|
26
|
+
|
27
|
+
begin
|
28
|
+
require 'rcov/rcovtask'
|
29
|
+
Rcov::RcovTask.new do |test|
|
30
|
+
test.libs << 'test'
|
31
|
+
test.pattern = 'test/**/*_test.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
rescue LoadError
|
35
|
+
task :rcov do
|
36
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION')
|
48
|
+
version = File.read('VERSION')
|
49
|
+
else
|
50
|
+
version = ""
|
51
|
+
end
|
52
|
+
|
53
|
+
rdoc.rdoc_dir = 'rdoc'
|
54
|
+
rdoc.title = "k_means #{version}"
|
55
|
+
rdoc.rdoc_files.include('README*')
|
56
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
57
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ai4r'
|
4
|
+
require File.dirname(__FILE__) + '/../lib/k_means'
|
5
|
+
|
6
|
+
data = Array.new(200) {Array.new(50) {rand(10)}}
|
7
|
+
|
8
|
+
puts data.inspect
|
9
|
+
|
10
|
+
ai4r_data = Ai4r::Data::DataSet.new(:data_items=> data)
|
11
|
+
|
12
|
+
# Clustering can happen in magical ways
|
13
|
+
# so lets do it over multiple times
|
14
|
+
n = 2
|
15
|
+
|
16
|
+
Benchmark.bm do |x|
|
17
|
+
x.report('Mine') do
|
18
|
+
a = KMeans.new(4)
|
19
|
+
n.times { a.clustify(data) }
|
20
|
+
end
|
21
|
+
x.report("Ai4R") do
|
22
|
+
b = Ai4r::Clusterers::KMeans.new
|
23
|
+
n.times { b.build(ai4r_data, 4) }
|
24
|
+
end
|
25
|
+
end
|
data/k_means.gemspec
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{k_means}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["reddavis"]
|
12
|
+
s.date = %q{2009-08-15}
|
13
|
+
s.description = %q{Attempting to create a fast memory efficient KMeans}
|
14
|
+
s.email = %q{reddavis@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"benchmark/benchmark.rb",
|
27
|
+
"k_means.gemspec",
|
28
|
+
"lib/basic_cache_store.rb",
|
29
|
+
"lib/ext/enumerable.rb",
|
30
|
+
"lib/k_means.rb",
|
31
|
+
"profiling/profile.rb",
|
32
|
+
"test/ext/test_enumerable.rb",
|
33
|
+
"test/helper.rb",
|
34
|
+
"test/test_k_means.rb"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/reddavis/k_means}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.5}
|
40
|
+
s.summary = %q{K Means algorithm}
|
41
|
+
s.test_files = [
|
42
|
+
"test/ext/test_enumerable.rb",
|
43
|
+
"test/helper.rb",
|
44
|
+
"test/test_k_means.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
52
|
+
else
|
53
|
+
end
|
54
|
+
else
|
55
|
+
end
|
56
|
+
end
|
data/lib/k_means.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
$: << File.dirname(__FILE__)
|
2
|
+
require 'rubygems'
|
3
|
+
require 'basic_cache_store'
|
4
|
+
require 'ext/enumerable'
|
5
|
+
|
6
|
+
class KMeans
|
7
|
+
|
8
|
+
def initialize(k=4, options={})
|
9
|
+
@k = k
|
10
|
+
@verbose = options[:verbose] == true ? true : nil
|
11
|
+
@last_matches = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def clustify(data)
|
15
|
+
@data = data
|
16
|
+
place_centroids
|
17
|
+
perform_cluster_process
|
18
|
+
@best_matches
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def get_best_distance(data_index, centroid_index, data)
|
24
|
+
if cached_data = @cache.get("#{data_index}_#{centroid_index}")
|
25
|
+
cached_data
|
26
|
+
else
|
27
|
+
data.euclidean_distance(@centroids[centroid_index])
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_best_distance(data_index, centroid_index, data)
|
32
|
+
@cache.set("#{data_index}_#{centroid_index}", data.euclidean_distance(@centroids[centroid_index]))
|
33
|
+
end
|
34
|
+
|
35
|
+
def perform_cluster_process
|
36
|
+
100.times do |t|
|
37
|
+
verbose_message("Iteration #{t}")
|
38
|
+
|
39
|
+
# Prepare best matches array
|
40
|
+
@best_matches = create_best_matches_array
|
41
|
+
# A little bit of caching
|
42
|
+
@cache = BasicCacheStore.new
|
43
|
+
|
44
|
+
# See which centroid is closest to which data
|
45
|
+
@data.each_with_index do |data, index|
|
46
|
+
best_match = 0
|
47
|
+
|
48
|
+
@k.times do |i|
|
49
|
+
# Calculate the distance between the centroid and the data
|
50
|
+
distance = data.euclidean_distance(@centroids[i])
|
51
|
+
# Check to see if our new distance is better than what we had before
|
52
|
+
if distance < get_best_distance(index, best_match, data)#data.euclidean_distance(@centroids[best_match])
|
53
|
+
best_match = i
|
54
|
+
set_best_distance(index, best_match, data)
|
55
|
+
end #if distance...
|
56
|
+
end #@k.times
|
57
|
+
@best_matches[best_match] << index
|
58
|
+
end #@data.each_with...
|
59
|
+
|
60
|
+
# Stop the loop if centroids have stopped moving
|
61
|
+
break if @last_matches == @best_matches
|
62
|
+
@last_matches = @best_matches
|
63
|
+
|
64
|
+
reposition_centroids
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Move the centroids to the average of their surrounding data
|
69
|
+
def reposition_centroids
|
70
|
+
@k.times do |i|
|
71
|
+
averages = [0.0] * @data[0].size # The average data
|
72
|
+
# Here we create an average of all the data in @best_matches[i]
|
73
|
+
# and then move the centroid (basically replacing the centroids own data with the average)
|
74
|
+
# i.e the data is the posistion of the element, if that makes sense?)
|
75
|
+
if @best_matches[i].size > 0 # Check the centroid has any matches
|
76
|
+
@best_matches[i].each do |data_index|
|
77
|
+
@data[data_index].each_with_index do |data, index|
|
78
|
+
averages[index] += data
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Calculate last part of the average
|
83
|
+
averages.each do |average|
|
84
|
+
average /= @best_matches[i].size
|
85
|
+
end
|
86
|
+
@centroids[i] = averages
|
87
|
+
end #if @best_matches
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def place_centroids
|
93
|
+
@centroids = []
|
94
|
+
ranges = create_ranges
|
95
|
+
|
96
|
+
@k.times do |i|
|
97
|
+
line_size = @data.first.size
|
98
|
+
|
99
|
+
ranges.each do |range|
|
100
|
+
group = []
|
101
|
+
line_size.times do |n|
|
102
|
+
group << rand * (range[1] - range[0]) + range[0]
|
103
|
+
end
|
104
|
+
@centroids << group
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Calculate the ranges for each points
|
110
|
+
def create_ranges
|
111
|
+
ranges = []
|
112
|
+
@data.each do |line|
|
113
|
+
ranges << [line.max, line.min]
|
114
|
+
end
|
115
|
+
ranges
|
116
|
+
end
|
117
|
+
|
118
|
+
def verbose_message(message)
|
119
|
+
puts message if @verbose
|
120
|
+
end
|
121
|
+
|
122
|
+
def create_best_matches_array
|
123
|
+
array = []
|
124
|
+
@k.times { array << []}
|
125
|
+
array
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../lib/k_means'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ruby-prof'
|
4
|
+
|
5
|
+
data = Array.new(100) {Array.new(2) {rand}}
|
6
|
+
|
7
|
+
result = RubyProf.profile do
|
8
|
+
a = KMeans.new(4)
|
9
|
+
a.clustify(data)
|
10
|
+
end
|
11
|
+
|
12
|
+
printer = RubyProf::GraphPrinter.new(result)
|
13
|
+
printer.print(STDOUT, 0)
|
data/test/helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestKMeans < Test::Unit::TestCase
|
4
|
+
context "A KMeans Instance" do
|
5
|
+
|
6
|
+
setup do
|
7
|
+
@kmeans = KMeans.new(4)
|
8
|
+
@data = Array.new(10) {Array.new(2) {rand}}
|
9
|
+
end
|
10
|
+
|
11
|
+
should "return an array" do
|
12
|
+
assert_kind_of Array, @kmeans.clustify(@data)
|
13
|
+
end
|
14
|
+
|
15
|
+
should "have 4 centroids" do
|
16
|
+
centroids = @kmeans.clustify(@data).size
|
17
|
+
assert_equal(4, centroids)
|
18
|
+
end
|
19
|
+
|
20
|
+
should "return same amount of data that went in" do
|
21
|
+
output_data_count = @kmeans.clustify(@data).inject(0) do |sum, n|
|
22
|
+
sum += n.size
|
23
|
+
end
|
24
|
+
assert_equal(@data.size, output_data_count)
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: reddavis-k_means
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- reddavis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-15 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Attempting to create a fast memory efficient KMeans
|
17
|
+
email: reddavis@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE
|
24
|
+
- README.rdoc
|
25
|
+
files:
|
26
|
+
- .document
|
27
|
+
- .gitignore
|
28
|
+
- LICENSE
|
29
|
+
- README.rdoc
|
30
|
+
- Rakefile
|
31
|
+
- VERSION
|
32
|
+
- benchmark/benchmark.rb
|
33
|
+
- k_means.gemspec
|
34
|
+
- lib/basic_cache_store.rb
|
35
|
+
- lib/ext/enumerable.rb
|
36
|
+
- lib/k_means.rb
|
37
|
+
- profiling/profile.rb
|
38
|
+
- test/ext/test_enumerable.rb
|
39
|
+
- test/helper.rb
|
40
|
+
- test/test_k_means.rb
|
41
|
+
has_rdoc: false
|
42
|
+
homepage: http://github.com/reddavis/k_means
|
43
|
+
licenses:
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options:
|
46
|
+
- --charset=UTF-8
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
version:
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: "0"
|
60
|
+
version:
|
61
|
+
requirements: []
|
62
|
+
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.3.5
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: K Means algorithm
|
68
|
+
test_files:
|
69
|
+
- test/ext/test_enumerable.rb
|
70
|
+
- test/helper.rb
|
71
|
+
- test/test_k_means.rb
|