clustertool 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/clustertool.rb +48 -0
- data/lib/clustertool/DBSCAN.rb +89 -0
- data/lib/clustertool/KMeans.rb +83 -0
- data/lib/clustertool/Util.rb +46 -0
- metadata +50 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dd947448c507995e271fcbb359f16a2c4d594b63
|
4
|
+
data.tar.gz: a472cae649f54082828137b49705a55ac6bf26ba
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 41e2fcc2d1c590028cf03c9a4032003bb5425f6398cf19789378bf7d530cf61e3a381b1b83467b8d562a347f188a21d821fc92244288405245d45371e9f3ce41
|
7
|
+
data.tar.gz: 01cf8185ed6e7ef39d61cebfe20b37f5c37ac76fe5521bece623c240afbf7ee1180a41a67759746ce5c350400886b9f447f0baee6e3628d9da37ffcd601a67c4
|
data/lib/clustertool.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require "sqlite3"
|
2
|
+
require_relative "clustertool/KMeans"
|
3
|
+
require_relative "clustertool/DBSCAN"
|
4
|
+
|
5
|
+
# The main ClusterTool driver.
|
6
|
+
class ClusterTool
|
7
|
+
# Data is retrieved from the database and passed off to the appropriate
|
8
|
+
# algorithm.
|
9
|
+
#
|
10
|
+
# Example:
|
11
|
+
# >> args = {
|
12
|
+
# >> :k => 3,
|
13
|
+
# >> :cycles => 10,
|
14
|
+
# >> }
|
15
|
+
# >> ClusterTool.cluster("test.db", "TestSimple2D", ["Field1", "Field2"],
|
16
|
+
# >> :kmeans, args)
|
17
|
+
# => [[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], [[0, 10], [-1, 10], [1,
|
18
|
+
# 10], [0, 9], [0, 11]], [[-10, 0], [-11, 0], [-9, 0], [-10, -1], [-10,
|
19
|
+
# 1]]]
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
# >> args = {
|
23
|
+
# >> :epsilon = 1.5,
|
24
|
+
# >> :min_points = 2,
|
25
|
+
# >> }
|
26
|
+
# >> ClusterTool.cluster("test.db", "TestSimple1D", ["Field"], :dbscan,
|
27
|
+
# >> args)
|
28
|
+
# => {0=>[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], 1=>[[0, 10], [-1,
|
29
|
+
# 10], [1, 10], [0, 9], [0, 11]], 2=>[[-10, 0], [-11, 0], [-9, 0], [-10,
|
30
|
+
# -1], [-10, 1]]}
|
31
|
+
#
|
32
|
+
# Arguments:
|
33
|
+
# database: (String)
|
34
|
+
# table: (String)
|
35
|
+
# columns: [(String), ...]
|
36
|
+
# algorithm: :kmeans or :dbscan
|
37
|
+
# args: { (Symbol) => (Numeric), ... }
|
38
|
+
def self.cluster(database, table, columns, algorithm, args)
|
39
|
+
db = SQLite3::Database.new(database)
|
40
|
+
data = db.execute("select #{columns.join(", ")} from #{table};")
|
41
|
+
|
42
|
+
case algorithm
|
43
|
+
when :kmeans then return KMeans.cluster(data, args)
|
44
|
+
when :dbscan then return DBSCAN.cluster(data, args)
|
45
|
+
else raise ArgumentError, 'Unknown clustering algorithm'
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require_relative "Util"
|
2
|
+
|
3
|
+
# Implements the DBSCAN clustering algorithm
|
4
|
+
class DBSCAN
|
5
|
+
|
6
|
+
# Cluster data using the DBSCAN algorithm with arguments epsilon and
|
7
|
+
# min_points provided in args.
|
8
|
+
#
|
9
|
+
# Example:
|
10
|
+
# >> args = {
|
11
|
+
# >> :epsilon => 1.5,
|
12
|
+
# >> :min_points => 2,
|
13
|
+
# >> }
|
14
|
+
# >> DBSCAN.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23],[100]], args)
|
15
|
+
# => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]],
|
16
|
+
# :noise => [[100]]}
|
17
|
+
#
|
18
|
+
# Arguments:
|
19
|
+
# data: [[(Numeric), ...], ...]
|
20
|
+
# args: { (Symbol) => (Numeric), ... }
|
21
|
+
def self.cluster(data, args)
|
22
|
+
unless args.include?(:epsilon)
|
23
|
+
raise ArgumentError, "DBSCAN algorithm requires :epsilon argument"
|
24
|
+
end
|
25
|
+
@epsilon = args[:epsilon]
|
26
|
+
|
27
|
+
unless args.include?(:min_points)
|
28
|
+
raise ArgumentError, "DBSCAN algorithm requires :min_points argument"
|
29
|
+
end
|
30
|
+
@min_pts = args[:min_points]
|
31
|
+
|
32
|
+
@data = data
|
33
|
+
@visited = Hash.new(false)
|
34
|
+
@clusters = Hash.new { Array.new() }
|
35
|
+
@cnum = 0
|
36
|
+
|
37
|
+
@data.each do |point|
|
38
|
+
next if @visited[point]
|
39
|
+
@visited[point] = true
|
40
|
+
neighbor_pts = regionQuery(point)
|
41
|
+
if neighbor_pts.length < @min_pts
|
42
|
+
@clusters[:noise] = @clusters[:noise].push(point)
|
43
|
+
else
|
44
|
+
expandCluster(point, neighbor_pts)
|
45
|
+
@cnum += 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return @clusters
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
class << self; attr_accessor :data end
|
54
|
+
class << self; attr_accessor :epsilon end
|
55
|
+
class << self; attr_accessor :min_pts end
|
56
|
+
class << self; attr_accessor :visited end
|
57
|
+
class << self; attr_accessor :clusters end
|
58
|
+
class << self; attr_accessor :cnum end
|
59
|
+
|
60
|
+
def self.expandCluster(point, neighbor_pts)
|
61
|
+
@clusters[@cnum] = @clusters[@cnum].push(point)
|
62
|
+
index = 0
|
63
|
+
while index < neighbor_pts.length do
|
64
|
+
neighbor = neighbor_pts[index]
|
65
|
+
unless @visited[neighbor]
|
66
|
+
@visited[neighbor] = true
|
67
|
+
neighbor_neighbor_pts = regionQuery(neighbor)
|
68
|
+
if neighbor_neighbor_pts.length >= @min_pts
|
69
|
+
neighbor_pts = neighbor_pts | neighbor_neighbor_pts
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
@clusters[:noise].delete(neighbor)
|
74
|
+
unless @clusters.values.any? { |cluster| cluster.include?(neighbor) }
|
75
|
+
@clusters[@cnum] = @clusters[@cnum].push(neighbor)
|
76
|
+
end
|
77
|
+
|
78
|
+
index += 1
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.regionQuery(point)
|
83
|
+
neighborhood = []
|
84
|
+
@data.each do |p|
|
85
|
+
neighborhood.push(p) if Util.dist(point,p) < @epsilon
|
86
|
+
end
|
87
|
+
return neighborhood
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative "Util"
|
2
|
+
|
3
|
+
# Implementation of the k-Means clustering algorithm
|
4
|
+
class KMeans
|
5
|
+
|
6
|
+
# Cluster data using the k-Means algoritm with k specified in args. The
|
7
|
+
# algorithms is run multiple times (as defined in the :cycles argument) and
|
8
|
+
# the most common clustering is returned.
|
9
|
+
#
|
10
|
+
# Example:
|
11
|
+
# >> args = {
|
12
|
+
# >> :k => 3,
|
13
|
+
# >> :cycles => 10,
|
14
|
+
# >> }
|
15
|
+
# >> KMeans.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23]], args)
|
16
|
+
# => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]]}
|
17
|
+
#
|
18
|
+
# Arguments:
|
19
|
+
# data: [[(Numeric), ...], ...]
|
20
|
+
# args: { (Symbol) => (Numeric), ... }
|
21
|
+
|
22
|
+
def self.cluster(data, args)
|
23
|
+
unless args.include?(:k)
|
24
|
+
raise ArgumentError, "KMeans algorithm requires :k arguemnt"
|
25
|
+
end
|
26
|
+
@k = args[:k]
|
27
|
+
|
28
|
+
unless args.include?(:cycles)
|
29
|
+
raise ArgumentError, "KMeans algorithm requires :cycles argument"
|
30
|
+
end
|
31
|
+
cycles = args[:cycles]
|
32
|
+
|
33
|
+
@data = data
|
34
|
+
|
35
|
+
results = Hash.new(0)
|
36
|
+
cycles.times { results[kmeans()] += 1 }
|
37
|
+
max = nil
|
38
|
+
results.each do |key, value|
|
39
|
+
if max.nil? or results[max] < value
|
40
|
+
max = key
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
clusters = Hash.new()
|
45
|
+
max.values.each_with_index { |v, i| clusters[i] = v } unless max.nil?
|
46
|
+
return clusters
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
class << self; attr_accessor :data end
|
52
|
+
class << self; attr_accessor :k end
|
53
|
+
|
54
|
+
def self.kmeans()
|
55
|
+
# initialize partitions with Forgy method: select random items.
|
56
|
+
means = data.shuffle[0...@k]
|
57
|
+
|
58
|
+
partitions = Hash.new { Array.new }
|
59
|
+
|
60
|
+
old_partitions = ""
|
61
|
+
until partitions.inspect.eql?(old_partitions) do
|
62
|
+
old_partitions = partitions.inspect
|
63
|
+
partitions = Hash.new { Array.new }
|
64
|
+
# assignment step
|
65
|
+
@data.each do |point|
|
66
|
+
min_dist = nil
|
67
|
+
min_index = nil
|
68
|
+
means.each do |mean|
|
69
|
+
if min_dist.nil? or Util.dist(point,mean) < min_dist
|
70
|
+
min_dist = Util.dist(point,mean)
|
71
|
+
min_index = mean
|
72
|
+
end
|
73
|
+
end
|
74
|
+
partitions[min_index] = partitions[min_index].push(point)
|
75
|
+
end
|
76
|
+
|
77
|
+
# update step
|
78
|
+
means = partitions.each_value.map { |points| Util.centroid(points) }
|
79
|
+
end
|
80
|
+
|
81
|
+
return partitions
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# A few utility functions used by the clustering algorithms
|
2
|
+
class Util
|
3
|
+
# Calculates the Euclidean Distance between two points of the same
|
4
|
+
# dimension.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# >> Util.dist([0,0],[3,4])
|
8
|
+
# => 5.0
|
9
|
+
#
|
10
|
+
# Arguments:
|
11
|
+
# point1: [(Numeric), ...]
|
12
|
+
# point2: [(Numeric), ...]
|
13
|
+
def self.dist(point1, point2)
|
14
|
+
# Start by making sure points have the same dimension
|
15
|
+
raise ArgumentError, "Can't calculate distance of points with different
|
16
|
+
dimensions" unless point1.length == point2.length
|
17
|
+
|
18
|
+
pairs = point1.zip(point2)
|
19
|
+
dist = 0
|
20
|
+
pairs.each { |pair| dist += (pair[0] - pair[1])**2 }
|
21
|
+
return Math.sqrt(dist)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Caculates the centroid of a set of points
|
25
|
+
#
|
26
|
+
# Example:
|
27
|
+
# >> Util.centroid([[0,0,0],[1,2,3],[2,4,0]])
|
28
|
+
# => [1,2,1]
|
29
|
+
#
|
30
|
+
# Arguments:
|
31
|
+
# points: [[(Numeric), ...], ...]
|
32
|
+
def self.centroid(points)
|
33
|
+
# Check boundary conditions and make sure each point has the same
|
34
|
+
# dimension.
|
35
|
+
return Array.new unless points.length > 0
|
36
|
+
dim = points[0].length
|
37
|
+
raise ArgumentError, "Can't calculate centroid of points with different
|
38
|
+
dimensions" if points.any? { |p| p.length != dim }
|
39
|
+
|
40
|
+
centroid = Array.new(points[0].length, 0)
|
41
|
+
points.each do |point|
|
42
|
+
point.each_with_index { |val, idx| centroid[idx] += val }
|
43
|
+
end
|
44
|
+
return centroid.map { |val| val.to_f / points.length }
|
45
|
+
end
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: clustertool
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- JD Nir
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-02-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: |-
|
14
|
+
Import a database and specify up to two columns of numerical
|
15
|
+
data as well as your choice of the K-Means or DBSCAN algorithms and their
|
16
|
+
respective parameters.
|
17
|
+
email: jnir@fastorientation.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/clustertool.rb
|
23
|
+
- lib/clustertool/DBSCAN.rb
|
24
|
+
- lib/clustertool/KMeans.rb
|
25
|
+
- lib/clustertool/Util.rb
|
26
|
+
homepage:
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.4.5
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: A tool for clustering database entries
|
50
|
+
test_files: []
|