clustertool 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/clustertool.rb +48 -0
- data/lib/clustertool/DBSCAN.rb +89 -0
- data/lib/clustertool/KMeans.rb +83 -0
- data/lib/clustertool/Util.rb +46 -0
- metadata +50 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dd947448c507995e271fcbb359f16a2c4d594b63
|
4
|
+
data.tar.gz: a472cae649f54082828137b49705a55ac6bf26ba
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 41e2fcc2d1c590028cf03c9a4032003bb5425f6398cf19789378bf7d530cf61e3a381b1b83467b8d562a347f188a21d821fc92244288405245d45371e9f3ce41
|
7
|
+
data.tar.gz: 01cf8185ed6e7ef39d61cebfe20b37f5c37ac76fe5521bece623c240afbf7ee1180a41a67759746ce5c350400886b9f447f0baee6e3628d9da37ffcd601a67c4
|
data/lib/clustertool.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require "sqlite3"
|
2
|
+
require_relative "clustertool/KMeans"
|
3
|
+
require_relative "clustertool/DBSCAN"
|
4
|
+
|
5
|
+
# The main ClusterTool driver.
|
6
|
+
class ClusterTool
|
7
|
+
# Data is retrieved from the database and passed off to the appropriate
|
8
|
+
# algorithm.
|
9
|
+
#
|
10
|
+
# Example:
|
11
|
+
# >> args = {
|
12
|
+
# >> :k => 3,
|
13
|
+
# >> :cycles => 10,
|
14
|
+
# >> }
|
15
|
+
# >> ClusterTool.cluster("test.db", "TestSimple2D", ["Field1", "Field2"],
|
16
|
+
# >> :kmeans, args)
|
17
|
+
# => [[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], [[0, 10], [-1, 10], [1,
|
18
|
+
# 10], [0, 9], [0, 11]], [[-10, 0], [-11, 0], [-9, 0], [-10, -1], [-10,
|
19
|
+
# 1]]]
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
# >> args = {
|
23
|
+
# >> :epsilon = 1.5,
|
24
|
+
# >> :min_points = 2,
|
25
|
+
# >> }
|
26
|
+
# >> ClusterTool.cluster("test.db", "TestSimple1D", ["Field"], :dbscan,
|
27
|
+
# >> args)
|
28
|
+
# => {0=>[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], 1=>[[0, 10], [-1,
|
29
|
+
# 10], [1, 10], [0, 9], [0, 11]], 2=>[[-10, 0], [-11, 0], [-9, 0], [-10,
|
30
|
+
# -1], [-10, 1]]}
|
31
|
+
#
|
32
|
+
# Arguments:
|
33
|
+
# database: (String)
|
34
|
+
# table: (String)
|
35
|
+
# columns: [(String), ...]
|
36
|
+
# algorithm: :kmeans or :dbscan
|
37
|
+
# args: { (Symbol) => (Numeric), ... }
|
38
|
+
def self.cluster(database, table, columns, algorithm, args)
|
39
|
+
db = SQLite3::Database.new(database)
|
40
|
+
data = db.execute("select #{columns.join(", ")} from #{table};")
|
41
|
+
|
42
|
+
case algorithm
|
43
|
+
when :kmeans then return KMeans.cluster(data, args)
|
44
|
+
when :dbscan then return DBSCAN.cluster(data, args)
|
45
|
+
else raise ArgumentError, 'Unknown clustering algorithm'
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require_relative "Util"
|
2
|
+
|
3
|
+
# Implements the DBSCAN clustering algorithm
|
4
|
+
class DBSCAN
|
5
|
+
|
6
|
+
# Cluster data using the DBSCAN algorithm with arguments epsilon and
|
7
|
+
# min_points provided in args.
|
8
|
+
#
|
9
|
+
# Example:
|
10
|
+
# >> args = {
|
11
|
+
# >> :epsilon => 1.5,
|
12
|
+
# >> :min_points => 2,
|
13
|
+
# >> }
|
14
|
+
# >> DBSCAN.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23],[100]], args)
|
15
|
+
# => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]],
|
16
|
+
# :noise => [[100]]}
|
17
|
+
#
|
18
|
+
# Arguments:
|
19
|
+
# data: [[(Numeric), ...], ...]
|
20
|
+
# args: { (Symbol) => (Numeric), ... }
|
21
|
+
def self.cluster(data, args)
|
22
|
+
unless args.include?(:epsilon)
|
23
|
+
raise ArgumentError, "DBSCAN algorithm requires :epsilon argument"
|
24
|
+
end
|
25
|
+
@epsilon = args[:epsilon]
|
26
|
+
|
27
|
+
unless args.include?(:min_points)
|
28
|
+
raise ArgumentError, "DBSCAN algorithm requires :min_points argument"
|
29
|
+
end
|
30
|
+
@min_pts = args[:min_points]
|
31
|
+
|
32
|
+
@data = data
|
33
|
+
@visited = Hash.new(false)
|
34
|
+
@clusters = Hash.new { Array.new() }
|
35
|
+
@cnum = 0
|
36
|
+
|
37
|
+
@data.each do |point|
|
38
|
+
next if @visited[point]
|
39
|
+
@visited[point] = true
|
40
|
+
neighbor_pts = regionQuery(point)
|
41
|
+
if neighbor_pts.length < @min_pts
|
42
|
+
@clusters[:noise] = @clusters[:noise].push(point)
|
43
|
+
else
|
44
|
+
expandCluster(point, neighbor_pts)
|
45
|
+
@cnum += 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return @clusters
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
class << self; attr_accessor :data end
|
54
|
+
class << self; attr_accessor :epsilon end
|
55
|
+
class << self; attr_accessor :min_pts end
|
56
|
+
class << self; attr_accessor :visited end
|
57
|
+
class << self; attr_accessor :clusters end
|
58
|
+
class << self; attr_accessor :cnum end
|
59
|
+
|
60
|
+
def self.expandCluster(point, neighbor_pts)
|
61
|
+
@clusters[@cnum] = @clusters[@cnum].push(point)
|
62
|
+
index = 0
|
63
|
+
while index < neighbor_pts.length do
|
64
|
+
neighbor = neighbor_pts[index]
|
65
|
+
unless @visited[neighbor]
|
66
|
+
@visited[neighbor] = true
|
67
|
+
neighbor_neighbor_pts = regionQuery(neighbor)
|
68
|
+
if neighbor_neighbor_pts.length >= @min_pts
|
69
|
+
neighbor_pts = neighbor_pts | neighbor_neighbor_pts
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
@clusters[:noise].delete(neighbor)
|
74
|
+
unless @clusters.values.any? { |cluster| cluster.include?(neighbor) }
|
75
|
+
@clusters[@cnum] = @clusters[@cnum].push(neighbor)
|
76
|
+
end
|
77
|
+
|
78
|
+
index += 1
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.regionQuery(point)
|
83
|
+
neighborhood = []
|
84
|
+
@data.each do |p|
|
85
|
+
neighborhood.push(p) if Util.dist(point,p) < @epsilon
|
86
|
+
end
|
87
|
+
return neighborhood
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative "Util"
|
2
|
+
|
3
|
+
# Implementation of the k-Means clustering algorithm
|
4
|
+
class KMeans
|
5
|
+
|
6
|
+
# Cluster data using the k-Means algoritm with k specified in args. The
|
7
|
+
# algorithms is run multiple times (as defined in the :cycles argument) and
|
8
|
+
# the most common clustering is returned.
|
9
|
+
#
|
10
|
+
# Example:
|
11
|
+
# >> args = {
|
12
|
+
# >> :k => 3,
|
13
|
+
# >> :cycles => 10,
|
14
|
+
# >> }
|
15
|
+
# >> KMeans.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23]], args)
|
16
|
+
# => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]]}
|
17
|
+
#
|
18
|
+
# Arguments:
|
19
|
+
# data: [[(Numeric), ...], ...]
|
20
|
+
# args: { (Symbol) => (Numeric), ... }
|
21
|
+
|
22
|
+
def self.cluster(data, args)
|
23
|
+
unless args.include?(:k)
|
24
|
+
raise ArgumentError, "KMeans algorithm requires :k arguemnt"
|
25
|
+
end
|
26
|
+
@k = args[:k]
|
27
|
+
|
28
|
+
unless args.include?(:cycles)
|
29
|
+
raise ArgumentError, "KMeans algorithm requires :cycles argument"
|
30
|
+
end
|
31
|
+
cycles = args[:cycles]
|
32
|
+
|
33
|
+
@data = data
|
34
|
+
|
35
|
+
results = Hash.new(0)
|
36
|
+
cycles.times { results[kmeans()] += 1 }
|
37
|
+
max = nil
|
38
|
+
results.each do |key, value|
|
39
|
+
if max.nil? or results[max] < value
|
40
|
+
max = key
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
clusters = Hash.new()
|
45
|
+
max.values.each_with_index { |v, i| clusters[i] = v } unless max.nil?
|
46
|
+
return clusters
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
class << self; attr_accessor :data end
|
52
|
+
class << self; attr_accessor :k end
|
53
|
+
|
54
|
+
def self.kmeans()
|
55
|
+
# initialize partitions with Forgy method: select random items.
|
56
|
+
means = data.shuffle[0...@k]
|
57
|
+
|
58
|
+
partitions = Hash.new { Array.new }
|
59
|
+
|
60
|
+
old_partitions = ""
|
61
|
+
until partitions.inspect.eql?(old_partitions) do
|
62
|
+
old_partitions = partitions.inspect
|
63
|
+
partitions = Hash.new { Array.new }
|
64
|
+
# assignment step
|
65
|
+
@data.each do |point|
|
66
|
+
min_dist = nil
|
67
|
+
min_index = nil
|
68
|
+
means.each do |mean|
|
69
|
+
if min_dist.nil? or Util.dist(point,mean) < min_dist
|
70
|
+
min_dist = Util.dist(point,mean)
|
71
|
+
min_index = mean
|
72
|
+
end
|
73
|
+
end
|
74
|
+
partitions[min_index] = partitions[min_index].push(point)
|
75
|
+
end
|
76
|
+
|
77
|
+
# update step
|
78
|
+
means = partitions.each_value.map { |points| Util.centroid(points) }
|
79
|
+
end
|
80
|
+
|
81
|
+
return partitions
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# A few utility functions used by the clustering algorithms
|
2
|
+
class Util
|
3
|
+
# Calculates the Euclidean Distance between two points of the same
|
4
|
+
# dimension.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# >> Util.dist([0,0],[3,4])
|
8
|
+
# => 5.0
|
9
|
+
#
|
10
|
+
# Arguments:
|
11
|
+
# point1: [(Numeric), ...]
|
12
|
+
# point2: [(Numeric), ...]
|
13
|
+
def self.dist(point1, point2)
|
14
|
+
# Start by making sure points have the same dimension
|
15
|
+
raise ArgumentError, "Can't calculate distance of points with different
|
16
|
+
dimensions" unless point1.length == point2.length
|
17
|
+
|
18
|
+
pairs = point1.zip(point2)
|
19
|
+
dist = 0
|
20
|
+
pairs.each { |pair| dist += (pair[0] - pair[1])**2 }
|
21
|
+
return Math.sqrt(dist)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Caculates the centroid of a set of points
|
25
|
+
#
|
26
|
+
# Example:
|
27
|
+
# >> Util.centroid([[0,0,0],[1,2,3],[2,4,0]])
|
28
|
+
# => [1,2,1]
|
29
|
+
#
|
30
|
+
# Arguments:
|
31
|
+
# points: [[(Numeric), ...], ...]
|
32
|
+
def self.centroid(points)
|
33
|
+
# Check boundary conditions and make sure each point has the same
|
34
|
+
# dimension.
|
35
|
+
return Array.new unless points.length > 0
|
36
|
+
dim = points[0].length
|
37
|
+
raise ArgumentError, "Can't calculate centroid of points with different
|
38
|
+
dimensions" if points.any? { |p| p.length != dim }
|
39
|
+
|
40
|
+
centroid = Array.new(points[0].length, 0)
|
41
|
+
points.each do |point|
|
42
|
+
point.each_with_index { |val, idx| centroid[idx] += val }
|
43
|
+
end
|
44
|
+
return centroid.map { |val| val.to_f / points.length }
|
45
|
+
end
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: clustertool
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- JD Nir
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-02-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: |-
|
14
|
+
Import a database and specify up to two columns of numerical
|
15
|
+
data as well as your choice of the K-Means or DBSCAN algorithms and their
|
16
|
+
respective parameters.
|
17
|
+
email: jnir@fastorientation.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/clustertool.rb
|
23
|
+
- lib/clustertool/DBSCAN.rb
|
24
|
+
- lib/clustertool/KMeans.rb
|
25
|
+
- lib/clustertool/Util.rb
|
26
|
+
homepage:
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.4.5
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: A tool for clustering database entries
|
50
|
+
test_files: []
|