clustertool 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dd947448c507995e271fcbb359f16a2c4d594b63
4
+ data.tar.gz: a472cae649f54082828137b49705a55ac6bf26ba
5
+ SHA512:
6
+ metadata.gz: 41e2fcc2d1c590028cf03c9a4032003bb5425f6398cf19789378bf7d530cf61e3a381b1b83467b8d562a347f188a21d821fc92244288405245d45371e9f3ce41
7
+ data.tar.gz: 01cf8185ed6e7ef39d61cebfe20b37f5c37ac76fe5521bece623c240afbf7ee1180a41a67759746ce5c350400886b9f447f0baee6e3628d9da37ffcd601a67c4
@@ -0,0 +1,48 @@
1
+ require "sqlite3"
2
+ require_relative "clustertool/KMeans"
3
+ require_relative "clustertool/DBSCAN"
4
+
5
+ # The main ClusterTool driver.
6
+ class ClusterTool
7
+ # Data is retrieved from the database and passed off to the appropriate
8
+ # algorithm.
9
+ #
10
+ # Example:
11
+ # >> args = {
12
+ # >> :k => 3,
13
+ # >> :cycles => 10,
14
+ # >> }
15
+ # >> ClusterTool.cluster("test.db", "TestSimple2D", ["Field1", "Field2"],
16
+ # >> :kmeans, args)
17
+ # => [[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], [[0, 10], [-1, 10], [1,
18
+ # 10], [0, 9], [0, 11]], [[-10, 0], [-11, 0], [-9, 0], [-10, -1], [-10,
19
+ # 1]]]
20
+ #
21
+ # Example:
22
+ # >> args = {
23
+ # >> :epsilon = 1.5,
24
+ # >> :min_points = 2,
25
+ # >> }
26
+ # >> ClusterTool.cluster("test.db", "TestSimple1D", ["Field"], :dbscan,
27
+ # >> args)
28
+ # => {0=>[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], 1=>[[0, 10], [-1,
29
+ # 10], [1, 10], [0, 9], [0, 11]], 2=>[[-10, 0], [-11, 0], [-9, 0], [-10,
30
+ # -1], [-10, 1]]}
31
+ #
32
+ # Arguments:
33
+ # database: (String)
34
+ # table: (String)
35
+ # columns: [(String), ...]
36
+ # algorithm: :kmeans or :dbscan
37
+ # args: { (Symbol) => (Numeric), ... }
38
+ def self.cluster(database, table, columns, algorithm, args)
39
+ db = SQLite3::Database.new(database)
40
+ data = db.execute("select #{columns.join(", ")} from #{table};")
41
+
42
+ case algorithm
43
+ when :kmeans then return KMeans.cluster(data, args)
44
+ when :dbscan then return DBSCAN.cluster(data, args)
45
+ else raise ArgumentError, 'Unknown clustering algorithm'
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,89 @@
1
+ require_relative "Util"
2
+
3
+ # Implements the DBSCAN clustering algorithm
4
+ class DBSCAN
5
+
6
+ # Cluster data using the DBSCAN algorithm with arguments epsilon and
7
+ # min_points provided in args.
8
+ #
9
+ # Example:
10
+ # >> args = {
11
+ # >> :epsilon => 1.5,
12
+ # >> :min_points => 2,
13
+ # >> }
14
+ # >> DBSCAN.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23],[100]], args)
15
+ # => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]],
16
+ # :noise => [[100]]}
17
+ #
18
+ # Arguments:
19
+ # data: [[(Numeric), ...], ...]
20
+ # args: { (Symbol) => (Numeric), ... }
21
+ def self.cluster(data, args)
22
+ unless args.include?(:epsilon)
23
+ raise ArgumentError, "DBSCAN algorithm requires :epsilon argument"
24
+ end
25
+ @epsilon = args[:epsilon]
26
+
27
+ unless args.include?(:min_points)
28
+ raise ArgumentError, "DBSCAN algorithm requires :min_points argument"
29
+ end
30
+ @min_pts = args[:min_points]
31
+
32
+ @data = data
33
+ @visited = Hash.new(false)
34
+ @clusters = Hash.new { Array.new() }
35
+ @cnum = 0
36
+
37
+ @data.each do |point|
38
+ next if @visited[point]
39
+ @visited[point] = true
40
+ neighbor_pts = regionQuery(point)
41
+ if neighbor_pts.length < @min_pts
42
+ @clusters[:noise] = @clusters[:noise].push(point)
43
+ else
44
+ expandCluster(point, neighbor_pts)
45
+ @cnum += 1
46
+ end
47
+ end
48
+ return @clusters
49
+ end
50
+
51
+ private
52
+
53
+ class << self; attr_accessor :data end
54
+ class << self; attr_accessor :epsilon end
55
+ class << self; attr_accessor :min_pts end
56
+ class << self; attr_accessor :visited end
57
+ class << self; attr_accessor :clusters end
58
+ class << self; attr_accessor :cnum end
59
+
60
+ def self.expandCluster(point, neighbor_pts)
61
+ @clusters[@cnum] = @clusters[@cnum].push(point)
62
+ index = 0
63
+ while index < neighbor_pts.length do
64
+ neighbor = neighbor_pts[index]
65
+ unless @visited[neighbor]
66
+ @visited[neighbor] = true
67
+ neighbor_neighbor_pts = regionQuery(neighbor)
68
+ if neighbor_neighbor_pts.length >= @min_pts
69
+ neighbor_pts = neighbor_pts | neighbor_neighbor_pts
70
+ end
71
+ end
72
+
73
+ @clusters[:noise].delete(neighbor)
74
+ unless @clusters.values.any? { |cluster| cluster.include?(neighbor) }
75
+ @clusters[@cnum] = @clusters[@cnum].push(neighbor)
76
+ end
77
+
78
+ index += 1
79
+ end
80
+ end
81
+
82
+ def self.regionQuery(point)
83
+ neighborhood = []
84
+ @data.each do |p|
85
+ neighborhood.push(p) if Util.dist(point,p) < @epsilon
86
+ end
87
+ return neighborhood
88
+ end
89
+ end
@@ -0,0 +1,83 @@
1
+ require_relative "Util"
2
+
3
+ # Implementation of the k-Means clustering algorithm
4
+ class KMeans
5
+
6
+ # Cluster data using the k-Means algoritm with k specified in args. The
7
+ # algorithms is run multiple times (as defined in the :cycles argument) and
8
+ # the most common clustering is returned.
9
+ #
10
+ # Example:
11
+ # >> args = {
12
+ # >> :k => 3,
13
+ # >> :cycles => 10,
14
+ # >> }
15
+ # >> KMeans.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23]], args)
16
+ # => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]]}
17
+ #
18
+ # Arguments:
19
+ # data: [[(Numeric), ...], ...]
20
+ # args: { (Symbol) => (Numeric), ... }
21
+
22
+ def self.cluster(data, args)
23
+ unless args.include?(:k)
24
+ raise ArgumentError, "KMeans algorithm requires :k arguemnt"
25
+ end
26
+ @k = args[:k]
27
+
28
+ unless args.include?(:cycles)
29
+ raise ArgumentError, "KMeans algorithm requires :cycles argument"
30
+ end
31
+ cycles = args[:cycles]
32
+
33
+ @data = data
34
+
35
+ results = Hash.new(0)
36
+ cycles.times { results[kmeans()] += 1 }
37
+ max = nil
38
+ results.each do |key, value|
39
+ if max.nil? or results[max] < value
40
+ max = key
41
+ end
42
+ end
43
+
44
+ clusters = Hash.new()
45
+ max.values.each_with_index { |v, i| clusters[i] = v } unless max.nil?
46
+ return clusters
47
+ end
48
+
49
+ private
50
+
51
+ class << self; attr_accessor :data end
52
+ class << self; attr_accessor :k end
53
+
54
+ def self.kmeans()
55
+ # initialize partitions with Forgy method: select random items.
56
+ means = data.shuffle[0...@k]
57
+
58
+ partitions = Hash.new { Array.new }
59
+
60
+ old_partitions = ""
61
+ until partitions.inspect.eql?(old_partitions) do
62
+ old_partitions = partitions.inspect
63
+ partitions = Hash.new { Array.new }
64
+ # assignment step
65
+ @data.each do |point|
66
+ min_dist = nil
67
+ min_index = nil
68
+ means.each do |mean|
69
+ if min_dist.nil? or Util.dist(point,mean) < min_dist
70
+ min_dist = Util.dist(point,mean)
71
+ min_index = mean
72
+ end
73
+ end
74
+ partitions[min_index] = partitions[min_index].push(point)
75
+ end
76
+
77
+ # update step
78
+ means = partitions.each_value.map { |points| Util.centroid(points) }
79
+ end
80
+
81
+ return partitions
82
+ end
83
+ end
@@ -0,0 +1,46 @@
1
+ # A few utility functions used by the clustering algorithms
2
+ class Util
3
+ # Calculates the Euclidean Distance between two points of the same
4
+ # dimension.
5
+ #
6
+ # Example:
7
+ # >> Util.dist([0,0],[3,4])
8
+ # => 5.0
9
+ #
10
+ # Arguments:
11
+ # point1: [(Numeric), ...]
12
+ # point2: [(Numeric), ...]
13
+ def self.dist(point1, point2)
14
+ # Start by making sure points have the same dimension
15
+ raise ArgumentError, "Can't calculate distance of points with different
16
+ dimensions" unless point1.length == point2.length
17
+
18
+ pairs = point1.zip(point2)
19
+ dist = 0
20
+ pairs.each { |pair| dist += (pair[0] - pair[1])**2 }
21
+ return Math.sqrt(dist)
22
+ end
23
+
24
+ # Caculates the centroid of a set of points
25
+ #
26
+ # Example:
27
+ # >> Util.centroid([[0,0,0],[1,2,3],[2,4,0]])
28
+ # => [1,2,1]
29
+ #
30
+ # Arguments:
31
+ # points: [[(Numeric), ...], ...]
32
+ def self.centroid(points)
33
+ # Check boundary conditions and make sure each point has the same
34
+ # dimension.
35
+ return Array.new unless points.length > 0
36
+ dim = points[0].length
37
+ raise ArgumentError, "Can't calculate centroid of points with different
38
+ dimensions" if points.any? { |p| p.length != dim }
39
+
40
+ centroid = Array.new(points[0].length, 0)
41
+ points.each do |point|
42
+ point.each_with_index { |val, idx| centroid[idx] += val }
43
+ end
44
+ return centroid.map { |val| val.to_f / points.length }
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: clustertool
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - JD Nir
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |-
14
+ Import a database and specify up to two columns of numerical
15
+ data as well as your choice of the K-Means or DBSCAN algorithms and their
16
+ respective parameters.
17
+ email: jnir@fastorientation.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/clustertool.rb
23
+ - lib/clustertool/DBSCAN.rb
24
+ - lib/clustertool/KMeans.rb
25
+ - lib/clustertool/Util.rb
26
+ homepage:
27
+ licenses:
28
+ - MIT
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.4.5
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: A tool for clustering database entries
50
+ test_files: []