clustertool 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dd947448c507995e271fcbb359f16a2c4d594b63
4
+ data.tar.gz: a472cae649f54082828137b49705a55ac6bf26ba
5
+ SHA512:
6
+ metadata.gz: 41e2fcc2d1c590028cf03c9a4032003bb5425f6398cf19789378bf7d530cf61e3a381b1b83467b8d562a347f188a21d821fc92244288405245d45371e9f3ce41
7
+ data.tar.gz: 01cf8185ed6e7ef39d61cebfe20b37f5c37ac76fe5521bece623c240afbf7ee1180a41a67759746ce5c350400886b9f447f0baee6e3628d9da37ffcd601a67c4
@@ -0,0 +1,48 @@
1
+ require "sqlite3"
2
+ require_relative "clustertool/KMeans"
3
+ require_relative "clustertool/DBSCAN"
4
+
5
+ # The main ClusterTool driver.
6
+ class ClusterTool
7
+ # Data is retrieved from the database and passed off to the appropriate
8
+ # algorithm.
9
+ #
10
+ # Example:
11
+ # >> args = {
12
+ # >> :k => 3,
13
+ # >> :cycles => 10,
14
+ # >> }
15
+ # >> ClusterTool.cluster("test.db", "TestSimple2D", ["Field1", "Field2"],
16
+ # >> :kmeans, args)
17
+ # => [[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], [[0, 10], [-1, 10], [1,
18
+ # 10], [0, 9], [0, 11]], [[-10, 0], [-11, 0], [-9, 0], [-10, -1], [-10,
19
+ # 1]]]
20
+ #
21
+ # Example:
22
+ # >> args = {
23
+ # >> :epsilon = 1.5,
24
+ # >> :min_points = 2,
25
+ # >> }
26
+ # >> ClusterTool.cluster("test.db", "TestSimple1D", ["Field"], :dbscan,
27
+ # >> args)
28
+ # => {0=>[[0, 0], [-1, 0], [1, 0], [0, -1], [0, 1]], 1=>[[0, 10], [-1,
29
+ # 10], [1, 10], [0, 9], [0, 11]], 2=>[[-10, 0], [-11, 0], [-9, 0], [-10,
30
+ # -1], [-10, 1]]}
31
+ #
32
+ # Arguments:
33
+ # database: (String)
34
+ # table: (String)
35
+ # columns: [(String), ...]
36
+ # algorithm: :kmeans or :dbscan
37
+ # args: { (Symbol) => (Numeric), ... }
38
+ def self.cluster(database, table, columns, algorithm, args)
39
+ db = SQLite3::Database.new(database)
40
+ data = db.execute("select #{columns.join(", ")} from #{table};")
41
+
42
+ case algorithm
43
+ when :kmeans then return KMeans.cluster(data, args)
44
+ when :dbscan then return DBSCAN.cluster(data, args)
45
+ else raise ArgumentError, 'Unknown clustering algorithm'
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,89 @@
1
+ require_relative "Util"
2
+
3
+ # Implements the DBSCAN clustering algorithm
4
+ class DBSCAN
5
+
6
+ # Cluster data using the DBSCAN algorithm with arguments epsilon and
7
+ # min_points provided in args.
8
+ #
9
+ # Example:
10
+ # >> args = {
11
+ # >> :epsilon => 1.5,
12
+ # >> :min_points => 2,
13
+ # >> }
14
+ # >> DBSCAN.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23],[100]], args)
15
+ # => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]],
16
+ # :noise => [[100]]}
17
+ #
18
+ # Arguments:
19
+ # data: [[(Numeric), ...], ...]
20
+ # args: { (Symbol) => (Numeric), ... }
21
+ def self.cluster(data, args)
22
+ unless args.include?(:epsilon)
23
+ raise ArgumentError, "DBSCAN algorithm requires :epsilon argument"
24
+ end
25
+ @epsilon = args[:epsilon]
26
+
27
+ unless args.include?(:min_points)
28
+ raise ArgumentError, "DBSCAN algorithm requires :min_points argument"
29
+ end
30
+ @min_pts = args[:min_points]
31
+
32
+ @data = data
33
+ @visited = Hash.new(false)
34
+ @clusters = Hash.new { Array.new() }
35
+ @cnum = 0
36
+
37
+ @data.each do |point|
38
+ next if @visited[point]
39
+ @visited[point] = true
40
+ neighbor_pts = regionQuery(point)
41
+ if neighbor_pts.length < @min_pts
42
+ @clusters[:noise] = @clusters[:noise].push(point)
43
+ else
44
+ expandCluster(point, neighbor_pts)
45
+ @cnum += 1
46
+ end
47
+ end
48
+ return @clusters
49
+ end
50
+
51
+ private
52
+
53
+ class << self; attr_accessor :data end
54
+ class << self; attr_accessor :epsilon end
55
+ class << self; attr_accessor :min_pts end
56
+ class << self; attr_accessor :visited end
57
+ class << self; attr_accessor :clusters end
58
+ class << self; attr_accessor :cnum end
59
+
60
+ def self.expandCluster(point, neighbor_pts)
61
+ @clusters[@cnum] = @clusters[@cnum].push(point)
62
+ index = 0
63
+ while index < neighbor_pts.length do
64
+ neighbor = neighbor_pts[index]
65
+ unless @visited[neighbor]
66
+ @visited[neighbor] = true
67
+ neighbor_neighbor_pts = regionQuery(neighbor)
68
+ if neighbor_neighbor_pts.length >= @min_pts
69
+ neighbor_pts = neighbor_pts | neighbor_neighbor_pts
70
+ end
71
+ end
72
+
73
+ @clusters[:noise].delete(neighbor)
74
+ unless @clusters.values.any? { |cluster| cluster.include?(neighbor) }
75
+ @clusters[@cnum] = @clusters[@cnum].push(neighbor)
76
+ end
77
+
78
+ index += 1
79
+ end
80
+ end
81
+
82
+ def self.regionQuery(point)
83
+ neighborhood = []
84
+ @data.each do |p|
85
+ neighborhood.push(p) if Util.dist(point,p) < @epsilon
86
+ end
87
+ return neighborhood
88
+ end
89
+ end
@@ -0,0 +1,83 @@
1
+ require_relative "Util"
2
+
3
+ # Implementation of the k-Means clustering algorithm
4
+ class KMeans
5
+
6
+ # Cluster data using the k-Means algoritm with k specified in args. The
7
+ # algorithms is run multiple times (as defined in the :cycles argument) and
8
+ # the most common clustering is returned.
9
+ #
10
+ # Example:
11
+ # >> args = {
12
+ # >> :k => 3,
13
+ # >> :cycles => 10,
14
+ # >> }
15
+ # >> KMeans.cluster([[1],[2],[3],[11],[12],[13],[21],[22],[23]], args)
16
+ # => {0 => [[1],[2],[3]], 1 => [[11],[12],[13]], 2 => [[21],[22],[23]]}
17
+ #
18
+ # Arguments:
19
+ # data: [[(Numeric), ...], ...]
20
+ # args: { (Symbol) => (Numeric), ... }
21
+
22
+ def self.cluster(data, args)
23
+ unless args.include?(:k)
24
+ raise ArgumentError, "KMeans algorithm requires :k arguemnt"
25
+ end
26
+ @k = args[:k]
27
+
28
+ unless args.include?(:cycles)
29
+ raise ArgumentError, "KMeans algorithm requires :cycles argument"
30
+ end
31
+ cycles = args[:cycles]
32
+
33
+ @data = data
34
+
35
+ results = Hash.new(0)
36
+ cycles.times { results[kmeans()] += 1 }
37
+ max = nil
38
+ results.each do |key, value|
39
+ if max.nil? or results[max] < value
40
+ max = key
41
+ end
42
+ end
43
+
44
+ clusters = Hash.new()
45
+ max.values.each_with_index { |v, i| clusters[i] = v } unless max.nil?
46
+ return clusters
47
+ end
48
+
49
+ private
50
+
51
+ class << self; attr_accessor :data end
52
+ class << self; attr_accessor :k end
53
+
54
+ def self.kmeans()
55
+ # initialize partitions with Forgy method: select random items.
56
+ means = data.shuffle[0...@k]
57
+
58
+ partitions = Hash.new { Array.new }
59
+
60
+ old_partitions = ""
61
+ until partitions.inspect.eql?(old_partitions) do
62
+ old_partitions = partitions.inspect
63
+ partitions = Hash.new { Array.new }
64
+ # assignment step
65
+ @data.each do |point|
66
+ min_dist = nil
67
+ min_index = nil
68
+ means.each do |mean|
69
+ if min_dist.nil? or Util.dist(point,mean) < min_dist
70
+ min_dist = Util.dist(point,mean)
71
+ min_index = mean
72
+ end
73
+ end
74
+ partitions[min_index] = partitions[min_index].push(point)
75
+ end
76
+
77
+ # update step
78
+ means = partitions.each_value.map { |points| Util.centroid(points) }
79
+ end
80
+
81
+ return partitions
82
+ end
83
+ end
@@ -0,0 +1,46 @@
1
+ # A few utility functions used by the clustering algorithms
2
+ class Util
3
+ # Calculates the Euclidean Distance between two points of the same
4
+ # dimension.
5
+ #
6
+ # Example:
7
+ # >> Util.dist([0,0],[3,4])
8
+ # => 5.0
9
+ #
10
+ # Arguments:
11
+ # point1: [(Numeric), ...]
12
+ # point2: [(Numeric), ...]
13
+ def self.dist(point1, point2)
14
+ # Start by making sure points have the same dimension
15
+ raise ArgumentError, "Can't calculate distance of points with different
16
+ dimensions" unless point1.length == point2.length
17
+
18
+ pairs = point1.zip(point2)
19
+ dist = 0
20
+ pairs.each { |pair| dist += (pair[0] - pair[1])**2 }
21
+ return Math.sqrt(dist)
22
+ end
23
+
24
+ # Caculates the centroid of a set of points
25
+ #
26
+ # Example:
27
+ # >> Util.centroid([[0,0,0],[1,2,3],[2,4,0]])
28
+ # => [1,2,1]
29
+ #
30
+ # Arguments:
31
+ # points: [[(Numeric), ...], ...]
32
+ def self.centroid(points)
33
+ # Check boundary conditions and make sure each point has the same
34
+ # dimension.
35
+ return Array.new unless points.length > 0
36
+ dim = points[0].length
37
+ raise ArgumentError, "Can't calculate centroid of points with different
38
+ dimensions" if points.any? { |p| p.length != dim }
39
+
40
+ centroid = Array.new(points[0].length, 0)
41
+ points.each do |point|
42
+ point.each_with_index { |val, idx| centroid[idx] += val }
43
+ end
44
+ return centroid.map { |val| val.to_f / points.length }
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: clustertool
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - JD Nir
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |-
14
+ Import a database and specify up to two columns of numerical
15
+ data as well as your choice of the K-Means or DBSCAN algorithms and their
16
+ respective parameters.
17
+ email: jnir@fastorientation.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/clustertool.rb
23
+ - lib/clustertool/DBSCAN.rb
24
+ - lib/clustertool/KMeans.rb
25
+ - lib/clustertool/Util.rb
26
+ homepage:
27
+ licenses:
28
+ - MIT
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.4.5
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: A tool for clustering database entries
50
+ test_files: []