dnks_clustering_indexes 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2184949a035f273d5fc42f6bc36896a2845982a9d2d3734a8d649455c1d29304
4
+ data.tar.gz: 2c9926ecedaf81280c9f9cf2c6eca1ec083a791ea1012820655a3938fd620aab
5
+ SHA512:
6
+ metadata.gz: 8be08145c482785e3dca7a53a58156dc18736ca65834474f35326c8c5d871aaeee0e048f063584efeeb0e5c53b0223a1d612dac6a281cbb3492a4c69e85bd89a
7
+ data.tar.gz: c9e8f30d24c5f867a5bb8e04f0aa97f3428299257b503076e79d3d453339f30eba2792445ffe5779dcd3eba425018ce5fd4d4207b6bd2a6b0ab42037a26a71fe
@@ -0,0 +1,109 @@
1
+ require "bigdecimal"
2
+
3
+ module Calculations
4
+ # numbers used for big calculations
5
+ def self.number(n)
6
+ BigDecimal(n, 10)
7
+ end
8
+
9
+ def self.mode_of_array(arr)
10
+ mode = nil
11
+ frequency_table = {}
12
+ max_frequency = 0
13
+
14
+ arr.each do |elem|
15
+ frequency_table[elem] ||= 0
16
+ frequency_table[elem] += 1
17
+ max_frequency, mode = frequency_table[elem], elem if frequency_table[elem] > max_frequency
18
+ end
19
+ mode
20
+ end
21
+
22
+ # return array without element with index from params
23
+ def self.array_except(array, index)
24
+ arr_dup = array.dup
25
+ arr_dup.delete_at(index)
26
+ arr_dup
27
+ end
28
+
29
+ module Distance
30
+ # distance between two "points"
31
+ def self.euclidean(p1, p2)
32
+ sum = Calculations.number(0.0)
33
+ p1.each_index { |i| sum += (Calculations.number(p2[i]) - Calculations.number(p1[i]))**2 }
34
+ Math.sqrt(sum)
35
+ end
36
+ end
37
+
38
+ module Cluster
39
+ def self.center(multi_array)
40
+ ret = []
41
+ (0...multi_array.first.length).each { |j|
42
+ sum = Calculations.number(0.0)
43
+ (0...multi_array.length).each { |i|
44
+ sum += Calculations.number(multi_array[i][j])
45
+ }
46
+ ret.push(Calculations.number(sum / multi_array.length))
47
+ }
48
+ ret
49
+ end
50
+
51
+ def self.distances_to_point(multi_array, point)
52
+ multi_array.map { |x| Calculations::Distance.euclidean(x, point) }
53
+ end
54
+
55
+ # find nearest centroid in array "centroids" to point "point",
56
+ # except centroid with index "index_of_cluster" in "centroids" array
57
+ # return nearest centroids index
58
+ def self.nearest_to_point(centroids, index_of_centroid_to_skip, point)
59
+ min = nil
60
+ (0...centroids.length).each do |i|
61
+ next if i == index_of_centroid_to_skip
62
+ distance = Calculations::Distance.euclidean(centroids[i], point)
63
+ min = [distance, i] if min.nil? || distance <= min[0]
64
+ end
65
+ min[1]
66
+ end
67
+ end
68
+
69
+ class DistanceMatrix
70
+ def initialize(data_array)
71
+ @distances_matrix = euclidean_distances_matrix(data_array)
72
+ end
73
+
74
+ def show
75
+ @distances_matrix.each_with_index do |row, i|
76
+ puts "X#{i}: #{row}"
77
+ end
78
+ end
79
+
80
+ def data
81
+ @distances_matrix.sum([])
82
+ end
83
+
84
+ def euclidean_distances_matrix(arr)
85
+ (0...arr.length - 1).to_a.map do |i|
86
+ ((i + 1)...arr.length).to_a.map { |j|
87
+ Calculations::Distance.euclidean(arr[i], arr[j])
88
+ }
89
+ end
90
+ end
91
+
92
+ def total_sum
93
+ data.sum(Calculations.number(0.0))
94
+ end
95
+
96
+ def max_distance
97
+ data.max
98
+ end
99
+
100
+ def get(point_1_index, point_2_index)
101
+ return 0 if point_1_index == point_2_index
102
+ point_1_index, point_2_index = point_2_index, point_1_index if point_1_index > point_2_index
103
+ puts "i = #{point_1_index}, j = #{point_2_index}"
104
+ puts "i = #{point_1_index}, j = #{point_2_index - point_1_index - 1}"
105
+ @distances_matrix[point_1_index][point_2_index - point_1_index - 1]
106
+ end
107
+
108
+ end
109
+ end
@@ -0,0 +1,61 @@
1
+ require_relative "calculations_utility_methods"
2
+ require_relative "clustering_indexes"
3
+
4
+ class ClusteredDataSet
5
+
6
+ attr_reader :clusters, :centroids, :num_objects, :num_features, :k
7
+
8
+ def data
9
+ clusters.sum([])
10
+ end
11
+
12
+ def dataset_barycenter
13
+ Calculations::Cluster.center(data)
14
+ end
15
+
16
+ def initialize(param)
17
+ build_by_kmeans_clusters(param) if param[0].instance_of?(KMeansClusterer::Cluster)
18
+ build_by_multi_array(param) if param[0].instance_of?(Array)
19
+ end
20
+
21
+ # build by already clustered dataset in multi array representation
22
+ def build_by_multi_array(marr)
23
+ @data_rows = []
24
+ @clusters = marr
25
+ @num_objects = data.length
26
+ @num_features = @clusters[0][0].length
27
+ @k = clusters.length
28
+ @centroids = @clusters.map { |cluster| Calculations::Cluster.center(cluster) }
29
+ puts "Built by multi array!"
30
+ self
31
+ end
32
+
33
+ # build by already clustered KmeansClusterer::Cluster
34
+ def build_by_kmeans_clusters(clusters)
35
+ @data_rows = []
36
+ @clusters = []
37
+ clusters.each { |x| @clusters.insert(x.id, kmeans_points_to_array(x.points)) }
38
+ @num_objects = data.length
39
+ @num_features = @clusters[0][0].length
40
+ @k = clusters.length
41
+ @centroids = @clusters.map { |cluster| Calculations::Cluster.center(cluster) }
42
+ puts "Built by kmeans clusterer! For k = #{@k}"
43
+ self
44
+ end
45
+
46
+ def kmeans_points_to_array(points)
47
+ points.map { |point| point.data.to_a }
48
+ end
49
+
50
+ def show
51
+ @clusters.each_with_index do |cl, i|
52
+ puts "Cluster #{i} :"
53
+ puts cl.to_s
54
+ end
55
+ end
56
+
57
+ def calculate_index_by_name(index_name)
58
+ index_calculator = ClusteringIndexes.new(self)
59
+ index_calculator.calculate_index_by_name(index_name)
60
+ end
61
+ end
@@ -0,0 +1,111 @@
1
+ require "bigdecimal"
2
+ class ClusteringIndexes
3
+ def initialize(cds)
4
+ @cds = cds
5
+ end
6
+
7
+ def calculate_index_by_name(index_name)
8
+ case index_name
9
+ when Constants::Indexes::SILHOUETTE
10
+ silhouette_index
11
+ when Constants::Indexes::CALINSKI_HARABASZ
12
+ calinski_harabasz_index
13
+ when Constants::Indexes::C
14
+ c_index
15
+ when Constants::Indexes::DUNN
16
+ dunn_index
17
+ when Constants::Indexes::DAVIES_BOULDIN
18
+ davies_bouldin_index
19
+ when Constants::Indexes::PBM
20
+ pbm_index
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def silhouette_index
27
+ return 1.0 if @cds.k < 2
28
+ silhouette_scores_sum = 0
29
+ silhouette_scores_count = 0
30
+ (0...@cds.k).each { |i|
31
+ silhouette_scores_sum += @cds.clusters[i].map { |point|
32
+ nearest_cluster_index = Calculations::Cluster.nearest_to_point(@cds.centroids, i, point)
33
+ a = Calculations::Cluster.distances_to_point(@cds.clusters[i], point).sum(0.0)
34
+ b = Calculations::Cluster.distances_to_point(@cds.clusters[nearest_cluster_index], point).sum(0.0)
35
+ a /= (@cds.clusters[i].length - 1)
36
+ b /= @cds.clusters[nearest_cluster_index].length
37
+ silhouette_scores_count += 1
38
+ a.nan? || b.nan? ? 0 : (b - a) / [a, b].max
39
+ }.sum(0.0)
40
+ }
41
+ silhouette_scores_sum / silhouette_scores_count
42
+ end
43
+
44
+ def davies_bouldin_index
45
+ r = (0...@cds.k).to_a.map { |i|
46
+ Calculations::Cluster.distances_to_point(@cds.clusters[i], @cds.centroids[i]).sum(0.0) / @cds.clusters[i].length
47
+ }
48
+ rc = (0...@cds.k).to_a.map { |i| [r[i], @cds.centroids[i]] }
49
+ (0...@cds.k).to_a.map do |i|
50
+ rc_dup = rc.dup
51
+ rc_dup.delete_at(i)
52
+ ret = rc_dup.map do |x|
53
+ sum = (0.0 + rc[i][0] + x[0])
54
+ div = Calculations::Distance.euclidean(rc[i][1], x[1])
55
+ sum / div
56
+ end.max
57
+ ret
58
+ end.sum(0.0) / @cds.k
59
+ end
60
+
61
+ def dunn_index
62
+ inter_min = (0...@cds.k).to_a.map do |i|
63
+ (0...@cds.clusters[i].length).to_a.map do |j|
64
+ Calculations::Cluster.distances_to_point(Calculations.array_except(@cds.clusters, i).sum([]), @cds.clusters[i][j]).min
65
+ end.min
66
+ end.min
67
+ intra_max = (0...@cds.k).to_a.map do |i|
68
+ (0...@cds.clusters[i].length).to_a.map do |j|
69
+ Calculations::Cluster.distances_to_point(@cds.clusters[i], @cds.clusters[i][j]).max
70
+ end.max
71
+ end.max
72
+ inter_min / intra_max
73
+ end
74
+
75
+ def calinski_harabasz_index
76
+ top_part = (0...@cds.k).to_a.map { |i|
77
+ @cds.clusters[i].length * (Calculations::Distance.euclidean(@cds.centroids[i], @cds.dataset_barycenter)**2)
78
+ }.sum(0.0) / (@cds.k - 1)
79
+
80
+ bot_part = (0...@cds.k).to_a.map { |i|
81
+ (0...@cds.clusters[i].length).to_a.map { |j|
82
+ Calculations::Distance.euclidean(@cds.clusters[i][j], @cds.centroids[i])**2
83
+ }.sum(0.0)
84
+ }.sum(0.0) / (@cds.num_objects - @cds.k)
85
+ top_part / bot_part
86
+ end
87
+
88
+ def c_index
89
+ # calculate d
90
+ d = (0...@cds.k).to_a.map do |i|
91
+ m = Calculations::DistanceMatrix.new(@cds.clusters[i])
92
+ m.total_sum
93
+ end.sum(0.0)
94
+ r = (0...@cds.k).to_a.map { |i| @cds.clusters[i].length * (@cds.clusters[i].length - 1) }.sum(0.0) / 2
95
+ matrix = Calculations::DistanceMatrix.new(@cds.data)
96
+ sorted_matrix = matrix.data.sort
97
+ d_min = sorted_matrix.first(r).sum(0.0)
98
+ d_max = sorted_matrix.last(r).sum(0.0)
99
+ (d - d_min) / (d_max - d_min)
100
+ end
101
+
102
+ def pbm_index
103
+ d_b = Calculations::DistanceMatrix.new(@cds.centroids).max_distance
104
+ num_arr = (0...@cds.k).to_a
105
+ sum_dist_to_centers =
106
+ num_arr.map { |i| Calculations::Cluster.distances_to_point(@cds.clusters[i], @cds.centroids[i]).sum(Calculations.number(0.0)) }
107
+ e_w = sum_dist_to_centers.sum(Calculations.number(0.0))
108
+ e_t = Calculations::Cluster.distances_to_point(@cds.data, @cds.dataset_barycenter).sum(0.0)
109
+ (d_b * (e_t / e_w) / @cds.k)**2
110
+ end
111
+ end
data/lib/constants.rb ADDED
@@ -0,0 +1,19 @@
1
+ module Constants
2
+ module Indexes
3
+ SILHOUETTE = "silhouette"
4
+ C = "c"
5
+ DAVIES_BOULDIN = "davies_bouldin"
6
+ DUNN = "dunn"
7
+ CALINSKI_HARABASZ = "calinski_harabasz"
8
+ PBM = "pbm"
9
+ end
10
+
11
+ CALCULATION_RULES = {
12
+ "silhouette" => "max",
13
+ "c" => "min",
14
+ "davies_bouldin" => "min",
15
+ "dunn" => "max",
16
+ "calinski_harabasz" => "max",
17
+ "pbm" => "max"
18
+ }
19
+ end
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "kmeans-clusterer"
4
+ require_relative "constants"
5
+ require_relative "clustered_data_set"
6
+ class IndexCalculator
7
+ attr_reader :data, :labels, :max_k, :k_matrix, :clustered_data_sets
8
+
9
+ def initialize(data, labels, max_k = 2)
10
+ @data = data
11
+ @labels = labels
12
+ @max_k = max_k
13
+ @k_matrix = []
14
+ @k_matrix_header = {}
15
+ @clustered_data_sets = [0, 0]
16
+ @optimal_k_indexes = {}
17
+ end
18
+
19
+ def clear_clustered_datasets
20
+ @clustered_data_sets = [0, 0] if @clustered_data_sets.length > 2
21
+ end
22
+
23
+ def load_data_from_file(filename)
24
+ if File.exist?(filename)
25
+ @data = []
26
+ clear_clustered_datasets
27
+ read_from_file_to_data(filename)
28
+ puts "Data was successfully loaded from file!"
29
+ else
30
+ puts "Build failed!\nFile with filename '#{filename}' doesnt exist!"
31
+ puts "Current working directory is '#{Dir.pwd}'"
32
+ end
33
+ end
34
+
35
+ def build_by_custom_clusters(multi_array)
36
+ clear_clustered_datasets
37
+ (2..@max_k).each { |k|
38
+ @clustered_data_sets.insert(k, ClusteredDataSet.new(multi_array))
39
+ }
40
+ end
41
+
42
+ def build_by_kmeans_clustering
43
+ clear_clustered_datasets
44
+ (2..@max_k).each { |k|
45
+ kmeans = KMeansClusterer.run k, @data, labels: @labels, runs: 5
46
+ @clustered_data_sets.insert(k, ClusteredDataSet.new((kmeans.clusters)))
47
+ }
48
+ end
49
+
50
+ def get_indexes_by_name(str)
51
+ @k_matrix[@k_matrix_header[str]] if @k_matrix_header.include?(str)
52
+ end
53
+
54
+ def calculate_index_k(index_name)
55
+ # nullify if index was calculated before, else add it to rhe and of array
56
+ if @k_matrix_header.has_key?(index_name)
57
+ @k_matrix[@k_matrix_header[index_name]] = [0, 0]
58
+ else
59
+ @k_matrix_header[index_name] = @k_matrix_header.size
60
+ @k_matrix.push([0, 0])
61
+ end
62
+ (2..@max_k).each { |k|
63
+ @k_matrix[@k_matrix_header[index_name]].push(@clustered_data_sets[k].calculate_index_by_name(index_name))
64
+ }
65
+ end
66
+
67
+ def calculate_all_indexes
68
+ Constants::Indexes.constants.each { |ce| calculate_index_k(Constants::Indexes.const_get(ce)) }
69
+ end
70
+
71
+ def optimal_k_indexes(index_name)
72
+ calculate_index_k(index_name) if @k_matrix.nil? || !@k_matrix_header.include?(index_name)
73
+ @optimal_k_indexes[index_name] ||= send("#{Constants::CALCULATION_RULES[index_name]}_k_index", index_name)
74
+ end
75
+
76
+ def find_all_optimal_k_indexes
77
+ Constants::Indexes.constants.each { |ce| optimal_k_indexes(Constants::Indexes.const_get(ce)) }
78
+ end
79
+
80
+ def optimal_k_index
81
+ calculate_all_indexes
82
+ find_all_optimal_k_indexes
83
+ all_k_values = @optimal_k_indexes.sum([])
84
+ Calculations.mode_of_array all_k_values
85
+ end
86
+
87
+ def show_optimal_k_indexes_table
88
+ best_k = optimal_k_index
89
+ puts "Optimal k(number of cluster) by different indexes"
90
+ @optimal_k_indexes.each { |k, v| puts "Best k for #{k} index: #{v}" }
91
+ s_for_plural = best_k.one? ? "" : "S"
92
+ puts "THE BEST VALUE#{s_for_plural} OF CLUSTER NUMBER#{s_for_plural} #{best_k.one? ? "IS" : "ARE"}: #{best_k.one? ? best_k[0] : best_k}"
93
+ str = ""
94
+ @optimal_k_indexes.each { |k, v| str += "#{k}, " if v == best_k }
95
+ puts "Indexes which calculated best k: #{str[0..-3]}"
96
+ end
97
+
98
+ def min_k_index(index_name)
99
+ header_index = @k_matrix_header[index_name]
100
+ k_indexes = @k_matrix[header_index]
101
+ best_k = 2
102
+ min = k_indexes[2]
103
+ (2...k_indexes.length).each { |k|
104
+ if k_indexes[k] < min
105
+ best_k = k
106
+ min = k_indexes[k]
107
+ end
108
+ }
109
+ best_value = k_indexes[best_k]
110
+ best_ks = []
111
+ (2...k_indexes.length).each { |k| best_ks.push k if k_indexes[k] == best_value }
112
+ best_ks
113
+ end
114
+
115
+ def max_k_index(index_name)
116
+ header_index = @k_matrix_header[index_name]
117
+ k_indexes = @k_matrix[header_index]
118
+ best_k = 2
119
+ max = k_indexes[2]
120
+ (2...k_indexes.length).each { |k|
121
+ if k_indexes[k] > max
122
+ best_k = k
123
+ max = k_indexes[k]
124
+ end
125
+ }
126
+ best_value = k_indexes[best_k]
127
+ best_ks = []
128
+ (2...k_indexes.length).each { |k| best_ks.push k if k_indexes[k] == best_value }
129
+ best_ks
130
+ end
131
+
132
+ def show_data_table
133
+ puts "Dataset :"
134
+ puts @data
135
+ end
136
+
137
+ def show_index_table(index_name)
138
+ # make meta
139
+ calculate_index_k(index_name) if @k_matrix.nil? || !@k_matrix_header.include?(index_name)
140
+ puts "#{index_name} index scores:"
141
+ puts_index_table index_name
142
+ end
143
+
144
+ def puts_index_table(index_name)
145
+ (2...@k_matrix[@k_matrix_header[index_name]].length).each { |k|
146
+ puts "k = #{k}\t score = #{@k_matrix[@k_matrix_header[index_name]][k]}"
147
+ }
148
+ end
149
+
150
+ private
151
+
152
+ def read_from_file_to_data(filename)
153
+ File.foreach(filename) { |line| @data.push(line.chomp.split.map(&:to_f)) }
154
+ end
155
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dnks_clustering_indexes
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Danil Kosenko
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-06-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Gem to calculate clustering indexes
14
+ email: danilkos2013@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/calculations_utility_methods.rb
20
+ - lib/clustered_data_set.rb
21
+ - lib/clustering_indexes.rb
22
+ - lib/constants.rb
23
+ - lib/index_calculator.rb
24
+ homepage: https://rubygems.org/gems/dnks_clustering_indexes
25
+ licenses:
26
+ - MIT
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubygems_version: 3.2.3
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Clustering indexes
47
+ test_files: []