db_clustering 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +15 -11
- data/Gemfile.lock +149 -0
- data/LICENSE.txt +1 -1
- data/README.md +90 -0
- data/Rakefile +6 -6
- data/VERSION +1 -0
- data/lib/algorithms/density_based/dbscan.rb +48 -0
- data/lib/datasource_adapters/active_record.rb +32 -0
- data/lib/datasource_adapters/in_memory.rb +29 -0
- data/lib/db_clustering.rb +34 -0
- data/lib/distance_metrics/average_difference.rb +28 -0
- data/lib/distance_metrics/cosine_similarity.rb +43 -0
- data/lib/distance_metrics/euclidean_distance.rb +32 -0
- data/lib/distance_metrics/pearson_correlation.rb +44 -0
- data/lib/generators/datasource/active_record.rb +0 -0
- data/lib/models/cluster.rb +18 -0
- data/lib/models/point.rb +41 -0
- data/lib/models/vector.rb +30 -0
- data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
- data/spec/datasource_adapters/active_record_spec.rb +0 -0
- data/spec/datasource_adapters/in_memory_spec.rb +82 -0
- data/spec/distance_metrics/average_difference_spec.rb +44 -0
- data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
- data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
- data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
- data/spec/generators/datasource/active_record_spec.rb +0 -0
- data/spec/models/cluster_spec.rb +0 -0
- data/spec/models/point_spec.rb +0 -0
- data/spec/models/vector_spec.rb +0 -0
- data/spec/spec_helper.rb +7 -2
- data/spec/support/dataset_helper.rb +19 -0
- data/spec/support/test_model.rb +9 -0
- metadata +31 -1
@@ -0,0 +1,34 @@
|
|
1
|
+
#
|
2
|
+
# Algorithms
|
3
|
+
#
|
4
|
+
require 'algorithms/density_based/dbscan'
|
5
|
+
|
6
|
+
|
7
|
+
#
|
8
|
+
# Datasource Adapters
|
9
|
+
#
|
10
|
+
require 'datasource_adapters/active_record'
|
11
|
+
require 'datasource_adapters/in_memory'
|
12
|
+
|
13
|
+
|
14
|
+
#
|
15
|
+
# Distance Metrics
|
16
|
+
#
|
17
|
+
require 'distance_metrics/average_difference'
|
18
|
+
require 'distance_metrics/cosine_similarity'
|
19
|
+
require 'distance_metrics/euclidean_distance'
|
20
|
+
require 'distance_metrics/pearson_correlation'
|
21
|
+
|
22
|
+
|
23
|
+
#
|
24
|
+
# Generators
|
25
|
+
#
|
26
|
+
require 'generators/datasource/active_record'
|
27
|
+
|
28
|
+
|
29
|
+
#
|
30
|
+
# Models
|
31
|
+
#
|
32
|
+
require 'models/cluster'
|
33
|
+
require 'models/point'
|
34
|
+
require 'models/vector'
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class AverageDifference
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
12
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
13
|
+
|
14
|
+
if vector1_array.count != vector2_array.count
|
15
|
+
raise "Vectors with different sizes cannot be compared"
|
16
|
+
end
|
17
|
+
|
18
|
+
if vector1_array.count < @min_dimensions
|
19
|
+
return Float::INFINITY
|
20
|
+
end
|
21
|
+
|
22
|
+
sum = vector1_array.map.with_index{ |x, i| (x - vector2_array[i]).abs }.reduce(&:+)
|
23
|
+
sum / vector1_array.count.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class CosineSimilarity
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
1.0 - correlation(vector1, vector2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def correlation(vector1, vector2)
|
15
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
16
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
17
|
+
|
18
|
+
if vector1_array.count != vector2_array.count
|
19
|
+
raise "Vectors with different sizes cannot be compared"
|
20
|
+
end
|
21
|
+
|
22
|
+
if vector1_array.count < @min_dimensions
|
23
|
+
return Float::INFINITY
|
24
|
+
end
|
25
|
+
|
26
|
+
# see here for calculation formula: https://en.wikipedia.org/wiki/Cosine_similarity
|
27
|
+
numerator = 0
|
28
|
+
vector1_array.count.times do |i|
|
29
|
+
numerator += vector1_array[i] * vector2_array[i]
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + v1i ** 2 })
|
35
|
+
right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + v2i ** 2 })
|
36
|
+
denominator = left_sqrt * right_sqrt
|
37
|
+
|
38
|
+
numerator.to_f / denominator
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class EuclideanDistance
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
12
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
13
|
+
|
14
|
+
if vector1_array.count != vector2_array.count
|
15
|
+
raise "Vectors with different sizes cannot be compared"
|
16
|
+
end
|
17
|
+
|
18
|
+
if vector1_array.count < @min_dimensions
|
19
|
+
return Float::INFINITY
|
20
|
+
end
|
21
|
+
|
22
|
+
# see here for calculation formula: http://en.wikipedia.org/wiki/Euclidean_distance
|
23
|
+
sum = 0
|
24
|
+
vector1_array.count.times do |i|
|
25
|
+
sum += (vector1_array[i] - vector2_array[i]) ** 2
|
26
|
+
end
|
27
|
+
sqrt sum
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class PearsonCorrelation
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
1.0 - correlation(vector1, vector2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def correlation(vector1, vector2)
|
15
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
16
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
17
|
+
|
18
|
+
if vector1_array.count != vector2_array.count
|
19
|
+
raise "Vectors with different sizes cannot be compared"
|
20
|
+
end
|
21
|
+
|
22
|
+
if vector1_array.count < @min_dimensions
|
23
|
+
return Float::INFINITY
|
24
|
+
end
|
25
|
+
|
26
|
+
# see here for calculation formula: http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
|
27
|
+
v1_mean = vector1_array.reduce(:+) / vector1_array.count.to_f
|
28
|
+
v2_mean = vector2_array.reduce(:+) / vector2_array.count.to_f
|
29
|
+
|
30
|
+
numerator = 0
|
31
|
+
vector1_array.count.times do |i|
|
32
|
+
numerator += (vector1_array[i] - v1_mean) * (vector2_array[i] - v2_mean)
|
33
|
+
end
|
34
|
+
|
35
|
+
left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + (v1i - v1_mean) ** 2 })
|
36
|
+
right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + (v2i - v2_mean) ** 2 })
|
37
|
+
denominator = left_sqrt * right_sqrt
|
38
|
+
|
39
|
+
numerator.to_f / denominator
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
File without changes
|
data/lib/models/point.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module Models
|
3
|
+
class Point
|
4
|
+
|
5
|
+
attr_accessor :cluster, :is_noise, :datasource_point
|
6
|
+
|
7
|
+
def initialize(datasource_point)
|
8
|
+
@is_noise = false
|
9
|
+
@cluster = nil
|
10
|
+
@datasource_point = datasource_point
|
11
|
+
end
|
12
|
+
|
13
|
+
def vector
|
14
|
+
vector_object = @datasource_point.clustering_vector
|
15
|
+
|
16
|
+
if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
|
17
|
+
DbClustering::Models::Vector.new(object: vector_object)
|
18
|
+
else
|
19
|
+
raise "clustering_vector method needs to result to a Hash or an Array object"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def visited?
|
24
|
+
self.is_noise || !self.cluster.nil?
|
25
|
+
end
|
26
|
+
|
27
|
+
def is_edge_point?
|
28
|
+
self.is_noise && !self.cluster.nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def is_core_point?
|
32
|
+
!self.is_noise && !self.cluster.nil?
|
33
|
+
end
|
34
|
+
|
35
|
+
def is_noise_point?
|
36
|
+
self.is_noise && self.cluster.nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module Models
|
3
|
+
class Vector
|
4
|
+
|
5
|
+
attr_reader :hash
|
6
|
+
|
7
|
+
def initialize(object:)
|
8
|
+
if object.is_a?(Hash)
|
9
|
+
@hash = object
|
10
|
+
else
|
11
|
+
@array = object
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def array_for_comparison(other_vector)
|
16
|
+
if @hash
|
17
|
+
if other_vector
|
18
|
+
shared_keys = @hash.keys & other_vector.hash.keys
|
19
|
+
@hash.select{ |k,v| shared_keys.include?(k) }.sort.map{ |arr| arr.last }
|
20
|
+
else
|
21
|
+
@hash.values
|
22
|
+
end
|
23
|
+
else
|
24
|
+
@array
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'support/test_model'
|
3
|
+
|
4
|
+
describe DbClustering::Algorithms::Dbscan do
|
5
|
+
describe "initialization" do
|
6
|
+
before(:each) do
|
7
|
+
@dataset = DatasetHelper.normal_distribution
|
8
|
+
|
9
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
10
|
+
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
11
|
+
|
12
|
+
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should initialize successfully" do
|
16
|
+
expect(@dbscan).to be_a(DbClustering::Algorithms::Dbscan)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#cluster" do
|
21
|
+
before(:each) do
|
22
|
+
@clusters_count = 10
|
23
|
+
@dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
|
24
|
+
|
25
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
26
|
+
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
27
|
+
|
28
|
+
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
29
|
+
@dbscan.cluster(max_distance: 10, min_neighbors: 5)
|
30
|
+
end
|
31
|
+
|
32
|
+
it "changes all points to clustered or noise – not both" do
|
33
|
+
@dbscan.datasource.iterate_all_points do |point|
|
34
|
+
expect(point.is_core_point? || point.is_edge_point? || point.is_noise).to eq(true)
|
35
|
+
expect(point.is_core_point? && point.is_edge_point?).to eq(false)
|
36
|
+
expect(point.is_core_point? && point.is_noise_point?).to eq(false)
|
37
|
+
expect(point.is_edge_point? && point.is_noise_point?).to eq(false)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "visits all points" do
|
42
|
+
@in_memory_datasource.iterate_all_points do |point|
|
43
|
+
expect(point.visited?).to eq(true)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
it "finds all clusters" do
|
48
|
+
expect(@dbscan.clusters.count).to eq(@clusters_count)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "#expand_cluster" do
|
53
|
+
pending "should expand cluster with one point and missing points in cluster"
|
54
|
+
pending "should expand cluster with several points and missing points in cluster"
|
55
|
+
pending "should expand cluster with several points and without missing points in cluster"
|
56
|
+
end
|
57
|
+
end
|
File without changes
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'simple-random'
|
3
|
+
|
4
|
+
describe DbClustering::DatasourceAdapters::InMemory, type: :model do
|
5
|
+
|
6
|
+
# describe "#initialize" do
|
7
|
+
# it "initializes with an array" do
|
8
|
+
# expect(DbClustering::DatasourceAdapters::InMemory.new(array: [])).to be_a(DbClustering::DatasourceAdapters::InMemory)
|
9
|
+
# end
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# describe "#iterate_all_points" do
|
13
|
+
# before(:each) do
|
14
|
+
# @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: (1..100).to_a)
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# it "iterates through all points" do
|
18
|
+
# x = 0
|
19
|
+
# @in_memory.iterate_all_points { |p| x += 1 }
|
20
|
+
# expect(x).to eq(100)
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
|
24
|
+
describe "#neighbors" do
|
25
|
+
before(:each) do
|
26
|
+
@dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
|
27
|
+
|
28
|
+
@in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
29
|
+
@first_point = DbClustering::Models::Point.new(@dataset.first)
|
30
|
+
end
|
31
|
+
|
32
|
+
context "average difference" do
|
33
|
+
before(:each) do
|
34
|
+
@average_difference = DbClustering::DistanceMetrics::AverageDifference.new
|
35
|
+
end
|
36
|
+
|
37
|
+
it "finds all neighbors" do
|
38
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @average_difference, max_distance: 10)
|
39
|
+
expect(neighbors.count).to eq(10)
|
40
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "cosine similarity" do
|
45
|
+
before(:each) do
|
46
|
+
@cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
|
47
|
+
end
|
48
|
+
|
49
|
+
it "finds all neighbors" do
|
50
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @cosine_similarity, max_distance: 0.25)
|
51
|
+
expect(neighbors.count).to eq(40)
|
52
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
context "euclidean distance" do
|
57
|
+
before(:each) do
|
58
|
+
@euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
|
59
|
+
end
|
60
|
+
|
61
|
+
it "finds all neighbors" do
|
62
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @euclidean_distance, max_distance: 50)
|
63
|
+
expect(neighbors.count).to eq(10)
|
64
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "pearson correlation" do
|
69
|
+
before(:each) do
|
70
|
+
@pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
|
71
|
+
end
|
72
|
+
|
73
|
+
it "finds all neighbors" do
|
74
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @pearson_correlation, max_distance: 0.705)
|
75
|
+
expect(neighbors.count).to eq(10)
|
76
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DbClustering::DistanceMetrics::AverageDifference, type: :model do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@average_difference = DbClustering::DistanceMetrics::AverageDifference.new
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#distance" do
|
10
|
+
|
11
|
+
context "using array object" do
|
12
|
+
|
13
|
+
it "works with 6 dimensional examples" do
|
14
|
+
a1 = [-100, -50, 0, 10, 20, 30]
|
15
|
+
a2 = [-100, -50, 0, 20, 30, 40]
|
16
|
+
|
17
|
+
expect_distance(a1, a2, 5.0)
|
18
|
+
|
19
|
+
a1[0] = 100
|
20
|
+
expect_distance(a1, a2, 38.333333333333336)
|
21
|
+
|
22
|
+
a1[1] = 50
|
23
|
+
expect_distance(a1, a2, 55)
|
24
|
+
|
25
|
+
a1[3] = 20
|
26
|
+
expect_distance(a1, a2, 53.333333333333333)
|
27
|
+
|
28
|
+
a1[4] = 30
|
29
|
+
expect_distance(a1, a2, 51.666666666666664)
|
30
|
+
|
31
|
+
a1[5] = 40
|
32
|
+
expect_distance(a1, a2, 50)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def expect_distance(object1, object2, distance)
|
39
|
+
vector1 = DbClustering::Models::Vector.new(object: object1)
|
40
|
+
vector2 = DbClustering::Models::Vector.new(object: object2)
|
41
|
+
expect(@average_difference.distance(vector1, vector2)).to be_within(0.001).of(distance)
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|