db_clustering 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +15 -11
- data/Gemfile.lock +149 -0
- data/LICENSE.txt +1 -1
- data/README.md +90 -0
- data/Rakefile +6 -6
- data/VERSION +1 -0
- data/lib/algorithms/density_based/dbscan.rb +48 -0
- data/lib/datasource_adapters/active_record.rb +32 -0
- data/lib/datasource_adapters/in_memory.rb +29 -0
- data/lib/db_clustering.rb +34 -0
- data/lib/distance_metrics/average_difference.rb +28 -0
- data/lib/distance_metrics/cosine_similarity.rb +43 -0
- data/lib/distance_metrics/euclidean_distance.rb +32 -0
- data/lib/distance_metrics/pearson_correlation.rb +44 -0
- data/lib/generators/datasource/active_record.rb +0 -0
- data/lib/models/cluster.rb +18 -0
- data/lib/models/point.rb +41 -0
- data/lib/models/vector.rb +30 -0
- data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
- data/spec/datasource_adapters/active_record_spec.rb +0 -0
- data/spec/datasource_adapters/in_memory_spec.rb +82 -0
- data/spec/distance_metrics/average_difference_spec.rb +44 -0
- data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
- data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
- data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
- data/spec/generators/datasource/active_record_spec.rb +0 -0
- data/spec/models/cluster_spec.rb +0 -0
- data/spec/models/point_spec.rb +0 -0
- data/spec/models/vector_spec.rb +0 -0
- data/spec/spec_helper.rb +7 -2
- data/spec/support/dataset_helper.rb +19 -0
- data/spec/support/test_model.rb +9 -0
- metadata +31 -1
@@ -0,0 +1,34 @@
|
|
1
|
+
#
|
2
|
+
# Algorithms
|
3
|
+
#
|
4
|
+
require 'algorithms/density_based/dbscan'
|
5
|
+
|
6
|
+
|
7
|
+
#
|
8
|
+
# Datasource Adapters
|
9
|
+
#
|
10
|
+
require 'datasource_adapters/active_record'
|
11
|
+
require 'datasource_adapters/in_memory'
|
12
|
+
|
13
|
+
|
14
|
+
#
|
15
|
+
# Distance Metrics
|
16
|
+
#
|
17
|
+
require 'distance_metrics/average_difference'
|
18
|
+
require 'distance_metrics/cosine_similarity'
|
19
|
+
require 'distance_metrics/euclidean_distance'
|
20
|
+
require 'distance_metrics/pearson_correlation'
|
21
|
+
|
22
|
+
|
23
|
+
#
|
24
|
+
# Generators
|
25
|
+
#
|
26
|
+
require 'generators/datasource/active_record'
|
27
|
+
|
28
|
+
|
29
|
+
#
|
30
|
+
# Models
|
31
|
+
#
|
32
|
+
require 'models/cluster'
|
33
|
+
require 'models/point'
|
34
|
+
require 'models/vector'
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class AverageDifference
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
12
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
13
|
+
|
14
|
+
if vector1_array.count != vector2_array.count
|
15
|
+
raise "Vectors with different sizes cannot be compared"
|
16
|
+
end
|
17
|
+
|
18
|
+
if vector1_array.count < @min_dimensions
|
19
|
+
return Float::INFINITY
|
20
|
+
end
|
21
|
+
|
22
|
+
sum = vector1_array.map.with_index{ |x, i| (x - vector2_array[i]).abs }.reduce(&:+)
|
23
|
+
sum / vector1_array.count.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class CosineSimilarity
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
1.0 - correlation(vector1, vector2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def correlation(vector1, vector2)
|
15
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
16
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
17
|
+
|
18
|
+
if vector1_array.count != vector2_array.count
|
19
|
+
raise "Vectors with different sizes cannot be compared"
|
20
|
+
end
|
21
|
+
|
22
|
+
if vector1_array.count < @min_dimensions
|
23
|
+
return Float::INFINITY
|
24
|
+
end
|
25
|
+
|
26
|
+
# see here for calculation formula: https://en.wikipedia.org/wiki/Cosine_similarity
|
27
|
+
numerator = 0
|
28
|
+
vector1_array.count.times do |i|
|
29
|
+
numerator += vector1_array[i] * vector2_array[i]
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + v1i ** 2 })
|
35
|
+
right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + v2i ** 2 })
|
36
|
+
denominator = left_sqrt * right_sqrt
|
37
|
+
|
38
|
+
numerator.to_f / denominator
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class EuclideanDistance
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
12
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
13
|
+
|
14
|
+
if vector1_array.count != vector2_array.count
|
15
|
+
raise "Vectors with different sizes cannot be compared"
|
16
|
+
end
|
17
|
+
|
18
|
+
if vector1_array.count < @min_dimensions
|
19
|
+
return Float::INFINITY
|
20
|
+
end
|
21
|
+
|
22
|
+
# see here for calculation formula: http://en.wikipedia.org/wiki/Euclidean_distance
|
23
|
+
sum = 0
|
24
|
+
vector1_array.count.times do |i|
|
25
|
+
sum += (vector1_array[i] - vector2_array[i]) ** 2
|
26
|
+
end
|
27
|
+
sqrt sum
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DistanceMetrics
|
3
|
+
class PearsonCorrelation
|
4
|
+
include Math
|
5
|
+
|
6
|
+
def initialize(min_dimensions: 1)
|
7
|
+
@min_dimensions = min_dimensions
|
8
|
+
end
|
9
|
+
|
10
|
+
def distance(vector1, vector2)
|
11
|
+
1.0 - correlation(vector1, vector2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def correlation(vector1, vector2)
|
15
|
+
vector1_array = vector1.array_for_comparison(vector2)
|
16
|
+
vector2_array = vector2.array_for_comparison(vector1)
|
17
|
+
|
18
|
+
if vector1_array.count != vector2_array.count
|
19
|
+
raise "Vectors with different sizes cannot be compared"
|
20
|
+
end
|
21
|
+
|
22
|
+
if vector1_array.count < @min_dimensions
|
23
|
+
return Float::INFINITY
|
24
|
+
end
|
25
|
+
|
26
|
+
# see here for calculation formula: http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
|
27
|
+
v1_mean = vector1_array.reduce(:+) / vector1_array.count.to_f
|
28
|
+
v2_mean = vector2_array.reduce(:+) / vector2_array.count.to_f
|
29
|
+
|
30
|
+
numerator = 0
|
31
|
+
vector1_array.count.times do |i|
|
32
|
+
numerator += (vector1_array[i] - v1_mean) * (vector2_array[i] - v2_mean)
|
33
|
+
end
|
34
|
+
|
35
|
+
left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + (v1i - v1_mean) ** 2 })
|
36
|
+
right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + (v2i - v2_mean) ** 2 })
|
37
|
+
denominator = left_sqrt * right_sqrt
|
38
|
+
|
39
|
+
numerator.to_f / denominator
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
File without changes
|
data/lib/models/point.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module Models
|
3
|
+
class Point
|
4
|
+
|
5
|
+
attr_accessor :cluster, :is_noise, :datasource_point
|
6
|
+
|
7
|
+
def initialize(datasource_point)
|
8
|
+
@is_noise = false
|
9
|
+
@cluster = nil
|
10
|
+
@datasource_point = datasource_point
|
11
|
+
end
|
12
|
+
|
13
|
+
def vector
|
14
|
+
vector_object = @datasource_point.clustering_vector
|
15
|
+
|
16
|
+
if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
|
17
|
+
DbClustering::Models::Vector.new(object: vector_object)
|
18
|
+
else
|
19
|
+
raise "clustering_vector method needs to result to a Hash or an Array object"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def visited?
|
24
|
+
self.is_noise || !self.cluster.nil?
|
25
|
+
end
|
26
|
+
|
27
|
+
def is_edge_point?
|
28
|
+
self.is_noise && !self.cluster.nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def is_core_point?
|
32
|
+
!self.is_noise && !self.cluster.nil?
|
33
|
+
end
|
34
|
+
|
35
|
+
def is_noise_point?
|
36
|
+
self.is_noise && self.cluster.nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module Models
|
3
|
+
class Vector
|
4
|
+
|
5
|
+
attr_reader :hash
|
6
|
+
|
7
|
+
def initialize(object:)
|
8
|
+
if object.is_a?(Hash)
|
9
|
+
@hash = object
|
10
|
+
else
|
11
|
+
@array = object
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def array_for_comparison(other_vector)
|
16
|
+
if @hash
|
17
|
+
if other_vector
|
18
|
+
shared_keys = @hash.keys & other_vector.hash.keys
|
19
|
+
@hash.select{ |k,v| shared_keys.include?(k) }.sort.map{ |arr| arr.last }
|
20
|
+
else
|
21
|
+
@hash.values
|
22
|
+
end
|
23
|
+
else
|
24
|
+
@array
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'support/test_model'
|
3
|
+
|
4
|
+
describe DbClustering::Algorithms::Dbscan do
|
5
|
+
describe "initialization" do
|
6
|
+
before(:each) do
|
7
|
+
@dataset = DatasetHelper.normal_distribution
|
8
|
+
|
9
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
10
|
+
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
11
|
+
|
12
|
+
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should initialize successfully" do
|
16
|
+
expect(@dbscan).to be_a(DbClustering::Algorithms::Dbscan)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#cluster" do
|
21
|
+
before(:each) do
|
22
|
+
@clusters_count = 10
|
23
|
+
@dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
|
24
|
+
|
25
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
26
|
+
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
27
|
+
|
28
|
+
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
29
|
+
@dbscan.cluster(max_distance: 10, min_neighbors: 5)
|
30
|
+
end
|
31
|
+
|
32
|
+
it "changes all points to clustered or noise – not both" do
|
33
|
+
@dbscan.datasource.iterate_all_points do |point|
|
34
|
+
expect(point.is_core_point? || point.is_edge_point? || point.is_noise).to eq(true)
|
35
|
+
expect(point.is_core_point? && point.is_edge_point?).to eq(false)
|
36
|
+
expect(point.is_core_point? && point.is_noise_point?).to eq(false)
|
37
|
+
expect(point.is_edge_point? && point.is_noise_point?).to eq(false)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "visits all points" do
|
42
|
+
@in_memory_datasource.iterate_all_points do |point|
|
43
|
+
expect(point.visited?).to eq(true)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
it "finds all clusters" do
|
48
|
+
expect(@dbscan.clusters.count).to eq(@clusters_count)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "#expand_cluster" do
|
53
|
+
pending "should expand cluster with one point and missing points in cluster"
|
54
|
+
pending "should expand cluster with several points and missing points in cluster"
|
55
|
+
pending "should expand cluster with several points and without missing points in cluster"
|
56
|
+
end
|
57
|
+
end
|
File without changes
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'simple-random'
|
3
|
+
|
4
|
+
describe DbClustering::DatasourceAdapters::InMemory, type: :model do
|
5
|
+
|
6
|
+
# describe "#initialize" do
|
7
|
+
# it "initializes with an array" do
|
8
|
+
# expect(DbClustering::DatasourceAdapters::InMemory.new(array: [])).to be_a(DbClustering::DatasourceAdapters::InMemory)
|
9
|
+
# end
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# describe "#iterate_all_points" do
|
13
|
+
# before(:each) do
|
14
|
+
# @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: (1..100).to_a)
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# it "iterates through all points" do
|
18
|
+
# x = 0
|
19
|
+
# @in_memory.iterate_all_points { |p| x += 1 }
|
20
|
+
# expect(x).to eq(100)
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
|
24
|
+
describe "#neighbors" do
|
25
|
+
before(:each) do
|
26
|
+
@dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
|
27
|
+
|
28
|
+
@in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
29
|
+
@first_point = DbClustering::Models::Point.new(@dataset.first)
|
30
|
+
end
|
31
|
+
|
32
|
+
context "average difference" do
|
33
|
+
before(:each) do
|
34
|
+
@average_difference = DbClustering::DistanceMetrics::AverageDifference.new
|
35
|
+
end
|
36
|
+
|
37
|
+
it "finds all neighbors" do
|
38
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @average_difference, max_distance: 10)
|
39
|
+
expect(neighbors.count).to eq(10)
|
40
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "cosine similarity" do
|
45
|
+
before(:each) do
|
46
|
+
@cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
|
47
|
+
end
|
48
|
+
|
49
|
+
it "finds all neighbors" do
|
50
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @cosine_similarity, max_distance: 0.25)
|
51
|
+
expect(neighbors.count).to eq(40)
|
52
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
context "euclidean distance" do
|
57
|
+
before(:each) do
|
58
|
+
@euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
|
59
|
+
end
|
60
|
+
|
61
|
+
it "finds all neighbors" do
|
62
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @euclidean_distance, max_distance: 50)
|
63
|
+
expect(neighbors.count).to eq(10)
|
64
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "pearson correlation" do
|
69
|
+
before(:each) do
|
70
|
+
@pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
|
71
|
+
end
|
72
|
+
|
73
|
+
it "finds all neighbors" do
|
74
|
+
neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @pearson_correlation, max_distance: 0.705)
|
75
|
+
expect(neighbors.count).to eq(10)
|
76
|
+
expect(neighbors.first).to be_a(DbClustering::Models::Point)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DbClustering::DistanceMetrics::AverageDifference, type: :model do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@average_difference = DbClustering::DistanceMetrics::AverageDifference.new
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#distance" do
|
10
|
+
|
11
|
+
context "using array object" do
|
12
|
+
|
13
|
+
it "works with 6 dimensional examples" do
|
14
|
+
a1 = [-100, -50, 0, 10, 20, 30]
|
15
|
+
a2 = [-100, -50, 0, 20, 30, 40]
|
16
|
+
|
17
|
+
expect_distance(a1, a2, 5.0)
|
18
|
+
|
19
|
+
a1[0] = 100
|
20
|
+
expect_distance(a1, a2, 38.333333333333336)
|
21
|
+
|
22
|
+
a1[1] = 50
|
23
|
+
expect_distance(a1, a2, 55)
|
24
|
+
|
25
|
+
a1[3] = 20
|
26
|
+
expect_distance(a1, a2, 53.333333333333333)
|
27
|
+
|
28
|
+
a1[4] = 30
|
29
|
+
expect_distance(a1, a2, 51.666666666666664)
|
30
|
+
|
31
|
+
a1[5] = 40
|
32
|
+
expect_distance(a1, a2, 50)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def expect_distance(object1, object2, distance)
|
39
|
+
vector1 = DbClustering::Models::Vector.new(object: object1)
|
40
|
+
vector2 = DbClustering::Models::Vector.new(object: object2)
|
41
|
+
expect(@average_difference.distance(vector1, vector2)).to be_within(0.001).of(distance)
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|