db_clustering 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +15 -11
  5. data/Gemfile.lock +149 -0
  6. data/LICENSE.txt +1 -1
  7. data/README.md +90 -0
  8. data/Rakefile +6 -6
  9. data/VERSION +1 -0
  10. data/lib/algorithms/density_based/dbscan.rb +48 -0
  11. data/lib/datasource_adapters/active_record.rb +32 -0
  12. data/lib/datasource_adapters/in_memory.rb +29 -0
  13. data/lib/db_clustering.rb +34 -0
  14. data/lib/distance_metrics/average_difference.rb +28 -0
  15. data/lib/distance_metrics/cosine_similarity.rb +43 -0
  16. data/lib/distance_metrics/euclidean_distance.rb +32 -0
  17. data/lib/distance_metrics/pearson_correlation.rb +44 -0
  18. data/lib/generators/datasource/active_record.rb +0 -0
  19. data/lib/models/cluster.rb +18 -0
  20. data/lib/models/point.rb +41 -0
  21. data/lib/models/vector.rb +30 -0
  22. data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
  23. data/spec/datasource_adapters/active_record_spec.rb +0 -0
  24. data/spec/datasource_adapters/in_memory_spec.rb +82 -0
  25. data/spec/distance_metrics/average_difference_spec.rb +44 -0
  26. data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
  27. data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
  28. data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
  29. data/spec/generators/datasource/active_record_spec.rb +0 -0
  30. data/spec/models/cluster_spec.rb +0 -0
  31. data/spec/models/point_spec.rb +0 -0
  32. data/spec/models/vector_spec.rb +0 -0
  33. data/spec/spec_helper.rb +7 -2
  34. data/spec/support/dataset_helper.rb +19 -0
  35. data/spec/support/test_model.rb +9 -0
  36. metadata +31 -1
@@ -0,0 +1,34 @@
1
+ #
2
+ # Algorithms
3
+ #
4
+ require 'algorithms/density_based/dbscan'
5
+
6
+
7
+ #
8
+ # Datasource Adapters
9
+ #
10
+ require 'datasource_adapters/active_record'
11
+ require 'datasource_adapters/in_memory'
12
+
13
+
14
+ #
15
+ # Distance Metrics
16
+ #
17
+ require 'distance_metrics/average_difference'
18
+ require 'distance_metrics/cosine_similarity'
19
+ require 'distance_metrics/euclidean_distance'
20
+ require 'distance_metrics/pearson_correlation'
21
+
22
+
23
+ #
24
+ # Generators
25
+ #
26
+ require 'generators/datasource/active_record'
27
+
28
+
29
+ #
30
+ # Models
31
+ #
32
+ require 'models/cluster'
33
+ require 'models/point'
34
+ require 'models/vector'
@@ -0,0 +1,28 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class AverageDifference
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ vector1_array = vector1.array_for_comparison(vector2)
12
+ vector2_array = vector2.array_for_comparison(vector1)
13
+
14
+ if vector1_array.count != vector2_array.count
15
+ raise "Vectors with different sizes cannot be compared"
16
+ end
17
+
18
+ if vector1_array.count < @min_dimensions
19
+ return Float::INFINITY
20
+ end
21
+
22
+ sum = vector1_array.map.with_index{ |x, i| (x - vector2_array[i]).abs }.reduce(&:+)
23
+ sum / vector1_array.count.to_f
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,43 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class CosineSimilarity
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ 1.0 - correlation(vector1, vector2)
12
+ end
13
+
14
+ def correlation(vector1, vector2)
15
+ vector1_array = vector1.array_for_comparison(vector2)
16
+ vector2_array = vector2.array_for_comparison(vector1)
17
+
18
+ if vector1_array.count != vector2_array.count
19
+ raise "Vectors with different sizes cannot be compared"
20
+ end
21
+
22
+ if vector1_array.count < @min_dimensions
23
+ return Float::INFINITY
24
+ end
25
+
26
+ # see here for calculation formula: https://en.wikipedia.org/wiki/Cosine_similarity
27
+ numerator = 0
28
+ vector1_array.count.times do |i|
29
+ numerator += vector1_array[i] * vector2_array[i]
30
+ end
31
+
32
+
33
+
34
+ left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + v1i ** 2 })
35
+ right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + v2i ** 2 })
36
+ denominator = left_sqrt * right_sqrt
37
+
38
+ numerator.to_f / denominator
39
+ end
40
+
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,32 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class EuclideanDistance
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ vector1_array = vector1.array_for_comparison(vector2)
12
+ vector2_array = vector2.array_for_comparison(vector1)
13
+
14
+ if vector1_array.count != vector2_array.count
15
+ raise "Vectors with different sizes cannot be compared"
16
+ end
17
+
18
+ if vector1_array.count < @min_dimensions
19
+ return Float::INFINITY
20
+ end
21
+
22
+ # see here for calculation formula: http://en.wikipedia.org/wiki/Euclidean_distance
23
+ sum = 0
24
+ vector1_array.count.times do |i|
25
+ sum += (vector1_array[i] - vector2_array[i]) ** 2
26
+ end
27
+ sqrt sum
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,44 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class PearsonCorrelation
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ 1.0 - correlation(vector1, vector2)
12
+ end
13
+
14
+ def correlation(vector1, vector2)
15
+ vector1_array = vector1.array_for_comparison(vector2)
16
+ vector2_array = vector2.array_for_comparison(vector1)
17
+
18
+ if vector1_array.count != vector2_array.count
19
+ raise "Vectors with different sizes cannot be compared"
20
+ end
21
+
22
+ if vector1_array.count < @min_dimensions
23
+ return Float::INFINITY
24
+ end
25
+
26
+ # see here for calculation formula: http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
27
+ v1_mean = vector1_array.reduce(:+) / vector1_array.count.to_f
28
+ v2_mean = vector2_array.reduce(:+) / vector2_array.count.to_f
29
+
30
+ numerator = 0
31
+ vector1_array.count.times do |i|
32
+ numerator += (vector1_array[i] - v1_mean) * (vector2_array[i] - v2_mean)
33
+ end
34
+
35
+ left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + (v1i - v1_mean) ** 2 })
36
+ right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + (v2i - v2_mean) ** 2 })
37
+ denominator = left_sqrt * right_sqrt
38
+
39
+ numerator.to_f / denominator
40
+ end
41
+
42
+ end
43
+ end
44
+ end
File without changes
@@ -0,0 +1,18 @@
1
+ module DbClustering
2
+ module Models
3
+ class Cluster
4
+
5
+ attr_accessor :points
6
+
7
+ def initialize
8
+ @points = []
9
+ end
10
+
11
+ def add(point)
12
+ @points << point
13
+ point.cluster = self
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,41 @@
1
+ module DbClustering
2
+ module Models
3
+ class Point
4
+
5
+ attr_accessor :cluster, :is_noise, :datasource_point
6
+
7
+ def initialize(datasource_point)
8
+ @is_noise = false
9
+ @cluster = nil
10
+ @datasource_point = datasource_point
11
+ end
12
+
13
+ def vector
14
+ vector_object = @datasource_point.clustering_vector
15
+
16
+ if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
17
+ DbClustering::Models::Vector.new(object: vector_object)
18
+ else
19
+ raise "clustering_vector method needs to result to a Hash or an Array object"
20
+ end
21
+ end
22
+
23
+ def visited?
24
+ self.is_noise || !self.cluster.nil?
25
+ end
26
+
27
+ def is_edge_point?
28
+ self.is_noise && !self.cluster.nil?
29
+ end
30
+
31
+ def is_core_point?
32
+ !self.is_noise && !self.cluster.nil?
33
+ end
34
+
35
+ def is_noise_point?
36
+ self.is_noise && self.cluster.nil?
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ module DbClustering
2
+ module Models
3
+ class Vector
4
+
5
+ attr_reader :hash
6
+
7
+ def initialize(object:)
8
+ if object.is_a?(Hash)
9
+ @hash = object
10
+ else
11
+ @array = object
12
+ end
13
+ end
14
+
15
+ def array_for_comparison(other_vector)
16
+ if @hash
17
+ if other_vector
18
+ shared_keys = @hash.keys & other_vector.hash.keys
19
+ @hash.select{ |k,v| shared_keys.include?(k) }.sort.map{ |arr| arr.last }
20
+ else
21
+ @hash.values
22
+ end
23
+ else
24
+ @array
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+ require 'support/test_model'
3
+
4
+ describe DbClustering::Algorithms::Dbscan do
5
+ describe "initialization" do
6
+ before(:each) do
7
+ @dataset = DatasetHelper.normal_distribution
8
+
9
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
10
+ @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
11
+
12
+ @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
13
+ end
14
+
15
+ it "should initialize successfully" do
16
+ expect(@dbscan).to be_a(DbClustering::Algorithms::Dbscan)
17
+ end
18
+ end
19
+
20
+ describe "#cluster" do
21
+ before(:each) do
22
+ @clusters_count = 10
23
+ @dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
24
+
25
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
26
+ @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
27
+
28
+ @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
29
+ @dbscan.cluster(max_distance: 10, min_neighbors: 5)
30
+ end
31
+
32
+ it "changes all points to clustered or noise – not both" do
33
+ @dbscan.datasource.iterate_all_points do |point|
34
+ expect(point.is_core_point? || point.is_edge_point? || point.is_noise).to eq(true)
35
+ expect(point.is_core_point? && point.is_edge_point?).to eq(false)
36
+ expect(point.is_core_point? && point.is_noise_point?).to eq(false)
37
+ expect(point.is_edge_point? && point.is_noise_point?).to eq(false)
38
+ end
39
+ end
40
+
41
+ it "visits all points" do
42
+ @in_memory_datasource.iterate_all_points do |point|
43
+ expect(point.visited?).to eq(true)
44
+ end
45
+ end
46
+
47
+ it "finds all clusters" do
48
+ expect(@dbscan.clusters.count).to eq(@clusters_count)
49
+ end
50
+ end
51
+
52
+ describe "#expand_cluster" do
53
+ pending "should expand cluster with one point and missing points in cluster"
54
+ pending "should expand cluster with several points and missing points in cluster"
55
+ pending "should expand cluster with several points and without missing points in cluster"
56
+ end
57
+ end
File without changes
@@ -0,0 +1,82 @@
1
+ require 'spec_helper'
2
+ require 'simple-random'
3
+
4
+ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
5
+
6
+ # describe "#initialize" do
7
+ # it "initializes with an array" do
8
+ # expect(DbClustering::DatasourceAdapters::InMemory.new(array: [])).to be_a(DbClustering::DatasourceAdapters::InMemory)
9
+ # end
10
+ # end
11
+ #
12
+ # describe "#iterate_all_points" do
13
+ # before(:each) do
14
+ # @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: (1..100).to_a)
15
+ # end
16
+ #
17
+ # it "iterates through all points" do
18
+ # x = 0
19
+ # @in_memory.iterate_all_points { |p| x += 1 }
20
+ # expect(x).to eq(100)
21
+ # end
22
+ # end
23
+
24
+ describe "#neighbors" do
25
+ before(:each) do
26
+ @dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
27
+
28
+ @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
29
+ @first_point = DbClustering::Models::Point.new(@dataset.first)
30
+ end
31
+
32
+ context "average difference" do
33
+ before(:each) do
34
+ @average_difference = DbClustering::DistanceMetrics::AverageDifference.new
35
+ end
36
+
37
+ it "finds all neighbors" do
38
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @average_difference, max_distance: 10)
39
+ expect(neighbors.count).to eq(10)
40
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
41
+ end
42
+ end
43
+
44
+ context "cosine similarity" do
45
+ before(:each) do
46
+ @cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
47
+ end
48
+
49
+ it "finds all neighbors" do
50
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @cosine_similarity, max_distance: 0.25)
51
+ expect(neighbors.count).to eq(40)
52
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
53
+ end
54
+ end
55
+
56
+ context "euclidean distance" do
57
+ before(:each) do
58
+ @euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
59
+ end
60
+
61
+ it "finds all neighbors" do
62
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @euclidean_distance, max_distance: 50)
63
+ expect(neighbors.count).to eq(10)
64
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
65
+ end
66
+ end
67
+
68
+ context "pearson correlation" do
69
+ before(:each) do
70
+ @pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
71
+ end
72
+
73
+ it "finds all neighbors" do
74
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @pearson_correlation, max_distance: 0.705)
75
+ expect(neighbors.count).to eq(10)
76
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
77
+ end
78
+ end
79
+
80
+ end
81
+
82
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::AverageDifference, type: :model do
4
+
5
+ before(:each) do
6
+ @average_difference = DbClustering::DistanceMetrics::AverageDifference.new
7
+ end
8
+
9
+ describe "#distance" do
10
+
11
+ context "using array object" do
12
+
13
+ it "works with 6 dimensional examples" do
14
+ a1 = [-100, -50, 0, 10, 20, 30]
15
+ a2 = [-100, -50, 0, 20, 30, 40]
16
+
17
+ expect_distance(a1, a2, 5.0)
18
+
19
+ a1[0] = 100
20
+ expect_distance(a1, a2, 38.333333333333336)
21
+
22
+ a1[1] = 50
23
+ expect_distance(a1, a2, 55)
24
+
25
+ a1[3] = 20
26
+ expect_distance(a1, a2, 53.333333333333333)
27
+
28
+ a1[4] = 30
29
+ expect_distance(a1, a2, 51.666666666666664)
30
+
31
+ a1[5] = 40
32
+ expect_distance(a1, a2, 50)
33
+ end
34
+
35
+ end
36
+ end
37
+
38
+ def expect_distance(object1, object2, distance)
39
+ vector1 = DbClustering::Models::Vector.new(object: object1)
40
+ vector2 = DbClustering::Models::Vector.new(object: object2)
41
+ expect(@average_difference.distance(vector1, vector2)).to be_within(0.001).of(distance)
42
+ end
43
+
44
+ end