db_clustering 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +15 -11
  5. data/Gemfile.lock +149 -0
  6. data/LICENSE.txt +1 -1
  7. data/README.md +90 -0
  8. data/Rakefile +6 -6
  9. data/VERSION +1 -0
  10. data/lib/algorithms/density_based/dbscan.rb +48 -0
  11. data/lib/datasource_adapters/active_record.rb +32 -0
  12. data/lib/datasource_adapters/in_memory.rb +29 -0
  13. data/lib/db_clustering.rb +34 -0
  14. data/lib/distance_metrics/average_difference.rb +28 -0
  15. data/lib/distance_metrics/cosine_similarity.rb +43 -0
  16. data/lib/distance_metrics/euclidean_distance.rb +32 -0
  17. data/lib/distance_metrics/pearson_correlation.rb +44 -0
  18. data/lib/generators/datasource/active_record.rb +0 -0
  19. data/lib/models/cluster.rb +18 -0
  20. data/lib/models/point.rb +41 -0
  21. data/lib/models/vector.rb +30 -0
  22. data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
  23. data/spec/datasource_adapters/active_record_spec.rb +0 -0
  24. data/spec/datasource_adapters/in_memory_spec.rb +82 -0
  25. data/spec/distance_metrics/average_difference_spec.rb +44 -0
  26. data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
  27. data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
  28. data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
  29. data/spec/generators/datasource/active_record_spec.rb +0 -0
  30. data/spec/models/cluster_spec.rb +0 -0
  31. data/spec/models/point_spec.rb +0 -0
  32. data/spec/models/vector_spec.rb +0 -0
  33. data/spec/spec_helper.rb +7 -2
  34. data/spec/support/dataset_helper.rb +19 -0
  35. data/spec/support/test_model.rb +9 -0
  36. metadata +31 -1
@@ -0,0 +1,34 @@
1
+ #
2
+ # Algorithms
3
+ #
4
+ require 'algorithms/density_based/dbscan'
5
+
6
+
7
+ #
8
+ # Datasource Adapters
9
+ #
10
+ require 'datasource_adapters/active_record'
11
+ require 'datasource_adapters/in_memory'
12
+
13
+
14
+ #
15
+ # Distance Metrics
16
+ #
17
+ require 'distance_metrics/average_difference'
18
+ require 'distance_metrics/cosine_similarity'
19
+ require 'distance_metrics/euclidean_distance'
20
+ require 'distance_metrics/pearson_correlation'
21
+
22
+
23
+ #
24
+ # Generators
25
+ #
26
+ require 'generators/datasource/active_record'
27
+
28
+
29
+ #
30
+ # Models
31
+ #
32
+ require 'models/cluster'
33
+ require 'models/point'
34
+ require 'models/vector'
@@ -0,0 +1,28 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class AverageDifference
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ vector1_array = vector1.array_for_comparison(vector2)
12
+ vector2_array = vector2.array_for_comparison(vector1)
13
+
14
+ if vector1_array.count != vector2_array.count
15
+ raise "Vectors with different sizes cannot be compared"
16
+ end
17
+
18
+ if vector1_array.count < @min_dimensions
19
+ return Float::INFINITY
20
+ end
21
+
22
+ sum = vector1_array.map.with_index{ |x, i| (x - vector2_array[i]).abs }.reduce(&:+)
23
+ sum / vector1_array.count.to_f
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,43 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class CosineSimilarity
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ 1.0 - correlation(vector1, vector2)
12
+ end
13
+
14
+ def correlation(vector1, vector2)
15
+ vector1_array = vector1.array_for_comparison(vector2)
16
+ vector2_array = vector2.array_for_comparison(vector1)
17
+
18
+ if vector1_array.count != vector2_array.count
19
+ raise "Vectors with different sizes cannot be compared"
20
+ end
21
+
22
+ if vector1_array.count < @min_dimensions
23
+ return Float::INFINITY
24
+ end
25
+
26
+ # see here for calculation formula: https://en.wikipedia.org/wiki/Cosine_similarity
27
+ numerator = 0
28
+ vector1_array.count.times do |i|
29
+ numerator += vector1_array[i] * vector2_array[i]
30
+ end
31
+
32
+
33
+
34
+ left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + v1i ** 2 })
35
+ right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + v2i ** 2 })
36
+ denominator = left_sqrt * right_sqrt
37
+
38
+ numerator.to_f / denominator
39
+ end
40
+
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,32 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class EuclideanDistance
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ vector1_array = vector1.array_for_comparison(vector2)
12
+ vector2_array = vector2.array_for_comparison(vector1)
13
+
14
+ if vector1_array.count != vector2_array.count
15
+ raise "Vectors with different sizes cannot be compared"
16
+ end
17
+
18
+ if vector1_array.count < @min_dimensions
19
+ return Float::INFINITY
20
+ end
21
+
22
+ # see here for calculation formula: http://en.wikipedia.org/wiki/Euclidean_distance
23
+ sum = 0
24
+ vector1_array.count.times do |i|
25
+ sum += (vector1_array[i] - vector2_array[i]) ** 2
26
+ end
27
+ sqrt sum
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,44 @@
1
+ module DbClustering
2
+ module DistanceMetrics
3
+ class PearsonCorrelation
4
+ include Math
5
+
6
+ def initialize(min_dimensions: 1)
7
+ @min_dimensions = min_dimensions
8
+ end
9
+
10
+ def distance(vector1, vector2)
11
+ 1.0 - correlation(vector1, vector2)
12
+ end
13
+
14
+ def correlation(vector1, vector2)
15
+ vector1_array = vector1.array_for_comparison(vector2)
16
+ vector2_array = vector2.array_for_comparison(vector1)
17
+
18
+ if vector1_array.count != vector2_array.count
19
+ raise "Vectors with different sizes cannot be compared"
20
+ end
21
+
22
+ if vector1_array.count < @min_dimensions
23
+ return Float::INFINITY
24
+ end
25
+
26
+ # see here for calculation formula: http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
27
+ v1_mean = vector1_array.reduce(:+) / vector1_array.count.to_f
28
+ v2_mean = vector2_array.reduce(:+) / vector2_array.count.to_f
29
+
30
+ numerator = 0
31
+ vector1_array.count.times do |i|
32
+ numerator += (vector1_array[i] - v1_mean) * (vector2_array[i] - v2_mean)
33
+ end
34
+
35
+ left_sqrt = sqrt(vector1_array.reduce(0) { |sum, v1i| sum + (v1i - v1_mean) ** 2 })
36
+ right_sqrt = sqrt(vector2_array.reduce(0) { |sum, v2i| sum + (v2i - v2_mean) ** 2 })
37
+ denominator = left_sqrt * right_sqrt
38
+
39
+ numerator.to_f / denominator
40
+ end
41
+
42
+ end
43
+ end
44
+ end
File without changes
@@ -0,0 +1,18 @@
1
+ module DbClustering
2
+ module Models
3
+ class Cluster
4
+
5
+ attr_accessor :points
6
+
7
+ def initialize
8
+ @points = []
9
+ end
10
+
11
+ def add(point)
12
+ @points << point
13
+ point.cluster = self
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,41 @@
1
+ module DbClustering
2
+ module Models
3
+ class Point
4
+
5
+ attr_accessor :cluster, :is_noise, :datasource_point
6
+
7
+ def initialize(datasource_point)
8
+ @is_noise = false
9
+ @cluster = nil
10
+ @datasource_point = datasource_point
11
+ end
12
+
13
+ def vector
14
+ vector_object = @datasource_point.clustering_vector
15
+
16
+ if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
17
+ DbClustering::Models::Vector.new(object: vector_object)
18
+ else
19
+ raise "clustering_vector method needs to result to a Hash or an Array object"
20
+ end
21
+ end
22
+
23
+ def visited?
24
+ self.is_noise || !self.cluster.nil?
25
+ end
26
+
27
+ def is_edge_point?
28
+ self.is_noise && !self.cluster.nil?
29
+ end
30
+
31
+ def is_core_point?
32
+ !self.is_noise && !self.cluster.nil?
33
+ end
34
+
35
+ def is_noise_point?
36
+ self.is_noise && self.cluster.nil?
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ module DbClustering
2
+ module Models
3
+ class Vector
4
+
5
+ attr_reader :hash
6
+
7
+ def initialize(object:)
8
+ if object.is_a?(Hash)
9
+ @hash = object
10
+ else
11
+ @array = object
12
+ end
13
+ end
14
+
15
+ def array_for_comparison(other_vector)
16
+ if @hash
17
+ if other_vector
18
+ shared_keys = @hash.keys & other_vector.hash.keys
19
+ @hash.select{ |k,v| shared_keys.include?(k) }.sort.map{ |arr| arr.last }
20
+ else
21
+ @hash.values
22
+ end
23
+ else
24
+ @array
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+ require 'support/test_model'
3
+
4
+ describe DbClustering::Algorithms::Dbscan do
5
+ describe "initialization" do
6
+ before(:each) do
7
+ @dataset = DatasetHelper.normal_distribution
8
+
9
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
10
+ @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
11
+
12
+ @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
13
+ end
14
+
15
+ it "should initialize successfully" do
16
+ expect(@dbscan).to be_a(DbClustering::Algorithms::Dbscan)
17
+ end
18
+ end
19
+
20
+ describe "#cluster" do
21
+ before(:each) do
22
+ @clusters_count = 10
23
+ @dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
24
+
25
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
26
+ @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
27
+
28
+ @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
29
+ @dbscan.cluster(max_distance: 10, min_neighbors: 5)
30
+ end
31
+
32
+ it "changes all points to clustered or noise – not both" do
33
+ @dbscan.datasource.iterate_all_points do |point|
34
+ expect(point.is_core_point? || point.is_edge_point? || point.is_noise).to eq(true)
35
+ expect(point.is_core_point? && point.is_edge_point?).to eq(false)
36
+ expect(point.is_core_point? && point.is_noise_point?).to eq(false)
37
+ expect(point.is_edge_point? && point.is_noise_point?).to eq(false)
38
+ end
39
+ end
40
+
41
+ it "visits all points" do
42
+ @in_memory_datasource.iterate_all_points do |point|
43
+ expect(point.visited?).to eq(true)
44
+ end
45
+ end
46
+
47
+ it "finds all clusters" do
48
+ expect(@dbscan.clusters.count).to eq(@clusters_count)
49
+ end
50
+ end
51
+
52
+ describe "#expand_cluster" do
53
+ pending "should expand cluster with one point and missing points in cluster"
54
+ pending "should expand cluster with several points and missing points in cluster"
55
+ pending "should expand cluster with several points and without missing points in cluster"
56
+ end
57
+ end
File without changes
@@ -0,0 +1,82 @@
1
+ require 'spec_helper'
2
+ require 'simple-random'
3
+
4
+ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
5
+
6
+ # describe "#initialize" do
7
+ # it "initializes with an array" do
8
+ # expect(DbClustering::DatasourceAdapters::InMemory.new(array: [])).to be_a(DbClustering::DatasourceAdapters::InMemory)
9
+ # end
10
+ # end
11
+ #
12
+ # describe "#iterate_all_points" do
13
+ # before(:each) do
14
+ # @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: (1..100).to_a)
15
+ # end
16
+ #
17
+ # it "iterates through all points" do
18
+ # x = 0
19
+ # @in_memory.iterate_all_points { |p| x += 1 }
20
+ # expect(x).to eq(100)
21
+ # end
22
+ # end
23
+
24
+ describe "#neighbors" do
25
+ before(:each) do
26
+ @dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
27
+
28
+ @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
29
+ @first_point = DbClustering::Models::Point.new(@dataset.first)
30
+ end
31
+
32
+ context "average difference" do
33
+ before(:each) do
34
+ @average_difference = DbClustering::DistanceMetrics::AverageDifference.new
35
+ end
36
+
37
+ it "finds all neighbors" do
38
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @average_difference, max_distance: 10)
39
+ expect(neighbors.count).to eq(10)
40
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
41
+ end
42
+ end
43
+
44
+ context "cosine similarity" do
45
+ before(:each) do
46
+ @cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
47
+ end
48
+
49
+ it "finds all neighbors" do
50
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @cosine_similarity, max_distance: 0.25)
51
+ expect(neighbors.count).to eq(40)
52
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
53
+ end
54
+ end
55
+
56
+ context "euclidean distance" do
57
+ before(:each) do
58
+ @euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
59
+ end
60
+
61
+ it "finds all neighbors" do
62
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @euclidean_distance, max_distance: 50)
63
+ expect(neighbors.count).to eq(10)
64
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
65
+ end
66
+ end
67
+
68
+ context "pearson correlation" do
69
+ before(:each) do
70
+ @pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
71
+ end
72
+
73
+ it "finds all neighbors" do
74
+ neighbors = @in_memory.neighbors(point: @first_point, distance_metric: @pearson_correlation, max_distance: 0.705)
75
+ expect(neighbors.count).to eq(10)
76
+ expect(neighbors.first).to be_a(DbClustering::Models::Point)
77
+ end
78
+ end
79
+
80
+ end
81
+
82
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::AverageDifference, type: :model do
4
+
5
+ before(:each) do
6
+ @average_difference = DbClustering::DistanceMetrics::AverageDifference.new
7
+ end
8
+
9
+ describe "#distance" do
10
+
11
+ context "using array object" do
12
+
13
+ it "works with 6 dimensional examples" do
14
+ a1 = [-100, -50, 0, 10, 20, 30]
15
+ a2 = [-100, -50, 0, 20, 30, 40]
16
+
17
+ expect_distance(a1, a2, 5.0)
18
+
19
+ a1[0] = 100
20
+ expect_distance(a1, a2, 38.333333333333336)
21
+
22
+ a1[1] = 50
23
+ expect_distance(a1, a2, 55)
24
+
25
+ a1[3] = 20
26
+ expect_distance(a1, a2, 53.333333333333333)
27
+
28
+ a1[4] = 30
29
+ expect_distance(a1, a2, 51.666666666666664)
30
+
31
+ a1[5] = 40
32
+ expect_distance(a1, a2, 50)
33
+ end
34
+
35
+ end
36
+ end
37
+
38
+ def expect_distance(object1, object2, distance)
39
+ vector1 = DbClustering::Models::Vector.new(object: object1)
40
+ vector2 = DbClustering::Models::Vector.new(object: object2)
41
+ expect(@average_difference.distance(vector1, vector2)).to be_within(0.001).of(distance)
42
+ end
43
+
44
+ end