db_clustering 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +15 -11
  5. data/Gemfile.lock +149 -0
  6. data/LICENSE.txt +1 -1
  7. data/README.md +90 -0
  8. data/Rakefile +6 -6
  9. data/VERSION +1 -0
  10. data/lib/algorithms/density_based/dbscan.rb +48 -0
  11. data/lib/datasource_adapters/active_record.rb +32 -0
  12. data/lib/datasource_adapters/in_memory.rb +29 -0
  13. data/lib/db_clustering.rb +34 -0
  14. data/lib/distance_metrics/average_difference.rb +28 -0
  15. data/lib/distance_metrics/cosine_similarity.rb +43 -0
  16. data/lib/distance_metrics/euclidean_distance.rb +32 -0
  17. data/lib/distance_metrics/pearson_correlation.rb +44 -0
  18. data/lib/generators/datasource/active_record.rb +0 -0
  19. data/lib/models/cluster.rb +18 -0
  20. data/lib/models/point.rb +41 -0
  21. data/lib/models/vector.rb +30 -0
  22. data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
  23. data/spec/datasource_adapters/active_record_spec.rb +0 -0
  24. data/spec/datasource_adapters/in_memory_spec.rb +82 -0
  25. data/spec/distance_metrics/average_difference_spec.rb +44 -0
  26. data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
  27. data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
  28. data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
  29. data/spec/generators/datasource/active_record_spec.rb +0 -0
  30. data/spec/models/cluster_spec.rb +0 -0
  31. data/spec/models/point_spec.rb +0 -0
  32. data/spec/models/vector_spec.rb +0 -0
  33. data/spec/spec_helper.rb +7 -2
  34. data/spec/support/dataset_helper.rb +19 -0
  35. data/spec/support/test_model.rb +9 -0
  36. metadata +31 -1
@@ -0,0 +1,172 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::CosineSimilarity, type: :model do
4
+
5
+ before(:each) do
6
+ @cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
7
+ end
8
+
9
+ describe "#distance" do
10
+
11
+ context "using array object" do
12
+
13
+ it "works with 6 dimensional examples" do
14
+ a1 = [-100, -50, 0, 10, 20, 30]
15
+ a2 = [-100, -50, 0, 20, 30, 40]
16
+
17
+ expect_distance(a1, a2, 1.0 - 0.9910606701839321)
18
+
19
+ a1[0] = 100
20
+ expect_distance(a1, a2, 1.0 - -0.37591956455252595)
21
+
22
+ a1[1] = 50
23
+ expect_distance(a1, a2, 1.0 - -0.7176646232366405)
24
+
25
+ a1[3] = 20
26
+ expect_distance(a1, a2, 1.0 - -0.6965185577824071)
27
+
28
+ a1[4] = 30
29
+ expect_distance(a1, a2, 1.0 - -0.6646315788560506)
30
+
31
+ a1[5] = 40
32
+ expect_distance(a1, a2, 1.0 - -0.6233766233766233)
33
+ end
34
+
35
+ end
36
+ end
37
+
38
+ describe "#correlation" do
39
+
40
+ context "using array object" do
41
+ it "works with 6 dimensional examples" do
42
+ a1 = [-100, -50, 0, 10, 20, 30]
43
+ a2 = [-100, -50, 0, 20, 30, 40]
44
+
45
+ expect_correlation(a1, a2, 0.9910606701839321)
46
+
47
+ a1[0] = 100
48
+ expect_correlation(a1, a2, -0.37591956455252595)
49
+
50
+ a1[1] = 50
51
+ expect_correlation(a1, a2, -0.7176646232366405)
52
+
53
+ a1[3] = 20
54
+ expect_correlation(a1, a2, -0.6965185577824071)
55
+
56
+ a1[4] = 30
57
+ expect_correlation(a1, a2, -0.6646315788560506)
58
+
59
+ a1[5] = 40
60
+ expect_correlation(a1, a2, -0.6233766233766233)
61
+ end
62
+
63
+ it "works with 10 dimensional example" do
64
+ a1 = [-100, -75, -50, -25, 0, 10, 30, 50, 70, 90]
65
+ a2 = [-100, -75, -50, -25, 0, 20, 40, 60, 80, 100]
66
+
67
+ expect_correlation(a1, a2, 0.9960326819057044)
68
+
69
+ a1[0] = 100
70
+ expect_correlation(a1, a2, 0.46833324778347685)
71
+
72
+ a1[1] = 75
73
+ expect_correlation(a1, a2, 0.1715023160897239)
74
+
75
+ a1[2] = 50
76
+ expect_correlation(a1, a2, 0.039577457559167056)
77
+
78
+ a1[3] = 25
79
+ expect_correlation(a1, a2, 0.006596242926527844)
80
+
81
+ a1[5] = 20
82
+ expect_correlation(a1, a2, 0.011823033079799202)
83
+
84
+ a1[6] = 40
85
+ expect_correlation(a1, a2, 0.02211572157270011)
86
+
87
+ a1[7] = 60
88
+ expect_correlation(a1, a2, 0.03716711852501086)
89
+
90
+ a1[8] = 80
91
+ expect_correlation(a1, a2, 0.056548774822804536)
92
+
93
+ a1[9] = 100
94
+ expect_correlation(a1, a2, 0.07975460122699386)
95
+ end
96
+
97
+ it "works with 200 dimensional example" do
98
+ a1 = (-100..0).to_a + (-9..90).to_a
99
+ a2 = (-100..0).to_a + (1..100).to_a
100
+
101
+ expect_correlation(a1, a2, 0.994666206187772)
102
+
103
+ a1[0] = 100
104
+ expect_correlation(a1, a2, 0.962897882770724)
105
+
106
+ a1[1] = 99
107
+ expect_correlation(a1, a2, 0.9317617489896753)
108
+
109
+ a1[2] = 98
110
+ expect_correlation(a1, a2, 0.9012514511799424)
111
+
112
+ a1[3] = 97
113
+ expect_correlation(a1, a2, 0.871360635676842)
114
+
115
+ (4..99).each{ |i| a1[i] = 100 - i }
116
+ expect_correlation(a1, a2, -0.08021501662804613)
117
+
118
+ a1[101] = 1
119
+ expect_correlation(a1, a2, -0.0802046101750023)
120
+
121
+ a1[102] = 2
122
+ expect_correlation(a1, a2, -0.08017694707226601)
123
+
124
+ a1[103] = 3
125
+ expect_correlation(a1, a2, -0.08013202587413609)
126
+
127
+ a1[104] = 4
128
+ expect_correlation(a1, a2, -0.08006984697332015)
129
+
130
+ (5..100).each{ |i| a1[100+i] = i }
131
+ expect_correlation(a1, a2, 0.0)
132
+ end
133
+ end
134
+
135
+ context "using hash object" do
136
+ it "works with 6 dimensional examples" do
137
+ a1 = {a: -100, b: -50, c: 0, d: 100, e: 100, f: 100, g: 10, h: 20, i: 30}
138
+ a2 = {a: -100, b: -50, c: 0, g: 20, h: 30, i: 40, j: -100, k: -100, l: -100}
139
+
140
+ expect_correlation(a1, a2, 0.9910606701839321)
141
+
142
+ a1[:a] = 100
143
+ expect_correlation(a1, a2, -0.37591956455252595)
144
+
145
+ a1[:b] = 50
146
+ expect_correlation(a1, a2, -0.7176646232366405)
147
+
148
+ a1[:g] = 20
149
+ expect_correlation(a1, a2, -0.6965185577824071)
150
+
151
+ a1[:h] = 30
152
+ expect_correlation(a1, a2, -0.6646315788560506)
153
+
154
+ a1[:i] = 40
155
+ expect_correlation(a1, a2, -0.6233766233766233)
156
+ end
157
+ end
158
+ end
159
+
160
+ def expect_correlation(object1, object2, correlation)
161
+ vector1 = DbClustering::Models::Vector.new(object: object1)
162
+ vector2 = DbClustering::Models::Vector.new(object: object2)
163
+ expect(@cosine_similarity.correlation(vector1, vector2)).to be_within(0.001).of(correlation)
164
+ end
165
+
166
+ def expect_distance(object1, object2, distance)
167
+ vector1 = DbClustering::Models::Vector.new(object: object1)
168
+ vector2 = DbClustering::Models::Vector.new(object: object2)
169
+ expect(@cosine_similarity.distance(vector1, vector2)).to be_within(0.001).of(distance)
170
+ end
171
+
172
+ end
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::EuclideanDistance, type: :model do
4
+
5
+ before(:each) do
6
+ @euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
7
+ end
8
+
9
+ describe "#distance" do
10
+
11
+ context "using array object" do
12
+ it "works with 6 dimensional examples" do
13
+ a1 = [-100, -50, 0, 10, 20, 30]
14
+ a2 = [-100, -50, 0, 20, 30, 40]
15
+
16
+ expect_distance(a1, a2, 17.320508075688775)
17
+
18
+ a1[0] = 100
19
+ expect_distance(a1, a2, 200.7485989988473)
20
+
21
+ a1[1] = 50
22
+ expect_distance(a1, a2, 224.27661492005805)
23
+
24
+ a1[3] = 20
25
+ expect_distance(a1, a2, 224.0535650240808)
26
+
27
+ a1[4] = 30
28
+ expect_distance(a1, a2, 223.83029285599392)
29
+
30
+ a1[5] = 40
31
+ expect_distance(a1, a2, 223.60679774997897)
32
+ end
33
+
34
+ it "works with 10 dimensional example" do
35
+ a1 = [-100, -75, -50, -25, 0, 10, 30, 50, 70, 90]
36
+ a2 = [-100, -75, -50, -25, 0, 20, 40, 60, 80, 100]
37
+
38
+ expect_distance(a1, a2, 22.360679774997898)
39
+
40
+ a1[0] = 100
41
+ expect_distance(a1, a2, 201.24611797498108)
42
+
43
+ a1[1] = 75
44
+ expect_distance(a1, a2, 250.99800796022265)
45
+
46
+ a1[2] = 50
47
+ expect_distance(a1, a2, 270.1851217221259)
48
+
49
+ a1[3] = 25
50
+ expect_distance(a1, a2, 274.7726332806817)
51
+
52
+ a1[5] = 20
53
+ expect_distance(a1, a2, 274.5906043549196)
54
+
55
+ a1[6] = 40
56
+ expect_distance(a1, a2, 274.40845468024486)
57
+
58
+ a1[7] = 60
59
+ expect_distance(a1, a2, 274.22618401604177)
60
+
61
+ a1[8] = 80
62
+ expect_distance(a1, a2, 274.0437921208944)
63
+
64
+ a1[9] = 100
65
+ expect_distance(a1, a2, 273.8612787525831)
66
+ end
67
+
68
+ it "works with 200 dimensional example" do
69
+ a1 = (-100..0).to_a + (-9..90).to_a
70
+ a2 = (-100..0).to_a + (1..100).to_a
71
+
72
+ expect_distance(a1, a2, 100)
73
+
74
+ a1[0] = 100
75
+ expect_distance(a1, a2, 223.60679774997897)
76
+
77
+ a1[1] = 99
78
+ expect_distance(a1, a2, 298.67038688159226)
79
+
80
+ a1[2] = 98
81
+ expect_distance(a1, a2, 357.2394155185007)
82
+
83
+ a1[3] = 97
84
+ expect_distance(a1, a2, 406.5169123173106)
85
+
86
+ (4..99).each{ |i| a1[i] = 100 - i }
87
+ expect_distance(a1, a2, 1167.6472069936192)
88
+
89
+ a1[101] = 1
90
+ expect_distance(a1, a2, 1167.6043850551437)
91
+
92
+ a1[102] = 2
93
+ expect_distance(a1, a2, 1167.561561546114)
94
+
95
+ a1[103] = 3
96
+ expect_distance(a1, a2, 1167.5187364663575)
97
+
98
+ a1[104] = 4
99
+ expect_distance(a1, a2, 1167.4759098157015)
100
+
101
+ (5..100).each{ |i| a1[100+i] = i }
102
+ expect_distance(a1, a2, 1163.3572108342305)
103
+ end
104
+ end
105
+
106
+ context "using hash object" do
107
+ it "works with 6 dimensional examples" do
108
+ a1 = {a: -100, b: -50, c: 0, d: 100, e: 100, f: 100, g: 10, h: 20, i: 30}
109
+ a2 = {a: -100, b: -50, c: 0, g: 20, h: 30, i: 40, j: -100, k: -100, l: -100}
110
+
111
+ expect_distance(a1, a2, 17.320508075688775)
112
+
113
+ a1[:a] = 100
114
+ expect_distance(a1, a2, 200.7485989988473)
115
+
116
+ a1[:b] = 50
117
+ expect_distance(a1, a2, 224.27661492005805)
118
+
119
+ a1[:g] = 20
120
+ expect_distance(a1, a2, 224.0535650240808)
121
+
122
+ a1[:h] = 30
123
+ expect_distance(a1, a2, 223.83029285599392)
124
+
125
+ a1[:i] = 40
126
+ expect_distance(a1, a2, 223.60679774997897)
127
+ end
128
+ end
129
+ end
130
+
131
+ def expect_distance(object1, object2, distance)
132
+ vector1 = DbClustering::Models::Vector.new(object: object1)
133
+ vector2 = DbClustering::Models::Vector.new(object: object2)
134
+ expect(@euclidean_distance.distance(vector1, vector2)).to be_within(0.001).of(distance)
135
+ end
136
+
137
+ end
@@ -0,0 +1,174 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::PearsonCorrelation, type: :model do
4
+
5
+ before(:each) do
6
+ @pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
7
+ end
8
+
9
+
10
+ describe "#distance" do
11
+
12
+ context "using array object" do
13
+
14
+ it "works with 6 dimensional examples" do
15
+ a1 = [-100, -50, 0, 10, 20, 30]
16
+ a2 = [-100, -50, 0, 20, 30, 40]
17
+
18
+ expect_distance(a1, a2, 1.0 - 0.9978980816987033)
19
+
20
+ a1[0] = 100
21
+ expect_distance(a1, a2, 1.0 - -0.33178189173568795)
22
+
23
+ a1[1] = 50
24
+ expect_distance(a1, a2, 1.0 - -0.8531546818010307)
25
+
26
+ a1[3] = 20
27
+ expect_distance(a1, a2, 1.0 - -0.8501701979323958)
28
+
29
+ a1[4] = 30
30
+ expect_distance(a1, a2, 1.0 - -0.8251789485121429)
31
+
32
+ a1[5] = 40
33
+ expect_distance(a1, a2, 1.0 - -0.777119205197422)
34
+ end
35
+
36
+ end
37
+ end
38
+
39
+ describe "#correlation" do
40
+
41
+ context "using array object" do
42
+ it "works with 6 dimensional examples" do
43
+ a1 = [-100, -50, 0, 10, 20, 30]
44
+ a2 = [-100, -50, 0, 20, 30, 40]
45
+
46
+ expect_correlation(a1, a2, 0.9978980816987033)
47
+
48
+ a1[0] = 100
49
+ expect_correlation(a1, a2, -0.33178189173568795)
50
+
51
+ a1[1] = 50
52
+ expect_correlation(a1, a2, -0.8531546818010307)
53
+
54
+ a1[3] = 20
55
+ expect_correlation(a1, a2, -0.8501701979323958)
56
+
57
+ a1[4] = 30
58
+ expect_correlation(a1, a2, -0.8251789485121429)
59
+
60
+ a1[5] = 40
61
+ expect_correlation(a1, a2, -0.777119205197422)
62
+ end
63
+
64
+ it "works with 10 dimensional example" do
65
+ a1 = [-100, -75, -50, -25, 0, 10, 30, 50, 70, 90]
66
+ a2 = [-100, -75, -50, -25, 0, 20, 40, 60, 80, 100]
67
+
68
+ expect_correlation(a1, a2, 0.9991021273387496)
69
+
70
+ a1[0] = 100
71
+ expect_correlation(a1, a2, 0.47082800718062534)
72
+
73
+ a1[1] = 75
74
+ expect_correlation(a1, a2, 0.1556331759412803)
75
+
76
+ a1[2] = 50
77
+ expect_correlation(a1, a2, -0.030429030972509225)
78
+
79
+ a1[3] = 25
80
+ expect_correlation(a1, a2, -0.11043152607484653)
81
+
82
+ a1[5] = 20
83
+ expect_correlation(a1, a2, -0.10683599418231368)
84
+
85
+ a1[6] = 40
86
+ expect_correlation(a1, a2, -0.09061095797872151)
87
+
88
+ a1[7] = 60
89
+ expect_correlation(a1, a2, -0.061965254978689745)
90
+
91
+ a1[8] = 80
92
+ expect_correlation(a1, a2, -0.022715542521212734)
93
+
94
+ a1[9] = 100
95
+ expect_correlation(a1, a2, 0.024246432248443597)
96
+ end
97
+
98
+ it "works with 200 dimensional example" do
99
+ a1 = (-100..0).to_a + (-9..90).to_a
100
+ a2 = (-100..0).to_a + (1..100).to_a
101
+
102
+ expect_correlation(a1, a2, 0.9989178188722178)
103
+
104
+ a1[0] = 100
105
+ expect_correlation(a1, a2, 0.9655259356163942)
106
+
107
+ a1[1] = 99
108
+ expect_correlation(a1, a2, 0.9331992252857959)
109
+
110
+ a1[2] = 98
111
+ expect_correlation(a1, a2, 0.9018830671823298)
112
+
113
+ a1[3] = 97
114
+ expect_correlation(a1, a2, 0.871527012471479)
115
+
116
+ (4..99).each{ |i| a1[i] = 100 - i }
117
+ expect_correlation(a1, a2, -0.14729260459452256)
118
+
119
+ a1[101] = 1
120
+ expect_correlation(a1, a2, -0.147683155760824)
121
+
122
+ a1[102] = 2
123
+ expect_correlation(a1, a2, -0.14803962444596394)
124
+
125
+ a1[103] = 3
126
+ expect_correlation(a1, a2, -0.14836161254154293)
127
+
128
+ a1[104] = 4
129
+ expect_correlation(a1, a2, -0.14864872717684907)
130
+
131
+ (5..100).each{ |i| a1[100+i] = i }
132
+ expect_correlation(a1, a2, 0.0)
133
+ end
134
+ end
135
+
136
+ context "using hash object" do
137
+ it "works with 6 dimensional examples" do
138
+ a1 = {a: -100, b: -50, c: 0, d: 100, e: 100, f: 100, g: 10, h: 20, i: 30}
139
+ a2 = {a: -100, b: -50, c: 0, g: 20, h: 30, i: 40, j: -100, k: -100, l: -100}
140
+
141
+ expect_correlation(a1, a2, 0.9978980816987033)
142
+
143
+ a1[:a] = 100
144
+ expect_correlation(a1, a2, -0.33178189173568795)
145
+
146
+ a1[:b] = 50
147
+ expect_correlation(a1, a2, -0.8531546818010307)
148
+
149
+ a1[:g] = 20
150
+ expect_correlation(a1, a2, -0.8501701979323958)
151
+
152
+ a1[:h] = 30
153
+ expect_correlation(a1, a2, -0.8251789485121429)
154
+
155
+ a1[:i] = 40
156
+ expect_correlation(a1, a2, -0.777119205197422)
157
+ end
158
+ end
159
+
160
+ end
161
+
162
+ def expect_correlation(object1, object2, correlation)
163
+ vector1 = DbClustering::Models::Vector.new(object: object1)
164
+ vector2 = DbClustering::Models::Vector.new(object: object2)
165
+ expect(@pearson_correlation.correlation(vector1, vector2)).to be_within(0.001).of(correlation)
166
+ end
167
+
168
+ def expect_distance(object1, object2, distance)
169
+ vector1 = DbClustering::Models::Vector.new(object: object1)
170
+ vector2 = DbClustering::Models::Vector.new(object: object2)
171
+ expect(@pearson_correlation.distance(vector1, vector2)).to be_within(0.001).of(distance)
172
+ end
173
+
174
+ end