db_clustering 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +15 -11
  5. data/Gemfile.lock +149 -0
  6. data/LICENSE.txt +1 -1
  7. data/README.md +90 -0
  8. data/Rakefile +6 -6
  9. data/VERSION +1 -0
  10. data/lib/algorithms/density_based/dbscan.rb +48 -0
  11. data/lib/datasource_adapters/active_record.rb +32 -0
  12. data/lib/datasource_adapters/in_memory.rb +29 -0
  13. data/lib/db_clustering.rb +34 -0
  14. data/lib/distance_metrics/average_difference.rb +28 -0
  15. data/lib/distance_metrics/cosine_similarity.rb +43 -0
  16. data/lib/distance_metrics/euclidean_distance.rb +32 -0
  17. data/lib/distance_metrics/pearson_correlation.rb +44 -0
  18. data/lib/generators/datasource/active_record.rb +0 -0
  19. data/lib/models/cluster.rb +18 -0
  20. data/lib/models/point.rb +41 -0
  21. data/lib/models/vector.rb +30 -0
  22. data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
  23. data/spec/datasource_adapters/active_record_spec.rb +0 -0
  24. data/spec/datasource_adapters/in_memory_spec.rb +82 -0
  25. data/spec/distance_metrics/average_difference_spec.rb +44 -0
  26. data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
  27. data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
  28. data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
  29. data/spec/generators/datasource/active_record_spec.rb +0 -0
  30. data/spec/models/cluster_spec.rb +0 -0
  31. data/spec/models/point_spec.rb +0 -0
  32. data/spec/models/vector_spec.rb +0 -0
  33. data/spec/spec_helper.rb +7 -2
  34. data/spec/support/dataset_helper.rb +19 -0
  35. data/spec/support/test_model.rb +9 -0
  36. metadata +31 -1
@@ -0,0 +1,172 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::CosineSimilarity, type: :model do
4
+
5
+ before(:each) do
6
+ @cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
7
+ end
8
+
9
+ describe "#distance" do
10
+
11
+ context "using array object" do
12
+
13
+ it "works with 6 dimensional examples" do
14
+ a1 = [-100, -50, 0, 10, 20, 30]
15
+ a2 = [-100, -50, 0, 20, 30, 40]
16
+
17
+ expect_distance(a1, a2, 1.0 - 0.9910606701839321)
18
+
19
+ a1[0] = 100
20
+ expect_distance(a1, a2, 1.0 - -0.37591956455252595)
21
+
22
+ a1[1] = 50
23
+ expect_distance(a1, a2, 1.0 - -0.7176646232366405)
24
+
25
+ a1[3] = 20
26
+ expect_distance(a1, a2, 1.0 - -0.6965185577824071)
27
+
28
+ a1[4] = 30
29
+ expect_distance(a1, a2, 1.0 - -0.6646315788560506)
30
+
31
+ a1[5] = 40
32
+ expect_distance(a1, a2, 1.0 - -0.6233766233766233)
33
+ end
34
+
35
+ end
36
+ end
37
+
38
+ describe "#correlation" do
39
+
40
+ context "using array object" do
41
+ it "works with 6 dimensional examples" do
42
+ a1 = [-100, -50, 0, 10, 20, 30]
43
+ a2 = [-100, -50, 0, 20, 30, 40]
44
+
45
+ expect_correlation(a1, a2, 0.9910606701839321)
46
+
47
+ a1[0] = 100
48
+ expect_correlation(a1, a2, -0.37591956455252595)
49
+
50
+ a1[1] = 50
51
+ expect_correlation(a1, a2, -0.7176646232366405)
52
+
53
+ a1[3] = 20
54
+ expect_correlation(a1, a2, -0.6965185577824071)
55
+
56
+ a1[4] = 30
57
+ expect_correlation(a1, a2, -0.6646315788560506)
58
+
59
+ a1[5] = 40
60
+ expect_correlation(a1, a2, -0.6233766233766233)
61
+ end
62
+
63
+ it "works with 10 dimensional example" do
64
+ a1 = [-100, -75, -50, -25, 0, 10, 30, 50, 70, 90]
65
+ a2 = [-100, -75, -50, -25, 0, 20, 40, 60, 80, 100]
66
+
67
+ expect_correlation(a1, a2, 0.9960326819057044)
68
+
69
+ a1[0] = 100
70
+ expect_correlation(a1, a2, 0.46833324778347685)
71
+
72
+ a1[1] = 75
73
+ expect_correlation(a1, a2, 0.1715023160897239)
74
+
75
+ a1[2] = 50
76
+ expect_correlation(a1, a2, 0.039577457559167056)
77
+
78
+ a1[3] = 25
79
+ expect_correlation(a1, a2, 0.006596242926527844)
80
+
81
+ a1[5] = 20
82
+ expect_correlation(a1, a2, 0.011823033079799202)
83
+
84
+ a1[6] = 40
85
+ expect_correlation(a1, a2, 0.02211572157270011)
86
+
87
+ a1[7] = 60
88
+ expect_correlation(a1, a2, 0.03716711852501086)
89
+
90
+ a1[8] = 80
91
+ expect_correlation(a1, a2, 0.056548774822804536)
92
+
93
+ a1[9] = 100
94
+ expect_correlation(a1, a2, 0.07975460122699386)
95
+ end
96
+
97
+ it "works with 200 dimensional example" do
98
+ a1 = (-100..0).to_a + (-9..90).to_a
99
+ a2 = (-100..0).to_a + (1..100).to_a
100
+
101
+ expect_correlation(a1, a2, 0.994666206187772)
102
+
103
+ a1[0] = 100
104
+ expect_correlation(a1, a2, 0.962897882770724)
105
+
106
+ a1[1] = 99
107
+ expect_correlation(a1, a2, 0.9317617489896753)
108
+
109
+ a1[2] = 98
110
+ expect_correlation(a1, a2, 0.9012514511799424)
111
+
112
+ a1[3] = 97
113
+ expect_correlation(a1, a2, 0.871360635676842)
114
+
115
+ (4..99).each{ |i| a1[i] = 100 - i }
116
+ expect_correlation(a1, a2, -0.08021501662804613)
117
+
118
+ a1[101] = 1
119
+ expect_correlation(a1, a2, -0.0802046101750023)
120
+
121
+ a1[102] = 2
122
+ expect_correlation(a1, a2, -0.08017694707226601)
123
+
124
+ a1[103] = 3
125
+ expect_correlation(a1, a2, -0.08013202587413609)
126
+
127
+ a1[104] = 4
128
+ expect_correlation(a1, a2, -0.08006984697332015)
129
+
130
+ (5..100).each{ |i| a1[100+i] = i }
131
+ expect_correlation(a1, a2, 0.0)
132
+ end
133
+ end
134
+
135
+ context "using hash object" do
136
+ it "works with 6 dimensional examples" do
137
+ a1 = {a: -100, b: -50, c: 0, d: 100, e: 100, f: 100, g: 10, h: 20, i: 30}
138
+ a2 = {a: -100, b: -50, c: 0, g: 20, h: 30, i: 40, j: -100, k: -100, l: -100}
139
+
140
+ expect_correlation(a1, a2, 0.9910606701839321)
141
+
142
+ a1[:a] = 100
143
+ expect_correlation(a1, a2, -0.37591956455252595)
144
+
145
+ a1[:b] = 50
146
+ expect_correlation(a1, a2, -0.7176646232366405)
147
+
148
+ a1[:g] = 20
149
+ expect_correlation(a1, a2, -0.6965185577824071)
150
+
151
+ a1[:h] = 30
152
+ expect_correlation(a1, a2, -0.6646315788560506)
153
+
154
+ a1[:i] = 40
155
+ expect_correlation(a1, a2, -0.6233766233766233)
156
+ end
157
+ end
158
+ end
159
+
160
+ def expect_correlation(object1, object2, correlation)
161
+ vector1 = DbClustering::Models::Vector.new(object: object1)
162
+ vector2 = DbClustering::Models::Vector.new(object: object2)
163
+ expect(@cosine_similarity.correlation(vector1, vector2)).to be_within(0.001).of(correlation)
164
+ end
165
+
166
+ def expect_distance(object1, object2, distance)
167
+ vector1 = DbClustering::Models::Vector.new(object: object1)
168
+ vector2 = DbClustering::Models::Vector.new(object: object2)
169
+ expect(@cosine_similarity.distance(vector1, vector2)).to be_within(0.001).of(distance)
170
+ end
171
+
172
+ end
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::EuclideanDistance, type: :model do
4
+
5
+ before(:each) do
6
+ @euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
7
+ end
8
+
9
+ describe "#distance" do
10
+
11
+ context "using array object" do
12
+ it "works with 6 dimensional examples" do
13
+ a1 = [-100, -50, 0, 10, 20, 30]
14
+ a2 = [-100, -50, 0, 20, 30, 40]
15
+
16
+ expect_distance(a1, a2, 17.320508075688775)
17
+
18
+ a1[0] = 100
19
+ expect_distance(a1, a2, 200.7485989988473)
20
+
21
+ a1[1] = 50
22
+ expect_distance(a1, a2, 224.27661492005805)
23
+
24
+ a1[3] = 20
25
+ expect_distance(a1, a2, 224.0535650240808)
26
+
27
+ a1[4] = 30
28
+ expect_distance(a1, a2, 223.83029285599392)
29
+
30
+ a1[5] = 40
31
+ expect_distance(a1, a2, 223.60679774997897)
32
+ end
33
+
34
+ it "works with 10 dimensional example" do
35
+ a1 = [-100, -75, -50, -25, 0, 10, 30, 50, 70, 90]
36
+ a2 = [-100, -75, -50, -25, 0, 20, 40, 60, 80, 100]
37
+
38
+ expect_distance(a1, a2, 22.360679774997898)
39
+
40
+ a1[0] = 100
41
+ expect_distance(a1, a2, 201.24611797498108)
42
+
43
+ a1[1] = 75
44
+ expect_distance(a1, a2, 250.99800796022265)
45
+
46
+ a1[2] = 50
47
+ expect_distance(a1, a2, 270.1851217221259)
48
+
49
+ a1[3] = 25
50
+ expect_distance(a1, a2, 274.7726332806817)
51
+
52
+ a1[5] = 20
53
+ expect_distance(a1, a2, 274.5906043549196)
54
+
55
+ a1[6] = 40
56
+ expect_distance(a1, a2, 274.40845468024486)
57
+
58
+ a1[7] = 60
59
+ expect_distance(a1, a2, 274.22618401604177)
60
+
61
+ a1[8] = 80
62
+ expect_distance(a1, a2, 274.0437921208944)
63
+
64
+ a1[9] = 100
65
+ expect_distance(a1, a2, 273.8612787525831)
66
+ end
67
+
68
+ it "works with 200 dimensional example" do
69
+ a1 = (-100..0).to_a + (-9..90).to_a
70
+ a2 = (-100..0).to_a + (1..100).to_a
71
+
72
+ expect_distance(a1, a2, 100)
73
+
74
+ a1[0] = 100
75
+ expect_distance(a1, a2, 223.60679774997897)
76
+
77
+ a1[1] = 99
78
+ expect_distance(a1, a2, 298.67038688159226)
79
+
80
+ a1[2] = 98
81
+ expect_distance(a1, a2, 357.2394155185007)
82
+
83
+ a1[3] = 97
84
+ expect_distance(a1, a2, 406.5169123173106)
85
+
86
+ (4..99).each{ |i| a1[i] = 100 - i }
87
+ expect_distance(a1, a2, 1167.6472069936192)
88
+
89
+ a1[101] = 1
90
+ expect_distance(a1, a2, 1167.6043850551437)
91
+
92
+ a1[102] = 2
93
+ expect_distance(a1, a2, 1167.561561546114)
94
+
95
+ a1[103] = 3
96
+ expect_distance(a1, a2, 1167.5187364663575)
97
+
98
+ a1[104] = 4
99
+ expect_distance(a1, a2, 1167.4759098157015)
100
+
101
+ (5..100).each{ |i| a1[100+i] = i }
102
+ expect_distance(a1, a2, 1163.3572108342305)
103
+ end
104
+ end
105
+
106
+ context "using hash object" do
107
+ it "works with 6 dimensional examples" do
108
+ a1 = {a: -100, b: -50, c: 0, d: 100, e: 100, f: 100, g: 10, h: 20, i: 30}
109
+ a2 = {a: -100, b: -50, c: 0, g: 20, h: 30, i: 40, j: -100, k: -100, l: -100}
110
+
111
+ expect_distance(a1, a2, 17.320508075688775)
112
+
113
+ a1[:a] = 100
114
+ expect_distance(a1, a2, 200.7485989988473)
115
+
116
+ a1[:b] = 50
117
+ expect_distance(a1, a2, 224.27661492005805)
118
+
119
+ a1[:g] = 20
120
+ expect_distance(a1, a2, 224.0535650240808)
121
+
122
+ a1[:h] = 30
123
+ expect_distance(a1, a2, 223.83029285599392)
124
+
125
+ a1[:i] = 40
126
+ expect_distance(a1, a2, 223.60679774997897)
127
+ end
128
+ end
129
+ end
130
+
131
+ def expect_distance(object1, object2, distance)
132
+ vector1 = DbClustering::Models::Vector.new(object: object1)
133
+ vector2 = DbClustering::Models::Vector.new(object: object2)
134
+ expect(@euclidean_distance.distance(vector1, vector2)).to be_within(0.001).of(distance)
135
+ end
136
+
137
+ end
@@ -0,0 +1,174 @@
1
+ require 'spec_helper'
2
+
3
+ describe DbClustering::DistanceMetrics::PearsonCorrelation, type: :model do
4
+
5
+ before(:each) do
6
+ @pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
7
+ end
8
+
9
+
10
+ describe "#distance" do
11
+
12
+ context "using array object" do
13
+
14
+ it "works with 6 dimensional examples" do
15
+ a1 = [-100, -50, 0, 10, 20, 30]
16
+ a2 = [-100, -50, 0, 20, 30, 40]
17
+
18
+ expect_distance(a1, a2, 1.0 - 0.9978980816987033)
19
+
20
+ a1[0] = 100
21
+ expect_distance(a1, a2, 1.0 - -0.33178189173568795)
22
+
23
+ a1[1] = 50
24
+ expect_distance(a1, a2, 1.0 - -0.8531546818010307)
25
+
26
+ a1[3] = 20
27
+ expect_distance(a1, a2, 1.0 - -0.8501701979323958)
28
+
29
+ a1[4] = 30
30
+ expect_distance(a1, a2, 1.0 - -0.8251789485121429)
31
+
32
+ a1[5] = 40
33
+ expect_distance(a1, a2, 1.0 - -0.777119205197422)
34
+ end
35
+
36
+ end
37
+ end
38
+
39
+ describe "#correlation" do
40
+
41
+ context "using array object" do
42
+ it "works with 6 dimensional examples" do
43
+ a1 = [-100, -50, 0, 10, 20, 30]
44
+ a2 = [-100, -50, 0, 20, 30, 40]
45
+
46
+ expect_correlation(a1, a2, 0.9978980816987033)
47
+
48
+ a1[0] = 100
49
+ expect_correlation(a1, a2, -0.33178189173568795)
50
+
51
+ a1[1] = 50
52
+ expect_correlation(a1, a2, -0.8531546818010307)
53
+
54
+ a1[3] = 20
55
+ expect_correlation(a1, a2, -0.8501701979323958)
56
+
57
+ a1[4] = 30
58
+ expect_correlation(a1, a2, -0.8251789485121429)
59
+
60
+ a1[5] = 40
61
+ expect_correlation(a1, a2, -0.777119205197422)
62
+ end
63
+
64
+ it "works with 10 dimensional example" do
65
+ a1 = [-100, -75, -50, -25, 0, 10, 30, 50, 70, 90]
66
+ a2 = [-100, -75, -50, -25, 0, 20, 40, 60, 80, 100]
67
+
68
+ expect_correlation(a1, a2, 0.9991021273387496)
69
+
70
+ a1[0] = 100
71
+ expect_correlation(a1, a2, 0.47082800718062534)
72
+
73
+ a1[1] = 75
74
+ expect_correlation(a1, a2, 0.1556331759412803)
75
+
76
+ a1[2] = 50
77
+ expect_correlation(a1, a2, -0.030429030972509225)
78
+
79
+ a1[3] = 25
80
+ expect_correlation(a1, a2, -0.11043152607484653)
81
+
82
+ a1[5] = 20
83
+ expect_correlation(a1, a2, -0.10683599418231368)
84
+
85
+ a1[6] = 40
86
+ expect_correlation(a1, a2, -0.09061095797872151)
87
+
88
+ a1[7] = 60
89
+ expect_correlation(a1, a2, -0.061965254978689745)
90
+
91
+ a1[8] = 80
92
+ expect_correlation(a1, a2, -0.022715542521212734)
93
+
94
+ a1[9] = 100
95
+ expect_correlation(a1, a2, 0.024246432248443597)
96
+ end
97
+
98
+ it "works with 200 dimensional example" do
99
+ a1 = (-100..0).to_a + (-9..90).to_a
100
+ a2 = (-100..0).to_a + (1..100).to_a
101
+
102
+ expect_correlation(a1, a2, 0.9989178188722178)
103
+
104
+ a1[0] = 100
105
+ expect_correlation(a1, a2, 0.9655259356163942)
106
+
107
+ a1[1] = 99
108
+ expect_correlation(a1, a2, 0.9331992252857959)
109
+
110
+ a1[2] = 98
111
+ expect_correlation(a1, a2, 0.9018830671823298)
112
+
113
+ a1[3] = 97
114
+ expect_correlation(a1, a2, 0.871527012471479)
115
+
116
+ (4..99).each{ |i| a1[i] = 100 - i }
117
+ expect_correlation(a1, a2, -0.14729260459452256)
118
+
119
+ a1[101] = 1
120
+ expect_correlation(a1, a2, -0.147683155760824)
121
+
122
+ a1[102] = 2
123
+ expect_correlation(a1, a2, -0.14803962444596394)
124
+
125
+ a1[103] = 3
126
+ expect_correlation(a1, a2, -0.14836161254154293)
127
+
128
+ a1[104] = 4
129
+ expect_correlation(a1, a2, -0.14864872717684907)
130
+
131
+ (5..100).each{ |i| a1[100+i] = i }
132
+ expect_correlation(a1, a2, 0.0)
133
+ end
134
+ end
135
+
136
+ context "using hash object" do
137
+ it "works with 6 dimensional examples" do
138
+ a1 = {a: -100, b: -50, c: 0, d: 100, e: 100, f: 100, g: 10, h: 20, i: 30}
139
+ a2 = {a: -100, b: -50, c: 0, g: 20, h: 30, i: 40, j: -100, k: -100, l: -100}
140
+
141
+ expect_correlation(a1, a2, 0.9978980816987033)
142
+
143
+ a1[:a] = 100
144
+ expect_correlation(a1, a2, -0.33178189173568795)
145
+
146
+ a1[:b] = 50
147
+ expect_correlation(a1, a2, -0.8531546818010307)
148
+
149
+ a1[:g] = 20
150
+ expect_correlation(a1, a2, -0.8501701979323958)
151
+
152
+ a1[:h] = 30
153
+ expect_correlation(a1, a2, -0.8251789485121429)
154
+
155
+ a1[:i] = 40
156
+ expect_correlation(a1, a2, -0.777119205197422)
157
+ end
158
+ end
159
+
160
+ end
161
+
162
+ def expect_correlation(object1, object2, correlation)
163
+ vector1 = DbClustering::Models::Vector.new(object: object1)
164
+ vector2 = DbClustering::Models::Vector.new(object: object2)
165
+ expect(@pearson_correlation.correlation(vector1, vector2)).to be_within(0.001).of(correlation)
166
+ end
167
+
168
+ def expect_distance(object1, object2, distance)
169
+ vector1 = DbClustering::Models::Vector.new(object: object1)
170
+ vector2 = DbClustering::Models::Vector.new(object: object2)
171
+ expect(@pearson_correlation.distance(vector1, vector2)).to be_within(0.001).of(distance)
172
+ end
173
+
174
+ end