db_clustering 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +15 -11
- data/Gemfile.lock +149 -0
- data/LICENSE.txt +1 -1
- data/README.md +90 -0
- data/Rakefile +6 -6
- data/VERSION +1 -0
- data/lib/algorithms/density_based/dbscan.rb +48 -0
- data/lib/datasource_adapters/active_record.rb +32 -0
- data/lib/datasource_adapters/in_memory.rb +29 -0
- data/lib/db_clustering.rb +34 -0
- data/lib/distance_metrics/average_difference.rb +28 -0
- data/lib/distance_metrics/cosine_similarity.rb +43 -0
- data/lib/distance_metrics/euclidean_distance.rb +32 -0
- data/lib/distance_metrics/pearson_correlation.rb +44 -0
- data/lib/generators/datasource/active_record.rb +0 -0
- data/lib/models/cluster.rb +18 -0
- data/lib/models/point.rb +41 -0
- data/lib/models/vector.rb +30 -0
- data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
- data/spec/datasource_adapters/active_record_spec.rb +0 -0
- data/spec/datasource_adapters/in_memory_spec.rb +82 -0
- data/spec/distance_metrics/average_difference_spec.rb +44 -0
- data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
- data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
- data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
- data/spec/generators/datasource/active_record_spec.rb +0 -0
- data/spec/models/cluster_spec.rb +0 -0
- data/spec/models/point_spec.rb +0 -0
- data/spec/models/vector_spec.rb +0 -0
- data/spec/spec_helper.rb +7 -2
- data/spec/support/dataset_helper.rb +19 -0
- data/spec/support/test_model.rb +9 -0
- metadata +31 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ddf1c0c19cc0a107f1727d260d1101a2b4bf49f6
|
4
|
+
data.tar.gz: f6ab2b977b9759aaa69d215240c12df3f1fc9426
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 706097cbe232cef96549c0e078f72b60b224d194ad5249ff5a95c6dcb44a6213fb0a5df8742f2e8e65d58aee2385ddd7412a2ef2188e1f1715dfb9c5167d2a92
|
7
|
+
data.tar.gz: 6eef6d390e20f4e9d762060dca6d4f87194ef0cc5f3c4b59180d06558af9f00994a7d622930a82273cef258b6be500412cd760156a6036127ba8f67a0a30b749
|
data/.rspec
CHANGED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -1,14 +1,18 @@
|
|
1
|
-
source
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
1
|
+
source 'http://rubygems.org'
|
5
2
|
|
6
|
-
#
|
7
|
-
|
3
|
+
# Dependencies required to use the gem
|
4
|
+
gem 'activerecord', '~> 4.2', '>= 4.2.1'
|
5
|
+
|
6
|
+
# Dependencies to develop the gem
|
8
7
|
group :development do
|
9
|
-
gem
|
10
|
-
gem
|
11
|
-
gem
|
12
|
-
gem
|
13
|
-
gem
|
8
|
+
gem 'rspec', '~> 3.0', '>= 3.2.0'
|
9
|
+
gem 'rdoc', '~> 4.0', '>= 4.2.0'
|
10
|
+
gem 'bundler', '~> 1.0', '>= 1.10.2'
|
11
|
+
gem 'jeweler', '~> 2.0', '>= 2.0.1'
|
12
|
+
gem 'simplecov', '>= 0'
|
13
|
+
gem 'guard-rspec', '~> 4.0', '>= 4.5.1'
|
14
|
+
gem 'byebug', '~> 5.0'
|
15
|
+
gem 'awesome_print', '~> 1.6', '>= 1.6.1', require: 'awesome_print'
|
16
|
+
gem 'fuubar', '~> 2.0', '>= 2.0.0.rc1'
|
17
|
+
gem 'simple-random', '~> 1.0', '>= 1.0.0'
|
14
18
|
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activemodel (4.2.3)
|
5
|
+
activesupport (= 4.2.3)
|
6
|
+
builder (~> 3.1)
|
7
|
+
activerecord (4.2.3)
|
8
|
+
activemodel (= 4.2.3)
|
9
|
+
activesupport (= 4.2.3)
|
10
|
+
arel (~> 6.0)
|
11
|
+
activesupport (4.2.3)
|
12
|
+
i18n (~> 0.7)
|
13
|
+
json (~> 1.7, >= 1.7.7)
|
14
|
+
minitest (~> 5.1)
|
15
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
16
|
+
tzinfo (~> 1.1)
|
17
|
+
addressable (2.3.8)
|
18
|
+
arel (6.0.1)
|
19
|
+
awesome_print (1.6.1)
|
20
|
+
builder (3.2.2)
|
21
|
+
byebug (5.0.0)
|
22
|
+
columnize (= 0.9.0)
|
23
|
+
coderay (1.1.0)
|
24
|
+
columnize (0.9.0)
|
25
|
+
descendants_tracker (0.0.4)
|
26
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
27
|
+
diff-lcs (1.2.5)
|
28
|
+
docile (1.1.5)
|
29
|
+
faraday (0.9.1)
|
30
|
+
multipart-post (>= 1.2, < 3)
|
31
|
+
ffi (1.9.10)
|
32
|
+
formatador (0.2.5)
|
33
|
+
fuubar (2.0.0)
|
34
|
+
rspec (~> 3.0)
|
35
|
+
ruby-progressbar (~> 1.4)
|
36
|
+
git (1.2.9.1)
|
37
|
+
github_api (0.12.3)
|
38
|
+
addressable (~> 2.3)
|
39
|
+
descendants_tracker (~> 0.0.4)
|
40
|
+
faraday (~> 0.8, < 0.10)
|
41
|
+
hashie (>= 3.3)
|
42
|
+
multi_json (>= 1.7.5, < 2.0)
|
43
|
+
nokogiri (~> 1.6.3)
|
44
|
+
oauth2
|
45
|
+
guard (2.12.8)
|
46
|
+
formatador (>= 0.2.4)
|
47
|
+
listen (>= 2.7, <= 4.0)
|
48
|
+
lumberjack (~> 1.0)
|
49
|
+
nenv (~> 0.1)
|
50
|
+
notiffany (~> 0.0)
|
51
|
+
pry (>= 0.9.12)
|
52
|
+
shellany (~> 0.0)
|
53
|
+
thor (>= 0.18.1)
|
54
|
+
guard-compat (1.2.1)
|
55
|
+
guard-rspec (4.6.1)
|
56
|
+
guard (~> 2.1)
|
57
|
+
guard-compat (~> 1.1)
|
58
|
+
rspec (>= 2.99.0, < 4.0)
|
59
|
+
hashie (3.4.2)
|
60
|
+
highline (1.7.2)
|
61
|
+
i18n (0.7.0)
|
62
|
+
jeweler (2.0.1)
|
63
|
+
builder
|
64
|
+
bundler (>= 1.0)
|
65
|
+
git (>= 1.2.5)
|
66
|
+
github_api
|
67
|
+
highline (>= 1.6.15)
|
68
|
+
nokogiri (>= 1.5.10)
|
69
|
+
rake
|
70
|
+
rdoc
|
71
|
+
json (1.8.3)
|
72
|
+
jwt (1.5.1)
|
73
|
+
listen (3.0.2)
|
74
|
+
rb-fsevent (>= 0.9.3)
|
75
|
+
rb-inotify (>= 0.9)
|
76
|
+
lumberjack (1.0.9)
|
77
|
+
method_source (0.8.2)
|
78
|
+
mini_portile (0.6.2)
|
79
|
+
minitest (5.7.0)
|
80
|
+
multi_json (1.11.2)
|
81
|
+
multi_xml (0.5.5)
|
82
|
+
multipart-post (2.0.0)
|
83
|
+
nenv (0.2.0)
|
84
|
+
nokogiri (1.6.6.2)
|
85
|
+
mini_portile (~> 0.6.0)
|
86
|
+
notiffany (0.0.6)
|
87
|
+
nenv (~> 0.1)
|
88
|
+
shellany (~> 0.0)
|
89
|
+
oauth2 (1.0.0)
|
90
|
+
faraday (>= 0.8, < 0.10)
|
91
|
+
jwt (~> 1.0)
|
92
|
+
multi_json (~> 1.3)
|
93
|
+
multi_xml (~> 0.5)
|
94
|
+
rack (~> 1.2)
|
95
|
+
pry (0.10.1)
|
96
|
+
coderay (~> 1.1.0)
|
97
|
+
method_source (~> 0.8.1)
|
98
|
+
slop (~> 3.4)
|
99
|
+
rack (1.6.4)
|
100
|
+
rake (10.4.2)
|
101
|
+
rb-fsevent (0.9.5)
|
102
|
+
rb-inotify (0.9.5)
|
103
|
+
ffi (>= 0.5.0)
|
104
|
+
rdoc (4.2.0)
|
105
|
+
rspec (3.3.0)
|
106
|
+
rspec-core (~> 3.3.0)
|
107
|
+
rspec-expectations (~> 3.3.0)
|
108
|
+
rspec-mocks (~> 3.3.0)
|
109
|
+
rspec-core (3.3.1)
|
110
|
+
rspec-support (~> 3.3.0)
|
111
|
+
rspec-expectations (3.3.0)
|
112
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
113
|
+
rspec-support (~> 3.3.0)
|
114
|
+
rspec-mocks (3.3.1)
|
115
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
116
|
+
rspec-support (~> 3.3.0)
|
117
|
+
rspec-support (3.3.0)
|
118
|
+
ruby-progressbar (1.7.5)
|
119
|
+
shellany (0.0.1)
|
120
|
+
simple-random (1.0.0)
|
121
|
+
simplecov (0.10.0)
|
122
|
+
docile (~> 1.1.0)
|
123
|
+
json (~> 1.8)
|
124
|
+
simplecov-html (~> 0.10.0)
|
125
|
+
simplecov-html (0.10.0)
|
126
|
+
slop (3.6.0)
|
127
|
+
thor (0.19.1)
|
128
|
+
thread_safe (0.3.5)
|
129
|
+
tzinfo (1.2.2)
|
130
|
+
thread_safe (~> 0.1)
|
131
|
+
|
132
|
+
PLATFORMS
|
133
|
+
ruby
|
134
|
+
|
135
|
+
DEPENDENCIES
|
136
|
+
activerecord (~> 4.2, >= 4.2.1)
|
137
|
+
awesome_print (~> 1.6, >= 1.6.1)
|
138
|
+
bundler (~> 1.0, >= 1.10.2)
|
139
|
+
byebug (~> 5.0)
|
140
|
+
fuubar (~> 2.0, >= 2.0.0.rc1)
|
141
|
+
guard-rspec (~> 4.0, >= 4.5.1)
|
142
|
+
jeweler (~> 2.0, >= 2.0.1)
|
143
|
+
rdoc (~> 4.0, >= 4.2.0)
|
144
|
+
rspec (~> 3.0, >= 3.2.0)
|
145
|
+
simple-random (~> 1.0, >= 1.0.0)
|
146
|
+
simplecov
|
147
|
+
|
148
|
+
BUNDLED WITH
|
149
|
+
1.10.5
|
data/LICENSE.txt
CHANGED
data/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# DBClustering [![Build Status](https://travis-ci.org/Dschee/db_clustering.svg?branch=develop)](https://travis-ci.org/Dschee/db_clustering)
|
2
|
+
|
3
|
+
Please note that this gem is still in its very early stages and should not considered stable.
|
4
|
+
Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
|
5
|
+
|
6
|
+
## Requirements
|
7
|
+
|
8
|
+
Ruby 2.1+ is required, earlier Rubies may work but are not officially supported.
|
9
|
+
|
10
|
+
## Getting Started
|
11
|
+
|
12
|
+
This gem was developed to work best in Ruby on Rails projects.
|
13
|
+
|
14
|
+
1. Add this gem to your Gemfile
|
15
|
+
|
16
|
+
gem 'db_clustering'
|
17
|
+
|
18
|
+
2. Rund `bundle install` in your terminal
|
19
|
+
|
20
|
+
3. Implement the `clustering_vector` method in your model class and return either:
|
21
|
+
- an **array** with numeric values for similarity comparison
|
22
|
+
- a **hash** with numeric values for similarity comparison between keys existing in both hashes
|
23
|
+
|
24
|
+
See `TestModel` class within the `spec/support` directory for a very simple example.
|
25
|
+
|
26
|
+
4. Decide for a *distance metric* and initialize it, e.g.:
|
27
|
+
|
28
|
+
``` ruby
|
29
|
+
average_difference = DbClustering::DistanceMetrics::AverageDifference.new
|
30
|
+
|
31
|
+
# Instead you can also use one of the following:
|
32
|
+
cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
|
33
|
+
euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
|
34
|
+
pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
|
35
|
+
```
|
36
|
+
|
37
|
+
5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
|
38
|
+
|
39
|
+
``` ruby
|
40
|
+
in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
|
41
|
+
```
|
42
|
+
|
43
|
+
Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
|
44
|
+
|
45
|
+
An **ActiveRecord datasource** type is planned but not yet implemented. Please stay tuned.
|
46
|
+
|
47
|
+
6. Decide for an **algorithm** and initialize it:
|
48
|
+
|
49
|
+
``` ruby
|
50
|
+
dbscan = DbClustering::Algorithms::Dbscan.new(datasource: in_memory_datasource, distance_metric: average_difference)
|
51
|
+
```
|
52
|
+
Please note that currently **only one algorithm is available**. More algorithms aren't currently planned but may be added if needed. Contributions are welcome, of course.
|
53
|
+
|
54
|
+
7. Decide for the **algorithm parameters** and start the process of clustering your data:
|
55
|
+
|
56
|
+
``` ruby
|
57
|
+
dbscan.cluster(max_distance: 10, min_neighbors: 5)
|
58
|
+
```
|
59
|
+
The `max_distance` is the epsilon parameter and the `min_neighbors` the minPts parameter from the usual DBSCAN algorithm documentation (e.g. Wikipedia). You might want to try different values here first before you decide for the right values for your purpose.
|
60
|
+
|
61
|
+
Plase also take note that the `max_distance` value is **highly dependent on the type of metric** you decided to go for. For the `AverageDifference` and `EuclideanDistance` metrics it can be an **open-ended positive value**. For the `CosineSimilarity` and `PearsonCorrelation` types it needs to be a value between 0 and 2 where a value of `0` means "100% positive correlation/similarity", a value of `1` means "no correlation/similarity at all" and a value of `2` means "100% negative correlation/similarity". You can use any decimal value in between (e.g. 0.25) as a partly positive/negative correlation.
|
62
|
+
|
63
|
+
8. Wait for the calculations to finish and use the results the way you want:
|
64
|
+
|
65
|
+
``` ruby
|
66
|
+
clusters = dbscan.clusters # the resulting Clusters, each cluster contains Points
|
67
|
+
first_cluster = clusters.first
|
68
|
+
point = first_cluster.points.first
|
69
|
+
# a point knows its cluster, and its position in there
|
70
|
+
point.cluster # will return the same object as `first_cluster`
|
71
|
+
point.is_edge_point? # boolean specifying if it's an edge point of its cluster
|
72
|
+
point.is_core_point? # boolean specifying if it's a core point of its cluster
|
73
|
+
point.is_noise_point? # boolean specifiying if it's a noise point without a cluster
|
74
|
+
|
75
|
+
# a point also contains the source object specifying the `clustering_vector` method
|
76
|
+
your_model = point.datasource_point
|
77
|
+
```
|
78
|
+
|
79
|
+
For more please don't hesitate to have a look into the underlying models under the `lib/models` directory as well as the corresponding specs.
|
80
|
+
|
81
|
+
That's it, it **looks more complicated than it actually** is, just try it out! You can find complete usage examples within the `spec/algorithms/density_based/dbscan_spec.rb` file.
|
82
|
+
|
83
|
+
## Contributing
|
84
|
+
|
85
|
+
Contributions are welcome. Please fork this project, make your changes and file a pull request. Please also make sure to write tests to ensure your changes persist over time.
|
86
|
+
|
87
|
+
|
88
|
+
## License
|
89
|
+
|
90
|
+
This gem is released under the [MIT License](http://www.opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
@@ -14,13 +14,13 @@ require 'rake'
|
|
14
14
|
require 'jeweler'
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
-
gem.name = "
|
18
|
-
gem.homepage = "http://github.com/Dschee/
|
17
|
+
gem.name = "db_clustering"
|
18
|
+
gem.homepage = "http://github.com/Dschee/db_clustering"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{
|
21
|
-
gem.description = %Q{
|
20
|
+
gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
|
21
|
+
gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
|
22
22
|
gem.email = "CihatGuenduez@posteo.de"
|
23
|
-
gem.authors = ["Cihat
|
23
|
+
gem.authors = ["Cihat Gündüz"]
|
24
24
|
# dependencies defined in Gemfile
|
25
25
|
end
|
26
26
|
Jeweler::RubygemsDotOrgTasks.new
|
@@ -44,7 +44,7 @@ Rake::RDocTask.new do |rdoc|
|
|
44
44
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
45
45
|
|
46
46
|
rdoc.rdoc_dir = 'rdoc'
|
47
|
-
rdoc.title = "
|
47
|
+
rdoc.title = "db_clustering #{version}"
|
48
48
|
rdoc.rdoc_files.include('README*')
|
49
49
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
50
50
|
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# For more information see https://en.wikipedia.org/wiki/DBSCAN
|
2
|
+
|
3
|
+
module DbClustering
|
4
|
+
module Algorithms
|
5
|
+
class Dbscan
|
6
|
+
|
7
|
+
attr_accessor :datasource, :clusters
|
8
|
+
|
9
|
+
def initialize(datasource:, distance_metric:)
|
10
|
+
@datasource = datasource
|
11
|
+
@distance_metric = distance_metric
|
12
|
+
@clusters = []
|
13
|
+
end
|
14
|
+
|
15
|
+
def cluster(max_distance:, min_neighbors:)
|
16
|
+
@clusters = []
|
17
|
+
cluster = nil
|
18
|
+
|
19
|
+
@datasource.iterate_all_points do |point|
|
20
|
+
neighbors = @datasource.neighbors(point: point, distance_metric: @distance_metric, max_distance: max_distance)
|
21
|
+
|
22
|
+
if neighbors.count < min_neighbors
|
23
|
+
point.is_noise = true
|
24
|
+
else
|
25
|
+
if point.cluster.nil?
|
26
|
+
cluster = DbClustering::Models::Cluster.new
|
27
|
+
@clusters << cluster
|
28
|
+
else
|
29
|
+
cluster = point.cluster
|
30
|
+
end
|
31
|
+
expand_cluster(cluster: cluster, neighbors: neighbors)
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def expand_cluster(cluster:, neighbors:)
|
38
|
+
# Important: If neighbors do not include point itself, then point must be added to cluster, too.
|
39
|
+
neighbors.each do |neighbor|
|
40
|
+
if !neighbor.visited?
|
41
|
+
cluster.add(neighbor)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DatasourceAdapters
|
3
|
+
class ActiveRecord
|
4
|
+
|
5
|
+
def initialize(relation:)
|
6
|
+
@relation = relation
|
7
|
+
end
|
8
|
+
|
9
|
+
def iterate_all_points
|
10
|
+
@relation.find_each do |datasource_point|
|
11
|
+
point = DbClustering::Models::Point.new(datasource_point)
|
12
|
+
yield(point)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def neighbors(point:, distance_metric:, max_distance:)
|
17
|
+
neighbors = []
|
18
|
+
|
19
|
+
@relation.find_each do |neighbor_candidate|
|
20
|
+
candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
|
21
|
+
|
22
|
+
if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
|
23
|
+
neighbors << candidate_point
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
neighbors
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DatasourceAdapters
|
3
|
+
class InMemory
|
4
|
+
|
5
|
+
def initialize(array:)
|
6
|
+
@array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
|
7
|
+
end
|
8
|
+
|
9
|
+
def iterate_all_points
|
10
|
+
@array.each do |point|
|
11
|
+
yield(point)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def neighbors(point:, distance_metric:, max_distance:)
|
16
|
+
neighbors = []
|
17
|
+
|
18
|
+
@array.each do |neighbor_candidate|
|
19
|
+
if distance_metric.distance(point.vector, neighbor_candidate.vector) <= max_distance
|
20
|
+
neighbors << neighbor_candidate
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
neighbors
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|