db_clustering 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +15 -11
- data/Gemfile.lock +149 -0
- data/LICENSE.txt +1 -1
- data/README.md +90 -0
- data/Rakefile +6 -6
- data/VERSION +1 -0
- data/lib/algorithms/density_based/dbscan.rb +48 -0
- data/lib/datasource_adapters/active_record.rb +32 -0
- data/lib/datasource_adapters/in_memory.rb +29 -0
- data/lib/db_clustering.rb +34 -0
- data/lib/distance_metrics/average_difference.rb +28 -0
- data/lib/distance_metrics/cosine_similarity.rb +43 -0
- data/lib/distance_metrics/euclidean_distance.rb +32 -0
- data/lib/distance_metrics/pearson_correlation.rb +44 -0
- data/lib/generators/datasource/active_record.rb +0 -0
- data/lib/models/cluster.rb +18 -0
- data/lib/models/point.rb +41 -0
- data/lib/models/vector.rb +30 -0
- data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
- data/spec/datasource_adapters/active_record_spec.rb +0 -0
- data/spec/datasource_adapters/in_memory_spec.rb +82 -0
- data/spec/distance_metrics/average_difference_spec.rb +44 -0
- data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
- data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
- data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
- data/spec/generators/datasource/active_record_spec.rb +0 -0
- data/spec/models/cluster_spec.rb +0 -0
- data/spec/models/point_spec.rb +0 -0
- data/spec/models/vector_spec.rb +0 -0
- data/spec/spec_helper.rb +7 -2
- data/spec/support/dataset_helper.rb +19 -0
- data/spec/support/test_model.rb +9 -0
- metadata +31 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ddf1c0c19cc0a107f1727d260d1101a2b4bf49f6
|
4
|
+
data.tar.gz: f6ab2b977b9759aaa69d215240c12df3f1fc9426
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 706097cbe232cef96549c0e078f72b60b224d194ad5249ff5a95c6dcb44a6213fb0a5df8742f2e8e65d58aee2385ddd7412a2ef2188e1f1715dfb9c5167d2a92
|
7
|
+
data.tar.gz: 6eef6d390e20f4e9d762060dca6d4f87194ef0cc5f3c4b59180d06558af9f00994a7d622930a82273cef258b6be500412cd760156a6036127ba8f67a0a30b749
|
data/.rspec
CHANGED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -1,14 +1,18 @@
|
|
1
|
-
source
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
1
|
+
source 'http://rubygems.org'
|
5
2
|
|
6
|
-
#
|
7
|
-
|
3
|
+
# Dependencies required to use the gem
|
4
|
+
gem 'activerecord', '~> 4.2', '>= 4.2.1'
|
5
|
+
|
6
|
+
# Dependencies to develop the gem
|
8
7
|
group :development do
|
9
|
-
gem
|
10
|
-
gem
|
11
|
-
gem
|
12
|
-
gem
|
13
|
-
gem
|
8
|
+
gem 'rspec', '~> 3.0', '>= 3.2.0'
|
9
|
+
gem 'rdoc', '~> 4.0', '>= 4.2.0'
|
10
|
+
gem 'bundler', '~> 1.0', '>= 1.10.2'
|
11
|
+
gem 'jeweler', '~> 2.0', '>= 2.0.1'
|
12
|
+
gem 'simplecov', '>= 0'
|
13
|
+
gem 'guard-rspec', '~> 4.0', '>= 4.5.1'
|
14
|
+
gem 'byebug', '~> 5.0'
|
15
|
+
gem 'awesome_print', '~> 1.6', '>= 1.6.1', require: 'awesome_print'
|
16
|
+
gem 'fuubar', '~> 2.0', '>= 2.0.0.rc1'
|
17
|
+
gem 'simple-random', '~> 1.0', '>= 1.0.0'
|
14
18
|
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activemodel (4.2.3)
|
5
|
+
activesupport (= 4.2.3)
|
6
|
+
builder (~> 3.1)
|
7
|
+
activerecord (4.2.3)
|
8
|
+
activemodel (= 4.2.3)
|
9
|
+
activesupport (= 4.2.3)
|
10
|
+
arel (~> 6.0)
|
11
|
+
activesupport (4.2.3)
|
12
|
+
i18n (~> 0.7)
|
13
|
+
json (~> 1.7, >= 1.7.7)
|
14
|
+
minitest (~> 5.1)
|
15
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
16
|
+
tzinfo (~> 1.1)
|
17
|
+
addressable (2.3.8)
|
18
|
+
arel (6.0.1)
|
19
|
+
awesome_print (1.6.1)
|
20
|
+
builder (3.2.2)
|
21
|
+
byebug (5.0.0)
|
22
|
+
columnize (= 0.9.0)
|
23
|
+
coderay (1.1.0)
|
24
|
+
columnize (0.9.0)
|
25
|
+
descendants_tracker (0.0.4)
|
26
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
27
|
+
diff-lcs (1.2.5)
|
28
|
+
docile (1.1.5)
|
29
|
+
faraday (0.9.1)
|
30
|
+
multipart-post (>= 1.2, < 3)
|
31
|
+
ffi (1.9.10)
|
32
|
+
formatador (0.2.5)
|
33
|
+
fuubar (2.0.0)
|
34
|
+
rspec (~> 3.0)
|
35
|
+
ruby-progressbar (~> 1.4)
|
36
|
+
git (1.2.9.1)
|
37
|
+
github_api (0.12.3)
|
38
|
+
addressable (~> 2.3)
|
39
|
+
descendants_tracker (~> 0.0.4)
|
40
|
+
faraday (~> 0.8, < 0.10)
|
41
|
+
hashie (>= 3.3)
|
42
|
+
multi_json (>= 1.7.5, < 2.0)
|
43
|
+
nokogiri (~> 1.6.3)
|
44
|
+
oauth2
|
45
|
+
guard (2.12.8)
|
46
|
+
formatador (>= 0.2.4)
|
47
|
+
listen (>= 2.7, <= 4.0)
|
48
|
+
lumberjack (~> 1.0)
|
49
|
+
nenv (~> 0.1)
|
50
|
+
notiffany (~> 0.0)
|
51
|
+
pry (>= 0.9.12)
|
52
|
+
shellany (~> 0.0)
|
53
|
+
thor (>= 0.18.1)
|
54
|
+
guard-compat (1.2.1)
|
55
|
+
guard-rspec (4.6.1)
|
56
|
+
guard (~> 2.1)
|
57
|
+
guard-compat (~> 1.1)
|
58
|
+
rspec (>= 2.99.0, < 4.0)
|
59
|
+
hashie (3.4.2)
|
60
|
+
highline (1.7.2)
|
61
|
+
i18n (0.7.0)
|
62
|
+
jeweler (2.0.1)
|
63
|
+
builder
|
64
|
+
bundler (>= 1.0)
|
65
|
+
git (>= 1.2.5)
|
66
|
+
github_api
|
67
|
+
highline (>= 1.6.15)
|
68
|
+
nokogiri (>= 1.5.10)
|
69
|
+
rake
|
70
|
+
rdoc
|
71
|
+
json (1.8.3)
|
72
|
+
jwt (1.5.1)
|
73
|
+
listen (3.0.2)
|
74
|
+
rb-fsevent (>= 0.9.3)
|
75
|
+
rb-inotify (>= 0.9)
|
76
|
+
lumberjack (1.0.9)
|
77
|
+
method_source (0.8.2)
|
78
|
+
mini_portile (0.6.2)
|
79
|
+
minitest (5.7.0)
|
80
|
+
multi_json (1.11.2)
|
81
|
+
multi_xml (0.5.5)
|
82
|
+
multipart-post (2.0.0)
|
83
|
+
nenv (0.2.0)
|
84
|
+
nokogiri (1.6.6.2)
|
85
|
+
mini_portile (~> 0.6.0)
|
86
|
+
notiffany (0.0.6)
|
87
|
+
nenv (~> 0.1)
|
88
|
+
shellany (~> 0.0)
|
89
|
+
oauth2 (1.0.0)
|
90
|
+
faraday (>= 0.8, < 0.10)
|
91
|
+
jwt (~> 1.0)
|
92
|
+
multi_json (~> 1.3)
|
93
|
+
multi_xml (~> 0.5)
|
94
|
+
rack (~> 1.2)
|
95
|
+
pry (0.10.1)
|
96
|
+
coderay (~> 1.1.0)
|
97
|
+
method_source (~> 0.8.1)
|
98
|
+
slop (~> 3.4)
|
99
|
+
rack (1.6.4)
|
100
|
+
rake (10.4.2)
|
101
|
+
rb-fsevent (0.9.5)
|
102
|
+
rb-inotify (0.9.5)
|
103
|
+
ffi (>= 0.5.0)
|
104
|
+
rdoc (4.2.0)
|
105
|
+
rspec (3.3.0)
|
106
|
+
rspec-core (~> 3.3.0)
|
107
|
+
rspec-expectations (~> 3.3.0)
|
108
|
+
rspec-mocks (~> 3.3.0)
|
109
|
+
rspec-core (3.3.1)
|
110
|
+
rspec-support (~> 3.3.0)
|
111
|
+
rspec-expectations (3.3.0)
|
112
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
113
|
+
rspec-support (~> 3.3.0)
|
114
|
+
rspec-mocks (3.3.1)
|
115
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
116
|
+
rspec-support (~> 3.3.0)
|
117
|
+
rspec-support (3.3.0)
|
118
|
+
ruby-progressbar (1.7.5)
|
119
|
+
shellany (0.0.1)
|
120
|
+
simple-random (1.0.0)
|
121
|
+
simplecov (0.10.0)
|
122
|
+
docile (~> 1.1.0)
|
123
|
+
json (~> 1.8)
|
124
|
+
simplecov-html (~> 0.10.0)
|
125
|
+
simplecov-html (0.10.0)
|
126
|
+
slop (3.6.0)
|
127
|
+
thor (0.19.1)
|
128
|
+
thread_safe (0.3.5)
|
129
|
+
tzinfo (1.2.2)
|
130
|
+
thread_safe (~> 0.1)
|
131
|
+
|
132
|
+
PLATFORMS
|
133
|
+
ruby
|
134
|
+
|
135
|
+
DEPENDENCIES
|
136
|
+
activerecord (~> 4.2, >= 4.2.1)
|
137
|
+
awesome_print (~> 1.6, >= 1.6.1)
|
138
|
+
bundler (~> 1.0, >= 1.10.2)
|
139
|
+
byebug (~> 5.0)
|
140
|
+
fuubar (~> 2.0, >= 2.0.0.rc1)
|
141
|
+
guard-rspec (~> 4.0, >= 4.5.1)
|
142
|
+
jeweler (~> 2.0, >= 2.0.1)
|
143
|
+
rdoc (~> 4.0, >= 4.2.0)
|
144
|
+
rspec (~> 3.0, >= 3.2.0)
|
145
|
+
simple-random (~> 1.0, >= 1.0.0)
|
146
|
+
simplecov
|
147
|
+
|
148
|
+
BUNDLED WITH
|
149
|
+
1.10.5
|
data/LICENSE.txt
CHANGED
data/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# DBClustering [](https://travis-ci.org/Dschee/db_clustering)
|
2
|
+
|
3
|
+
Please note that this gem is still in its very early stages and should not considered stable.
|
4
|
+
Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
|
5
|
+
|
6
|
+
## Requirements
|
7
|
+
|
8
|
+
Ruby 2.1+ is required, earlier Rubies may work but are not officially supported.
|
9
|
+
|
10
|
+
## Getting Started
|
11
|
+
|
12
|
+
This gem was developed to work best in Ruby on Rails projects.
|
13
|
+
|
14
|
+
1. Add this gem to your Gemfile
|
15
|
+
|
16
|
+
gem 'db_clustering'
|
17
|
+
|
18
|
+
2. Rund `bundle install` in your terminal
|
19
|
+
|
20
|
+
3. Implement the `clustering_vector` method in your model class and return either:
|
21
|
+
- an **array** with numeric values for similarity comparison
|
22
|
+
- a **hash** with numeric values for similarity comparison between keys existing in both hashes
|
23
|
+
|
24
|
+
See `TestModel` class within the `spec/support` directory for a very simple example.
|
25
|
+
|
26
|
+
4. Decide for a *distance metric* and initialize it, e.g.:
|
27
|
+
|
28
|
+
``` ruby
|
29
|
+
average_difference = DbClustering::DistanceMetrics::AverageDifference.new
|
30
|
+
|
31
|
+
# Instead you can also use one of the following:
|
32
|
+
cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
|
33
|
+
euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
|
34
|
+
pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
|
35
|
+
```
|
36
|
+
|
37
|
+
5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
|
38
|
+
|
39
|
+
``` ruby
|
40
|
+
in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
|
41
|
+
```
|
42
|
+
|
43
|
+
Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
|
44
|
+
|
45
|
+
An **ActiveRecord datasource** type is planned but not yet implemented. Please stay tuned.
|
46
|
+
|
47
|
+
6. Decide for an **algorithm** and initialize it:
|
48
|
+
|
49
|
+
``` ruby
|
50
|
+
dbscan = DbClustering::Algorithms::Dbscan.new(datasource: in_memory_datasource, distance_metric: average_difference)
|
51
|
+
```
|
52
|
+
Please note that currently **only one algorithm is available**. More algorithms aren't currently planned but may be added if needed. Contributions are welcome, of course.
|
53
|
+
|
54
|
+
7. Decide for the **algorithm parameters** and start the process of clustering your data:
|
55
|
+
|
56
|
+
``` ruby
|
57
|
+
dbscan.cluster(max_distance: 10, min_neighbors: 5)
|
58
|
+
```
|
59
|
+
The `max_distance` is the epsilon parameter and the `min_neighbors` the minPts parameter from the usual DBSCAN algorithm documentation (e.g. Wikipedia). You might want to try different values here first before you decide for the right values for your purpose.
|
60
|
+
|
61
|
+
Plase also take note that the `max_distance` value is **highly dependent on the type of metric** you decided to go for. For the `AverageDifference` and `EuclideanDistance` metrics it can be an **open-ended positive value**. For the `CosineSimilarity` and `PearsonCorrelation` types it needs to be a value between 0 and 2 where a value of `0` means "100% positive correlation/similarity", a value of `1` means "no correlation/similarity at all" and a value of `2` means "100% negative correlation/similarity". You can use any decimal value in between (e.g. 0.25) as a partly positive/negative correlation.
|
62
|
+
|
63
|
+
8. Wait for the calculations to finish and use the results the way you want:
|
64
|
+
|
65
|
+
``` ruby
|
66
|
+
clusters = dbscan.clusters # the resulting Clusters, each cluster contains Points
|
67
|
+
first_cluster = clusters.first
|
68
|
+
point = first_cluster.points.first
|
69
|
+
# a point knows its cluster, and its position in there
|
70
|
+
point.cluster # will return the same object as `first_cluster`
|
71
|
+
point.is_edge_point? # boolean specifying if it's an edge point of its cluster
|
72
|
+
point.is_core_point? # boolean specifying if it's a core point of its cluster
|
73
|
+
point.is_noise_point? # boolean specifiying if it's a noise point without a cluster
|
74
|
+
|
75
|
+
# a point also contains the source object specifying the `clustering_vector` method
|
76
|
+
your_model = point.datasource_point
|
77
|
+
```
|
78
|
+
|
79
|
+
For more please don't hesitate to have a look into the underlying models under the `lib/models` directory as well as the corresponding specs.
|
80
|
+
|
81
|
+
That's it, it **looks more complicated than it actually** is, just try it out! You can find complete usage examples within the `spec/algorithms/density_based/dbscan_spec.rb` file.
|
82
|
+
|
83
|
+
## Contributing
|
84
|
+
|
85
|
+
Contributions are welcome. Please fork this project, make your changes and file a pull request. Please also make sure to write tests to ensure your changes persist over time.
|
86
|
+
|
87
|
+
|
88
|
+
## License
|
89
|
+
|
90
|
+
This gem is released under the [MIT License](http://www.opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
@@ -14,13 +14,13 @@ require 'rake'
|
|
14
14
|
require 'jeweler'
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
-
gem.name = "
|
18
|
-
gem.homepage = "http://github.com/Dschee/
|
17
|
+
gem.name = "db_clustering"
|
18
|
+
gem.homepage = "http://github.com/Dschee/db_clustering"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{
|
21
|
-
gem.description = %Q{
|
20
|
+
gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
|
21
|
+
gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
|
22
22
|
gem.email = "CihatGuenduez@posteo.de"
|
23
|
-
gem.authors = ["Cihat
|
23
|
+
gem.authors = ["Cihat Gündüz"]
|
24
24
|
# dependencies defined in Gemfile
|
25
25
|
end
|
26
26
|
Jeweler::RubygemsDotOrgTasks.new
|
@@ -44,7 +44,7 @@ Rake::RDocTask.new do |rdoc|
|
|
44
44
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
45
45
|
|
46
46
|
rdoc.rdoc_dir = 'rdoc'
|
47
|
-
rdoc.title = "
|
47
|
+
rdoc.title = "db_clustering #{version}"
|
48
48
|
rdoc.rdoc_files.include('README*')
|
49
49
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
50
50
|
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# For more information see https://en.wikipedia.org/wiki/DBSCAN
|
2
|
+
|
3
|
+
module DbClustering
|
4
|
+
module Algorithms
|
5
|
+
class Dbscan
|
6
|
+
|
7
|
+
attr_accessor :datasource, :clusters
|
8
|
+
|
9
|
+
def initialize(datasource:, distance_metric:)
|
10
|
+
@datasource = datasource
|
11
|
+
@distance_metric = distance_metric
|
12
|
+
@clusters = []
|
13
|
+
end
|
14
|
+
|
15
|
+
def cluster(max_distance:, min_neighbors:)
|
16
|
+
@clusters = []
|
17
|
+
cluster = nil
|
18
|
+
|
19
|
+
@datasource.iterate_all_points do |point|
|
20
|
+
neighbors = @datasource.neighbors(point: point, distance_metric: @distance_metric, max_distance: max_distance)
|
21
|
+
|
22
|
+
if neighbors.count < min_neighbors
|
23
|
+
point.is_noise = true
|
24
|
+
else
|
25
|
+
if point.cluster.nil?
|
26
|
+
cluster = DbClustering::Models::Cluster.new
|
27
|
+
@clusters << cluster
|
28
|
+
else
|
29
|
+
cluster = point.cluster
|
30
|
+
end
|
31
|
+
expand_cluster(cluster: cluster, neighbors: neighbors)
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def expand_cluster(cluster:, neighbors:)
|
38
|
+
# Important: If neighbors do not include point itself, then point must be added to cluster, too.
|
39
|
+
neighbors.each do |neighbor|
|
40
|
+
if !neighbor.visited?
|
41
|
+
cluster.add(neighbor)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DatasourceAdapters
|
3
|
+
class ActiveRecord
|
4
|
+
|
5
|
+
def initialize(relation:)
|
6
|
+
@relation = relation
|
7
|
+
end
|
8
|
+
|
9
|
+
def iterate_all_points
|
10
|
+
@relation.find_each do |datasource_point|
|
11
|
+
point = DbClustering::Models::Point.new(datasource_point)
|
12
|
+
yield(point)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def neighbors(point:, distance_metric:, max_distance:)
|
17
|
+
neighbors = []
|
18
|
+
|
19
|
+
@relation.find_each do |neighbor_candidate|
|
20
|
+
candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
|
21
|
+
|
22
|
+
if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
|
23
|
+
neighbors << candidate_point
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
neighbors
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module DbClustering
|
2
|
+
module DatasourceAdapters
|
3
|
+
class InMemory
|
4
|
+
|
5
|
+
def initialize(array:)
|
6
|
+
@array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
|
7
|
+
end
|
8
|
+
|
9
|
+
def iterate_all_points
|
10
|
+
@array.each do |point|
|
11
|
+
yield(point)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def neighbors(point:, distance_metric:, max_distance:)
|
16
|
+
neighbors = []
|
17
|
+
|
18
|
+
@array.each do |neighbor_candidate|
|
19
|
+
if distance_metric.distance(point.vector, neighbor_candidate.vector) <= max_distance
|
20
|
+
neighbors << neighbor_candidate
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
neighbors
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|