db_clustering 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +1 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +15 -11
  5. data/Gemfile.lock +149 -0
  6. data/LICENSE.txt +1 -1
  7. data/README.md +90 -0
  8. data/Rakefile +6 -6
  9. data/VERSION +1 -0
  10. data/lib/algorithms/density_based/dbscan.rb +48 -0
  11. data/lib/datasource_adapters/active_record.rb +32 -0
  12. data/lib/datasource_adapters/in_memory.rb +29 -0
  13. data/lib/db_clustering.rb +34 -0
  14. data/lib/distance_metrics/average_difference.rb +28 -0
  15. data/lib/distance_metrics/cosine_similarity.rb +43 -0
  16. data/lib/distance_metrics/euclidean_distance.rb +32 -0
  17. data/lib/distance_metrics/pearson_correlation.rb +44 -0
  18. data/lib/generators/datasource/active_record.rb +0 -0
  19. data/lib/models/cluster.rb +18 -0
  20. data/lib/models/point.rb +41 -0
  21. data/lib/models/vector.rb +30 -0
  22. data/spec/algorithms/density_based/dbscan_spec.rb +57 -0
  23. data/spec/datasource_adapters/active_record_spec.rb +0 -0
  24. data/spec/datasource_adapters/in_memory_spec.rb +82 -0
  25. data/spec/distance_metrics/average_difference_spec.rb +44 -0
  26. data/spec/distance_metrics/cosine_similarity_spec.rb +172 -0
  27. data/spec/distance_metrics/euclidean_distance_spec.rb +137 -0
  28. data/spec/distance_metrics/pearson_correlation_spec.rb +174 -0
  29. data/spec/generators/datasource/active_record_spec.rb +0 -0
  30. data/spec/models/cluster_spec.rb +0 -0
  31. data/spec/models/point_spec.rb +0 -0
  32. data/spec/models/vector_spec.rb +0 -0
  33. data/spec/spec_helper.rb +7 -2
  34. data/spec/support/dataset_helper.rb +19 -0
  35. data/spec/support/test_model.rb +9 -0
  36. metadata +31 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7da4fe20fbc049e8449e67437319f746dc368099
4
- data.tar.gz: ac84dd76c9e9cbe21cabb52e2425a90796717112
3
+ metadata.gz: ddf1c0c19cc0a107f1727d260d1101a2b4bf49f6
4
+ data.tar.gz: f6ab2b977b9759aaa69d215240c12df3f1fc9426
5
5
  SHA512:
6
- metadata.gz: 2fb3221634bbc0f02558d45f376f383c34e94a7fccd936d87fc7388e4bed3e868f26da552b1a7f11f57feb7fc4bb72949ac62f5be26a3d1bc5d77cad28afb716
7
- data.tar.gz: 1b81cd46f2d49efaea7081df28beaf361c2a4d1106d5f184c83fb85cd208a4e8df61037f141aa7590d28d7690347ee27cdc4b0bc0fb53fb73b390227841fb3b6
6
+ metadata.gz: 706097cbe232cef96549c0e078f72b60b224d194ad5249ff5a95c6dcb44a6213fb0a5df8742f2e8e65d58aee2385ddd7412a2ef2188e1f1715dfb9c5167d2a92
7
+ data.tar.gz: 6eef6d390e20f4e9d762060dca6d4f87194ef0cc5f3c4b59180d06558af9f00994a7d622930a82273cef258b6be500412cd760156a6036127ba8f67a0a30b749
data/.rspec CHANGED
@@ -1 +1,2 @@
1
+ --format Fuubar
1
2
  --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.6
4
+ - 2.2.2
5
+ - ruby-head
data/Gemfile CHANGED
@@ -1,14 +1,18 @@
1
- source "http://rubygems.org"
2
- # Add dependencies required to use your gem here.
3
- # Example:
4
- # gem "activesupport", ">= 2.3.5"
1
+ source 'http://rubygems.org'
5
2
 
6
- # Add dependencies to develop your gem here.
7
- # Include everything needed to run rake, tests, features, etc.
3
+ # Dependencies required to use the gem
4
+ gem 'activerecord', '~> 4.2', '>= 4.2.1'
5
+
6
+ # Dependencies to develop the gem
8
7
  group :development do
9
- gem "rspec", "~> 2.8.0"
10
- gem "rdoc", "~> 3.12"
11
- gem "bundler", "~> 1.0"
12
- gem "jeweler", "~> 2.0.1"
13
- gem "simplecov", ">= 0"
8
+ gem 'rspec', '~> 3.0', '>= 3.2.0'
9
+ gem 'rdoc', '~> 4.0', '>= 4.2.0'
10
+ gem 'bundler', '~> 1.0', '>= 1.10.2'
11
+ gem 'jeweler', '~> 2.0', '>= 2.0.1'
12
+ gem 'simplecov', '>= 0'
13
+ gem 'guard-rspec', '~> 4.0', '>= 4.5.1'
14
+ gem 'byebug', '~> 5.0'
15
+ gem 'awesome_print', '~> 1.6', '>= 1.6.1', require: 'awesome_print'
16
+ gem 'fuubar', '~> 2.0', '>= 2.0.0.rc1'
17
+ gem 'simple-random', '~> 1.0', '>= 1.0.0'
14
18
  end
data/Gemfile.lock ADDED
@@ -0,0 +1,149 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ activemodel (4.2.3)
5
+ activesupport (= 4.2.3)
6
+ builder (~> 3.1)
7
+ activerecord (4.2.3)
8
+ activemodel (= 4.2.3)
9
+ activesupport (= 4.2.3)
10
+ arel (~> 6.0)
11
+ activesupport (4.2.3)
12
+ i18n (~> 0.7)
13
+ json (~> 1.7, >= 1.7.7)
14
+ minitest (~> 5.1)
15
+ thread_safe (~> 0.3, >= 0.3.4)
16
+ tzinfo (~> 1.1)
17
+ addressable (2.3.8)
18
+ arel (6.0.1)
19
+ awesome_print (1.6.1)
20
+ builder (3.2.2)
21
+ byebug (5.0.0)
22
+ columnize (= 0.9.0)
23
+ coderay (1.1.0)
24
+ columnize (0.9.0)
25
+ descendants_tracker (0.0.4)
26
+ thread_safe (~> 0.3, >= 0.3.1)
27
+ diff-lcs (1.2.5)
28
+ docile (1.1.5)
29
+ faraday (0.9.1)
30
+ multipart-post (>= 1.2, < 3)
31
+ ffi (1.9.10)
32
+ formatador (0.2.5)
33
+ fuubar (2.0.0)
34
+ rspec (~> 3.0)
35
+ ruby-progressbar (~> 1.4)
36
+ git (1.2.9.1)
37
+ github_api (0.12.3)
38
+ addressable (~> 2.3)
39
+ descendants_tracker (~> 0.0.4)
40
+ faraday (~> 0.8, < 0.10)
41
+ hashie (>= 3.3)
42
+ multi_json (>= 1.7.5, < 2.0)
43
+ nokogiri (~> 1.6.3)
44
+ oauth2
45
+ guard (2.12.8)
46
+ formatador (>= 0.2.4)
47
+ listen (>= 2.7, <= 4.0)
48
+ lumberjack (~> 1.0)
49
+ nenv (~> 0.1)
50
+ notiffany (~> 0.0)
51
+ pry (>= 0.9.12)
52
+ shellany (~> 0.0)
53
+ thor (>= 0.18.1)
54
+ guard-compat (1.2.1)
55
+ guard-rspec (4.6.1)
56
+ guard (~> 2.1)
57
+ guard-compat (~> 1.1)
58
+ rspec (>= 2.99.0, < 4.0)
59
+ hashie (3.4.2)
60
+ highline (1.7.2)
61
+ i18n (0.7.0)
62
+ jeweler (2.0.1)
63
+ builder
64
+ bundler (>= 1.0)
65
+ git (>= 1.2.5)
66
+ github_api
67
+ highline (>= 1.6.15)
68
+ nokogiri (>= 1.5.10)
69
+ rake
70
+ rdoc
71
+ json (1.8.3)
72
+ jwt (1.5.1)
73
+ listen (3.0.2)
74
+ rb-fsevent (>= 0.9.3)
75
+ rb-inotify (>= 0.9)
76
+ lumberjack (1.0.9)
77
+ method_source (0.8.2)
78
+ mini_portile (0.6.2)
79
+ minitest (5.7.0)
80
+ multi_json (1.11.2)
81
+ multi_xml (0.5.5)
82
+ multipart-post (2.0.0)
83
+ nenv (0.2.0)
84
+ nokogiri (1.6.6.2)
85
+ mini_portile (~> 0.6.0)
86
+ notiffany (0.0.6)
87
+ nenv (~> 0.1)
88
+ shellany (~> 0.0)
89
+ oauth2 (1.0.0)
90
+ faraday (>= 0.8, < 0.10)
91
+ jwt (~> 1.0)
92
+ multi_json (~> 1.3)
93
+ multi_xml (~> 0.5)
94
+ rack (~> 1.2)
95
+ pry (0.10.1)
96
+ coderay (~> 1.1.0)
97
+ method_source (~> 0.8.1)
98
+ slop (~> 3.4)
99
+ rack (1.6.4)
100
+ rake (10.4.2)
101
+ rb-fsevent (0.9.5)
102
+ rb-inotify (0.9.5)
103
+ ffi (>= 0.5.0)
104
+ rdoc (4.2.0)
105
+ rspec (3.3.0)
106
+ rspec-core (~> 3.3.0)
107
+ rspec-expectations (~> 3.3.0)
108
+ rspec-mocks (~> 3.3.0)
109
+ rspec-core (3.3.1)
110
+ rspec-support (~> 3.3.0)
111
+ rspec-expectations (3.3.0)
112
+ diff-lcs (>= 1.2.0, < 2.0)
113
+ rspec-support (~> 3.3.0)
114
+ rspec-mocks (3.3.1)
115
+ diff-lcs (>= 1.2.0, < 2.0)
116
+ rspec-support (~> 3.3.0)
117
+ rspec-support (3.3.0)
118
+ ruby-progressbar (1.7.5)
119
+ shellany (0.0.1)
120
+ simple-random (1.0.0)
121
+ simplecov (0.10.0)
122
+ docile (~> 1.1.0)
123
+ json (~> 1.8)
124
+ simplecov-html (~> 0.10.0)
125
+ simplecov-html (0.10.0)
126
+ slop (3.6.0)
127
+ thor (0.19.1)
128
+ thread_safe (0.3.5)
129
+ tzinfo (1.2.2)
130
+ thread_safe (~> 0.1)
131
+
132
+ PLATFORMS
133
+ ruby
134
+
135
+ DEPENDENCIES
136
+ activerecord (~> 4.2, >= 4.2.1)
137
+ awesome_print (~> 1.6, >= 1.6.1)
138
+ bundler (~> 1.0, >= 1.10.2)
139
+ byebug (~> 5.0)
140
+ fuubar (~> 2.0, >= 2.0.0.rc1)
141
+ guard-rspec (~> 4.0, >= 4.5.1)
142
+ jeweler (~> 2.0, >= 2.0.1)
143
+ rdoc (~> 4.0, >= 4.2.0)
144
+ rspec (~> 3.0, >= 3.2.0)
145
+ simple-random (~> 1.0, >= 1.0.0)
146
+ simplecov
147
+
148
+ BUNDLED WITH
149
+ 1.10.5
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2015 Cihat Gündüz
1
+ Copyright (c) 2015 Cihat Gündüz
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # DBClustering [![Build Status](https://travis-ci.org/Dschee/db_clustering.svg?branch=develop)](https://travis-ci.org/Dschee/db_clustering)
2
+
3
+ Please note that this gem is still in its very early stages and should not considered stable.
4
+ Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
5
+
6
+ ## Requirements
7
+
8
+ Ruby 2.1+ is required, earlier Rubies may work but are not officially supported.
9
+
10
+ ## Getting Started
11
+
12
+ This gem was developed to work best in Ruby on Rails projects.
13
+
14
+ 1. Add this gem to your Gemfile
15
+
16
+ gem 'db_clustering'
17
+
18
+ 2. Rund `bundle install` in your terminal
19
+
20
+ 3. Implement the `clustering_vector` method in your model class and return either:
21
+ - an **array** with numeric values for similarity comparison
22
+ - a **hash** with numeric values for similarity comparison between keys existing in both hashes
23
+
24
+ See `TestModel` class within the `spec/support` directory for a very simple example.
25
+
26
+ 4. Decide for a *distance metric* and initialize it, e.g.:
27
+
28
+ ``` ruby
29
+ average_difference = DbClustering::DistanceMetrics::AverageDifference.new
30
+
31
+ # Instead you can also use one of the following:
32
+ cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new
33
+ euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new
34
+ pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
35
+ ```
36
+
37
+ 5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
38
+
39
+ ``` ruby
40
+ in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
41
+ ```
42
+
43
+ Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
44
+
45
+ An **ActiveRecord datasource** type is planned but not yet implemented. Please stay tuned.
46
+
47
+ 6. Decide for an **algorithm** and initialize it:
48
+
49
+ ``` ruby
50
+ dbscan = DbClustering::Algorithms::Dbscan.new(datasource: in_memory_datasource, distance_metric: average_difference)
51
+ ```
52
+ Please note that currently **only one algorithm is available**. More algorithms aren't currently planned but may be added if needed. Contributions are welcome, of course.
53
+
54
+ 7. Decide for the **algorithm parameters** and start the process of clustering your data:
55
+
56
+ ``` ruby
57
+ dbscan.cluster(max_distance: 10, min_neighbors: 5)
58
+ ```
59
+ The `max_distance` is the epsilon parameter and the `min_neighbors` the minPts parameter from the usual DBSCAN algorithm documentation (e.g. Wikipedia). You might want to try different values here first before you decide for the right values for your purpose.
60
+
61
+ Plase also take note that the `max_distance` value is **highly dependent on the type of metric** you decided to go for. For the `AverageDifference` and `EuclideanDistance` metrics it can be an **open-ended positive value**. For the `CosineSimilarity` and `PearsonCorrelation` types it needs to be a value between 0 and 2 where a value of `0` means "100% positive correlation/similarity", a value of `1` means "no correlation/similarity at all" and a value of `2` means "100% negative correlation/similarity". You can use any decimal value in between (e.g. 0.25) as a partly positive/negative correlation.
62
+
63
+ 8. Wait for the calculations to finish and use the results the way you want:
64
+
65
+ ``` ruby
66
+ clusters = dbscan.clusters # the resulting Clusters, each cluster contains Points
67
+ first_cluster = clusters.first
68
+ point = first_cluster.points.first
69
+ # a point knows its cluster, and its position in there
70
+ point.cluster # will return the same object as `first_cluster`
71
+ point.is_edge_point? # boolean specifying if it's an edge point of its cluster
72
+ point.is_core_point? # boolean specifying if it's a core point of its cluster
73
+ point.is_noise_point? # boolean specifiying if it's a noise point without a cluster
74
+
75
+ # a point also contains the source object specifying the `clustering_vector` method
76
+ your_model = point.datasource_point
77
+ ```
78
+
79
+ For more please don't hesitate to have a look into the underlying models under the `lib/models` directory as well as the corresponding specs.
80
+
81
+ That's it, it **looks more complicated than it actually** is, just try it out! You can find complete usage examples within the `spec/algorithms/density_based/dbscan_spec.rb` file.
82
+
83
+ ## Contributing
84
+
85
+ Contributions are welcome. Please fork this project, make your changes and file a pull request. Please also make sure to write tests to ensure your changes persist over time.
86
+
87
+
88
+ ## License
89
+
90
+ This gem is released under the [MIT License](http://www.opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -14,13 +14,13 @@ require 'rake'
14
14
  require 'jeweler'
15
15
  Jeweler::Tasks.new do |gem|
16
16
  # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
- gem.name = "clustering"
18
- gem.homepage = "http://github.com/Dschee/clustering"
17
+ gem.name = "db_clustering"
18
+ gem.homepage = "http://github.com/Dschee/db_clustering"
19
19
  gem.license = "MIT"
20
- gem.summary = %Q{TODO: one-line summary of your gem}
21
- gem.description = %Q{TODO: longer description of your gem}
20
+ gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
21
+ gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
22
22
  gem.email = "CihatGuenduez@posteo.de"
23
- gem.authors = ["Cihat Gündüz"]
23
+ gem.authors = ["Cihat Gündüz"]
24
24
  # dependencies defined in Gemfile
25
25
  end
26
26
  Jeweler::RubygemsDotOrgTasks.new
@@ -44,7 +44,7 @@ Rake::RDocTask.new do |rdoc|
44
44
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
45
45
 
46
46
  rdoc.rdoc_dir = 'rdoc'
47
- rdoc.title = "clustering #{version}"
47
+ rdoc.title = "db_clustering #{version}"
48
48
  rdoc.rdoc_files.include('README*')
49
49
  rdoc.rdoc_files.include('lib/**/*.rb')
50
50
  end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.1
@@ -0,0 +1,48 @@
1
+ # For more information see https://en.wikipedia.org/wiki/DBSCAN
2
+
3
+ module DbClustering
4
+ module Algorithms
5
+ class Dbscan
6
+
7
+ attr_accessor :datasource, :clusters
8
+
9
+ def initialize(datasource:, distance_metric:)
10
+ @datasource = datasource
11
+ @distance_metric = distance_metric
12
+ @clusters = []
13
+ end
14
+
15
+ def cluster(max_distance:, min_neighbors:)
16
+ @clusters = []
17
+ cluster = nil
18
+
19
+ @datasource.iterate_all_points do |point|
20
+ neighbors = @datasource.neighbors(point: point, distance_metric: @distance_metric, max_distance: max_distance)
21
+
22
+ if neighbors.count < min_neighbors
23
+ point.is_noise = true
24
+ else
25
+ if point.cluster.nil?
26
+ cluster = DbClustering::Models::Cluster.new
27
+ @clusters << cluster
28
+ else
29
+ cluster = point.cluster
30
+ end
31
+ expand_cluster(cluster: cluster, neighbors: neighbors)
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+ def expand_cluster(cluster:, neighbors:)
38
+ # Important: If neighbors do not include point itself, then point must be added to cluster, too.
39
+ neighbors.each do |neighbor|
40
+ if !neighbor.visited?
41
+ cluster.add(neighbor)
42
+ end
43
+ end
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,32 @@
1
+ module DbClustering
2
+ module DatasourceAdapters
3
+ class ActiveRecord
4
+
5
+ def initialize(relation:)
6
+ @relation = relation
7
+ end
8
+
9
+ def iterate_all_points
10
+ @relation.find_each do |datasource_point|
11
+ point = DbClustering::Models::Point.new(datasource_point)
12
+ yield(point)
13
+ end
14
+ end
15
+
16
+ def neighbors(point:, distance_metric:, max_distance:)
17
+ neighbors = []
18
+
19
+ @relation.find_each do |neighbor_candidate|
20
+ candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
21
+
22
+ if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
23
+ neighbors << candidate_point
24
+ end
25
+ end
26
+
27
+ neighbors
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,29 @@
1
+ module DbClustering
2
+ module DatasourceAdapters
3
+ class InMemory
4
+
5
+ def initialize(array:)
6
+ @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
7
+ end
8
+
9
+ def iterate_all_points
10
+ @array.each do |point|
11
+ yield(point)
12
+ end
13
+ end
14
+
15
+ def neighbors(point:, distance_metric:, max_distance:)
16
+ neighbors = []
17
+
18
+ @array.each do |neighbor_candidate|
19
+ if distance_metric.distance(point.vector, neighbor_candidate.vector) <= max_distance
20
+ neighbors << neighbor_candidate
21
+ end
22
+ end
23
+
24
+ neighbors
25
+ end
26
+
27
+ end
28
+ end
29
+ end