db_clustering 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +21 -2
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/datasource_adapters/active_record.rb +5 -4
- data/lib/datasource_adapters/in_memory.rb +3 -2
- data/lib/models/point.rb +7 -2
- data/spec/algorithms/density_based/dbscan_spec.rb +2 -2
- data/spec/datasource_adapters/in_memory_spec.rb +2 -2
- data/spec/support/test_model.rb +8 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d90adfb63dc205025dbd331adf219dbf8153232
|
4
|
+
data.tar.gz: 48467c1bac585bc7e1c95b028fcbebe73b310b3e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8f6cf2d2eb668c229599138a886ef0cdbf76487575faa2e07d53e50c83a4b1b358d8741b56e5fb4a12269d3e355c42e63edf7d8b5af1029c5ceb650b98c36bad
|
7
|
+
data.tar.gz: 2a383a33400a8f1582be0bd022b4bde47d52166b1b3286c0e0a0e74c73ed1beb3fdf1ed29a0d996565895df08ed9b1db5c2ca0cd627a8bdf77f4de565debd958
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# DBClustering [![Build Status](https://travis-ci.org/
|
1
|
+
# DBClustering [![Build Status](https://travis-ci.org/Flinesoft/db_clustering.svg?branch=develop)](https://travis-ci.org/Flinesoft/db_clustering)
|
2
2
|
|
3
3
|
Please note that this gem is still in its very early stages and should not considered stable.
|
4
4
|
Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
|
@@ -21,6 +21,8 @@ This gem was developed to work best in Ruby on Rails projects.
|
|
21
21
|
- an **array** with numeric values for similarity comparison
|
22
22
|
- a **hash** with numeric values for similarity comparison between keys existing in both hashes
|
23
23
|
|
24
|
+
The `clustering_vector` can either have no parameters at all or one parameter that we call `vector_params` within this documentation. You can of course name it the way you want – it will just be passed through as you specify in a later step (step 5).
|
25
|
+
|
24
26
|
See `TestModel` class within the `spec/support` directory for a very simple example.
|
25
27
|
|
26
28
|
4. Decide for a *distance metric* and initialize it, e.g.:
|
@@ -34,10 +36,27 @@ This gem was developed to work best in Ruby on Rails projects.
|
|
34
36
|
pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
|
35
37
|
```
|
36
38
|
|
39
|
+
You can also specify that a distance only makes sense if a minimum number of values are given when comparing two vectors. This is especially useful in combination with a hash clustering vector and many different keys. A distance will then only be calculated if the given `min_dimensions` is reached or exceeded – otherwise the distance will be set to (near) infinity and therefore the vectors will not be ranked as in the same cluster:
|
40
|
+
|
41
|
+
``` ruby
|
42
|
+
average_difference = DbClustering::DistanceMetrics::AverageDifference.new(min_dimensions: 5)
|
43
|
+
|
44
|
+
# Instead you can also use one of the following:
|
45
|
+
cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new(min_dimensions: 5)
|
46
|
+
euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new(min_dimensions: 5)
|
47
|
+
pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new(min_dimensions: 5)
|
48
|
+
```
|
49
|
+
|
37
50
|
5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
|
38
51
|
|
39
52
|
``` ruby
|
40
|
-
|
53
|
+
in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
|
54
|
+
```
|
55
|
+
|
56
|
+
If you decided to accept a `vector_params` parameter in step 3 please add a non-nil parameter in your datasource like this:
|
57
|
+
|
58
|
+
``` ruby
|
59
|
+
in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array, vector_params: { any_object: :to_passthrough })
|
41
60
|
```
|
42
61
|
|
43
62
|
Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
|
data/Rakefile
CHANGED
@@ -15,7 +15,7 @@ require 'jeweler'
|
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
17
|
gem.name = "db_clustering"
|
18
|
-
gem.homepage = "http://github.com/
|
18
|
+
gem.homepage = "http://github.com/Flinesoft/db_clustering"
|
19
19
|
gem.license = "MIT"
|
20
20
|
gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
|
21
21
|
gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.7
|
@@ -2,16 +2,17 @@ module DbClustering
|
|
2
2
|
module DatasourceAdapters
|
3
3
|
class ActiveRecord
|
4
4
|
|
5
|
-
def initialize(relation:)
|
5
|
+
def initialize(relation:, vector_params: nil)
|
6
6
|
@relation = relation
|
7
|
+
@vector_params = vector_params
|
7
8
|
end
|
8
9
|
|
9
10
|
def iterate_all_points
|
10
11
|
points_count = @relation.count
|
11
12
|
current_index = 0
|
12
|
-
|
13
|
+
|
13
14
|
@relation.find_each do |datasource_point|
|
14
|
-
point = DbClustering::Models::Point.new(datasource_point)
|
15
|
+
point = DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params)
|
15
16
|
yield(point, current_index, points_count)
|
16
17
|
current_index += 1
|
17
18
|
end
|
@@ -21,7 +22,7 @@ module DbClustering
|
|
21
22
|
neighbors = []
|
22
23
|
|
23
24
|
@relation.find_each do |neighbor_candidate|
|
24
|
-
candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
|
25
|
+
candidate_point = DbClustering::Models::Point.new(datasource_point: neighbor_candidate, vector_params: @vector_params)
|
25
26
|
|
26
27
|
if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
|
27
28
|
neighbors << candidate_point
|
@@ -2,8 +2,9 @@ module DbClustering
|
|
2
2
|
module DatasourceAdapters
|
3
3
|
class InMemory
|
4
4
|
|
5
|
-
def initialize(array:)
|
6
|
-
@
|
5
|
+
def initialize(array:, vector_params: nil)
|
6
|
+
@vector_params = vector_params
|
7
|
+
@array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params) }
|
7
8
|
end
|
8
9
|
|
9
10
|
def iterate_all_points
|
data/lib/models/point.rb
CHANGED
@@ -4,14 +4,19 @@ module DbClustering
|
|
4
4
|
|
5
5
|
attr_accessor :cluster, :is_noise, :datasource_point
|
6
6
|
|
7
|
-
def initialize(datasource_point)
|
7
|
+
def initialize(datasource_point:, vector_params: nil)
|
8
8
|
@is_noise = false
|
9
9
|
@cluster = nil
|
10
10
|
@datasource_point = datasource_point
|
11
|
+
@vector_params = vector_params
|
11
12
|
end
|
12
13
|
|
13
14
|
def vector
|
14
|
-
|
15
|
+
if @vector_params
|
16
|
+
vector_object = @datasource_point.clustering_vector(@vector_params)
|
17
|
+
else
|
18
|
+
vector_object = @datasource_point.clustering_vector
|
19
|
+
end
|
15
20
|
|
16
21
|
if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
|
17
22
|
DbClustering::Models::Vector.new(object: vector_object)
|
@@ -6,7 +6,7 @@ describe DbClustering::Algorithms::Dbscan do
|
|
6
6
|
before(:each) do
|
7
7
|
@dataset = DatasetHelper.normal_distribution
|
8
8
|
|
9
|
-
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
9
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
|
10
10
|
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
11
11
|
|
12
12
|
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
@@ -22,7 +22,7 @@ describe DbClustering::Algorithms::Dbscan do
|
|
22
22
|
@clusters_count = 10
|
23
23
|
@dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
|
24
24
|
|
25
|
-
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
25
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
|
26
26
|
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
27
27
|
|
28
28
|
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
@@ -25,8 +25,8 @@ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
|
|
25
25
|
before(:each) do
|
26
26
|
@dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
|
27
27
|
|
28
|
-
@in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
29
|
-
@first_point = DbClustering::Models::Point.new(@dataset.first)
|
28
|
+
@in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
|
29
|
+
@first_point = DbClustering::Models::Point.new(datasource_point: @dataset.first, vector_params: { type: 'Array' })
|
30
30
|
end
|
31
31
|
|
32
32
|
context "average difference" do
|
data/spec/support/test_model.rb
CHANGED
@@ -3,7 +3,13 @@ class TestModel
|
|
3
3
|
@vector = vector
|
4
4
|
end
|
5
5
|
|
6
|
-
def clustering_vector
|
7
|
-
|
6
|
+
def clustering_vector(vector_params)
|
7
|
+
if vector_params[:type] == 'Hash'
|
8
|
+
vector_as_hash = {}
|
9
|
+
@vector.map.with_index{ |x,i| vector_as_hash[i] = x }
|
10
|
+
return vector_as_hash
|
11
|
+
end
|
12
|
+
|
13
|
+
return @vector
|
8
14
|
end
|
9
15
|
end
|