db_clustering 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -2
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/datasource_adapters/active_record.rb +5 -4
- data/lib/datasource_adapters/in_memory.rb +3 -2
- data/lib/models/point.rb +7 -2
- data/spec/algorithms/density_based/dbscan_spec.rb +2 -2
- data/spec/datasource_adapters/in_memory_spec.rb +2 -2
- data/spec/support/test_model.rb +8 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d90adfb63dc205025dbd331adf219dbf8153232
|
4
|
+
data.tar.gz: 48467c1bac585bc7e1c95b028fcbebe73b310b3e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8f6cf2d2eb668c229599138a886ef0cdbf76487575faa2e07d53e50c83a4b1b358d8741b56e5fb4a12269d3e355c42e63edf7d8b5af1029c5ceb650b98c36bad
|
7
|
+
data.tar.gz: 2a383a33400a8f1582be0bd022b4bde47d52166b1b3286c0e0a0e74c73ed1beb3fdf1ed29a0d996565895df08ed9b1db5c2ca0cd627a8bdf77f4de565debd958
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# DBClustering [](https://travis-ci.org/Flinesoft/db_clustering)
|
2
2
|
|
3
3
|
Please note that this gem is still in its very early stages and should not considered stable.
|
4
4
|
Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
|
@@ -21,6 +21,8 @@ This gem was developed to work best in Ruby on Rails projects.
|
|
21
21
|
- an **array** with numeric values for similarity comparison
|
22
22
|
- a **hash** with numeric values for similarity comparison between keys existing in both hashes
|
23
23
|
|
24
|
+
The `clustering_vector` can either have no parameters at all or one parameter that we call `vector_params` within this documentation. You can of course name it the way you want – it will just be passed through as you specify in a later step (step 5).
|
25
|
+
|
24
26
|
See `TestModel` class within the `spec/support` directory for a very simple example.
|
25
27
|
|
26
28
|
4. Decide for a *distance metric* and initialize it, e.g.:
|
@@ -34,10 +36,27 @@ This gem was developed to work best in Ruby on Rails projects.
|
|
34
36
|
pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
|
35
37
|
```
|
36
38
|
|
39
|
+
You can also specify that a distance only makes sense if a minimum number of values are given when comparing two vectors. This is especially useful in combination with a hash clustering vector and many different keys. A distance will then only be calculated if the given `min_dimensions` is reached or exceeded – otherwise the distance will be set to (near) infinity and therefore the vectors will not be ranked as in the same cluster:
|
40
|
+
|
41
|
+
``` ruby
|
42
|
+
average_difference = DbClustering::DistanceMetrics::AverageDifference.new(min_dimensions: 5)
|
43
|
+
|
44
|
+
# Instead you can also use one of the following:
|
45
|
+
cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new(min_dimensions: 5)
|
46
|
+
euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new(min_dimensions: 5)
|
47
|
+
pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new(min_dimensions: 5)
|
48
|
+
```
|
49
|
+
|
37
50
|
5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
|
38
51
|
|
39
52
|
``` ruby
|
40
|
-
|
53
|
+
in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
|
54
|
+
```
|
55
|
+
|
56
|
+
If you decided to accept a `vector_params` parameter in step 3 please add a non-nil parameter in your datasource like this:
|
57
|
+
|
58
|
+
``` ruby
|
59
|
+
in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array, vector_params: { any_object: :to_passthrough })
|
41
60
|
```
|
42
61
|
|
43
62
|
Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
|
data/Rakefile
CHANGED
@@ -15,7 +15,7 @@ require 'jeweler'
|
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
17
|
gem.name = "db_clustering"
|
18
|
-
gem.homepage = "http://github.com/
|
18
|
+
gem.homepage = "http://github.com/Flinesoft/db_clustering"
|
19
19
|
gem.license = "MIT"
|
20
20
|
gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
|
21
21
|
gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.7
|
@@ -2,16 +2,17 @@ module DbClustering
|
|
2
2
|
module DatasourceAdapters
|
3
3
|
class ActiveRecord
|
4
4
|
|
5
|
-
def initialize(relation:)
|
5
|
+
def initialize(relation:, vector_params: nil)
|
6
6
|
@relation = relation
|
7
|
+
@vector_params = vector_params
|
7
8
|
end
|
8
9
|
|
9
10
|
def iterate_all_points
|
10
11
|
points_count = @relation.count
|
11
12
|
current_index = 0
|
12
|
-
|
13
|
+
|
13
14
|
@relation.find_each do |datasource_point|
|
14
|
-
point = DbClustering::Models::Point.new(datasource_point)
|
15
|
+
point = DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params)
|
15
16
|
yield(point, current_index, points_count)
|
16
17
|
current_index += 1
|
17
18
|
end
|
@@ -21,7 +22,7 @@ module DbClustering
|
|
21
22
|
neighbors = []
|
22
23
|
|
23
24
|
@relation.find_each do |neighbor_candidate|
|
24
|
-
candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
|
25
|
+
candidate_point = DbClustering::Models::Point.new(datasource_point: neighbor_candidate, vector_params: @vector_params)
|
25
26
|
|
26
27
|
if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
|
27
28
|
neighbors << candidate_point
|
@@ -2,8 +2,9 @@ module DbClustering
|
|
2
2
|
module DatasourceAdapters
|
3
3
|
class InMemory
|
4
4
|
|
5
|
-
def initialize(array:)
|
6
|
-
@
|
5
|
+
def initialize(array:, vector_params: nil)
|
6
|
+
@vector_params = vector_params
|
7
|
+
@array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params) }
|
7
8
|
end
|
8
9
|
|
9
10
|
def iterate_all_points
|
data/lib/models/point.rb
CHANGED
@@ -4,14 +4,19 @@ module DbClustering
|
|
4
4
|
|
5
5
|
attr_accessor :cluster, :is_noise, :datasource_point
|
6
6
|
|
7
|
-
def initialize(datasource_point)
|
7
|
+
def initialize(datasource_point:, vector_params: nil)
|
8
8
|
@is_noise = false
|
9
9
|
@cluster = nil
|
10
10
|
@datasource_point = datasource_point
|
11
|
+
@vector_params = vector_params
|
11
12
|
end
|
12
13
|
|
13
14
|
def vector
|
14
|
-
|
15
|
+
if @vector_params
|
16
|
+
vector_object = @datasource_point.clustering_vector(@vector_params)
|
17
|
+
else
|
18
|
+
vector_object = @datasource_point.clustering_vector
|
19
|
+
end
|
15
20
|
|
16
21
|
if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
|
17
22
|
DbClustering::Models::Vector.new(object: vector_object)
|
@@ -6,7 +6,7 @@ describe DbClustering::Algorithms::Dbscan do
|
|
6
6
|
before(:each) do
|
7
7
|
@dataset = DatasetHelper.normal_distribution
|
8
8
|
|
9
|
-
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
9
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
|
10
10
|
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
11
11
|
|
12
12
|
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
@@ -22,7 +22,7 @@ describe DbClustering::Algorithms::Dbscan do
|
|
22
22
|
@clusters_count = 10
|
23
23
|
@dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
|
24
24
|
|
25
|
-
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
25
|
+
@in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
|
26
26
|
@average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
|
27
27
|
|
28
28
|
@dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
|
@@ -25,8 +25,8 @@ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
|
|
25
25
|
before(:each) do
|
26
26
|
@dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
|
27
27
|
|
28
|
-
@in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
|
29
|
-
@first_point = DbClustering::Models::Point.new(@dataset.first)
|
28
|
+
@in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
|
29
|
+
@first_point = DbClustering::Models::Point.new(datasource_point: @dataset.first, vector_params: { type: 'Array' })
|
30
30
|
end
|
31
31
|
|
32
32
|
context "average difference" do
|
data/spec/support/test_model.rb
CHANGED
@@ -3,7 +3,13 @@ class TestModel
|
|
3
3
|
@vector = vector
|
4
4
|
end
|
5
5
|
|
6
|
-
def clustering_vector
|
7
|
-
|
6
|
+
def clustering_vector(vector_params)
|
7
|
+
if vector_params[:type] == 'Hash'
|
8
|
+
vector_as_hash = {}
|
9
|
+
@vector.map.with_index{ |x,i| vector_as_hash[i] = x }
|
10
|
+
return vector_as_hash
|
11
|
+
end
|
12
|
+
|
13
|
+
return @vector
|
8
14
|
end
|
9
15
|
end
|