db_clustering 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3804d604feaab377c7cbddb8af4318ba3e3b15a
4
- data.tar.gz: 8f32d2fa6074a2feea7c52f9be3d3ec586eead85
3
+ metadata.gz: 6d90adfb63dc205025dbd331adf219dbf8153232
4
+ data.tar.gz: 48467c1bac585bc7e1c95b028fcbebe73b310b3e
5
5
  SHA512:
6
- metadata.gz: 6f3b4a9eef1f060c81ef70dd9eb8216e913a88cebddc9152cbe88fedbe974d8d68f6b1e99baa161e8e8eff45025391f4f75f865cc657c78998c75720ff3e32c2
7
- data.tar.gz: 7b0d8fa98551a2a5a73a23b1c6989a113eb4c97e48a085839c5642d2e52fc12e4fbf73efb5404dc8892178a8a71533842f0afcc7cfff2651d3875b79e42bfabb
6
+ metadata.gz: 8f6cf2d2eb668c229599138a886ef0cdbf76487575faa2e07d53e50c83a4b1b358d8741b56e5fb4a12269d3e355c42e63edf7d8b5af1029c5ceb650b98c36bad
7
+ data.tar.gz: 2a383a33400a8f1582be0bd022b4bde47d52166b1b3286c0e0a0e74c73ed1beb3fdf1ed29a0d996565895df08ed9b1db5c2ca0cd627a8bdf77f4de565debd958
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # DBClustering [![Build Status](https://travis-ci.org/Dschee/db_clustering.svg?branch=develop)](https://travis-ci.org/Dschee/db_clustering)
1
+ # DBClustering [![Build Status](https://travis-ci.org/Flinesoft/db_clustering.svg?branch=develop)](https://travis-ci.org/Flinesoft/db_clustering)
2
2
 
3
3
  Please note that this gem is still in its very early stages and should not considered stable.
4
4
  Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
@@ -21,6 +21,8 @@ This gem was developed to work best in Ruby on Rails projects.
21
21
  - an **array** with numeric values for similarity comparison
22
22
  - a **hash** with numeric values for similarity comparison between keys existing in both hashes
23
23
 
24
+ The `clustering_vector` can either have no parameters at all or one parameter that we call `vector_params` within this documentation. You can of course name it the way you want – it will just be passed through as you specify in a later step (step 5).
25
+
24
26
  See `TestModel` class within the `spec/support` directory for a very simple example.
25
27
 
26
28
  4. Decide for a *distance metric* and initialize it, e.g.:
@@ -34,10 +36,27 @@ This gem was developed to work best in Ruby on Rails projects.
34
36
  pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
35
37
  ```
36
38
 
39
+ You can also specify that a distance only makes sense if a minimum number of values are given when comparing two vectors. This is especially useful in combination with a hash clustering vector and many different keys. A distance will then only be calculated if the given `min_dimensions` is reached or exceeded – otherwise the distance will be set to (near) infinity and therefore the vectors will not be ranked as in the same cluster:
40
+
41
+ ``` ruby
42
+ average_difference = DbClustering::DistanceMetrics::AverageDifference.new(min_dimensions: 5)
43
+
44
+ # Instead you can also use one of the following:
45
+ cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new(min_dimensions: 5)
46
+ euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new(min_dimensions: 5)
47
+ pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new(min_dimensions: 5)
48
+ ```
49
+
37
50
  5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
38
51
 
39
52
  ``` ruby
40
- in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
53
+ in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
54
+ ```
55
+
56
+ If you decided to accept a `vector_params` parameter in step 3 please add a non-nil parameter in your datasource like this:
57
+
58
+ ``` ruby
59
+ in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array, vector_params: { any_object: :to_passthrough })
41
60
  ```
42
61
 
43
62
  Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
data/Rakefile CHANGED
@@ -15,7 +15,7 @@ require 'jeweler'
15
15
  Jeweler::Tasks.new do |gem|
16
16
  # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
17
  gem.name = "db_clustering"
18
- gem.homepage = "http://github.com/Dschee/db_clustering"
18
+ gem.homepage = "http://github.com/Flinesoft/db_clustering"
19
19
  gem.license = "MIT"
20
20
  gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
21
21
  gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.1.7
@@ -2,16 +2,17 @@ module DbClustering
2
2
  module DatasourceAdapters
3
3
  class ActiveRecord
4
4
 
5
- def initialize(relation:)
5
+ def initialize(relation:, vector_params: nil)
6
6
  @relation = relation
7
+ @vector_params = vector_params
7
8
  end
8
9
 
9
10
  def iterate_all_points
10
11
  points_count = @relation.count
11
12
  current_index = 0
12
-
13
+
13
14
  @relation.find_each do |datasource_point|
14
- point = DbClustering::Models::Point.new(datasource_point)
15
+ point = DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params)
15
16
  yield(point, current_index, points_count)
16
17
  current_index += 1
17
18
  end
@@ -21,7 +22,7 @@ module DbClustering
21
22
  neighbors = []
22
23
 
23
24
  @relation.find_each do |neighbor_candidate|
24
- candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
25
+ candidate_point = DbClustering::Models::Point.new(datasource_point: neighbor_candidate, vector_params: @vector_params)
25
26
 
26
27
  if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
27
28
  neighbors << candidate_point
@@ -2,8 +2,9 @@ module DbClustering
2
2
  module DatasourceAdapters
3
3
  class InMemory
4
4
 
5
- def initialize(array:)
6
- @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
5
+ def initialize(array:, vector_params: nil)
6
+ @vector_params = vector_params
7
+ @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params) }
7
8
  end
8
9
 
9
10
  def iterate_all_points
data/lib/models/point.rb CHANGED
@@ -4,14 +4,19 @@ module DbClustering
4
4
 
5
5
  attr_accessor :cluster, :is_noise, :datasource_point
6
6
 
7
- def initialize(datasource_point)
7
+ def initialize(datasource_point:, vector_params: nil)
8
8
  @is_noise = false
9
9
  @cluster = nil
10
10
  @datasource_point = datasource_point
11
+ @vector_params = vector_params
11
12
  end
12
13
 
13
14
  def vector
14
- vector_object = @datasource_point.clustering_vector
15
+ if @vector_params
16
+ vector_object = @datasource_point.clustering_vector(@vector_params)
17
+ else
18
+ vector_object = @datasource_point.clustering_vector
19
+ end
15
20
 
16
21
  if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
17
22
  DbClustering::Models::Vector.new(object: vector_object)
@@ -6,7 +6,7 @@ describe DbClustering::Algorithms::Dbscan do
6
6
  before(:each) do
7
7
  @dataset = DatasetHelper.normal_distribution
8
8
 
9
- @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
9
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
10
10
  @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
11
11
 
12
12
  @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
@@ -22,7 +22,7 @@ describe DbClustering::Algorithms::Dbscan do
22
22
  @clusters_count = 10
23
23
  @dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
24
24
 
25
- @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
25
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
26
26
  @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
27
27
 
28
28
  @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
@@ -25,8 +25,8 @@ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
25
25
  before(:each) do
26
26
  @dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
27
27
 
28
- @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
29
- @first_point = DbClustering::Models::Point.new(@dataset.first)
28
+ @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
29
+ @first_point = DbClustering::Models::Point.new(datasource_point: @dataset.first, vector_params: { type: 'Array' })
30
30
  end
31
31
 
32
32
  context "average difference" do
@@ -3,7 +3,13 @@ class TestModel
3
3
  @vector = vector
4
4
  end
5
5
 
6
- def clustering_vector
7
- @vector
6
+ def clustering_vector(vector_params)
7
+ if vector_params[:type] == 'Hash'
8
+ vector_as_hash = {}
9
+ @vector.map.with_index{ |x,i| vector_as_hash[i] = x }
10
+ return vector_as_hash
11
+ end
12
+
13
+ return @vector
8
14
  end
9
15
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: db_clustering
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cihat Gündüz