db_clustering 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3804d604feaab377c7cbddb8af4318ba3e3b15a
4
- data.tar.gz: 8f32d2fa6074a2feea7c52f9be3d3ec586eead85
3
+ metadata.gz: 6d90adfb63dc205025dbd331adf219dbf8153232
4
+ data.tar.gz: 48467c1bac585bc7e1c95b028fcbebe73b310b3e
5
5
  SHA512:
6
- metadata.gz: 6f3b4a9eef1f060c81ef70dd9eb8216e913a88cebddc9152cbe88fedbe974d8d68f6b1e99baa161e8e8eff45025391f4f75f865cc657c78998c75720ff3e32c2
7
- data.tar.gz: 7b0d8fa98551a2a5a73a23b1c6989a113eb4c97e48a085839c5642d2e52fc12e4fbf73efb5404dc8892178a8a71533842f0afcc7cfff2651d3875b79e42bfabb
6
+ metadata.gz: 8f6cf2d2eb668c229599138a886ef0cdbf76487575faa2e07d53e50c83a4b1b358d8741b56e5fb4a12269d3e355c42e63edf7d8b5af1029c5ceb650b98c36bad
7
+ data.tar.gz: 2a383a33400a8f1582be0bd022b4bde47d52166b1b3286c0e0a0e74c73ed1beb3fdf1ed29a0d996565895df08ed9b1db5c2ca0cd627a8bdf77f4de565debd958
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # DBClustering [![Build Status](https://travis-ci.org/Dschee/db_clustering.svg?branch=develop)](https://travis-ci.org/Dschee/db_clustering)
1
+ # DBClustering [![Build Status](https://travis-ci.org/Flinesoft/db_clustering.svg?branch=develop)](https://travis-ci.org/Flinesoft/db_clustering)
2
2
 
3
3
  Please note that this gem is still in its very early stages and should not considered stable.
4
4
  Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
@@ -21,6 +21,8 @@ This gem was developed to work best in Ruby on Rails projects.
21
21
  - an **array** with numeric values for similarity comparison
22
22
  - a **hash** with numeric values for similarity comparison between keys existing in both hashes
23
23
 
24
+ The `clustering_vector` can either have no parameters at all or one parameter that we call `vector_params` within this documentation. You can of course name it the way you want – it will just be passed through as you specify in a later step (step 5).
25
+
24
26
  See `TestModel` class within the `spec/support` directory for a very simple example.
25
27
 
26
28
  4. Decide for a *distance metric* and initialize it, e.g.:
@@ -34,10 +36,27 @@ This gem was developed to work best in Ruby on Rails projects.
34
36
  pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
35
37
  ```
36
38
 
39
+ You can also specify that a distance only makes sense if a minimum number of values are given when comparing two vectors. This is especially useful in combination with a hash clustering vector and many different keys. A distance will then only be calculated if the given `min_dimensions` is reached or exceeded – otherwise the distance will be set to (near) infinity and therefore the vectors will not be ranked as in the same cluster:
40
+
41
+ ``` ruby
42
+ average_difference = DbClustering::DistanceMetrics::AverageDifference.new(min_dimensions: 5)
43
+
44
+ # Instead you can also use one of the following:
45
+ cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new(min_dimensions: 5)
46
+ euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new(min_dimensions: 5)
47
+ pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new(min_dimensions: 5)
48
+ ```
49
+
37
50
  5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
38
51
 
39
52
  ``` ruby
40
- in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
53
+ in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
54
+ ```
55
+
56
+ If you decided to accept a `vector_params` parameter in step 3 please add a non-nil parameter in your datasource like this:
57
+
58
+ ``` ruby
59
+ in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array, vector_params: { any_object: :to_passthrough })
41
60
  ```
42
61
 
43
62
  Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.
data/Rakefile CHANGED
@@ -15,7 +15,7 @@ require 'jeweler'
15
15
  Jeweler::Tasks.new do |gem|
16
16
  # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
17
  gem.name = "db_clustering"
18
- gem.homepage = "http://github.com/Dschee/db_clustering"
18
+ gem.homepage = "http://github.com/Flinesoft/db_clustering"
19
19
  gem.license = "MIT"
20
20
  gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
21
21
  gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.1.7
@@ -2,16 +2,17 @@ module DbClustering
2
2
  module DatasourceAdapters
3
3
  class ActiveRecord
4
4
 
5
- def initialize(relation:)
5
+ def initialize(relation:, vector_params: nil)
6
6
  @relation = relation
7
+ @vector_params = vector_params
7
8
  end
8
9
 
9
10
  def iterate_all_points
10
11
  points_count = @relation.count
11
12
  current_index = 0
12
-
13
+
13
14
  @relation.find_each do |datasource_point|
14
- point = DbClustering::Models::Point.new(datasource_point)
15
+ point = DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params)
15
16
  yield(point, current_index, points_count)
16
17
  current_index += 1
17
18
  end
@@ -21,7 +22,7 @@ module DbClustering
21
22
  neighbors = []
22
23
 
23
24
  @relation.find_each do |neighbor_candidate|
24
- candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
25
+ candidate_point = DbClustering::Models::Point.new(datasource_point: neighbor_candidate, vector_params: @vector_params)
25
26
 
26
27
  if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
27
28
  neighbors << candidate_point
@@ -2,8 +2,9 @@ module DbClustering
2
2
  module DatasourceAdapters
3
3
  class InMemory
4
4
 
5
- def initialize(array:)
6
- @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
5
+ def initialize(array:, vector_params: nil)
6
+ @vector_params = vector_params
7
+ @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params) }
7
8
  end
8
9
 
9
10
  def iterate_all_points
data/lib/models/point.rb CHANGED
@@ -4,14 +4,19 @@ module DbClustering
4
4
 
5
5
  attr_accessor :cluster, :is_noise, :datasource_point
6
6
 
7
- def initialize(datasource_point)
7
+ def initialize(datasource_point:, vector_params: nil)
8
8
  @is_noise = false
9
9
  @cluster = nil
10
10
  @datasource_point = datasource_point
11
+ @vector_params = vector_params
11
12
  end
12
13
 
13
14
  def vector
14
- vector_object = @datasource_point.clustering_vector
15
+ if @vector_params
16
+ vector_object = @datasource_point.clustering_vector(@vector_params)
17
+ else
18
+ vector_object = @datasource_point.clustering_vector
19
+ end
15
20
 
16
21
  if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
17
22
  DbClustering::Models::Vector.new(object: vector_object)
@@ -6,7 +6,7 @@ describe DbClustering::Algorithms::Dbscan do
6
6
  before(:each) do
7
7
  @dataset = DatasetHelper.normal_distribution
8
8
 
9
- @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
9
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
10
10
  @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
11
11
 
12
12
  @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
@@ -22,7 +22,7 @@ describe DbClustering::Algorithms::Dbscan do
22
22
  @clusters_count = 10
23
23
  @dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
24
24
 
25
- @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
25
+ @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
26
26
  @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
27
27
 
28
28
  @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
@@ -25,8 +25,8 @@ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
25
25
  before(:each) do
26
26
  @dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
27
27
 
28
- @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
29
- @first_point = DbClustering::Models::Point.new(@dataset.first)
28
+ @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
29
+ @first_point = DbClustering::Models::Point.new(datasource_point: @dataset.first, vector_params: { type: 'Array' })
30
30
  end
31
31
 
32
32
  context "average difference" do
@@ -3,7 +3,13 @@ class TestModel
3
3
  @vector = vector
4
4
  end
5
5
 
6
- def clustering_vector
7
- @vector
6
+ def clustering_vector(vector_params)
7
+ if vector_params[:type] == 'Hash'
8
+ vector_as_hash = {}
9
+ @vector.map.with_index{ |x,i| vector_as_hash[i] = x }
10
+ return vector_as_hash
11
+ end
12
+
13
+ return @vector
8
14
  end
9
15
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: db_clustering
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cihat Gündüz