RubyGems - db_clustering - Versions diffs - 0.1.6 → 0.1.7 - Mend

db_clustering 0.1.6 → 0.1.7

Files changed (11) hide show

checksums.yaml +4 -4
data/README.md +21 -2
data/Rakefile +1 -1
data/VERSION +1 -1
data/lib/datasource_adapters/active_record.rb +5 -4
data/lib/datasource_adapters/in_memory.rb +3 -2
data/lib/models/point.rb +7 -2
data/spec/algorithms/density_based/dbscan_spec.rb +2 -2
data/spec/datasource_adapters/in_memory_spec.rb +2 -2
data/spec/support/test_model.rb +8 -2
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a3804d604feaab377c7cbddb8af4318ba3e3b15a
-  data.tar.gz: 8f32d2fa6074a2feea7c52f9be3d3ec586eead85
+  metadata.gz: 6d90adfb63dc205025dbd331adf219dbf8153232
+  data.tar.gz: 48467c1bac585bc7e1c95b028fcbebe73b310b3e
 SHA512:
-  metadata.gz: 6f3b4a9eef1f060c81ef70dd9eb8216e913a88cebddc9152cbe88fedbe974d8d68f6b1e99baa161e8e8eff45025391f4f75f865cc657c78998c75720ff3e32c2
-  data.tar.gz: 7b0d8fa98551a2a5a73a23b1c6989a113eb4c97e48a085839c5642d2e52fc12e4fbf73efb5404dc8892178a8a71533842f0afcc7cfff2651d3875b79e42bfabb
+  metadata.gz: 8f6cf2d2eb668c229599138a886ef0cdbf76487575faa2e07d53e50c83a4b1b358d8741b56e5fb4a12269d3e355c42e63edf7d8b5af1029c5ceb650b98c36bad
+  data.tar.gz: 2a383a33400a8f1582be0bd022b4bde47d52166b1b3286c0e0a0e74c73ed1beb3fdf1ed29a0d996565895df08ed9b1db5c2ca0cd627a8bdf77f4de565debd958

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# DBClustering [![Build Status](https://travis-ci.org/Dschee/db_clustering.svg?branch=develop)](https://travis-ci.org/Dschee/db_clustering)
+# DBClustering [![Build Status](https://travis-ci.org/Flinesoft/db_clustering.svg?branch=develop)](https://travis-ci.org/Flinesoft/db_clustering)
 Please note that this gem is still in its very early stages and should not considered stable.
 Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
@@ -21,6 +21,8 @@ This gem was developed to work best in Ruby on Rails projects.
    - an **array** with numeric values for similarity comparison
    - a **hash** with numeric values for similarity comparison between keys existing in both hashes
+   The `clustering_vector` can either have no parameters at all or one parameter that we call `vector_params` within this documentation. You can of course name it the way you want – it will just be passed through as you specify in a later step (step 5).
    See `TestModel` class within the `spec/support` directory for a very simple example.
 4. Decide for a *distance metric* and initialize it, e.g.:
@@ -34,10 +36,27 @@ This gem was developed to work best in Ruby on Rails projects.
    pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
    ```
+   You can also specify that a distance only makes sense if a minimum number of values are given when comparing two vectors. This is especially useful in combination with a hash clustering vector and many different keys. A distance will then only be calculated if the given `min_dimensions` is reached or exceeded – otherwise the distance will be set to (near) infinity and therefore the vectors will not be ranked as in the same cluster:
+   ``` ruby
+   average_difference = DbClustering::DistanceMetrics::AverageDifference.new(min_dimensions: 5)
+   # Instead you can also use one of the following:
+   cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new(min_dimensions: 5)
+   euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new(min_dimensions: 5)
+   pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new(min_dimensions: 5)
+   ```
 5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
    ``` ruby
-    in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
+   in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
+   ```
+   If you decided to accept a `vector_params` parameter in step 3 please add a non-nil parameter in your datasource like this:
+   ``` ruby
+   in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array, vector_params: { any_object: :to_passthrough })
    ```
    Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.

data/Rakefile CHANGED Viewed

@@ -15,7 +15,7 @@ require 'jeweler'
 Jeweler::Tasks.new do |gem|
   # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
   gem.name = "db_clustering"
-  gem.homepage = "http://github.com/Dschee/db_clustering"
+  gem.homepage = "http://github.com/Flinesoft/db_clustering"
   gem.license = "MIT"
   gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
   gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.5
1	+ 0.1.7

data/lib/datasource_adapters/active_record.rb CHANGED Viewed

@@ -2,16 +2,17 @@ module DbClustering
   module DatasourceAdapters
     class ActiveRecord
-      def initialize(relation:)
+      def initialize(relation:, vector_params: nil)
         @relation = relation
+        @vector_params = vector_params
       end
       def iterate_all_points
         points_count = @relation.count
         current_index = 0
         @relation.find_each do |datasource_point|
-          point = DbClustering::Models::Point.new(datasource_point)
+          point = DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params)
           yield(point, current_index, points_count)
           current_index += 1
         end
@@ -21,7 +22,7 @@ module DbClustering
         neighbors = []
         @relation.find_each do |neighbor_candidate|
-          candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
+          candidate_point = DbClustering::Models::Point.new(datasource_point: neighbor_candidate, vector_params: @vector_params)
           if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
             neighbors << candidate_point

data/lib/datasource_adapters/in_memory.rb CHANGED Viewed

@@ -2,8 +2,9 @@ module DbClustering
   module DatasourceAdapters
     class InMemory
-      def initialize(array:)
-        @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
+      def initialize(array:, vector_params: nil)
+        @vector_params = vector_params
+        @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params) }
       end
       def iterate_all_points

data/lib/models/point.rb CHANGED Viewed

@@ -4,14 +4,19 @@ module DbClustering
       attr_accessor :cluster, :is_noise, :datasource_point
-      def initialize(datasource_point)
+      def initialize(datasource_point:, vector_params: nil)
         @is_noise = false
         @cluster = nil
         @datasource_point = datasource_point
+        @vector_params = vector_params
       end
       def vector
-        vector_object = @datasource_point.clustering_vector
+        if @vector_params
+          vector_object = @datasource_point.clustering_vector(@vector_params)
+        else
+          vector_object = @datasource_point.clustering_vector
+        end
         if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
           DbClustering::Models::Vector.new(object: vector_object)

data/spec/algorithms/density_based/dbscan_spec.rb CHANGED Viewed

@@ -6,7 +6,7 @@ describe DbClustering::Algorithms::Dbscan do
     before(:each) do
       @dataset = DatasetHelper.normal_distribution
-      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
+      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
       @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
       @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
@@ -22,7 +22,7 @@ describe DbClustering::Algorithms::Dbscan do
       @clusters_count = 10
       @dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
-      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
+      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
       @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
       @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)

data/spec/datasource_adapters/in_memory_spec.rb CHANGED Viewed

@@ -25,8 +25,8 @@ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
     before(:each) do
       @dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
-      @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
-      @first_point = DbClustering::Models::Point.new(@dataset.first)
+      @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
+      @first_point = DbClustering::Models::Point.new(datasource_point: @dataset.first, vector_params: { type: 'Array' })
     end
     context "average difference" do

data/spec/support/test_model.rb CHANGED Viewed

@@ -3,7 +3,13 @@ class TestModel
     @vector = vector
   end
-  def clustering_vector
-    @vector
+  def clustering_vector(vector_params)
+    if vector_params[:type] == 'Hash'
+      vector_as_hash = {}
+      @vector.map.with_index{ |x,i| vector_as_hash[i] = x }
+      return vector_as_hash
+    end
+    return @vector
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: db_clustering
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - Cihat Gündüz