RubyGems - db_clustering - Versions diffs - 0.1.6 → 0.1.7 - Mend

db_clustering 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/README.md +21 -2
data/Rakefile +1 -1
data/VERSION +1 -1
data/lib/datasource_adapters/active_record.rb +5 -4
data/lib/datasource_adapters/in_memory.rb +3 -2
data/lib/models/point.rb +7 -2
data/spec/algorithms/density_based/dbscan_spec.rb +2 -2
data/spec/datasource_adapters/in_memory_spec.rb +2 -2
data/spec/support/test_model.rb +8 -2
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a3804d604feaab377c7cbddb8af4318ba3e3b15a
-  data.tar.gz: 8f32d2fa6074a2feea7c52f9be3d3ec586eead85
+  metadata.gz: 6d90adfb63dc205025dbd331adf219dbf8153232
+  data.tar.gz: 48467c1bac585bc7e1c95b028fcbebe73b310b3e
 SHA512:
-  metadata.gz: 6f3b4a9eef1f060c81ef70dd9eb8216e913a88cebddc9152cbe88fedbe974d8d68f6b1e99baa161e8e8eff45025391f4f75f865cc657c78998c75720ff3e32c2
-  data.tar.gz: 7b0d8fa98551a2a5a73a23b1c6989a113eb4c97e48a085839c5642d2e52fc12e4fbf73efb5404dc8892178a8a71533842f0afcc7cfff2651d3875b79e42bfabb
+  metadata.gz: 8f6cf2d2eb668c229599138a886ef0cdbf76487575faa2e07d53e50c83a4b1b358d8741b56e5fb4a12269d3e355c42e63edf7d8b5af1029c5ceb650b98c36bad
+  data.tar.gz: 2a383a33400a8f1582be0bd022b4bde47d52166b1b3286c0e0a0e74c73ed1beb3fdf1ed29a0d996565895df08ed9b1db5c2ca0cd627a8bdf77f4de565debd958

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# DBClustering [![Build Status](https://travis-ci.org/Dschee/db_clustering.svg?branch=develop)](https://travis-ci.org/Dschee/db_clustering)
+# DBClustering [![Build Status](https://travis-ci.org/Flinesoft/db_clustering.svg?branch=develop)](https://travis-ci.org/Flinesoft/db_clustering)
 Please note that this gem is still in its very early stages and should not considered stable.
 Also it currently only supports the in-memory datasource adapter. In future versions an ActiveRecord adapter is planned but this is not yet implemented. Stay tuned.
@@ -21,6 +21,8 @@ This gem was developed to work best in Ruby on Rails projects.
    - an **array** with numeric values for similarity comparison
    - a **hash** with numeric values for similarity comparison between keys existing in both hashes
+   The `clustering_vector` can either have no parameters at all or one parameter that we call `vector_params` within this documentation. You can of course name it the way you want – it will just be passed through as you specify in a later step (step 5).
    See `TestModel` class within the `spec/support` directory for a very simple example.
 4. Decide for a *distance metric* and initialize it, e.g.:
@@ -34,10 +36,27 @@ This gem was developed to work best in Ruby on Rails projects.
    pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new
    ```
+   You can also specify that a distance only makes sense if a minimum number of values are given when comparing two vectors. This is especially useful in combination with a hash clustering vector and many different keys. A distance will then only be calculated if the given `min_dimensions` is reached or exceeded – otherwise the distance will be set to (near) infinity and therefore the vectors will not be ranked as in the same cluster:
+   ``` ruby
+   average_difference = DbClustering::DistanceMetrics::AverageDifference.new(min_dimensions: 5)
+   # Instead you can also use one of the following:
+   cosine_similarity = DbClustering::DistanceMetrics::CosineSimilarity.new(min_dimensions: 5)
+   euclidean_distance = DbClustering::DistanceMetrics::EuclideanDistance.new(min_dimensions: 5)
+   pearson_correlation = DbClustering::DistanceMetrics::PearsonCorrelation.new(min_dimensions: 5)
+   ```
 5. Decide for a datasource adapter (currently only in-memory datasource available), e.g.:
    ``` ruby
-    in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
+   in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array)
+   ```
+   If you decided to accept a `vector_params` parameter in step 3 please add a non-nil parameter in your datasource like this:
+   ``` ruby
+   in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: your_array, vector_params: { any_object: :to_passthrough })
    ```
    Please note that `your_array` should be an array filled with objects of the class type that implements the `clustering_vector` method from step 3.

data/Rakefile CHANGED Viewed

@@ -15,7 +15,7 @@ require 'jeweler'
 Jeweler::Tasks.new do |gem|
   # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
   gem.name = "db_clustering"
-  gem.homepage = "http://github.com/Dschee/db_clustering"
+  gem.homepage = "http://github.com/Flinesoft/db_clustering"
   gem.license = "MIT"
   gem.summary = %Q{Big-Data clustering algorithms in Ruby (on Rails).}
   gem.description = %Q{Big-Data clustering algorithms in Ruby (on Rails). Currently only supports DBSCAN algorithm and ActiveRecord database.}

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.5
1	+ 0.1.7

data/lib/datasource_adapters/active_record.rb CHANGED Viewed

@@ -2,16 +2,17 @@ module DbClustering
   module DatasourceAdapters
     class ActiveRecord
-      def initialize(relation:)
+      def initialize(relation:, vector_params: nil)
         @relation = relation
+        @vector_params = vector_params
       end
       def iterate_all_points
         points_count = @relation.count
         current_index = 0
         @relation.find_each do |datasource_point|
-          point = DbClustering::Models::Point.new(datasource_point)
+          point = DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params)
           yield(point, current_index, points_count)
           current_index += 1
         end
@@ -21,7 +22,7 @@ module DbClustering
         neighbors = []
         @relation.find_each do |neighbor_candidate|
-          candidate_point = DbClustering::Models::Point.new(neighbor_candidate)
+          candidate_point = DbClustering::Models::Point.new(datasource_point: neighbor_candidate, vector_params: @vector_params)
           if distance_metric.distance(point.vector, candidate_point.vector) <= max_distance
             neighbors << candidate_point

data/lib/datasource_adapters/in_memory.rb CHANGED Viewed

@@ -2,8 +2,9 @@ module DbClustering
   module DatasourceAdapters
     class InMemory
-      def initialize(array:)
-        @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point) }
+      def initialize(array:, vector_params: nil)
+        @vector_params = vector_params
+        @array = array.map{ |datasource_point| DbClustering::Models::Point.new(datasource_point: datasource_point, vector_params: @vector_params) }
       end
       def iterate_all_points

data/lib/models/point.rb CHANGED Viewed

@@ -4,14 +4,19 @@ module DbClustering
       attr_accessor :cluster, :is_noise, :datasource_point
-      def initialize(datasource_point)
+      def initialize(datasource_point:, vector_params: nil)
         @is_noise = false
         @cluster = nil
         @datasource_point = datasource_point
+        @vector_params = vector_params
       end
       def vector
-        vector_object = @datasource_point.clustering_vector
+        if @vector_params
+          vector_object = @datasource_point.clustering_vector(@vector_params)
+        else
+          vector_object = @datasource_point.clustering_vector
+        end
         if vector_object.is_a?(Hash) || vector_object.is_a?(Array)
           DbClustering::Models::Vector.new(object: vector_object)

data/spec/algorithms/density_based/dbscan_spec.rb CHANGED Viewed

@@ -6,7 +6,7 @@ describe DbClustering::Algorithms::Dbscan do
     before(:each) do
       @dataset = DatasetHelper.normal_distribution
-      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
+      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
       @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
       @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)
@@ -22,7 +22,7 @@ describe DbClustering::Algorithms::Dbscan do
       @clusters_count = 10
       @dataset = DatasetHelper.normal_distribution(vector_size: 10, clusters: @clusters_count, datapoints: 100)
-      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
+      @in_memory_datasource = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
       @average_difference_metric = DbClustering::DistanceMetrics::AverageDifference.new
       @dbscan = DbClustering::Algorithms::Dbscan.new(datasource: @in_memory_datasource, distance_metric: @average_difference_metric)

data/spec/datasource_adapters/in_memory_spec.rb CHANGED Viewed

@@ -25,8 +25,8 @@ describe DbClustering::DatasourceAdapters::InMemory, type: :model do
     before(:each) do
       @dataset = DatasetHelper.normal_distribution(vector_size: 16, clusters: 8, datapoints: 80)
-      @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset)
-      @first_point = DbClustering::Models::Point.new(@dataset.first)
+      @in_memory = DbClustering::DatasourceAdapters::InMemory.new(array: @dataset, vector_params: { type: 'Array' })
+      @first_point = DbClustering::Models::Point.new(datasource_point: @dataset.first, vector_params: { type: 'Array' })
     end
     context "average difference" do

data/spec/support/test_model.rb CHANGED Viewed

@@ -3,7 +3,13 @@ class TestModel
     @vector = vector
   end
-  def clustering_vector
-    @vector
+  def clustering_vector(vector_params)
+    if vector_params[:type] == 'Hash'
+      vector_as_hash = {}
+      @vector.map.with_index{ |x,i| vector_as_hash[i] = x }
+      return vector_as_hash
+    end
+    return @vector
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: db_clustering
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - Cihat Gündüz