db_clustering 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -0
- data/VERSION +1 -1
- data/db_clustering.gemspec +2 -2
- data/lib/algorithms/density_based/dbscan.rb +2 -2
- data/lib/datasource_adapters/active_record.rb +5 -1
- data/lib/datasource_adapters/in_memory.rb +3 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9aa27e97cd4db7b79c7a6ce30db82436e9553f55
|
4
|
+
data.tar.gz: 61628405866773b9bacaa9f5ac954ba623d48f69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b126213d40d23b9548e754587da63c27d580358d9b403f96a4628c14820d5513a1af7f3e2679e9871521dc280720ba39466bdd448431bfce4e3f5fb4252fd8db
|
7
|
+
data.tar.gz: c0808f2d68358297e6820b9ecebec10711662c285750ef3419d2a3a60969bc1f6632a8ff108bd0fcf2c4ea522d46a77891b7c3195e36b35f8ade781bd9210ee2
|
data/README.md
CHANGED
@@ -58,6 +58,27 @@ This gem was developed to work best in Ruby on Rails projects.
|
|
58
58
|
```
|
59
59
|
The `max_distance` is the epsilon parameter and the `min_neighbors` the minPts parameter from the usual DBSCAN algorithm documentation (e.g. Wikipedia). You might want to try different values here first before you decide for the right values for your purpose.
|
60
60
|
|
61
|
+
If you're interested in the progress of the algorithm you can run some code after each iteration of it (for DBSCAN this would mean after clustering a single point with its neighbors). Please note though that the current information at that point may be incomplete so don't use this as a method to receive a portion of the final results, treat it more like a partial result or just use it to indicate progress or do some debugging. For example you could do this:
|
62
|
+
|
63
|
+
``` ruby
|
64
|
+
last_printed_progress = 0.0
|
65
|
+
|
66
|
+
dbscan.cluster(max_distance: 10, min_neighbors: 5) do |point, current_index, points_count|
|
67
|
+
progress = (current_index + 1) * 100 / points_count.to_f
|
68
|
+
|
69
|
+
if progress > last_printed_progress + 1
|
70
|
+
print "[#{progress.to_i}%]"
|
71
|
+
last_printed_progress = progress
|
72
|
+
end
|
73
|
+
|
74
|
+
if point.cluster
|
75
|
+
print "(#{point.cluster.id}|#{point.cluster.points.count})"
|
76
|
+
else
|
77
|
+
print "(nil|0)"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
```
|
81
|
+
|
61
82
|
Plase also take note that the `max_distance` value is **highly dependent on the type of metric** you decided to go for. For the `AverageDifference` and `EuclideanDistance` metrics it can be an **open-ended positive value**. For the `CosineSimilarity` and `PearsonCorrelation` types it needs to be a value between 0 and 2 where a value of `0` means "100% positive correlation/similarity", a value of `1` means "no correlation/similarity at all" and a value of `2` means "100% negative correlation/similarity". You can use any decimal value in between (e.g. 0.25) as a partly positive/negative correlation.
|
62
83
|
|
63
84
|
8. Wait for the calculations to finish and use the results the way you want:
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.5
|
data/db_clustering.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: db_clustering 0.1.
|
5
|
+
# stub: db_clustering 0.1.5 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "db_clustering"
|
9
|
-
s.version = "0.1.
|
9
|
+
s.version = "0.1.5"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
@@ -16,7 +16,7 @@ module DbClustering
|
|
16
16
|
@clusters = []
|
17
17
|
cluster = nil
|
18
18
|
|
19
|
-
@datasource.iterate_all_points do |point|
|
19
|
+
@datasource.iterate_all_points do |point, current_index, points_count|
|
20
20
|
neighbors = @datasource.neighbors(point: point, distance_metric: @distance_metric, max_distance: max_distance)
|
21
21
|
|
22
22
|
if neighbors.count < min_neighbors
|
@@ -36,7 +36,7 @@ module DbClustering
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
|
-
yield(point)
|
39
|
+
yield(point, current_index, points_count) if block_given?
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
@@ -7,9 +7,13 @@ module DbClustering
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def iterate_all_points
|
10
|
+
points_count = @relation.count
|
11
|
+
current_index = 0
|
12
|
+
|
10
13
|
@relation.find_each do |datasource_point|
|
11
14
|
point = DbClustering::Models::Point.new(datasource_point)
|
12
|
-
yield(point)
|
15
|
+
yield(point, current_index, points_count)
|
16
|
+
current_index += 1
|
13
17
|
end
|
14
18
|
end
|
15
19
|
|