svmkit 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +3 -0
- data/README.md +1 -1
- data/lib/svmkit.rb +1 -0
- data/lib/svmkit/clustering/dbscan.rb +127 -0
- data/lib/svmkit/clustering/k_means.rb +9 -7
- data/lib/svmkit/version.rb +1 -1
- data/svmkit.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 917f85878296b940b497f13253e3d3b03047be8f154d554116c2629aaeea55dd
|
4
|
+
data.tar.gz: 16308e4638b15a55843f15b4e0d97886f27aae0cc236c59c590a8f9fe7f0e5c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d390d3ef0d7b06676e6d3c34479939b4a99ee01472816eacbe49fd3f40224ef5984620dfe6d335fb5b15e7213d3b0d17ba9441766e7cdd08c8bad9bff669db8d
|
7
|
+
data.tar.gz: ab2239c0d1297e18e31940e763875ac24668d8c4c3f30355f06bc5ed305c247ff0328e1d584c5ab70ce77d4d2f946dcc5f72f1eb4c3a25d9b0dcd38e1d246182
|
data/HISTORY.md
CHANGED
data/README.md
CHANGED
@@ -10,7 +10,7 @@ SVMKit provides machine learning algorithms with interfaces similar to Scikit-Le
|
|
10
10
|
SVMKit currently supports Linear / Kernel Support Vector Machine,
|
11
11
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
12
12
|
Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier,
|
13
|
-
K-Means and cross-validation.
|
13
|
+
K-Means, DBSCAN and cross-validation.
|
14
14
|
|
15
15
|
## Installation
|
16
16
|
|
data/lib/svmkit.rb
CHANGED
@@ -38,6 +38,7 @@ require 'svmkit/tree/decision_tree_regressor'
|
|
38
38
|
require 'svmkit/ensemble/random_forest_classifier'
|
39
39
|
require 'svmkit/ensemble/random_forest_regressor'
|
40
40
|
require 'svmkit/clustering/k_means'
|
41
|
+
require 'svmkit/clustering/dbscan'
|
41
42
|
require 'svmkit/preprocessing/l2_normalizer'
|
42
43
|
require 'svmkit/preprocessing/min_max_scaler'
|
43
44
|
require 'svmkit/preprocessing/standard_scaler'
|
@@ -0,0 +1,127 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'svmkit/validation'
|
4
|
+
require 'svmkit/base/base_estimator'
|
5
|
+
require 'svmkit/base/cluster_analyzer'
|
6
|
+
require 'svmkit/pairwise_metric'
|
7
|
+
|
8
|
+
module SVMKit
|
9
|
+
module Clustering
|
10
|
+
# DBSCAN is a class that implements DBSCAN cluster analysis.
|
11
|
+
# The current implementation uses the Euclidean distance for analyzing the clusters.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# analyzer = SVMKit::Clustering::DBSCAN.new(eps: 0.5, min_samples: 5)
|
15
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
16
|
+
#
|
17
|
+
# *Reference*
|
18
|
+
# - M. Ester, H-P. Kriegel, J. Sander, and X. Xu, "A density-based algorithm for discovering clusters in large spatial databases with noise," Proc. KDD' 96, pp. 266--231, 1996.
|
19
|
+
class DBSCAN
|
20
|
+
include Base::BaseEstimator
|
21
|
+
include Base::ClusterAnalyzer
|
22
|
+
include Validation
|
23
|
+
|
24
|
+
# Return the core sample indices.
|
25
|
+
# @return [Numo::Int32] (shape: [n_core_samples])
|
26
|
+
attr_reader :core_sample_ids
|
27
|
+
|
28
|
+
# Return the cluster labels. The negative cluster label indicates that the point is noise.
|
29
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
30
|
+
attr_reader :labels
|
31
|
+
|
32
|
+
# Create a new cluster analyzer with DBSCAN method.
|
33
|
+
#
|
34
|
+
# @param eps [Float] The radius of neighborhood.
|
35
|
+
# @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
|
36
|
+
def initialize(eps: 0.5, min_samples: 5)
|
37
|
+
check_params_float(eps: eps)
|
38
|
+
check_params_integer(min_samples: min_samples)
|
39
|
+
@params = {}
|
40
|
+
@params[:eps] = eps
|
41
|
+
@params[:min_samples] = min_samples
|
42
|
+
@core_sample_ids = nil
|
43
|
+
@labels = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
# Analysis clusters with given training data.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> DBSCAN
|
49
|
+
#
|
50
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
51
|
+
# @return [DBSCAN] The learned cluster analyzer itself.
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
check_sample_array(x)
|
54
|
+
partial_fit(x)
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
# Analysis clusters and assign samples to clusters.
|
59
|
+
#
|
60
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
61
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
62
|
+
def fit_predict(x)
|
63
|
+
check_sample_array(x)
|
64
|
+
partial_fit(x)
|
65
|
+
labels
|
66
|
+
end
|
67
|
+
|
68
|
+
# Dump marshal data.
|
69
|
+
# @return [Hash] The marshal data.
|
70
|
+
def marshal_dump
|
71
|
+
{ params: @params,
|
72
|
+
core_sample_ids: @core_sample_ids,
|
73
|
+
labels: @labels }
|
74
|
+
end
|
75
|
+
|
76
|
+
# Load marshal data.
|
77
|
+
# @return [nil]
|
78
|
+
def marshal_load(obj)
|
79
|
+
@params = obj[:params]
|
80
|
+
@core_sample_ids = obj[:core_sample_ids]
|
81
|
+
@labels = obj[:labels]
|
82
|
+
nil
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def partial_fit(x)
|
88
|
+
cluster_id = 0
|
89
|
+
n_samples = x.shape[0]
|
90
|
+
@core_sample_ids = []
|
91
|
+
@labels = Numo::Int32.zeros(n_samples) - 2
|
92
|
+
n_samples.times do |q|
|
93
|
+
next if @labels[q] >= -1
|
94
|
+
cluster_id += 1 if expand_cluster(x, q, cluster_id)
|
95
|
+
end
|
96
|
+
@core_sample_ids = Numo::Int32[*@core_sample_ids.flatten]
|
97
|
+
nil
|
98
|
+
end
|
99
|
+
|
100
|
+
def expand_cluster(x, query_id, cluster_id)
|
101
|
+
target_ids = region_query(x[query_id, true], x)
|
102
|
+
if target_ids.size < @params[:min_samples]
|
103
|
+
@labels[query_id] = -1
|
104
|
+
false
|
105
|
+
else
|
106
|
+
@labels[target_ids] = cluster_id
|
107
|
+
@core_sample_ids.push(target_ids.dup)
|
108
|
+
target_ids.delete(query_id)
|
109
|
+
while (m = target_ids.shift)
|
110
|
+
neighbor_ids = region_query(x[m, true], x)
|
111
|
+
next if neighbor_ids.size < @params[:min_samples]
|
112
|
+
neighbor_ids.each do |n|
|
113
|
+
target_ids.push(n) if @labels[n] < -1
|
114
|
+
@labels[n] = cluster_id if @labels[n] <= -1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
true
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def region_query(query, targets)
|
122
|
+
distance_arr = PairwiseMetric.euclidean_distance(query.expand_dims(0), targets)[0, true]
|
123
|
+
distance_arr.lt(@params[:eps]).where.to_a
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
@@ -9,10 +9,11 @@ module SVMKit
|
|
9
9
|
# This module consists of classes that implement cluster analysis methods.
|
10
10
|
module Clustering
|
11
11
|
# KMeans is a class that implements K-Means cluster analysis.
|
12
|
+
# The current implementation uses the Euclidean distance for analyzing the clusters.
|
12
13
|
#
|
13
14
|
# @example
|
14
15
|
# analyzer = SVMKit::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
|
15
|
-
#
|
16
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
16
17
|
#
|
17
18
|
# *Reference*
|
18
19
|
# - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
|
@@ -38,6 +39,7 @@ module SVMKit
|
|
38
39
|
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
39
40
|
def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
|
40
41
|
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
42
|
+
check_params_float(tol: tol)
|
41
43
|
check_params_string(init: init)
|
42
44
|
check_params_type_or_nil(Integer, random_seed: random_seed)
|
43
45
|
check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
|
@@ -62,10 +64,10 @@ module SVMKit
|
|
62
64
|
check_sample_array(x)
|
63
65
|
init_cluster_centers(x)
|
64
66
|
@params[:max_iter].times do |_t|
|
65
|
-
|
67
|
+
cluster_labels = assign_cluster(x)
|
66
68
|
old_centers = @cluster_centers.dup
|
67
69
|
@params[:n_clusters].times do |n|
|
68
|
-
assigned_bits =
|
70
|
+
assigned_bits = cluster_labels.eq(n)
|
69
71
|
@cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count > 0
|
70
72
|
end
|
71
73
|
error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
|
@@ -74,10 +76,10 @@ module SVMKit
|
|
74
76
|
self
|
75
77
|
end
|
76
78
|
|
77
|
-
# Predict cluster
|
79
|
+
# Predict cluster labels for samples.
|
78
80
|
#
|
79
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster
|
80
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster
|
81
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
82
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
81
83
|
def predict(x)
|
82
84
|
check_sample_array(x)
|
83
85
|
assign_cluster(x)
|
@@ -86,7 +88,7 @@ module SVMKit
|
|
86
88
|
# Analysis clusters and assign samples to clusters.
|
87
89
|
#
|
88
90
|
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
89
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster
|
91
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
90
92
|
def fit_predict(x)
|
91
93
|
check_sample_array(x)
|
92
94
|
fit(x)
|
data/lib/svmkit/version.rb
CHANGED
data/svmkit.gemspec
CHANGED
@@ -18,7 +18,7 @@ SVMKit provides machine learning algorithms with interfaces similar to Scikit-Le
|
|
18
18
|
SVMKit currently supports Linear / Kernel Support Vector Machine,
|
19
19
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
20
20
|
Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
|
21
|
-
K-Means and cross-validation.
|
21
|
+
K-Means, DBSCAN and cross-validation.
|
22
22
|
MSG
|
23
23
|
spec.homepage = 'https://github.com/yoshoku/svmkit'
|
24
24
|
spec.license = 'BSD-2-Clause'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svmkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-06-
|
11
|
+
date: 2018-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -86,7 +86,7 @@ description: |
|
|
86
86
|
SVMKit currently supports Linear / Kernel Support Vector Machine,
|
87
87
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
88
88
|
Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
|
89
|
-
K-Means and cross-validation.
|
89
|
+
K-Means, DBSCAN and cross-validation.
|
90
90
|
email:
|
91
91
|
- yoshoku@outlook.com
|
92
92
|
executables: []
|
@@ -115,6 +115,7 @@ files:
|
|
115
115
|
- lib/svmkit/base/regressor.rb
|
116
116
|
- lib/svmkit/base/splitter.rb
|
117
117
|
- lib/svmkit/base/transformer.rb
|
118
|
+
- lib/svmkit/clustering/dbscan.rb
|
118
119
|
- lib/svmkit/clustering/k_means.rb
|
119
120
|
- lib/svmkit/dataset.rb
|
120
121
|
- lib/svmkit/ensemble/random_forest_classifier.rb
|