data_mining 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/data_mining/dbscan.rb +105 -0
- data/lib/data_mining/point.rb +35 -0
- data/lib/data_mining.rb +4 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5da253c99db083a7e8a8694ef479299269b95c87
|
4
|
+
data.tar.gz: 873c011a2e26d9c1c11165a7ce9aa401d0d50250
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6f2f01babec563518f49ddd12ac23a55865245ab6b1c9862b88836d5544eb2be6ca49574f8261ed6f0d215e5880c14804dc5f54ae1757220b1bc41a71eeb780f
|
7
|
+
data.tar.gz: 027e465483ca27939f8b43167ae9aacfd3cd709545e9bfb38c3a5cc31c0d39d5f2e1b23ec74d4c3319e01c5b4552c2f9ec41f15140ae5cb34e4b6ea73185be32
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module DataMining
|
2
|
+
# Density-Based clustering / Outlier-Detection Algorithu
|
3
|
+
class DBScan
|
4
|
+
# Find clusters and outliers
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# >> input = [[:p1, 1], [:p2, 2], [:p3, 10]]
|
8
|
+
# >> radius = 3
|
9
|
+
# >> min_points = 2
|
10
|
+
# >> dbscan = DataMining::DBScan.cluster(input, radius, min_points)
|
11
|
+
# >> dbscan.build!
|
12
|
+
# >>
|
13
|
+
# >> dbscan.clusters # gives array of clusters found (:p1, :p2)
|
14
|
+
# >>
|
15
|
+
# >> dbscan.outliers # gives array of outliers found (:p3)
|
16
|
+
#
|
17
|
+
# Arguments:
|
18
|
+
# data: (array of arrays, like [[:id, value], [:id2, value2]]
|
19
|
+
# radius: (integer)
|
20
|
+
# min_points: (integer)
|
21
|
+
|
22
|
+
def self.cluster(data, radius, min_points)
|
23
|
+
DBScan.new(data, radius, min_points)
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(data, radius, min_points)
|
27
|
+
@data = data.map { |i, v| DataMining::Point.new(i, v) }
|
28
|
+
@radius = radius
|
29
|
+
@min_points = min_points
|
30
|
+
@current_cluster_id = 0
|
31
|
+
@clusters = {}
|
32
|
+
@unvisited_points = @data.shuffle
|
33
|
+
end
|
34
|
+
|
35
|
+
def build!
|
36
|
+
dbscan
|
37
|
+
clusters
|
38
|
+
end
|
39
|
+
|
40
|
+
def outliers
|
41
|
+
@data.select { |p| !p.assigned_to_cluster? }
|
42
|
+
end
|
43
|
+
|
44
|
+
def clusters
|
45
|
+
@clusters.map { |cluster, points| { cluster => points.each(&:id) } }
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def dbscan
|
51
|
+
until unvisited_points.empty?
|
52
|
+
p = unvisited_points.pop
|
53
|
+
p.visit!
|
54
|
+
|
55
|
+
neighborhood = get_neighborhood(p)
|
56
|
+
create_cluster(p, neighborhood) if core_object?(neighborhood)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def unvisited_points
|
61
|
+
@unvisited_points.select! { |p| !p.visited? }
|
62
|
+
@unvisited_points
|
63
|
+
end
|
64
|
+
|
65
|
+
def create_cluster(point, neighborhood)
|
66
|
+
@current_cluster_id += 1
|
67
|
+
point.assign_to_cluster!
|
68
|
+
(@clusters[@current_cluster_id] ||= []) << point
|
69
|
+
fill_current_cluster(neighborhood)
|
70
|
+
end
|
71
|
+
|
72
|
+
def fill_current_cluster(neighborhood)
|
73
|
+
neighborhood.each do |neighbor|
|
74
|
+
elaborate(neighbor) unless neighbor.visited?
|
75
|
+
neighbor.assign_to_cluster!
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def elaborate(point)
|
80
|
+
point.visit!
|
81
|
+
@clusters[@current_cluster_id] << point unless point.assigned_to_cluster?
|
82
|
+
neighborhood = get_neighborhood(point)
|
83
|
+
fill_current_cluster(neighborhood) if core_object?(neighborhood)
|
84
|
+
end
|
85
|
+
|
86
|
+
# use map instead of each?
|
87
|
+
def get_neighborhood(point)
|
88
|
+
neighborhood = []
|
89
|
+
@data.each { |p| neighborhood << p if neighbors?(p, point) }
|
90
|
+
neighborhood
|
91
|
+
end
|
92
|
+
|
93
|
+
def core_object?(neighborhood)
|
94
|
+
return true if neighborhood.size >= (@min_points - 1)
|
95
|
+
false
|
96
|
+
end
|
97
|
+
|
98
|
+
def neighbors?(p1, p2)
|
99
|
+
return true if (p1.value - p2.value).abs <= @radius && p1 != p2
|
100
|
+
false
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
require 'data_mining/point'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module DataMining
|
2
|
+
# Point class
|
3
|
+
class Point
|
4
|
+
attr_reader :id, :value
|
5
|
+
|
6
|
+
# Represents a Point for the DBScan Algorithm
|
7
|
+
#
|
8
|
+
# Arguments:
|
9
|
+
# id: (symbol)
|
10
|
+
# value: (integer)
|
11
|
+
|
12
|
+
def initialize(id, value)
|
13
|
+
@id = id
|
14
|
+
@value = value
|
15
|
+
@visited = false
|
16
|
+
@in_a_cluster = false
|
17
|
+
end
|
18
|
+
|
19
|
+
def assigned_to_cluster?
|
20
|
+
@in_a_cluster
|
21
|
+
end
|
22
|
+
|
23
|
+
def assign_to_cluster!
|
24
|
+
@in_a_cluster = true
|
25
|
+
end
|
26
|
+
|
27
|
+
def visited?
|
28
|
+
@visited
|
29
|
+
end
|
30
|
+
|
31
|
+
def visit!
|
32
|
+
@visited = true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/data_mining.rb
ADDED
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: data_mining
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Manuel Stuefer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-06-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A collection of data mining algorithms
|
14
|
+
email: mstuefer@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/data_mining.rb
|
20
|
+
- lib/data_mining/dbscan.rb
|
21
|
+
- lib/data_mining/point.rb
|
22
|
+
homepage: http://rubygems.org/gems/data_mining
|
23
|
+
licenses:
|
24
|
+
- MIT
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.6
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: Data-Mining-Algorithms
|
46
|
+
test_files: []
|