fastout 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/fastout/ranker.rb +177 -174
- data/lib/fastout/version.rb +1 -1
- metadata +3 -3
data/lib/fastout/ranker.rb
CHANGED
@@ -5,239 +5,242 @@
|
|
5
5
|
# Author:: Jason Dew (mailto:jason.dew@gmail.com)
|
6
6
|
# Copyright:: Copyright (c) 2010 Jason Dew
|
7
7
|
# License:: MIT
|
8
|
-
class Ranker
|
9
8
|
|
10
|
-
|
9
|
+
module Fastout
|
10
|
+
class Ranker
|
11
11
|
|
12
|
-
|
12
|
+
class Point
|
13
13
|
|
14
|
-
|
15
|
-
@@next_id = id
|
16
|
-
end
|
14
|
+
@@next_id = 0
|
17
15
|
|
18
|
-
|
19
|
-
|
16
|
+
def self.next_id= id
|
17
|
+
@@next_id = id
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
@cluster = nil
|
24
|
-
@score = 0
|
25
|
-
@bins = []
|
20
|
+
attr_reader :id, :attributes, :bins
|
21
|
+
attr_accessor :cluster, :score
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
def initialize *attributes
|
24
|
+
@attributes = attributes
|
25
|
+
@cluster = nil
|
26
|
+
@score = 0
|
27
|
+
@bins = []
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
@id = @@next_id
|
30
|
+
@@next_id += 1
|
31
|
+
end
|
34
32
|
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
def [] index
|
34
|
+
@attributes[index]
|
35
|
+
end
|
38
36
|
|
39
|
-
|
40
|
-
|
41
|
-
|
37
|
+
def clustered?
|
38
|
+
!! cluster
|
39
|
+
end
|
42
40
|
|
43
|
-
|
44
|
-
|
45
|
-
return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
|
41
|
+
def uncluster!
|
42
|
+
@cluster = nil
|
46
43
|
end
|
47
44
|
|
48
|
-
|
49
|
-
|
45
|
+
def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
|
46
|
+
attribute_indexes.each do |attribute_index|
|
47
|
+
return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
|
48
|
+
end
|
49
|
+
|
50
|
+
attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
|
51
|
+
return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
|
52
|
+
end
|
53
|
+
|
54
|
+
true
|
50
55
|
end
|
51
56
|
|
52
|
-
|
53
|
-
|
57
|
+
def neighbor_of_any? points, attribute_indexes, neighborhoods
|
58
|
+
points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
|
59
|
+
end
|
54
60
|
|
55
|
-
def neighbor_of_any? points, attribute_indexes, neighborhoods
|
56
|
-
points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
|
57
61
|
end
|
58
62
|
|
59
|
-
|
63
|
+
attr_reader :data, :points, :minimums, :maximums
|
60
64
|
|
61
|
-
|
65
|
+
def self.pointify data
|
66
|
+
data.map {|attributes| Point.new *attributes }
|
67
|
+
end
|
62
68
|
|
63
|
-
|
64
|
-
|
65
|
-
|
69
|
+
# takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
|
70
|
+
# values should all be numerical
|
71
|
+
# * +data+ should not be empty or nil will be returned
|
72
|
+
# * also generates minimum and maximum values for each attribute for later use
|
73
|
+
def initialize data
|
74
|
+
raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
|
75
|
+
@data = data
|
76
|
+
@points = self.class.pointify data
|
77
|
+
@minimums, @maximums = compute_minimums_and_maximums
|
78
|
+
Point.next_id = 0
|
79
|
+
end
|
66
80
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
81
|
+
# searches the parameter space to find the optimized values of +k+ and +q+
|
82
|
+
# * +theta_target+ is the maximum acceptable value of theta, default is 1
|
83
|
+
# * +sample+ is the number of iterations to perform in estimating the parameters
|
84
|
+
# * +n+ is the number of points to rank
|
85
|
+
def optimized_ranking sample, n, theta_target=1
|
86
|
+
k = 3
|
87
|
+
q = 5
|
88
|
+
max_q = n / 4
|
89
|
+
step_q = 10
|
90
|
+
last_theta = n
|
91
|
+
theta, s = calculate_theta(sample, k, n, q)
|
78
92
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
if (theta >= last_theta)
|
95
|
-
# effectiveness declining so try next k
|
96
|
-
k += 1
|
97
|
-
q -= step_q
|
98
|
-
last_theta = n
|
99
|
-
else
|
100
|
-
# try next q
|
101
|
-
q += step_q
|
102
|
-
last_theta = theta
|
93
|
+
while (theta > theta_target or theta < last_theta or q < max_q) do
|
94
|
+
return s if (theta <= theta_target)
|
95
|
+
|
96
|
+
if (theta >= last_theta)
|
97
|
+
# effectiveness declining so try next k
|
98
|
+
k += 1
|
99
|
+
q -= step_q
|
100
|
+
last_theta = n
|
101
|
+
else
|
102
|
+
# try next q
|
103
|
+
q += step_q
|
104
|
+
last_theta = theta
|
105
|
+
end
|
106
|
+
|
107
|
+
theta, s = calculate_theta(sample, k, n, q)
|
103
108
|
end
|
104
109
|
|
105
|
-
|
110
|
+
s
|
106
111
|
end
|
107
112
|
|
108
|
-
|
109
|
-
|
113
|
+
# find and rank the points by their outlier score and determine
|
114
|
+
# theta (the number of points with an outlier score of +n+)
|
115
|
+
def calculate_theta sample, k, n, q
|
116
|
+
s = ranked_outliers sample, k, q
|
117
|
+
theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
|
110
118
|
|
111
|
-
|
112
|
-
|
113
|
-
def calculate_theta sample, k, n, q
|
114
|
-
s = ranked_outliers sample, k, q
|
115
|
-
theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
|
119
|
+
[theta, s]
|
120
|
+
end
|
116
121
|
|
117
|
-
|
118
|
-
|
122
|
+
# chooses +k+ random attributes with an average of +q+ data points
|
123
|
+
# in each bin +sample+ times to determine outliers
|
124
|
+
def ranked_outliers sample_size, k, q
|
125
|
+
# determine number of bins and their widths
|
126
|
+
bin_count = compute_bin_count(q)
|
127
|
+
bin_widths = compute_bin_widths(q, bin_count)
|
119
128
|
|
120
|
-
|
121
|
-
|
122
|
-
def ranked_outliers sample_size, k, q
|
123
|
-
# determine number of bins and their widths
|
124
|
-
bin_count = compute_bin_count(q)
|
125
|
-
bin_widths = compute_bin_widths(q, bin_count)
|
129
|
+
# assign points to the attribute bins
|
130
|
+
assign_points_to_bins! bin_widths, bin_count
|
126
131
|
|
127
|
-
|
128
|
-
|
132
|
+
1.upto(sample_size) {
|
133
|
+
score_points_from_a_random_set_of_attributes! k, bin_widths }
|
129
134
|
|
130
|
-
|
131
|
-
|
135
|
+
points.sort_by(&:score).reverse
|
136
|
+
end
|
132
137
|
|
133
|
-
|
134
|
-
|
138
|
+
# pick a random set of attributes and compute the outlier score
|
139
|
+
# for each of the points
|
140
|
+
def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
|
141
|
+
cluster = 0
|
142
|
+
attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
|
143
|
+
bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
|
135
144
|
|
136
|
-
|
137
|
-
|
138
|
-
def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
|
139
|
-
cluster = 0
|
140
|
-
attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
|
141
|
-
bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
|
145
|
+
points.each do |point|
|
146
|
+
next if point.clustered?
|
142
147
|
|
143
|
-
|
144
|
-
|
148
|
+
point.cluster = (cluster += 1)
|
149
|
+
neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
|
145
150
|
|
146
|
-
|
147
|
-
|
151
|
+
point.uncluster! if neighbors.empty?
|
152
|
+
end
|
148
153
|
|
149
|
-
|
154
|
+
points.each do |point|
|
155
|
+
next unless point.clustered?
|
156
|
+
point.uncluster!
|
157
|
+
point.score += 1
|
158
|
+
end
|
150
159
|
end
|
151
160
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
point.score += 1
|
161
|
+
# randomly choose +number+ of attribute indexes
|
162
|
+
def random_attribute_indexes number
|
163
|
+
(0...@data.first.size).sort_by { rand }[0..number]
|
156
164
|
end
|
157
|
-
end
|
158
165
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
166
|
+
# find all unclustered points that are neighbors of +point+ on
|
167
|
+
# *all* selected attributes or neighbors in the neighborhood
|
168
|
+
# of +point+; find recursively until no additions can be made
|
169
|
+
def cluster_neighbors point, cluster, attribute_indexes, bin_widths
|
170
|
+
recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
|
171
|
+
end
|
163
172
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
def cluster_neighbors point, cluster, attribute_indexes, bin_widths
|
168
|
-
recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
|
169
|
-
end
|
173
|
+
# recursive step of #cluster_neighbors
|
174
|
+
def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
|
175
|
+
fruitful = false
|
170
176
|
|
171
|
-
|
172
|
-
|
173
|
-
|
177
|
+
unclustered_points.each do |unclustered_point|
|
178
|
+
next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
|
179
|
+
unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
|
174
180
|
|
175
|
-
|
176
|
-
|
177
|
-
|
181
|
+
fruitful = true
|
182
|
+
unclustered_point.cluster = cluster
|
183
|
+
neighbors << unclustered_point
|
184
|
+
end
|
178
185
|
|
179
|
-
fruitful
|
180
|
-
|
181
|
-
|
186
|
+
if fruitful
|
187
|
+
recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
|
188
|
+
else
|
189
|
+
neighbors
|
190
|
+
end
|
182
191
|
end
|
183
192
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
neighbors
|
193
|
+
# find all of the points that don't already belong to a cluster
|
194
|
+
def unclustered_points
|
195
|
+
points.select {|point| not point.clustered? }
|
188
196
|
end
|
189
|
-
end
|
190
197
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
def assign_points_to_bins! bin_widths, bin_count
|
199
|
-
bin_widths.each_with_index do |bin_width, attribute_index|
|
200
|
-
points.each do |point|
|
201
|
-
point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
|
198
|
+
# assign each of the data points to a bin based on the given +bin_widths+,
|
199
|
+
# returns a 2-d array in attribute-major order
|
200
|
+
def assign_points_to_bins! bin_widths, bin_count
|
201
|
+
bin_widths.each_with_index do |bin_width, attribute_index|
|
202
|
+
points.each do |point|
|
203
|
+
point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
|
204
|
+
end
|
202
205
|
end
|
203
206
|
end
|
204
|
-
end
|
205
207
|
|
206
|
-
|
207
|
-
|
208
|
-
|
208
|
+
def bin_index point, attribute_index, bin_width
|
209
|
+
minimum = @minimums[attribute_index]
|
210
|
+
maximum = @maximums[attribute_index]
|
209
211
|
|
210
|
-
|
211
|
-
|
212
|
+
value = point[attribute_index]
|
213
|
+
index = ((value - minimum) / bin_width).floor
|
212
214
|
|
213
|
-
|
214
|
-
|
215
|
+
value == maximum ? index - 1 : index
|
216
|
+
end
|
215
217
|
|
216
|
-
|
217
|
-
|
218
|
-
|
218
|
+
def compute_minimums_and_maximums
|
219
|
+
minimums = @data.first.dup
|
220
|
+
maximums = @data.first.dup
|
219
221
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
222
|
+
@data.each do |attributes|
|
223
|
+
attributes.each_with_index do |attribute, attribute_index|
|
224
|
+
minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
|
225
|
+
maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
|
226
|
+
end
|
224
227
|
end
|
228
|
+
|
229
|
+
[minimums, maximums]
|
225
230
|
end
|
226
231
|
|
227
|
-
|
228
|
-
|
232
|
+
# determine the widths of the bins based on +q+
|
233
|
+
def compute_bin_widths q, bin_count
|
234
|
+
(0...@data.first.size).map do |attribute_index|
|
235
|
+
(@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
|
236
|
+
end
|
237
|
+
end
|
229
238
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
239
|
+
# compute the number of bins for a given +q+
|
240
|
+
def compute_bin_count q
|
241
|
+
count = (@data.size / q.to_f).ceil
|
242
|
+
count < 2 ? 2 : count
|
234
243
|
end
|
235
|
-
end
|
236
244
|
|
237
|
-
# compute the number of bins for a given +q+
|
238
|
-
def compute_bin_count q
|
239
|
-
count = (@data.size / q.to_f).ceil
|
240
|
-
count < 2 ? 2 : count
|
241
245
|
end
|
242
|
-
|
243
246
|
end
|
data/lib/fastout/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fastout
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jason Dew
|