fastout 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/fastout/ranker.rb +177 -174
- data/lib/fastout/version.rb +1 -1
- metadata +3 -3
data/lib/fastout/ranker.rb
CHANGED
@@ -5,239 +5,242 @@
|
|
5
5
|
# Author:: Jason Dew (mailto:jason.dew@gmail.com)
|
6
6
|
# Copyright:: Copyright (c) 2010 Jason Dew
|
7
7
|
# License:: MIT
|
8
|
-
class Ranker
|
9
8
|
|
10
|
-
|
9
|
+
module Fastout
|
10
|
+
class Ranker
|
11
11
|
|
12
|
-
|
12
|
+
class Point
|
13
13
|
|
14
|
-
|
15
|
-
@@next_id = id
|
16
|
-
end
|
14
|
+
@@next_id = 0
|
17
15
|
|
18
|
-
|
19
|
-
|
16
|
+
def self.next_id= id
|
17
|
+
@@next_id = id
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
@cluster = nil
|
24
|
-
@score = 0
|
25
|
-
@bins = []
|
20
|
+
attr_reader :id, :attributes, :bins
|
21
|
+
attr_accessor :cluster, :score
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
def initialize *attributes
|
24
|
+
@attributes = attributes
|
25
|
+
@cluster = nil
|
26
|
+
@score = 0
|
27
|
+
@bins = []
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
@id = @@next_id
|
30
|
+
@@next_id += 1
|
31
|
+
end
|
34
32
|
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
def [] index
|
34
|
+
@attributes[index]
|
35
|
+
end
|
38
36
|
|
39
|
-
|
40
|
-
|
41
|
-
|
37
|
+
def clustered?
|
38
|
+
!! cluster
|
39
|
+
end
|
42
40
|
|
43
|
-
|
44
|
-
|
45
|
-
return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
|
41
|
+
def uncluster!
|
42
|
+
@cluster = nil
|
46
43
|
end
|
47
44
|
|
48
|
-
|
49
|
-
|
45
|
+
def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
|
46
|
+
attribute_indexes.each do |attribute_index|
|
47
|
+
return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
|
48
|
+
end
|
49
|
+
|
50
|
+
attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
|
51
|
+
return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
|
52
|
+
end
|
53
|
+
|
54
|
+
true
|
50
55
|
end
|
51
56
|
|
52
|
-
|
53
|
-
|
57
|
+
def neighbor_of_any? points, attribute_indexes, neighborhoods
|
58
|
+
points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
|
59
|
+
end
|
54
60
|
|
55
|
-
def neighbor_of_any? points, attribute_indexes, neighborhoods
|
56
|
-
points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
|
57
61
|
end
|
58
62
|
|
59
|
-
|
63
|
+
attr_reader :data, :points, :minimums, :maximums
|
60
64
|
|
61
|
-
|
65
|
+
def self.pointify data
|
66
|
+
data.map {|attributes| Point.new *attributes }
|
67
|
+
end
|
62
68
|
|
63
|
-
|
64
|
-
|
65
|
-
|
69
|
+
# takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
|
70
|
+
# values should all be numerical
|
71
|
+
# * +data+ should not be empty or nil will be returned
|
72
|
+
# * also generates minimum and maximum values for each attribute for later use
|
73
|
+
def initialize data
|
74
|
+
raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
|
75
|
+
@data = data
|
76
|
+
@points = self.class.pointify data
|
77
|
+
@minimums, @maximums = compute_minimums_and_maximums
|
78
|
+
Point.next_id = 0
|
79
|
+
end
|
66
80
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
81
|
+
# searches the parameter space to find the optimized values of +k+ and +q+
|
82
|
+
# * +theta_target+ is the maximum acceptable value of theta, default is 1
|
83
|
+
# * +sample+ is the number of iterations to perform in estimating the parameters
|
84
|
+
# * +n+ is the number of points to rank
|
85
|
+
def optimized_ranking sample, n, theta_target=1
|
86
|
+
k = 3
|
87
|
+
q = 5
|
88
|
+
max_q = n / 4
|
89
|
+
step_q = 10
|
90
|
+
last_theta = n
|
91
|
+
theta, s = calculate_theta(sample, k, n, q)
|
78
92
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
if (theta >= last_theta)
|
95
|
-
# effectiveness declining so try next k
|
96
|
-
k += 1
|
97
|
-
q -= step_q
|
98
|
-
last_theta = n
|
99
|
-
else
|
100
|
-
# try next q
|
101
|
-
q += step_q
|
102
|
-
last_theta = theta
|
93
|
+
while (theta > theta_target or theta < last_theta or q < max_q) do
|
94
|
+
return s if (theta <= theta_target)
|
95
|
+
|
96
|
+
if (theta >= last_theta)
|
97
|
+
# effectiveness declining so try next k
|
98
|
+
k += 1
|
99
|
+
q -= step_q
|
100
|
+
last_theta = n
|
101
|
+
else
|
102
|
+
# try next q
|
103
|
+
q += step_q
|
104
|
+
last_theta = theta
|
105
|
+
end
|
106
|
+
|
107
|
+
theta, s = calculate_theta(sample, k, n, q)
|
103
108
|
end
|
104
109
|
|
105
|
-
|
110
|
+
s
|
106
111
|
end
|
107
112
|
|
108
|
-
|
109
|
-
|
113
|
+
# find and rank the points by their outlier score and determine
|
114
|
+
# theta (the number of points with an outlier score of +n+)
|
115
|
+
def calculate_theta sample, k, n, q
|
116
|
+
s = ranked_outliers sample, k, q
|
117
|
+
theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
|
110
118
|
|
111
|
-
|
112
|
-
|
113
|
-
def calculate_theta sample, k, n, q
|
114
|
-
s = ranked_outliers sample, k, q
|
115
|
-
theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
|
119
|
+
[theta, s]
|
120
|
+
end
|
116
121
|
|
117
|
-
|
118
|
-
|
122
|
+
# chooses +k+ random attributes with an average of +q+ data points
|
123
|
+
# in each bin +sample+ times to determine outliers
|
124
|
+
def ranked_outliers sample_size, k, q
|
125
|
+
# determine number of bins and their widths
|
126
|
+
bin_count = compute_bin_count(q)
|
127
|
+
bin_widths = compute_bin_widths(q, bin_count)
|
119
128
|
|
120
|
-
|
121
|
-
|
122
|
-
def ranked_outliers sample_size, k, q
|
123
|
-
# determine number of bins and their widths
|
124
|
-
bin_count = compute_bin_count(q)
|
125
|
-
bin_widths = compute_bin_widths(q, bin_count)
|
129
|
+
# assign points to the attribute bins
|
130
|
+
assign_points_to_bins! bin_widths, bin_count
|
126
131
|
|
127
|
-
|
128
|
-
|
132
|
+
1.upto(sample_size) {
|
133
|
+
score_points_from_a_random_set_of_attributes! k, bin_widths }
|
129
134
|
|
130
|
-
|
131
|
-
|
135
|
+
points.sort_by(&:score).reverse
|
136
|
+
end
|
132
137
|
|
133
|
-
|
134
|
-
|
138
|
+
# pick a random set of attributes and compute the outlier score
|
139
|
+
# for each of the points
|
140
|
+
def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
|
141
|
+
cluster = 0
|
142
|
+
attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
|
143
|
+
bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
|
135
144
|
|
136
|
-
|
137
|
-
|
138
|
-
def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
|
139
|
-
cluster = 0
|
140
|
-
attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
|
141
|
-
bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
|
145
|
+
points.each do |point|
|
146
|
+
next if point.clustered?
|
142
147
|
|
143
|
-
|
144
|
-
|
148
|
+
point.cluster = (cluster += 1)
|
149
|
+
neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
|
145
150
|
|
146
|
-
|
147
|
-
|
151
|
+
point.uncluster! if neighbors.empty?
|
152
|
+
end
|
148
153
|
|
149
|
-
|
154
|
+
points.each do |point|
|
155
|
+
next unless point.clustered?
|
156
|
+
point.uncluster!
|
157
|
+
point.score += 1
|
158
|
+
end
|
150
159
|
end
|
151
160
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
point.score += 1
|
161
|
+
# randomly choose +number+ of attribute indexes
|
162
|
+
def random_attribute_indexes number
|
163
|
+
(0...@data.first.size).sort_by { rand }[0..number]
|
156
164
|
end
|
157
|
-
end
|
158
165
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
166
|
+
# find all unclustered points that are neighbors of +point+ on
|
167
|
+
# *all* selected attributes or neighbors in the neighborhood
|
168
|
+
# of +point+; find recursively until no additions can be made
|
169
|
+
def cluster_neighbors point, cluster, attribute_indexes, bin_widths
|
170
|
+
recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
|
171
|
+
end
|
163
172
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
def cluster_neighbors point, cluster, attribute_indexes, bin_widths
|
168
|
-
recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
|
169
|
-
end
|
173
|
+
# recursive step of #cluster_neighbors
|
174
|
+
def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
|
175
|
+
fruitful = false
|
170
176
|
|
171
|
-
|
172
|
-
|
173
|
-
|
177
|
+
unclustered_points.each do |unclustered_point|
|
178
|
+
next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
|
179
|
+
unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
|
174
180
|
|
175
|
-
|
176
|
-
|
177
|
-
|
181
|
+
fruitful = true
|
182
|
+
unclustered_point.cluster = cluster
|
183
|
+
neighbors << unclustered_point
|
184
|
+
end
|
178
185
|
|
179
|
-
fruitful
|
180
|
-
|
181
|
-
|
186
|
+
if fruitful
|
187
|
+
recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
|
188
|
+
else
|
189
|
+
neighbors
|
190
|
+
end
|
182
191
|
end
|
183
192
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
neighbors
|
193
|
+
# find all of the points that don't already belong to a cluster
|
194
|
+
def unclustered_points
|
195
|
+
points.select {|point| not point.clustered? }
|
188
196
|
end
|
189
|
-
end
|
190
197
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
def assign_points_to_bins! bin_widths, bin_count
|
199
|
-
bin_widths.each_with_index do |bin_width, attribute_index|
|
200
|
-
points.each do |point|
|
201
|
-
point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
|
198
|
+
# assign each of the data points to a bin based on the given +bin_widths+,
|
199
|
+
# returns a 2-d array in attribute-major order
|
200
|
+
def assign_points_to_bins! bin_widths, bin_count
|
201
|
+
bin_widths.each_with_index do |bin_width, attribute_index|
|
202
|
+
points.each do |point|
|
203
|
+
point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
|
204
|
+
end
|
202
205
|
end
|
203
206
|
end
|
204
|
-
end
|
205
207
|
|
206
|
-
|
207
|
-
|
208
|
-
|
208
|
+
def bin_index point, attribute_index, bin_width
|
209
|
+
minimum = @minimums[attribute_index]
|
210
|
+
maximum = @maximums[attribute_index]
|
209
211
|
|
210
|
-
|
211
|
-
|
212
|
+
value = point[attribute_index]
|
213
|
+
index = ((value - minimum) / bin_width).floor
|
212
214
|
|
213
|
-
|
214
|
-
|
215
|
+
value == maximum ? index - 1 : index
|
216
|
+
end
|
215
217
|
|
216
|
-
|
217
|
-
|
218
|
-
|
218
|
+
def compute_minimums_and_maximums
|
219
|
+
minimums = @data.first.dup
|
220
|
+
maximums = @data.first.dup
|
219
221
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
222
|
+
@data.each do |attributes|
|
223
|
+
attributes.each_with_index do |attribute, attribute_index|
|
224
|
+
minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
|
225
|
+
maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
|
226
|
+
end
|
224
227
|
end
|
228
|
+
|
229
|
+
[minimums, maximums]
|
225
230
|
end
|
226
231
|
|
227
|
-
|
228
|
-
|
232
|
+
# determine the widths of the bins based on +q+
|
233
|
+
def compute_bin_widths q, bin_count
|
234
|
+
(0...@data.first.size).map do |attribute_index|
|
235
|
+
(@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
|
236
|
+
end
|
237
|
+
end
|
229
238
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
239
|
+
# compute the number of bins for a given +q+
|
240
|
+
def compute_bin_count q
|
241
|
+
count = (@data.size / q.to_f).ceil
|
242
|
+
count < 2 ? 2 : count
|
234
243
|
end
|
235
|
-
end
|
236
244
|
|
237
|
-
# compute the number of bins for a given +q+
|
238
|
-
def compute_bin_count q
|
239
|
-
count = (@data.size / q.to_f).ceil
|
240
|
-
count < 2 ? 2 : count
|
241
245
|
end
|
242
|
-
|
243
246
|
end
|
data/lib/fastout/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fastout
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jason Dew
|