fastout 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/fastout/ranker.rb +177 -174
  2. data/lib/fastout/version.rb +1 -1
  3. metadata +3 -3
@@ -5,239 +5,242 @@
5
5
  # Author:: Jason Dew (mailto:jason.dew@gmail.com)
6
6
  # Copyright:: Copyright (c) 2010 Jason Dew
7
7
  # License:: MIT
8
- class Ranker
9
8
 
10
- class Point
9
+ module Fastout
10
+ class Ranker
11
11
 
12
- @@next_id = 0
12
+ class Point
13
13
 
14
- def self.next_id= id
15
- @@next_id = id
16
- end
14
+ @@next_id = 0
17
15
 
18
- attr_reader :id, :attributes, :bins
19
- attr_accessor :cluster, :score
16
+ def self.next_id= id
17
+ @@next_id = id
18
+ end
20
19
 
21
- def initialize *attributes
22
- @attributes = attributes
23
- @cluster = nil
24
- @score = 0
25
- @bins = []
20
+ attr_reader :id, :attributes, :bins
21
+ attr_accessor :cluster, :score
26
22
 
27
- @id = @@next_id
28
- @@next_id += 1
29
- end
23
+ def initialize *attributes
24
+ @attributes = attributes
25
+ @cluster = nil
26
+ @score = 0
27
+ @bins = []
30
28
 
31
- def [] index
32
- @attributes[index]
33
- end
29
+ @id = @@next_id
30
+ @@next_id += 1
31
+ end
34
32
 
35
- def clustered?
36
- !! cluster
37
- end
33
+ def [] index
34
+ @attributes[index]
35
+ end
38
36
 
39
- def uncluster!
40
- @cluster = nil
41
- end
37
+ def clustered?
38
+ !! cluster
39
+ end
42
40
 
43
- def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
44
- attribute_indexes.each do |attribute_index|
45
- return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
41
+ def uncluster!
42
+ @cluster = nil
46
43
  end
47
44
 
48
- attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
49
- return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
45
+ def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
46
+ attribute_indexes.each do |attribute_index|
47
+ return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
48
+ end
49
+
50
+ attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
51
+ return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
52
+ end
53
+
54
+ true
50
55
  end
51
56
 
52
- true
53
- end
57
+ def neighbor_of_any? points, attribute_indexes, neighborhoods
58
+ points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
59
+ end
54
60
 
55
- def neighbor_of_any? points, attribute_indexes, neighborhoods
56
- points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
57
61
  end
58
62
 
59
- end
63
+ attr_reader :data, :points, :minimums, :maximums
60
64
 
61
- attr_reader :data, :points, :minimums, :maximums
65
+ def self.pointify data
66
+ data.map {|attributes| Point.new *attributes }
67
+ end
62
68
 
63
- def self.pointify data
64
- data.map {|attributes| Point.new *attributes }
65
- end
69
+ # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
70
+ # values should all be numerical
71
+ # * +data+ should not be empty or nil will be returned
72
+ # * also generates minimum and maximum values for each attribute for later use
73
+ def initialize data
74
+ raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
75
+ @data = data
76
+ @points = self.class.pointify data
77
+ @minimums, @maximums = compute_minimums_and_maximums
78
+ Point.next_id = 0
79
+ end
66
80
 
67
- # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
68
- # values should all be numerical
69
- # * +data+ should not be empty or nil will be returned
70
- # * also generates minimum and maximum values for each attribute for later use
71
- def initialize data
72
- raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
73
- @data = data
74
- @points = self.class.pointify data
75
- @minimums, @maximums = compute_minimums_and_maximums
76
- Point.next_id = 0
77
- end
81
+ # searches the parameter space to find the optimized values of +k+ and +q+
82
+ # * +theta_target+ is the maximum acceptable value of theta, default is 1
83
+ # * +sample+ is the number of iterations to perform in estimating the parameters
84
+ # * +n+ is the number of points to rank
85
+ def optimized_ranking sample, n, theta_target=1
86
+ k = 3
87
+ q = 5
88
+ max_q = n / 4
89
+ step_q = 10
90
+ last_theta = n
91
+ theta, s = calculate_theta(sample, k, n, q)
78
92
 
79
- # searches the parameter space to find the optimized values of +k+ and +q+
80
- # * +theta_target+ is the maximum acceptable value of theta, default is 1
81
- # * +sample+ is the number of iterations to perform in estimating the parameters
82
- # * +n+ is the number of points to rank
83
- def optimized_ranking sample, n, theta_target=1
84
- k = 3
85
- q = 5
86
- max_q = n / 4
87
- step_q = 10
88
- last_theta = n
89
- theta, s = calculate_theta(sample, k, n, q)
90
-
91
- while (theta > theta_target or theta < last_theta or q < max_q) do
92
- return s if (theta <= theta_target)
93
-
94
- if (theta >= last_theta)
95
- # effectiveness declining so try next k
96
- k += 1
97
- q -= step_q
98
- last_theta = n
99
- else
100
- # try next q
101
- q += step_q
102
- last_theta = theta
93
+ while (theta > theta_target or theta < last_theta or q < max_q) do
94
+ return s if (theta <= theta_target)
95
+
96
+ if (theta >= last_theta)
97
+ # effectiveness declining so try next k
98
+ k += 1
99
+ q -= step_q
100
+ last_theta = n
101
+ else
102
+ # try next q
103
+ q += step_q
104
+ last_theta = theta
105
+ end
106
+
107
+ theta, s = calculate_theta(sample, k, n, q)
103
108
  end
104
109
 
105
- theta, s = calculate_theta(sample, k, n, q)
110
+ s
106
111
  end
107
112
 
108
- s
109
- end
113
+ # find and rank the points by their outlier score and determine
114
+ # theta (the number of points with an outlier score of +n+)
115
+ def calculate_theta sample, k, n, q
116
+ s = ranked_outliers sample, k, q
117
+ theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
110
118
 
111
- # find and rank the points by their outlier score and determine
112
- # theta (the number of points with an outlier score of +n+)
113
- def calculate_theta sample, k, n, q
114
- s = ranked_outliers sample, k, q
115
- theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
119
+ [theta, s]
120
+ end
116
121
 
117
- [theta, s]
118
- end
122
+ # chooses +k+ random attributes with an average of +q+ data points
123
+ # in each bin +sample+ times to determine outliers
124
+ def ranked_outliers sample_size, k, q
125
+ # determine number of bins and their widths
126
+ bin_count = compute_bin_count(q)
127
+ bin_widths = compute_bin_widths(q, bin_count)
119
128
 
120
- # chooses +k+ random attributes with an average of +q+ data points
121
- # in each bin +sample+ times to determine outliers
122
- def ranked_outliers sample_size, k, q
123
- # determine number of bins and their widths
124
- bin_count = compute_bin_count(q)
125
- bin_widths = compute_bin_widths(q, bin_count)
129
+ # assign points to the attribute bins
130
+ assign_points_to_bins! bin_widths, bin_count
126
131
 
127
- # assign points to the attribute bins
128
- assign_points_to_bins! bin_widths, bin_count
132
+ 1.upto(sample_size) {
133
+ score_points_from_a_random_set_of_attributes! k, bin_widths }
129
134
 
130
- 1.upto(sample_size) {
131
- score_points_from_a_random_set_of_attributes! k, bin_widths }
135
+ points.sort_by(&:score).reverse
136
+ end
132
137
 
133
- points.sort_by(&:score).reverse
134
- end
138
+ # pick a random set of attributes and compute the outlier score
139
+ # for each of the points
140
+ def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
141
+ cluster = 0
142
+ attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
143
+ bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
135
144
 
136
- # pick a random set of attributes and compute the outlier score
137
- # for each of the points
138
- def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
139
- cluster = 0
140
- attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
141
- bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
145
+ points.each do |point|
146
+ next if point.clustered?
142
147
 
143
- points.each do |point|
144
- next if point.clustered?
148
+ point.cluster = (cluster += 1)
149
+ neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
145
150
 
146
- point.cluster = (cluster += 1)
147
- neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
151
+ point.uncluster! if neighbors.empty?
152
+ end
148
153
 
149
- point.uncluster! if neighbors.empty?
154
+ points.each do |point|
155
+ next unless point.clustered?
156
+ point.uncluster!
157
+ point.score += 1
158
+ end
150
159
  end
151
160
 
152
- points.each do |point|
153
- next unless point.clustered?
154
- point.uncluster!
155
- point.score += 1
161
+ # randomly choose +number+ of attribute indexes
162
+ def random_attribute_indexes number
163
+ (0...@data.first.size).sort_by { rand }[0..number]
156
164
  end
157
- end
158
165
 
159
- # randomly choose +number+ of attribute indexes
160
- def random_attribute_indexes number
161
- (0...@data.first.size).sort_by { rand }[0..number]
162
- end
166
+ # find all unclustered points that are neighbors of +point+ on
167
+ # *all* selected attributes or neighbors in the neighborhood
168
+ # of +point+; find recursively until no additions can be made
169
+ def cluster_neighbors point, cluster, attribute_indexes, bin_widths
170
+ recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
171
+ end
163
172
 
164
- # find all unclustered points that are neighbors of +point+ on
165
- # *all* selected attributes or neighbors in the neighborhood
166
- # of +point+; find recursively until no additions can be made
167
- def cluster_neighbors point, cluster, attribute_indexes, bin_widths
168
- recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
169
- end
173
+ # recursive step of #cluster_neighbors
174
+ def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
175
+ fruitful = false
170
176
 
171
- # recursive step of #cluster_neighbors
172
- def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
173
- fruitful = false
177
+ unclustered_points.each do |unclustered_point|
178
+ next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
179
+ unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
174
180
 
175
- unclustered_points.each do |unclustered_point|
176
- next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
177
- unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
181
+ fruitful = true
182
+ unclustered_point.cluster = cluster
183
+ neighbors << unclustered_point
184
+ end
178
185
 
179
- fruitful = true
180
- unclustered_point.cluster = cluster
181
- neighbors << unclustered_point
186
+ if fruitful
187
+ recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
188
+ else
189
+ neighbors
190
+ end
182
191
  end
183
192
 
184
- if fruitful
185
- recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
186
- else
187
- neighbors
193
+ # find all of the points that don't already belong to a cluster
194
+ def unclustered_points
195
+ points.select {|point| not point.clustered? }
188
196
  end
189
- end
190
197
 
191
- # find all of the points that don't already belong to a cluster
192
- def unclustered_points
193
- points.select {|point| not point.clustered? }
194
- end
195
-
196
- # assign each of the data points to a bin based on the given +bin_widths+,
197
- # returns a 2-d array in attribute-major order
198
- def assign_points_to_bins! bin_widths, bin_count
199
- bin_widths.each_with_index do |bin_width, attribute_index|
200
- points.each do |point|
201
- point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
198
+ # assign each of the data points to a bin based on the given +bin_widths+,
199
+ # returns a 2-d array in attribute-major order
200
+ def assign_points_to_bins! bin_widths, bin_count
201
+ bin_widths.each_with_index do |bin_width, attribute_index|
202
+ points.each do |point|
203
+ point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
204
+ end
202
205
  end
203
206
  end
204
- end
205
207
 
206
- def bin_index point, attribute_index, bin_width
207
- minimum = @minimums[attribute_index]
208
- maximum = @maximums[attribute_index]
208
+ def bin_index point, attribute_index, bin_width
209
+ minimum = @minimums[attribute_index]
210
+ maximum = @maximums[attribute_index]
209
211
 
210
- value = point[attribute_index]
211
- index = ((value - minimum) / bin_width).floor
212
+ value = point[attribute_index]
213
+ index = ((value - minimum) / bin_width).floor
212
214
 
213
- value == maximum ? index - 1 : index
214
- end
215
+ value == maximum ? index - 1 : index
216
+ end
215
217
 
216
- def compute_minimums_and_maximums
217
- minimums = @data.first.dup
218
- maximums = @data.first.dup
218
+ def compute_minimums_and_maximums
219
+ minimums = @data.first.dup
220
+ maximums = @data.first.dup
219
221
 
220
- @data.each do |attributes|
221
- attributes.each_with_index do |attribute, attribute_index|
222
- minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
223
- maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
222
+ @data.each do |attributes|
223
+ attributes.each_with_index do |attribute, attribute_index|
224
+ minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
225
+ maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
226
+ end
224
227
  end
228
+
229
+ [minimums, maximums]
225
230
  end
226
231
 
227
- [minimums, maximums]
228
- end
232
+ # determine the widths of the bins based on +q+
233
+ def compute_bin_widths q, bin_count
234
+ (0...@data.first.size).map do |attribute_index|
235
+ (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
236
+ end
237
+ end
229
238
 
230
- # determine the widths of the bins based on +q+
231
- def compute_bin_widths q, bin_count
232
- (0...@data.first.size).map do |attribute_index|
233
- (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
239
+ # compute the number of bins for a given +q+
240
+ def compute_bin_count q
241
+ count = (@data.size / q.to_f).ceil
242
+ count < 2 ? 2 : count
234
243
  end
235
- end
236
244
 
237
- # compute the number of bins for a given +q+
238
- def compute_bin_count q
239
- count = (@data.size / q.to_f).ceil
240
- count < 2 ? 2 : count
241
245
  end
242
-
243
246
  end
@@ -1,3 +1,3 @@
1
1
  module Fastout
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fastout
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jason Dew