fastout 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/fastout/ranker.rb +177 -174
  2. data/lib/fastout/version.rb +1 -1
  3. metadata +3 -3
@@ -5,239 +5,242 @@
5
5
  # Author:: Jason Dew (mailto:jason.dew@gmail.com)
6
6
  # Copyright:: Copyright (c) 2010 Jason Dew
7
7
  # License:: MIT
8
- class Ranker
9
8
 
10
- class Point
9
+ module Fastout
10
+ class Ranker
11
11
 
12
- @@next_id = 0
12
+ class Point
13
13
 
14
- def self.next_id= id
15
- @@next_id = id
16
- end
14
+ @@next_id = 0
17
15
 
18
- attr_reader :id, :attributes, :bins
19
- attr_accessor :cluster, :score
16
+ def self.next_id= id
17
+ @@next_id = id
18
+ end
20
19
 
21
- def initialize *attributes
22
- @attributes = attributes
23
- @cluster = nil
24
- @score = 0
25
- @bins = []
20
+ attr_reader :id, :attributes, :bins
21
+ attr_accessor :cluster, :score
26
22
 
27
- @id = @@next_id
28
- @@next_id += 1
29
- end
23
+ def initialize *attributes
24
+ @attributes = attributes
25
+ @cluster = nil
26
+ @score = 0
27
+ @bins = []
30
28
 
31
- def [] index
32
- @attributes[index]
33
- end
29
+ @id = @@next_id
30
+ @@next_id += 1
31
+ end
34
32
 
35
- def clustered?
36
- !! cluster
37
- end
33
+ def [] index
34
+ @attributes[index]
35
+ end
38
36
 
39
- def uncluster!
40
- @cluster = nil
41
- end
37
+ def clustered?
38
+ !! cluster
39
+ end
42
40
 
43
- def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
44
- attribute_indexes.each do |attribute_index|
45
- return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
41
+ def uncluster!
42
+ @cluster = nil
46
43
  end
47
44
 
48
- attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
49
- return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
45
+ def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
46
+ attribute_indexes.each do |attribute_index|
47
+ return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
48
+ end
49
+
50
+ attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
51
+ return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
52
+ end
53
+
54
+ true
50
55
  end
51
56
 
52
- true
53
- end
57
+ def neighbor_of_any? points, attribute_indexes, neighborhoods
58
+ points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
59
+ end
54
60
 
55
- def neighbor_of_any? points, attribute_indexes, neighborhoods
56
- points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
57
61
  end
58
62
 
59
- end
63
+ attr_reader :data, :points, :minimums, :maximums
60
64
 
61
- attr_reader :data, :points, :minimums, :maximums
65
+ def self.pointify data
66
+ data.map {|attributes| Point.new *attributes }
67
+ end
62
68
 
63
- def self.pointify data
64
- data.map {|attributes| Point.new *attributes }
65
- end
69
+ # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
70
+ # values should all be numerical
71
+ # * +data+ should not be empty or nil will be returned
72
+ # * also generates minimum and maximum values for each attribute for later use
73
+ def initialize data
74
+ raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
75
+ @data = data
76
+ @points = self.class.pointify data
77
+ @minimums, @maximums = compute_minimums_and_maximums
78
+ Point.next_id = 0
79
+ end
66
80
 
67
- # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
68
- # values should all be numerical
69
- # * +data+ should not be empty or nil will be returned
70
- # * also generates minimum and maximum values for each attribute for later use
71
- def initialize data
72
- raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
73
- @data = data
74
- @points = self.class.pointify data
75
- @minimums, @maximums = compute_minimums_and_maximums
76
- Point.next_id = 0
77
- end
81
+ # searches the parameter space to find the optimized values of +k+ and +q+
82
+ # * +theta_target+ is the maximum acceptable value of theta, default is 1
83
+ # * +sample+ is the number of iterations to perform in estimating the parameters
84
+ # * +n+ is the number of points to rank
85
+ def optimized_ranking sample, n, theta_target=1
86
+ k = 3
87
+ q = 5
88
+ max_q = n / 4
89
+ step_q = 10
90
+ last_theta = n
91
+ theta, s = calculate_theta(sample, k, n, q)
78
92
 
79
- # searches the parameter space to find the optimized values of +k+ and +q+
80
- # * +theta_target+ is the maximum acceptable value of theta, default is 1
81
- # * +sample+ is the number of iterations to perform in estimating the parameters
82
- # * +n+ is the number of points to rank
83
- def optimized_ranking sample, n, theta_target=1
84
- k = 3
85
- q = 5
86
- max_q = n / 4
87
- step_q = 10
88
- last_theta = n
89
- theta, s = calculate_theta(sample, k, n, q)
90
-
91
- while (theta > theta_target or theta < last_theta or q < max_q) do
92
- return s if (theta <= theta_target)
93
-
94
- if (theta >= last_theta)
95
- # effectiveness declining so try next k
96
- k += 1
97
- q -= step_q
98
- last_theta = n
99
- else
100
- # try next q
101
- q += step_q
102
- last_theta = theta
93
+ while (theta > theta_target or theta < last_theta or q < max_q) do
94
+ return s if (theta <= theta_target)
95
+
96
+ if (theta >= last_theta)
97
+ # effectiveness declining so try next k
98
+ k += 1
99
+ q -= step_q
100
+ last_theta = n
101
+ else
102
+ # try next q
103
+ q += step_q
104
+ last_theta = theta
105
+ end
106
+
107
+ theta, s = calculate_theta(sample, k, n, q)
103
108
  end
104
109
 
105
- theta, s = calculate_theta(sample, k, n, q)
110
+ s
106
111
  end
107
112
 
108
- s
109
- end
113
+ # find and rank the points by their outlier score and determine
114
+ # theta (the number of points with an outlier score of +n+)
115
+ def calculate_theta sample, k, n, q
116
+ s = ranked_outliers sample, k, q
117
+ theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
110
118
 
111
- # find and rank the points by their outlier score and determine
112
- # theta (the number of points with an outlier score of +n+)
113
- def calculate_theta sample, k, n, q
114
- s = ranked_outliers sample, k, q
115
- theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
119
+ [theta, s]
120
+ end
116
121
 
117
- [theta, s]
118
- end
122
+ # chooses +k+ random attributes with an average of +q+ data points
123
+ # in each bin +sample+ times to determine outliers
124
+ def ranked_outliers sample_size, k, q
125
+ # determine number of bins and their widths
126
+ bin_count = compute_bin_count(q)
127
+ bin_widths = compute_bin_widths(q, bin_count)
119
128
 
120
- # chooses +k+ random attributes with an average of +q+ data points
121
- # in each bin +sample+ times to determine outliers
122
- def ranked_outliers sample_size, k, q
123
- # determine number of bins and their widths
124
- bin_count = compute_bin_count(q)
125
- bin_widths = compute_bin_widths(q, bin_count)
129
+ # assign points to the attribute bins
130
+ assign_points_to_bins! bin_widths, bin_count
126
131
 
127
- # assign points to the attribute bins
128
- assign_points_to_bins! bin_widths, bin_count
132
+ 1.upto(sample_size) {
133
+ score_points_from_a_random_set_of_attributes! k, bin_widths }
129
134
 
130
- 1.upto(sample_size) {
131
- score_points_from_a_random_set_of_attributes! k, bin_widths }
135
+ points.sort_by(&:score).reverse
136
+ end
132
137
 
133
- points.sort_by(&:score).reverse
134
- end
138
+ # pick a random set of attributes and compute the outlier score
139
+ # for each of the points
140
+ def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
141
+ cluster = 0
142
+ attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
143
+ bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
135
144
 
136
- # pick a random set of attributes and compute the outlier score
137
- # for each of the points
138
- def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
139
- cluster = 0
140
- attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
141
- bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
145
+ points.each do |point|
146
+ next if point.clustered?
142
147
 
143
- points.each do |point|
144
- next if point.clustered?
148
+ point.cluster = (cluster += 1)
149
+ neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
145
150
 
146
- point.cluster = (cluster += 1)
147
- neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
151
+ point.uncluster! if neighbors.empty?
152
+ end
148
153
 
149
- point.uncluster! if neighbors.empty?
154
+ points.each do |point|
155
+ next unless point.clustered?
156
+ point.uncluster!
157
+ point.score += 1
158
+ end
150
159
  end
151
160
 
152
- points.each do |point|
153
- next unless point.clustered?
154
- point.uncluster!
155
- point.score += 1
161
+ # randomly choose +number+ of attribute indexes
162
+ def random_attribute_indexes number
163
+ (0...@data.first.size).sort_by { rand }[0..number]
156
164
  end
157
- end
158
165
 
159
- # randomly choose +number+ of attribute indexes
160
- def random_attribute_indexes number
161
- (0...@data.first.size).sort_by { rand }[0..number]
162
- end
166
+ # find all unclustered points that are neighbors of +point+ on
167
+ # *all* selected attributes or neighbors in the neighborhood
168
+ # of +point+; find recursively until no additions can be made
169
+ def cluster_neighbors point, cluster, attribute_indexes, bin_widths
170
+ recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
171
+ end
163
172
 
164
- # find all unclustered points that are neighbors of +point+ on
165
- # *all* selected attributes or neighbors in the neighborhood
166
- # of +point+; find recursively until no additions can be made
167
- def cluster_neighbors point, cluster, attribute_indexes, bin_widths
168
- recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
169
- end
173
+ # recursive step of #cluster_neighbors
174
+ def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
175
+ fruitful = false
170
176
 
171
- # recursive step of #cluster_neighbors
172
- def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
173
- fruitful = false
177
+ unclustered_points.each do |unclustered_point|
178
+ next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
179
+ unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
174
180
 
175
- unclustered_points.each do |unclustered_point|
176
- next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
177
- unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
181
+ fruitful = true
182
+ unclustered_point.cluster = cluster
183
+ neighbors << unclustered_point
184
+ end
178
185
 
179
- fruitful = true
180
- unclustered_point.cluster = cluster
181
- neighbors << unclustered_point
186
+ if fruitful
187
+ recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
188
+ else
189
+ neighbors
190
+ end
182
191
  end
183
192
 
184
- if fruitful
185
- recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
186
- else
187
- neighbors
193
+ # find all of the points that don't already belong to a cluster
194
+ def unclustered_points
195
+ points.select {|point| not point.clustered? }
188
196
  end
189
- end
190
197
 
191
- # find all of the points that don't already belong to a cluster
192
- def unclustered_points
193
- points.select {|point| not point.clustered? }
194
- end
195
-
196
- # assign each of the data points to a bin based on the given +bin_widths+,
197
- # returns a 2-d array in attribute-major order
198
- def assign_points_to_bins! bin_widths, bin_count
199
- bin_widths.each_with_index do |bin_width, attribute_index|
200
- points.each do |point|
201
- point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
198
+ # assign each of the data points to a bin based on the given +bin_widths+,
199
+ # returns a 2-d array in attribute-major order
200
+ def assign_points_to_bins! bin_widths, bin_count
201
+ bin_widths.each_with_index do |bin_width, attribute_index|
202
+ points.each do |point|
203
+ point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
204
+ end
202
205
  end
203
206
  end
204
- end
205
207
 
206
- def bin_index point, attribute_index, bin_width
207
- minimum = @minimums[attribute_index]
208
- maximum = @maximums[attribute_index]
208
+ def bin_index point, attribute_index, bin_width
209
+ minimum = @minimums[attribute_index]
210
+ maximum = @maximums[attribute_index]
209
211
 
210
- value = point[attribute_index]
211
- index = ((value - minimum) / bin_width).floor
212
+ value = point[attribute_index]
213
+ index = ((value - minimum) / bin_width).floor
212
214
 
213
- value == maximum ? index - 1 : index
214
- end
215
+ value == maximum ? index - 1 : index
216
+ end
215
217
 
216
- def compute_minimums_and_maximums
217
- minimums = @data.first.dup
218
- maximums = @data.first.dup
218
+ def compute_minimums_and_maximums
219
+ minimums = @data.first.dup
220
+ maximums = @data.first.dup
219
221
 
220
- @data.each do |attributes|
221
- attributes.each_with_index do |attribute, attribute_index|
222
- minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
223
- maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
222
+ @data.each do |attributes|
223
+ attributes.each_with_index do |attribute, attribute_index|
224
+ minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
225
+ maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
226
+ end
224
227
  end
228
+
229
+ [minimums, maximums]
225
230
  end
226
231
 
227
- [minimums, maximums]
228
- end
232
+ # determine the widths of the bins based on +q+
233
+ def compute_bin_widths q, bin_count
234
+ (0...@data.first.size).map do |attribute_index|
235
+ (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
236
+ end
237
+ end
229
238
 
230
- # determine the widths of the bins based on +q+
231
- def compute_bin_widths q, bin_count
232
- (0...@data.first.size).map do |attribute_index|
233
- (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
239
+ # compute the number of bins for a given +q+
240
+ def compute_bin_count q
241
+ count = (@data.size / q.to_f).ceil
242
+ count < 2 ? 2 : count
234
243
  end
235
- end
236
244
 
237
- # compute the number of bins for a given +q+
238
- def compute_bin_count q
239
- count = (@data.size / q.to_f).ceil
240
- count < 2 ? 2 : count
241
245
  end
242
-
243
246
  end
@@ -1,3 +1,3 @@
1
1
  module Fastout
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fastout
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jason Dew