rust 0.4 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,157 @@
1
+ require_relative '../core'
2
+
3
+ ##
4
+ # Module containing utilities for descriptive statistics.
5
+
6
+ module Rust::Descriptive
7
+ class << self
8
+
9
+ ##
10
+ # Computes the arithmetic mean of the given +data+.
11
+
12
+ def mean(data)
13
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
14
+
15
+ return data.sum.to_f / data.size
16
+ end
17
+
18
+ ##
19
+ # Computes the standard deviation of the given +data+.
20
+
21
+ def standard_deviation(data)
22
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
23
+
24
+ return Math.sqrt(variance(data))
25
+ end
26
+ alias :sd :standard_deviation
27
+ alias :stddev :standard_deviation
28
+
29
+ ##
30
+ # Computes the variance of the given +data+.
31
+
32
+ def variance(data)
33
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
34
+ return Float::NAN if data.size < 2
35
+
36
+ mean = mean(data)
37
+ return data.map { |v| (v - mean) ** 2 }.sum.to_f / (data.size - 1)
38
+ end
39
+ alias :var :variance
40
+
41
+ ##
42
+ # Computes the median of the given +data+.
43
+
44
+ def median(data)
45
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
46
+
47
+ sorted = data.sort
48
+ if data.size == 0
49
+ return Float::NAN
50
+ elsif data.size.odd?
51
+ return sorted[data.size / 2]
52
+ else
53
+ i = (data.size / 2)
54
+ return (sorted[i - 1] + sorted[i]) / 2.0
55
+ end
56
+ end
57
+
58
+ ##
59
+ # Sums the given +data+.
60
+
61
+ def sum(data)
62
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
63
+
64
+ return data.sum
65
+ end
66
+
67
+ ##
68
+ # Returns the quantiles of the given +data+, given the +percentiles+ (optional).
69
+
70
+ def quantile(data, percentiles = [0.0, 0.25, 0.5, 0.75, 1.0])
71
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
72
+ raise TypeError, "Expecting Array of numerics" if !percentiles.is_a?(Array) || !percentiles.all? { |e| e.is_a?(Numeric) }
73
+ raise "Percentiles outside the range: #{percentiles}" if percentiles.any? { |e| !e.between?(0, 1) }
74
+
75
+ n = data.size
76
+ quantiles = percentiles.size
77
+ percentiles = percentiles.map { |x| x > 1.0 ? 1.0 : (x < 0.0 ? 0.0 : x) }
78
+
79
+ rough_indices = percentiles.map { |x| 1 + [n - 1, 0].max * x - 1 }
80
+ floor_indices = rough_indices.map { |i| i.floor }
81
+ ceil_indices = rough_indices.map { |i| i.ceil }
82
+
83
+ data = data.sort
84
+ result = floor_indices.map { |i| data[i] }
85
+ result_ceil = ceil_indices.map { |i| data[i] }
86
+
87
+ indices_to_fix = (0...quantiles).select { |i| rough_indices[i] > floor_indices[i] && result_ceil[i] != result[i] }
88
+ index_approximation_errors = indices_to_fix.map { |i| rough_indices[i] - floor_indices[i] }
89
+ reduced_index_approximation_errors = index_approximation_errors.map { |i| (1 - i) }
90
+ hi_indices = indices_to_fix.map { |i| ceil_indices[i] }
91
+ data_hi_indices = hi_indices.map { |i| data[i] }
92
+
93
+ j = 0
94
+ indices_to_fix.each do |i|
95
+ result[i] = reduced_index_approximation_errors[j] * result[i] + index_approximation_errors[j] * data_hi_indices[j]
96
+ j += 1
97
+ end
98
+
99
+ return percentiles.zip(result).to_h
100
+ end
101
+
102
+ ##
103
+ # Returns the outliers in +data+ using Tukey's fences, with a given +k+.
104
+
105
+ def outliers(data, k=1.5, **opts)
106
+ outliers_according_to(data, data, k, **opts)
107
+ end
108
+
109
+ ##
110
+ # Returns the outliers in +data+ using Tukey's fences, with a given +k+, with respect to different data
111
+ # distribution (+data_distribution+).
112
+
113
+ def outliers_according_to(data, data_distribution, k=1.5, **opts)
114
+ quantiles = Rust::Descriptive.quantile(data_distribution, [0.25, 0.75])
115
+ q1 = quantiles[0.25]
116
+ q3 = quantiles[0.75]
117
+ iqr = q3 - q1
118
+
119
+ positive_outliers = data.select { |d| d > q3 + iqr * k }
120
+ negative_outliers = data.select { |d| d < q1 - iqr * k }
121
+
122
+ outliers = negative_outliers + positive_outliers
123
+ if opts[:side]
124
+ case opts[:side].to_sym
125
+ when :positive, :neg, :n, :+
126
+ outliers = positive_outliers
127
+ when :negative, :pos, :p, :-
128
+ outliers = negative_outliers
129
+ end
130
+ end
131
+
132
+ return outliers
133
+ end
134
+ end
135
+ end
136
+
137
+ module Rust::RBindings
138
+ def mean(series)
139
+ Rust::Descriptive.mean(series)
140
+ end
141
+
142
+ def median(series)
143
+ Rust::Descriptive.median(series)
144
+ end
145
+
146
+ def var(series)
147
+ Rust::Descriptive.variance(series)
148
+ end
149
+
150
+ def sd(series)
151
+ Rust::Descriptive.standard_deviation(series)
152
+ end
153
+
154
+ def quantile(series, percentiles = [0.0, 0.25, 0.5, 0.75, 1.0])
155
+ Rust::Descriptive.quantile(series, percentiles)
156
+ end
157
+ end
@@ -1,10 +1,15 @@
1
- require 'code-assertions'
1
+ require_relative '../core'
2
2
 
3
- Rust.exclusive do
4
- Rust._eval("library(effsize)")
5
- end
3
+ Rust.prerequisite('effsize')
4
+
5
+ ##
6
+ # Module containing utilities for computing effect size statistics.
6
7
 
7
8
  module Rust::EffectSize
9
+
10
+ ##
11
+ # Effect size results.
12
+
8
13
  class Result
9
14
  attr_accessor :name
10
15
  attr_accessor :estimate
@@ -16,14 +21,23 @@ module Rust::EffectSize
16
21
  return "#{name} = #{estimate} (#{magnitude}) [#{confidence_interval.min}, #{confidence_interval.max}]"
17
22
  end
18
23
  end
19
- end
20
24
 
21
- module Rust::EffectSize::CliffDelta
22
- class << self
23
- def compute(d1, d2)
25
+ ##
26
+ # Cliff delta effect size statistics.
27
+
28
+ class CliffDelta
29
+
30
+ ##
31
+ # Computes and returns the effect size for +d1+ and +d2+.
32
+
33
+ def self.compute(d1, d2)
24
34
  raise TypeError, "Expecting Array of numerics" if !d1.is_a?(Array) || !d1.all? { |e| e.is_a?(Numeric) }
25
35
  raise TypeError, "Expecting Array of numerics" if !d2.is_a?(Array) || !d2.all? { |e| e.is_a?(Numeric) }
26
36
 
37
+ if d1.size <= 1 || d2.size <= 1
38
+ return Rust::EffectSize::Result.new
39
+ end
40
+
27
41
  Rust.exclusive do
28
42
  Rust['effsize.a'] = d1
29
43
  Rust['effsize.b'] = d2
@@ -32,23 +46,32 @@ module Rust::EffectSize::CliffDelta
32
46
 
33
47
  result = Rust::EffectSize::Result.new
34
48
  result.name = "Cliff's delta"
35
- result.estimate = Rust._pull("effsize.result$estimate")
36
- result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int"))
37
- result.confidence_level = Rust._pull("effsize.result$conf.level")
38
- result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym
49
+ result.estimate = Rust._pull("effsize.result$estimate") rescue Float::NAN
50
+ result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int")) rescue nil
51
+ result.confidence_level = Rust._pull("effsize.result$conf.level") rescue Float::NAN
52
+ result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym rescue nil
39
53
 
40
54
  return result
41
55
  end
42
56
  end
43
57
  end
44
- end
45
-
46
- module Rust::EffectSize::CohenD
47
- class << self
48
- def compute(d1, d2)
58
+
59
+ ##
60
+ # Cohen D effect size statistics.
61
+
62
+ class CohenD
63
+
64
+ ##
65
+ # Computes and returns the effect size for +d1+ and +d2+.
66
+
67
+ def self.compute(d1, d2)
49
68
  raise TypeError, "Expecting Array of numerics" if !d1.is_a?(Array) || !d1.all? { |e| e.is_a?(Numeric) }
50
69
  raise TypeError, "Expecting Array of numerics" if !d2.is_a?(Array) || !d2.all? { |e| e.is_a?(Numeric) }
51
70
 
71
+ if d1.size <= 1 || d2.size <= 1
72
+ return Rust::EffectSize::Result.new
73
+ end
74
+
52
75
  Rust.exclusive do
53
76
  Rust['effsize.a'] = d1
54
77
  Rust['effsize.b'] = d2
@@ -57,10 +80,10 @@ module Rust::EffectSize::CohenD
57
80
 
58
81
  result = Rust::EffectSize::Result.new
59
82
  result.name = "Cohen's d"
60
- result.estimate = Rust._pull("effsize.result$estimate")
61
- result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int"))
62
- result.confidence_level = Rust._pull("effsize.result$conf.level")
63
- result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym
83
+ result.estimate = Rust._pull("effsize.result$estimate") rescue Float::NAN
84
+ result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int")) rescue nil
85
+ result.confidence_level = Rust._pull("effsize.result$conf.level") rescue Float::NAN
86
+ result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym rescue nil
64
87
 
65
88
  return result
66
89
  end
@@ -0,0 +1,356 @@
1
+ require_relative '../core'
2
+
3
+ class Numeric
4
+
5
+ ##
6
+ # Computes the distance between this and another number.
7
+
8
+ def _rust_prob_distance(other)
9
+ raise TypeError, "no implicit conversion of #{other.class} into Numeric" unless other.is_a? Numeric
10
+
11
+ return (self - other).abs
12
+ end
13
+ end
14
+
15
+ class Array
16
+
17
+ ##
18
+ # Computes the distance between this and another array.
19
+
20
+ def _rust_prob_distance(other)
21
+ raise TypeError, "no implicit conversion of #{other.class} into Array" unless other.is_a? Array
22
+
23
+ longest, shortest = self.size > other.size ? [self, other] : [other, self]
24
+
25
+ distance = 0
26
+ for i in 0...longest.size
27
+ distance += longest[i].to_i._rust_prob_distance(shortest[i].to_i)
28
+ end
29
+
30
+ return distance
31
+ end
32
+ end
33
+
34
+ class String
35
+
36
+ ##
37
+ # Computes the distance between this and another string.
38
+
39
+ def _rust_prob_distance(other)
40
+ raise TypeError, "no implicit conversion of #{other.class} into String" unless other.is_a? String
41
+
42
+ return self.bytes._rust_prob_distance other.bytes
43
+ end
44
+ end
45
+
46
+ module Rust
47
+
48
+ ##
49
+ # Represents a slice of a random variable, for which no check is made in terms of cumulative probability.
50
+
51
+ class RandomVariableSlice
52
+
53
+ ##
54
+ # Creates a new slice of random variable. +values+ is a hash of values associated with their probabilities.
55
+
56
+ def initialize(values)
57
+ raise TypeError, "Expected Hash" unless values.is_a?(Hash)
58
+
59
+ @values = values
60
+ end
61
+
62
+ ##
63
+ # Gets the probability of a value +v+. If +v+ is not specified, returns the cumulative probability of the whole
64
+ # slice.
65
+
66
+ def probability(v=nil)
67
+ unless v
68
+ return @values.values.sum
69
+ else
70
+ return @values[v]
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Returns the value with the maximum probability.
76
+
77
+ def ml
78
+ @values.max_by { |k, v| v }[0]
79
+ end
80
+
81
+ ##
82
+ # Returns the expected value for this slice.
83
+
84
+ def expected
85
+ @values.map { |k, v| k*v }.sum
86
+ end
87
+
88
+ ##
89
+ # Returns a slice with the values that are greater than +n+.
90
+
91
+ def >(n)
92
+ self.so_that { |k| k > n }
93
+ end
94
+
95
+ ##
96
+ # Returns a slice with the values that are greater than or equal to +n+.
97
+
98
+ def >=(n)
99
+ self.so_that { |k| k >= n }
100
+ end
101
+
102
+ ##
103
+ # Returns a slice with the values that are lower than +n+.
104
+
105
+ def <(n)
106
+ self.so_that { |k| k < n }
107
+ end
108
+
109
+ ##
110
+ # Returns a slice with the values that are lower than or equal to +n+.
111
+
112
+ def <=(n)
113
+ self.so_that { |k| k <= n }
114
+ end
115
+
116
+ ##
117
+ # Returns a slice with the value +n+.
118
+
119
+ def ==(n)
120
+ self.so_that { |k| k == n }
121
+ end
122
+
123
+ ##
124
+ # Returns a slice with the values between +a+ and +b+.
125
+
126
+ def between(a, b)
127
+ self.so_that { |k| k.between(a, b) }
128
+ end
129
+
130
+ ##
131
+ # Returns a slice with the values for which the given block returns true.
132
+
133
+ def so_that
134
+ RandomVariableSlice.new(@values.select { |k, v| yield(k) })
135
+ end
136
+ end
137
+
138
+ ##
139
+ # Represents a random variable. The cumulative probability of the values must equal 1.
140
+
141
+ class RandomVariable < RandomVariableSlice
142
+ EPSILON = 1e-7
143
+
144
+ attr_reader :values
145
+
146
+ ##
147
+ # Creates a new random variable. +values+ is a hash of values associated with their probabilities.
148
+ # +exact+ indicates whether this variable, when combined with others, should force to keep all the values, even
149
+ # the most unlikely ones. If this is +false+ (default), the most improbable values (lower than EPSILON) are
150
+ # removed for efficiency reasons.
151
+
152
+ def initialize(values = {0 => 1.0}, exact = false)
153
+ @values = values
154
+ @exact = exact
155
+
156
+ raise "All the probabilities should be in the range [0, 1]" unless @values.values.all? { |v| v.between? 0, 1 }
157
+ raise "The cumulative probability must be exactly 1 (#{@values.values.sum} instead)" unless @values.values.sum.between? 1-EPSILON, 1+EPSILON
158
+
159
+ approx!
160
+ end
161
+
162
+ ##
163
+ # Returns the probability of value +v+.
164
+
165
+ def probability(v)
166
+ return @values[v].to_f
167
+ end
168
+
169
+ ##
170
+ # Returns a new random variable which represents the sum of this and the +other+ random variable.
171
+
172
+ def +(other)
173
+ new_hash = {}
174
+
175
+ @values.each do |my_key, my_value|
176
+ other.values.each do |other_key, other_value|
177
+ sum_key = my_key + other_key
178
+
179
+ new_hash[sum_key] = new_hash[sum_key].to_f + (my_value * other_value)
180
+ end
181
+ end
182
+
183
+ return RandomVariable.new(new_hash, @exact)
184
+ end
185
+
186
+ ##
187
+ # Based on the type of +arg+, either mul (product with another random variable) or rep (repeated sum) is called.
188
+
189
+ def *(arg)
190
+ if arg.is_a? Integer
191
+ return rep(arg)
192
+ elsif arg.is_a? RandomVariable
193
+ return mul(arg)
194
+ else
195
+ raise "The argument must be an Integer or a RandomVariable"
196
+ end
197
+ end
198
+
199
+ ##
200
+ # Returns a new random variable which represents the product of this and the +other+ random variable.
201
+
202
+ def mul(other)
203
+ new_hash = {}
204
+
205
+ @values.each do |my_key, my_value|
206
+ other.values.each do |other_key, other_value|
207
+ mul_key = my_key * other_key
208
+
209
+ new_hash[mul_key] = new_hash[mul_key].to_f + (my_value * other_value)
210
+ end
211
+ end
212
+
213
+ return RandomVariable.new(new_hash, @exact)
214
+ end
215
+
216
+ ##
217
+ # Returns a new random variable which represents the sum of this random variable with itself +n+ times.
218
+
219
+ def rep(times)
220
+ rv = self
221
+ (times-1).times do
222
+ rv += self
223
+ end
224
+
225
+ return rv
226
+ end
227
+
228
+ ##
229
+ # Makes sure that the operations yield all the values, even the most unlikely ones.
230
+
231
+ def exact!
232
+ @exact = true
233
+ end
234
+
235
+ ##
236
+ # If this variable is not exact, the values with probability lower than EPSLION are removed.
237
+
238
+ def approx!
239
+ return if @exact
240
+
241
+ to_delete = []
242
+ @values.each do |v, probability|
243
+ to_delete.push v if probability <= EPSILON
244
+ end
245
+
246
+ to_delete.each do |v|
247
+ probability = @values.delete v
248
+ nearest = @values.keys.min_by { |k| k._rust_prob_distance v }
249
+ @values[nearest] += probability
250
+ end
251
+ end
252
+
253
+ ##
254
+ # Returns a random value, according to the data distribution.
255
+
256
+ def extract
257
+ v = rand
258
+
259
+ cumulative = 0
260
+ @values.sort_by { |k, v| k }.each do |key, prob|
261
+ cumulative += prob
262
+
263
+ return key if cumulative >= v
264
+ end
265
+ end
266
+
267
+ ##
268
+ # Creates a random variable by partially specifying the values through +hash+. The remaining probability is
269
+ # attributed to +key+ (0, by default).
270
+
271
+ def self.complete(hash, key=0)
272
+ hash[key] = 1 - hash.values.sum
273
+ return RandomVariable.new(hash)
274
+ end
275
+ end
276
+
277
+ ##
278
+ # Represents a uniform random variable.
279
+
280
+ class UniformRandomVariable < RandomVariable
281
+
282
+ ##
283
+ # Creates random variables for which all the +values+ have the same probability (1 / values.size).
284
+
285
+ def initialize(values, exact = false)
286
+ super(values.map { |k| [k, 1.0 / values.size]}.to_h, exact)
287
+ end
288
+ end
289
+
290
+ ##
291
+ # Module that contains utilities for handling random variables.
292
+
293
+ module Probabilities
294
+
295
+ ##
296
+ # Computes the probability of the random variable +v+.
297
+
298
+ def P(v)
299
+ if v.is_a? RandomVariableSlice
300
+ raise "Cannot compute the probability of a random variable" if v.is_a? RandomVariable
301
+ return v.probability
302
+ else
303
+ raise "Cannot compute the expected value of a #{v.class}"
304
+ end
305
+ end
306
+
307
+ ##
308
+ # Computes the expected value of the random variable +v+.
309
+
310
+ def E(v)
311
+ if v.is_a? RandomVariableSlice
312
+ return v.expected
313
+ else
314
+ raise "Cannot compute the expected value of a #{v.class}"
315
+ end
316
+ end
317
+ end
318
+
319
+ ##
320
+ # Module containing examples of commonly-used random variables.
321
+
322
+ module RandomVariableExamples
323
+ ENGLISH_ALPHABET = RandomVariable.new({
324
+ "a" => 0.08167,
325
+ "b" => 0.01492,
326
+ "c" => 0.02782,
327
+ "d" => 0.04253,
328
+ "e" => 0.12703,
329
+ "f" => 0.02228,
330
+ "g" => 0.02015,
331
+ "h" => 0.06094,
332
+ "i" => 0.06966,
333
+ "j" => 0.00153,
334
+ "k" => 0.00772,
335
+ "l" => 0.04025,
336
+ "m" => 0.02406,
337
+ "n" => 0.06749,
338
+ "o" => 0.07507,
339
+ "p" => 0.01929,
340
+ "q" => 0.00095,
341
+ "r" => 0.05987,
342
+ "s" => 0.06327,
343
+ "t" => 0.09056,
344
+ "u" => 0.02758,
345
+ "v" => 0.00978,
346
+ "w" => 0.02360,
347
+ "x" => 0.00150,
348
+ "y" => 0.01974,
349
+ "z" => 0.00074
350
+ })
351
+
352
+ DICE = UniformRandomVariable.new([1, 2, 3, 4, 5, 6])
353
+
354
+ COIN = UniformRandomVariable.new(["h", "t"])
355
+ end
356
+ end