rust 0.4 → 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ require_relative '../core'
2
+
3
+ ##
4
+ # Module containing utilities for descriptive statistics.
5
+
6
+ module Rust::Descriptive
7
+ class << self
8
+
9
+ ##
10
+ # Computes the arithmetic mean of the given +data+.
11
+
12
+ def mean(data)
13
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
14
+
15
+ return data.sum.to_f / data.size
16
+ end
17
+
18
+ ##
19
+ # Computes the standard deviation of the given +data+.
20
+
21
+ def standard_deviation(data)
22
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
23
+
24
+ return Math.sqrt(variance(data))
25
+ end
26
+ alias :sd :standard_deviation
27
+ alias :stddev :standard_deviation
28
+
29
+ ##
30
+ # Computes the variance of the given +data+.
31
+
32
+ def variance(data)
33
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
34
+ return Float::NAN if data.size < 2
35
+
36
+ mean = mean(data)
37
+ return data.map { |v| (v - mean) ** 2 }.sum.to_f / (data.size - 1)
38
+ end
39
+ alias :var :variance
40
+
41
+ ##
42
+ # Computes the median of the given +data+.
43
+
44
+ def median(data)
45
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
46
+
47
+ sorted = data.sort
48
+ if data.size == 0
49
+ return Float::NAN
50
+ elsif data.size.odd?
51
+ return sorted[data.size / 2]
52
+ else
53
+ i = (data.size / 2)
54
+ return (sorted[i - 1] + sorted[i]) / 2.0
55
+ end
56
+ end
57
+
58
+ ##
59
+ # Sums the given +data+.
60
+
61
+ def sum(data)
62
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
63
+
64
+ return data.sum
65
+ end
66
+
67
+ ##
68
+ # Returns the quantiles of the given +data+, given the +percentiles+ (optional).
69
+
70
+ def quantile(data, percentiles = [0.0, 0.25, 0.5, 0.75, 1.0])
71
+ raise TypeError, "Expecting Array of numerics" if !data.is_a?(Array) || !data.all? { |e| e.is_a?(Numeric) }
72
+ raise TypeError, "Expecting Array of numerics" if !percentiles.is_a?(Array) || !percentiles.all? { |e| e.is_a?(Numeric) }
73
+ raise "Percentiles outside the range: #{percentiles}" if percentiles.any? { |e| !e.between?(0, 1) }
74
+
75
+ n = data.size
76
+ quantiles = percentiles.size
77
+ percentiles = percentiles.map { |x| x > 1.0 ? 1.0 : (x < 0.0 ? 0.0 : x) }
78
+
79
+ rough_indices = percentiles.map { |x| 1 + [n - 1, 0].max * x - 1 }
80
+ floor_indices = rough_indices.map { |i| i.floor }
81
+ ceil_indices = rough_indices.map { |i| i.ceil }
82
+
83
+ data = data.sort
84
+ result = floor_indices.map { |i| data[i] }
85
+ result_ceil = ceil_indices.map { |i| data[i] }
86
+
87
+ indices_to_fix = (0...quantiles).select { |i| rough_indices[i] > floor_indices[i] && result_ceil[i] != result[i] }
88
+ index_approximation_errors = indices_to_fix.map { |i| rough_indices[i] - floor_indices[i] }
89
+ reduced_index_approximation_errors = index_approximation_errors.map { |i| (1 - i) }
90
+ hi_indices = indices_to_fix.map { |i| ceil_indices[i] }
91
+ data_hi_indices = hi_indices.map { |i| data[i] }
92
+
93
+ j = 0
94
+ indices_to_fix.each do |i|
95
+ result[i] = reduced_index_approximation_errors[j] * result[i] + index_approximation_errors[j] * data_hi_indices[j]
96
+ j += 1
97
+ end
98
+
99
+ return percentiles.zip(result).to_h
100
+ end
101
+
102
+ ##
103
+ # Returns the outliers in +data+ using Tukey's fences, with a given +k+.
104
+
105
+ def outliers(data, k=1.5, **opts)
106
+ outliers_according_to(data, data, k, **opts)
107
+ end
108
+
109
+ ##
110
+ # Returns the outliers in +data+ using Tukey's fences, with a given +k+, with respect to different data
111
+ # distribution (+data_distribution+).
112
+
113
+ def outliers_according_to(data, data_distribution, k=1.5, **opts)
114
+ quantiles = Rust::Descriptive.quantile(data_distribution, [0.25, 0.75])
115
+ q1 = quantiles[0.25]
116
+ q3 = quantiles[0.75]
117
+ iqr = q3 - q1
118
+
119
+ positive_outliers = data.select { |d| d > q3 + iqr * k }
120
+ negative_outliers = data.select { |d| d < q1 - iqr * k }
121
+
122
+ outliers = negative_outliers + positive_outliers
123
+ if opts[:side]
124
+ case opts[:side].to_sym
125
+ when :positive, :neg, :n, :+
126
+ outliers = positive_outliers
127
+ when :negative, :pos, :p, :-
128
+ outliers = negative_outliers
129
+ end
130
+ end
131
+
132
+ return outliers
133
+ end
134
+ end
135
+ end
136
+
137
+ module Rust::RBindings
138
+ def mean(series)
139
+ Rust::Descriptive.mean(series)
140
+ end
141
+
142
+ def median(series)
143
+ Rust::Descriptive.median(series)
144
+ end
145
+
146
+ def var(series)
147
+ Rust::Descriptive.variance(series)
148
+ end
149
+
150
+ def sd(series)
151
+ Rust::Descriptive.standard_deviation(series)
152
+ end
153
+
154
+ def quantile(series, percentiles = [0.0, 0.25, 0.5, 0.75, 1.0])
155
+ Rust::Descriptive.quantile(series, percentiles)
156
+ end
157
+ end
@@ -1,10 +1,15 @@
1
- require 'code-assertions'
1
+ require_relative '../core'
2
2
 
3
- Rust.exclusive do
4
- Rust._eval("library(effsize)")
5
- end
3
+ Rust.prerequisite('effsize')
4
+
5
+ ##
6
+ # Module containing utilities for computing effect size statistics.
6
7
 
7
8
  module Rust::EffectSize
9
+
10
+ ##
11
+ # Effect size results.
12
+
8
13
  class Result
9
14
  attr_accessor :name
10
15
  attr_accessor :estimate
@@ -16,14 +21,23 @@ module Rust::EffectSize
16
21
  return "#{name} = #{estimate} (#{magnitude}) [#{confidence_interval.min}, #{confidence_interval.max}]"
17
22
  end
18
23
  end
19
- end
20
24
 
21
- module Rust::EffectSize::CliffDelta
22
- class << self
23
- def compute(d1, d2)
25
+ ##
26
+ # Cliff delta effect size statistics.
27
+
28
+ class CliffDelta
29
+
30
+ ##
31
+ # Computes and returns the effect size for +d1+ and +d2+.
32
+
33
+ def self.compute(d1, d2)
24
34
  raise TypeError, "Expecting Array of numerics" if !d1.is_a?(Array) || !d1.all? { |e| e.is_a?(Numeric) }
25
35
  raise TypeError, "Expecting Array of numerics" if !d2.is_a?(Array) || !d2.all? { |e| e.is_a?(Numeric) }
26
36
 
37
+ if d1.size <= 1 || d2.size <= 1
38
+ return Rust::EffectSize::Result.new
39
+ end
40
+
27
41
  Rust.exclusive do
28
42
  Rust['effsize.a'] = d1
29
43
  Rust['effsize.b'] = d2
@@ -32,23 +46,32 @@ module Rust::EffectSize::CliffDelta
32
46
 
33
47
  result = Rust::EffectSize::Result.new
34
48
  result.name = "Cliff's delta"
35
- result.estimate = Rust._pull("effsize.result$estimate")
36
- result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int"))
37
- result.confidence_level = Rust._pull("effsize.result$conf.level")
38
- result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym
49
+ result.estimate = Rust._pull("effsize.result$estimate") rescue Float::NAN
50
+ result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int")) rescue nil
51
+ result.confidence_level = Rust._pull("effsize.result$conf.level") rescue Float::NAN
52
+ result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym rescue nil
39
53
 
40
54
  return result
41
55
  end
42
56
  end
43
57
  end
44
- end
45
-
46
- module Rust::EffectSize::CohenD
47
- class << self
48
- def compute(d1, d2)
58
+
59
+ ##
60
+ # Cohen D effect size statistics.
61
+
62
+ class CohenD
63
+
64
+ ##
65
+ # Computes and returns the effect size for +d1+ and +d2+.
66
+
67
+ def self.compute(d1, d2)
49
68
  raise TypeError, "Expecting Array of numerics" if !d1.is_a?(Array) || !d1.all? { |e| e.is_a?(Numeric) }
50
69
  raise TypeError, "Expecting Array of numerics" if !d2.is_a?(Array) || !d2.all? { |e| e.is_a?(Numeric) }
51
70
 
71
+ if d1.size <= 1 || d2.size <= 1
72
+ return Rust::EffectSize::Result.new
73
+ end
74
+
52
75
  Rust.exclusive do
53
76
  Rust['effsize.a'] = d1
54
77
  Rust['effsize.b'] = d2
@@ -57,10 +80,10 @@ module Rust::EffectSize::CohenD
57
80
 
58
81
  result = Rust::EffectSize::Result.new
59
82
  result.name = "Cohen's d"
60
- result.estimate = Rust._pull("effsize.result$estimate")
61
- result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int"))
62
- result.confidence_level = Rust._pull("effsize.result$conf.level")
63
- result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym
83
+ result.estimate = Rust._pull("effsize.result$estimate") rescue Float::NAN
84
+ result.confidence_interval = Range.new(*Rust._pull("effsize.result$conf.int")) rescue nil
85
+ result.confidence_level = Rust._pull("effsize.result$conf.level") rescue Float::NAN
86
+ result.magnitude = Rust._pull("as.character(effsize.result$magnitude)").to_sym rescue nil
64
87
 
65
88
  return result
66
89
  end
@@ -0,0 +1,356 @@
1
+ require_relative '../core'
2
+
3
+ class Numeric
4
+
5
+ ##
6
+ # Computes the distance between this and another number.
7
+
8
+ def _rust_prob_distance(other)
9
+ raise TypeError, "no implicit conversion of #{other.class} into Numeric" unless other.is_a? Numeric
10
+
11
+ return (self - other).abs
12
+ end
13
+ end
14
+
15
+ class Array
16
+
17
+ ##
18
+ # Computes the distance between this and another array.
19
+
20
+ def _rust_prob_distance(other)
21
+ raise TypeError, "no implicit conversion of #{other.class} into Array" unless other.is_a? Array
22
+
23
+ longest, shortest = self.size > other.size ? [self, other] : [other, self]
24
+
25
+ distance = 0
26
+ for i in 0...longest.size
27
+ distance += longest[i].to_i._rust_prob_distance(shortest[i].to_i)
28
+ end
29
+
30
+ return distance
31
+ end
32
+ end
33
+
34
+ class String
35
+
36
+ ##
37
+ # Computes the distance between this and another string.
38
+
39
+ def _rust_prob_distance(other)
40
+ raise TypeError, "no implicit conversion of #{other.class} into String" unless other.is_a? String
41
+
42
+ return self.bytes._rust_prob_distance other.bytes
43
+ end
44
+ end
45
+
46
+ module Rust
47
+
48
+ ##
49
+ # Represents a slice of a random variable, for which no check is made in terms of cumulative probability.
50
+
51
+ class RandomVariableSlice
52
+
53
+ ##
54
+ # Creates a new slice of random variable. +values+ is a hash of values associated with their probabilities.
55
+
56
+ def initialize(values)
57
+ raise TypeError, "Expected Hash" unless values.is_a?(Hash)
58
+
59
+ @values = values
60
+ end
61
+
62
+ ##
63
+ # Gets the probability of a value +v+. If +v+ is not specified, returns the cumulative probability of the whole
64
+ # slice.
65
+
66
+ def probability(v=nil)
67
+ unless v
68
+ return @values.values.sum
69
+ else
70
+ return @values[v]
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Returns the value with the maximum probability.
76
+
77
+ def ml
78
+ @values.max_by { |k, v| v }[0]
79
+ end
80
+
81
+ ##
82
+ # Returns the expected value for this slice.
83
+
84
+ def expected
85
+ @values.map { |k, v| k*v }.sum
86
+ end
87
+
88
+ ##
89
+ # Returns a slice with the values that are greater than +n+.
90
+
91
+ def >(n)
92
+ self.so_that { |k| k > n }
93
+ end
94
+
95
+ ##
96
+ # Returns a slice with the values that are greater than or equal to +n+.
97
+
98
+ def >=(n)
99
+ self.so_that { |k| k >= n }
100
+ end
101
+
102
+ ##
103
+ # Returns a slice with the values that are lower than +n+.
104
+
105
+ def <(n)
106
+ self.so_that { |k| k < n }
107
+ end
108
+
109
+ ##
110
+ # Returns a slice with the values that are lower than or equal to +n+.
111
+
112
+ def <=(n)
113
+ self.so_that { |k| k <= n }
114
+ end
115
+
116
+ ##
117
+ # Returns a slice with the value +n+.
118
+
119
+ def ==(n)
120
+ self.so_that { |k| k == n }
121
+ end
122
+
123
+ ##
124
+ # Returns a slice with the values between +a+ and +b+.
125
+
126
+ def between(a, b)
127
+ self.so_that { |k| k.between(a, b) }
128
+ end
129
+
130
+ ##
131
+ # Returns a slice with the values for which the given block returns true.
132
+
133
+ def so_that
134
+ RandomVariableSlice.new(@values.select { |k, v| yield(k) })
135
+ end
136
+ end
137
+
138
+ ##
139
+ # Represents a random variable. The cumulative probability of the values must equal 1.
140
+
141
+ class RandomVariable < RandomVariableSlice
142
+ EPSILON = 1e-7
143
+
144
+ attr_reader :values
145
+
146
+ ##
147
+ # Creates a new random variable. +values+ is a hash of values associated with their probabilities.
148
+ # +exact+ indicates whether this variable, when combined with others, should force to keep all the values, even
149
+ # the most unlikely ones. If this is +false+ (default), the most improbable values (lower than EPSILON) are
150
+ # removed for efficiency reasons.
151
+
152
+ def initialize(values = {0 => 1.0}, exact = false)
153
+ @values = values
154
+ @exact = exact
155
+
156
+ raise "All the probabilities should be in the range [0, 1]" unless @values.values.all? { |v| v.between? 0, 1 }
157
+ raise "The cumulative probability must be exactly 1 (#{@values.values.sum} instead)" unless @values.values.sum.between? 1-EPSILON, 1+EPSILON
158
+
159
+ approx!
160
+ end
161
+
162
+ ##
163
+ # Returns the probability of value +v+.
164
+
165
+ def probability(v)
166
+ return @values[v].to_f
167
+ end
168
+
169
+ ##
170
+ # Returns a new random variable which represents the sum of this and the +other+ random variable.
171
+
172
+ def +(other)
173
+ new_hash = {}
174
+
175
+ @values.each do |my_key, my_value|
176
+ other.values.each do |other_key, other_value|
177
+ sum_key = my_key + other_key
178
+
179
+ new_hash[sum_key] = new_hash[sum_key].to_f + (my_value * other_value)
180
+ end
181
+ end
182
+
183
+ return RandomVariable.new(new_hash, @exact)
184
+ end
185
+
186
+ ##
187
+ # Based on the type of +arg+, either mul (product with another random variable) or rep (repeated sum) is called.
188
+
189
+ def *(arg)
190
+ if arg.is_a? Integer
191
+ return rep(arg)
192
+ elsif arg.is_a? RandomVariable
193
+ return mul(arg)
194
+ else
195
+ raise "The argument must be an Integer or a RandomVariable"
196
+ end
197
+ end
198
+
199
+ ##
200
+ # Returns a new random variable which represents the product of this and the +other+ random variable.
201
+
202
+ def mul(other)
203
+ new_hash = {}
204
+
205
+ @values.each do |my_key, my_value|
206
+ other.values.each do |other_key, other_value|
207
+ mul_key = my_key * other_key
208
+
209
+ new_hash[mul_key] = new_hash[mul_key].to_f + (my_value * other_value)
210
+ end
211
+ end
212
+
213
+ return RandomVariable.new(new_hash, @exact)
214
+ end
215
+
216
+ ##
217
+ # Returns a new random variable which represents the sum of this random variable with itself +n+ times.
218
+
219
+ def rep(times)
220
+ rv = self
221
+ (times-1).times do
222
+ rv += self
223
+ end
224
+
225
+ return rv
226
+ end
227
+
228
+ ##
229
+ # Makes sure that the operations yield all the values, even the most unlikely ones.
230
+
231
+ def exact!
232
+ @exact = true
233
+ end
234
+
235
+ ##
236
+ # If this variable is not exact, the values with probability lower than EPSLION are removed.
237
+
238
+ def approx!
239
+ return if @exact
240
+
241
+ to_delete = []
242
+ @values.each do |v, probability|
243
+ to_delete.push v if probability <= EPSILON
244
+ end
245
+
246
+ to_delete.each do |v|
247
+ probability = @values.delete v
248
+ nearest = @values.keys.min_by { |k| k._rust_prob_distance v }
249
+ @values[nearest] += probability
250
+ end
251
+ end
252
+
253
+ ##
254
+ # Returns a random value, according to the data distribution.
255
+
256
+ def extract
257
+ v = rand
258
+
259
+ cumulative = 0
260
+ @values.sort_by { |k, v| k }.each do |key, prob|
261
+ cumulative += prob
262
+
263
+ return key if cumulative >= v
264
+ end
265
+ end
266
+
267
+ ##
268
+ # Creates a random variable by partially specifying the values through +hash+. The remaining probability is
269
+ # attributed to +key+ (0, by default).
270
+
271
+ def self.complete(hash, key=0)
272
+ hash[key] = 1 - hash.values.sum
273
+ return RandomVariable.new(hash)
274
+ end
275
+ end
276
+
277
+ ##
278
+ # Represents a uniform random variable.
279
+
280
+ class UniformRandomVariable < RandomVariable
281
+
282
+ ##
283
+ # Creates random variables for which all the +values+ have the same probability (1 / values.size).
284
+
285
+ def initialize(values, exact = false)
286
+ super(values.map { |k| [k, 1.0 / values.size]}.to_h, exact)
287
+ end
288
+ end
289
+
290
+ ##
291
+ # Module that contains utilities for handling random variables.
292
+
293
+ module Probabilities
294
+
295
+ ##
296
+ # Computes the probability of the random variable +v+.
297
+
298
+ def P(v)
299
+ if v.is_a? RandomVariableSlice
300
+ raise "Cannot compute the probability of a random variable" if v.is_a? RandomVariable
301
+ return v.probability
302
+ else
303
+ raise "Cannot compute the expected value of a #{v.class}"
304
+ end
305
+ end
306
+
307
+ ##
308
+ # Computes the expected value of the random variable +v+.
309
+
310
+ def E(v)
311
+ if v.is_a? RandomVariableSlice
312
+ return v.expected
313
+ else
314
+ raise "Cannot compute the expected value of a #{v.class}"
315
+ end
316
+ end
317
+ end
318
+
319
+ ##
320
+ # Module containing examples of commonly-used random variables.
321
+
322
+ module RandomVariableExamples
323
+ ENGLISH_ALPHABET = RandomVariable.new({
324
+ "a" => 0.08167,
325
+ "b" => 0.01492,
326
+ "c" => 0.02782,
327
+ "d" => 0.04253,
328
+ "e" => 0.12703,
329
+ "f" => 0.02228,
330
+ "g" => 0.02015,
331
+ "h" => 0.06094,
332
+ "i" => 0.06966,
333
+ "j" => 0.00153,
334
+ "k" => 0.00772,
335
+ "l" => 0.04025,
336
+ "m" => 0.02406,
337
+ "n" => 0.06749,
338
+ "o" => 0.07507,
339
+ "p" => 0.01929,
340
+ "q" => 0.00095,
341
+ "r" => 0.05987,
342
+ "s" => 0.06327,
343
+ "t" => 0.09056,
344
+ "u" => 0.02758,
345
+ "v" => 0.00978,
346
+ "w" => 0.02360,
347
+ "x" => 0.00150,
348
+ "y" => 0.01974,
349
+ "z" => 0.00074
350
+ })
351
+
352
+ DICE = UniformRandomVariable.new([1, 2, 3, 4, 5, 6])
353
+
354
+ COIN = UniformRandomVariable.new(["h", "t"])
355
+ end
356
+ end