viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,401 +1,149 @@
1
- # lib/math.rb
2
-
3
- # math and statistic functions
4
- # inlcuding the following methods
5
- # ViralSeq::count
6
- # ViralSeq::count_percentage
7
- # ViralSeq::poisson_distribution
8
- # ViralSeq::r_binom_CI
9
- # Enumerable#median
10
- # Enumerable#sum
11
- # Enumerable#mean
12
- # Enumerable#sample_variance
13
- # Enumerable#stdev
14
- # Enumerable#upper_quartile
15
- # Enumerable#lower_quartile
16
- # Integer#!
17
- # Rubystats::FishersExactTest
18
- # RandomGaussian::new
19
- # RandomGaussian#rand
20
1
 
21
2
  module ViralSeq
22
3
 
23
- # count elements in a array, return a hash of {:element1 => number1, :element2 => number2, ...}
24
- # =Usage
25
- # array = %w{cat dog monkey cat cat cat monkey}
26
- # ViralSeq.count(array)
27
- # => {"cat"=>4, "dog"=>1, "monkey"=>2}
28
-
29
- def self.count(array)
30
- hash = Hash.new(0)
31
- array.each do |element|
32
- hash[element] +=1
33
- end
34
- return hash
35
- end
4
+ # math functions reqruied for ViralSeq
36
5
 
37
- # count elements in a array, return a hash of {:element1 => frequency1, :element2 => frequency2, ...}
38
- # default decimal as 2
39
- # =Usage
40
- # array = %w{cat dog monkey cat cat cat monkey}
41
- # ViralSeq.count_percentage(array)
42
- # => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
43
-
44
- def self.count_percentage(array,decimal = 2)
45
- hash1 = Hash.new(0)
46
- array.each do |element|
47
- hash1[element] += 1
48
- end
49
- total_elements = array.size
50
- hash2 = Hash.new(0)
51
- hash1.each do |key,value|
52
- hash2[key] = (value/total_elements.to_f).round(decimal)
53
- end
54
- return hash2
55
- end
56
-
57
- # poisson distribution. input lambda and maximum k, return a hash with keys as k
58
- # default k value is 5, meaning calculate up to 5 events.
59
- #
60
- # Poisson Distribution (https://en.wikipedia.org/wiki/Poisson_distribution)
61
- # An event can occur 0, 1, 2, … times in an interval.
62
- # The average number of events in an interval is designated λ (lambda).
63
- # λ is the event rate, also called the rate parameter.
64
- # The probability of observing k events in an interval is given by the equation
65
- #
66
- # P(k events in interval) = e^(-λ) * λ^k / k!
67
- #
68
- # λ is the average number of events per interval
69
- # e is the number 2.71828... (Euler's number) the base of the natural logarithms
70
- # k takes values 0, 1, 2, …
71
- # k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
72
- #
73
- # =USAGE
74
- # # We assume the mutaiton rate is 0.005 (event rate λ),
75
- # # we would like to calculate the probablity of 3 mutations on one sequence
76
- # prob_hash = ViralSeq::poisson_distribution(0.005)
77
- # => {0=>0.9950124791926823, 1=>0.004975062395963412, 2=>1.243765598990853e-05, 3=>2.072942664984755e-08, 4=>2.5911783312309436e-11, 5=>2.591178331230944e-14}
78
- # prob_hash[3]
79
- # => 2.072942664984755e-08
80
-
81
- def self.poisson_distribution(rate,k = 5)
82
- out_hash = {}
83
- (0..k).each do |n|
84
- p = (rate**n * Math::E**(-rate))/!n
85
- out_hash[n] = p
86
- end
87
- return out_hash
88
- end
6
+ module Math
89
7
 
8
+ # Generate values from the standard normal distribution with given mean and standard deviation
9
+ # @see http://en.wikipedia.org/wiki/Box-Muller_transform Wikipedia explanation
90
10
 
91
- # require R pre-installed
92
- # calculate binomial 95% confidence intervals by R. refer to R function binom.test
93
- # input number x and n, return an array as [lower_interval, upper_interval]
94
- #
95
- # =USAGE
96
- # # mutation M184V found in 3 out of 923 sequences, the 95% confidence interval is
97
- # ViralSeq.r_binom_CI(3, 923)
98
- # => [0.02223, 0.19234]
99
- #
100
- def self.r_binom_CI(x= 0, n= 0)
101
- r_output = `Rscript -e 'binom.test(#{x},#{n})$conf.int[1];binom.test(#{x},#{n})$conf.int[2]'`
102
- lines = r_output.split "\n"
103
- low = lines[0].chomp[4..-1].to_f
104
- high = lines[1].chomp[4..-1].to_f
105
- return [low.round(5), high.round(5)]
106
- end
11
+ class RandomGaussian
107
12
 
108
- end
109
-
110
- # statistic methods
111
- # :median :sum :mean :sample_variance :stdev :upper_quartile :lower_quartile
112
- # =USAGE
113
- # array = [1,2,3,4,5,6,7,8,9,10]
114
- # array.median
115
- # => 5.5
116
- # array.sum
117
- # => 55
118
- # array.mean
119
- # => 5.5
120
- # array.sample_variance
121
- # => 9.166666666666666
122
- # array.stdev
123
- # => 3.0276503540974917
124
- # array.upper_quartile
125
- # => 7.5
126
- # array.lower_quartile
127
- # => 3.5
128
-
129
- module Enumerable
130
- def median
131
- len = self.length
132
- sorted = self.sort
133
- len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
134
- end
135
-
136
- def sum
137
- self.inject(0){|accum, i| accum + i }
138
- end
139
-
140
- def mean
141
- self.sum/self.length.to_f
142
- end
143
-
144
- def sample_variance
145
- m = self.mean
146
- sum = self.inject(0){|accum, i| accum + (i-m)**2 }
147
- sum/(self.length - 1).to_f
148
- end
149
-
150
- def stdev
151
- return Math.sqrt(self.sample_variance)
152
- end
153
-
154
- def upper_quartile
155
- return nil if self.empty?
156
- sorted_array = self.sort
157
- u = (0.25*(3*sorted_array.length))
158
- if (u-u.truncate).is_a?(Integer)
159
- return sorted_array[(u-u.truncate)-1]
160
- else
161
- sample = sorted_array[u.truncate.abs-1]
162
- sample1 = sorted_array[(u.truncate.abs)]
163
- return sample+((sample1-sample)*(u-u.truncate))
164
- end
165
- end
13
+ # generate RandomGaussian instance with given mean and standard deviation
14
+ # @param mean [Float] mean value.
15
+ # @param sd [Float] standard deviation value.
166
16
 
167
- def lower_quartile
168
- return nil if self.empty?
169
- sorted_array = self.sort
170
- u = 0.25*sorted_array.length + 1
171
- if (u-u.truncate).is_a?(Integer)
172
- return sorted_array[(u-u.truncate)-1]
173
- else
174
- sample = sorted_array[u.truncate.abs-1]
175
- sample1 = sorted_array[(u.truncate.abs)]
176
- return sample+((sample1-sample)*(u-u.truncate))
177
- end
178
- end
179
- end
180
-
181
- # factorial method for an Integer
182
- # Integer.!
183
- class Integer
184
- def !
185
- if self == 0
186
- return 1
187
- else
188
- (1..self).inject(:*)
189
- end
190
- end
191
- end
192
-
193
-
194
- # Fisher's Exact Test Function Library
195
- #
196
- # Based on JavaScript version created by: Oyvind Langsrud
197
- # Ported to Ruby by Bryan Donovan
198
-
199
- module Rubystats
200
- class FishersExactTest
201
-
202
- def initialize
203
- @sn11 = 0.0
204
- @sn1_ = 0.0
205
- @sn_1 = 0.0
206
- @sn = 0.0
207
- @sprob = 0.0
208
-
209
- @sleft = 0.0
210
- @sright = 0.0
211
- @sless = 0.0
212
- @slarg = 0.0
213
-
214
- @left = 0.0
215
- @right = 0.0
216
- @twotail = 0.0
217
- end
218
-
219
- # Reference: "Lanczos, C. 'A precision approximation
220
- # of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
221
- # Translation of Alan Miller's FORTRAN-implementation
222
- # See http://lib.stat.cmu.edu/apstat/245
223
- def lngamm(z)
224
- x = 0
225
- x += 0.0000001659470187408462 / (z+7)
226
- x += 0.000009934937113930748 / (z+6)
227
- x -= 0.1385710331296526 / (z+5)
228
- x += 12.50734324009056 / (z+4)
229
- x -= 176.6150291498386 / (z+3)
230
- x += 771.3234287757674 / (z+2)
231
- x -= 1259.139216722289 / (z+1)
232
- x += 676.5203681218835 / (z)
233
- x += 0.9999999999995183
234
-
235
- return(Math.log(x)-5.58106146679532777-z+(z-0.5) * Math.log(z+6.5))
236
- end
237
-
238
- def lnfact(n)
239
- if n <= 1
240
- return 0
241
- else
242
- return lngamm(n+1)
17
+ def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
18
+ @mean, @sd, @rng = mean, sd, rng
19
+ @compute_next_pair = false
243
20
  end
244
- end
245
21
 
246
- def lnbico(n,k)
247
- return lnfact(n) - lnfact(k) - lnfact(n-k)
248
- end
249
-
250
- def hyper_323(n11, n1_, n_1, n)
251
- return Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
252
- end
253
-
254
- def hyper(n11)
255
- return hyper0(n11, 0, 0, 0)
256
- end
257
-
258
- def hyper0(n11i,n1_i,n_1i,ni)
259
- if n1_i == 0 and n_1i ==0 and ni == 0
260
- unless n11i % 10 == 0
261
- if n11i == @sn11+1
262
- @sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
263
- @sn11 = n11i
264
- return @sprob
265
- end
266
- if n11i == @sn11-1
267
- @sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
268
- @sn11 = n11i
269
- return @sprob
270
- end
22
+ # generate a random number that falls in the pre-defined gaussian distribution
23
+ # @return [Float]
24
+ # @example generate 10 random number that falls in the a gaussian distribution with mean at 0 and standard deviation at 1.0
25
+ # a = RandomGaussian.new
26
+ # numbers = []
27
+ # 10.times {numbers << a.rand.round(5)}
28
+ # numbers
29
+ # => [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
30
+
31
+ def rand
32
+ if (@compute_next_pair = !@compute_next_pair)
33
+ theta = 2 * ::Math::PI * @rng.call
34
+ scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
35
+ @g1 = @mean + scale * ::Math.sin(theta)
36
+ @g0 = @mean + scale * ::Math.cos(theta)
37
+ else
38
+ @g1
271
39
  end
272
- @sn11 = n11i
273
- else
274
- @sn11 = n11i
275
- @sn1_ = n1_i
276
- @sn_1 = n_1i
277
- @sn = ni
278
- end
279
- @sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
280
- return @sprob
281
- end
282
-
283
- def exact(n11,n1_,n_1,n)
284
-
285
- p = i = j = prob = 0.0
286
-
287
- max = n1_
288
- max = n_1 if n_1 < max
289
- min = n1_ + n_1 - n
290
- min = 0 if min < 0
291
-
292
- if min == max
293
- @sless = 1
294
- @sright = 1
295
- @sleft = 1
296
- @slarg = 1
297
- return 1
298
- end
299
-
300
- prob = hyper0(n11,n1_,n_1,n)
301
- @sleft = 0
302
-
303
- p = hyper(min)
304
- i = min + 1
305
- while p < (0.99999999 * prob)
306
- @sleft += p
307
- p = hyper(i)
308
- i += 1
309
40
  end
310
41
 
311
- i -= 1
42
+ end
312
43
 
313
- if p < (1.00000001*prob)
314
- @sleft += p
315
- else
316
- i -= 1
44
+ # class for poisson distribution.
45
+ # An event can occur 0, 1, 2, … times in an interval.
46
+ # The average number of events in an interval is designated λ (lambda).
47
+ # λ is the event rate, also called the rate parameter.
48
+ # The probability of observing k events in an interval is given by the equation
49
+ #
50
+ # P(k events in interval) = e^(-λ) * λ^k / k!
51
+ #
52
+ # λ is the average number of events per interval
53
+ # e is the number 2.71828... (Euler's number) the base of the natural logarithms
54
+ # k takes values 0, 1, 2, …
55
+ # k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
56
+ # @see https://en.wikipedia.org/wiki/Poisson_distribution Poisson Distribution (Wikipedia).
57
+ # @example given the mutation rate at 0.01 and sequence length of 1000 bp,
58
+ # calculate the probablity of 3 mutations on one sequence
59
+ # new_poisson_dist = ViralSeq::Math::PoissonDist.new(0.01)
60
+ # prob_hash = new_poisson_dist.poisson_hash
61
+ # 1000 * prob_hash[3].round(5)
62
+ # => 0.00017
63
+ class PoissonDist
64
+ # initialize with given event rate λ, default events upper limit set to 5
65
+ def initialize(rate,k = 5)
66
+ @rate = rate
67
+ @k = k
68
+ @poisson_hash = {}
69
+ (0..k).each do |n|
70
+ p = (rate**n * ::Math::E**(-rate))/!n
71
+ @poisson_hash[n] = p
72
+ end
317
73
  end
318
74
 
319
- @sright = 0
320
-
321
- p = hyper(max)
322
- j = max - 1
323
- while p < (0.99999999 * prob)
324
- @sright += p
325
- p = hyper(j)
326
- j -= 1
75
+ # @return [Float] event rate λ
76
+ attr_accessor :rate
77
+ # @return [Integer] maxinum events number shows in @poisson_hash
78
+ attr_accessor :k
79
+ # @return [Hash] probablity hash of :event_number => :probablity
80
+ attr_reader :poisson_hash
81
+ end # end of PoissonDist
82
+
83
+ # Use R to calculate binomial 95% confidence intervals. Require R function binom.test.
84
+ # @example mutation M184V found in 3 out of 923 sequences, calculate 95% confidence interval
85
+ # freq = ViralSeq::Math::BinomCI.new(3,923)
86
+ # freq.mean.round(5)
87
+ # => 0.00325
88
+ # freq.lower.round(5)
89
+ # => 0.00067
90
+ # freq.upper.round(5)
91
+ # => 0.00947
92
+
93
+ class BinomCI
94
+ # initialize with numerator @n1 and denominator @n2 as Integer
95
+ def initialize(n1, n2)
96
+ @n1 = n1
97
+ @n2 = n2
98
+ @mean = n1/n2.to_f
99
+ r_output = `Rscript -e 'binom.test(#{n1},#{n2})$conf.int[1];binom.test(#{n1},#{n2})$conf.int[2]'`
100
+ lines = r_output.split "\n"
101
+ @lower = lines[0].chomp[4..-1].to_f
102
+ @upper = lines[1].chomp[4..-1].to_f
327
103
  end
328
- j += 1
329
104
 
330
- if p < (1.00000001*prob)
331
- @sright += p
332
- else
333
- j += 1
105
+ # @return [Integer] number of observations
106
+ attr_accessor :n1
107
+ # @return [Integer] total numbers
108
+ attr_accessor :n2
109
+ # @return [Float] mean
110
+ attr_reader :mean
111
+ # @return [Float] lower limit of 95% CI
112
+ attr_reader :lower
113
+ # @return [Float] upper limit of 95% CI
114
+ attr_reader :upper
115
+
116
+ end # end of BinomCI
117
+
118
+
119
+ # A function to calcuate cut-off for offspring primer IDs.
120
+ # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
121
+ # @param m [Integer] PID abundance
122
+ # @param error_rate [Float] estimated platform error rate, the model supports error rate from 0.003 to 0.03.
123
+ # @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
124
+
125
+ def self.calculate_pid_cut_off(m, error_rate = 0.02)
126
+ if m <= 10
127
+ return 2
334
128
  end
335
-
336
- if (i - n11).abs < (j - n11).abs
337
- @sless = @sleft
338
- @slarg = 1 - @sleft + prob
129
+ n = 0
130
+ case error_rate
131
+ when 0...0.0075
132
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
133
+ when 0.0075...0.015
134
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
135
+ when 0.015..0.03
136
+ if m <= 8500
137
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
138
+ else
139
+ n = 0.0079 * m + 9.4869
140
+ end
339
141
  else
340
- @sless = 1 - @sright + prob
341
- @slarg = @sright
142
+ raise ArgumentError.new('Error_rate has be between 0 to 0.03')
342
143
  end
343
- return prob
344
- end
345
-
346
- def calculate(n11_,n12_,n21_,n22_)
347
- n11_ *= -1 if n11_ < 0
348
- n12_ *= -1 if n12_ < 0
349
- n21_ *= -1 if n21_ < 0
350
- n22_ *= -1 if n22_ < 0
351
- n1_ = n11_ + n12_
352
- n_1 = n11_ + n21_
353
- n = n11_ + n12_ + n21_ + n22_
354
- exact(n11_,n1_,n_1,n)
355
- left = @sless
356
- right = @slarg
357
- twotail = @sleft + @sright
358
- twotail = 1 if twotail > 1
359
- values_hash = { :left =>left, :right =>right, :twotail =>twotail }
360
- return values_hash
361
- end
362
- end
363
- end
364
-
365
-
366
- # generate values from the standard normal distribution with given mean and standard deviation
367
- # See http://en.wikipedia.org/wiki/Box-Muller_transform
368
- #
369
- # RandomGaussian.new(mean, sd, rng)
370
- # # generate RandomGaussian instance with given mean and standard deviation
371
- # # default value: mean = 0.0, sd = 1.0
372
- #
373
- # RandomGaussian.rand
374
- # # generate a random number that falls in the pre-defined gaussian distribution
375
- # =USAGE
376
- # # example
377
- # a = RandomGaussian.new
378
- # a.rand
379
- # numbers = []
380
- # 10.times {numbers << a.rand.round(5)}
381
- # numbers
382
- # [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
383
-
384
-
385
- class RandomGaussian
386
- def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
387
- @mean, @sd, @rng = mean, sd, rng
388
- @compute_next_pair = false
389
- end
390
-
391
- def rand
392
- if (@compute_next_pair = !@compute_next_pair)
393
- theta = 2 * Math::PI * @rng.call
394
- scale = @sd * Math.sqrt(-2 * Math.log(1 - @rng.call))
395
- @g1 = @mean + scale * Math.sin(theta)
396
- @g0 = @mean + scale * Math.cos(theta)
397
- else
398
- @g1
399
- end
400
- end
401
- end
144
+ n = n.round
145
+ n = 2 if n < 3
146
+ return n
147
+ end # end of .calculate_pid_cut_off
148
+ end # end of Math
149
+ end # end of ViralSeq