viral_seq 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,401 +1,149 @@
1
- # lib/math.rb
2
-
3
- # math and statistic functions
4
- # inlcuding the following methods
5
- # ViralSeq::count
6
- # ViralSeq::count_percentage
7
- # ViralSeq::poisson_distribution
8
- # ViralSeq::r_binom_CI
9
- # Enumerable#median
10
- # Enumerable#sum
11
- # Enumerable#mean
12
- # Enumerable#sample_variance
13
- # Enumerable#stdev
14
- # Enumerable#upper_quartile
15
- # Enumerable#lower_quartile
16
- # Integer#!
17
- # Rubystats::FishersExactTest
18
- # RandomGaussian::new
19
- # RandomGaussian#rand
20
1
 
21
2
  module ViralSeq
22
3
 
23
- # count elements in a array, return a hash of {:element1 => number1, :element2 => number2, ...}
24
- # =Usage
25
- # array = %w{cat dog monkey cat cat cat monkey}
26
- # ViralSeq.count(array)
27
- # => {"cat"=>4, "dog"=>1, "monkey"=>2}
28
-
29
- def self.count(array)
30
- hash = Hash.new(0)
31
- array.each do |element|
32
- hash[element] +=1
33
- end
34
- return hash
35
- end
4
+ # math functions reqruied for ViralSeq
36
5
 
37
- # count elements in a array, return a hash of {:element1 => frequency1, :element2 => frequency2, ...}
38
- # default decimal as 2
39
- # =Usage
40
- # array = %w{cat dog monkey cat cat cat monkey}
41
- # ViralSeq.count_percentage(array)
42
- # => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
43
-
44
- def self.count_percentage(array,decimal = 2)
45
- hash1 = Hash.new(0)
46
- array.each do |element|
47
- hash1[element] += 1
48
- end
49
- total_elements = array.size
50
- hash2 = Hash.new(0)
51
- hash1.each do |key,value|
52
- hash2[key] = (value/total_elements.to_f).round(decimal)
53
- end
54
- return hash2
55
- end
56
-
57
- # poisson distribution. input lambda and maximum k, return a hash with keys as k
58
- # default k value is 5, meaning calculate up to 5 events.
59
- #
60
- # Poisson Distribution (https://en.wikipedia.org/wiki/Poisson_distribution)
61
- # An event can occur 0, 1, 2, … times in an interval.
62
- # The average number of events in an interval is designated λ (lambda).
63
- # λ is the event rate, also called the rate parameter.
64
- # The probability of observing k events in an interval is given by the equation
65
- #
66
- # P(k events in interval) = e^(-λ) * λ^k / k!
67
- #
68
- # λ is the average number of events per interval
69
- # e is the number 2.71828... (Euler's number) the base of the natural logarithms
70
- # k takes values 0, 1, 2, …
71
- # k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
72
- #
73
- # =USAGE
74
- # # We assume the mutaiton rate is 0.005 (event rate λ),
75
- # # we would like to calculate the probablity of 3 mutations on one sequence
76
- # prob_hash = ViralSeq::poisson_distribution(0.005)
77
- # => {0=>0.9950124791926823, 1=>0.004975062395963412, 2=>1.243765598990853e-05, 3=>2.072942664984755e-08, 4=>2.5911783312309436e-11, 5=>2.591178331230944e-14}
78
- # prob_hash[3]
79
- # => 2.072942664984755e-08
80
-
81
- def self.poisson_distribution(rate,k = 5)
82
- out_hash = {}
83
- (0..k).each do |n|
84
- p = (rate**n * Math::E**(-rate))/!n
85
- out_hash[n] = p
86
- end
87
- return out_hash
88
- end
6
+ module Math
89
7
 
8
+ # Generate values from the standard normal distribution with given mean and standard deviation
9
+ # @see http://en.wikipedia.org/wiki/Box-Muller_transform Wikipedia explanation
90
10
 
91
- # require R pre-installed
92
- # calculate binomial 95% confidence intervals by R. refer to R function binom.test
93
- # input number x and n, return an array as [lower_interval, upper_interval]
94
- #
95
- # =USAGE
96
- # # mutation M184V found in 3 out of 923 sequences, the 95% confidence interval is
97
- # ViralSeq.r_binom_CI(3, 923)
98
- # => [0.02223, 0.19234]
99
- #
100
- def self.r_binom_CI(x= 0, n= 0)
101
- r_output = `Rscript -e 'binom.test(#{x},#{n})$conf.int[1];binom.test(#{x},#{n})$conf.int[2]'`
102
- lines = r_output.split "\n"
103
- low = lines[0].chomp[4..-1].to_f
104
- high = lines[1].chomp[4..-1].to_f
105
- return [low.round(5), high.round(5)]
106
- end
11
+ class RandomGaussian
107
12
 
108
- end
109
-
110
- # statistic methods
111
- # :median :sum :mean :sample_variance :stdev :upper_quartile :lower_quartile
112
- # =USAGE
113
- # array = [1,2,3,4,5,6,7,8,9,10]
114
- # array.median
115
- # => 5.5
116
- # array.sum
117
- # => 55
118
- # array.mean
119
- # => 5.5
120
- # array.sample_variance
121
- # => 9.166666666666666
122
- # array.stdev
123
- # => 3.0276503540974917
124
- # array.upper_quartile
125
- # => 7.5
126
- # array.lower_quartile
127
- # => 3.5
128
-
129
- module Enumerable
130
- def median
131
- len = self.length
132
- sorted = self.sort
133
- len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
134
- end
135
-
136
- def sum
137
- self.inject(0){|accum, i| accum + i }
138
- end
139
-
140
- def mean
141
- self.sum/self.length.to_f
142
- end
143
-
144
- def sample_variance
145
- m = self.mean
146
- sum = self.inject(0){|accum, i| accum + (i-m)**2 }
147
- sum/(self.length - 1).to_f
148
- end
149
-
150
- def stdev
151
- return Math.sqrt(self.sample_variance)
152
- end
153
-
154
- def upper_quartile
155
- return nil if self.empty?
156
- sorted_array = self.sort
157
- u = (0.25*(3*sorted_array.length))
158
- if (u-u.truncate).is_a?(Integer)
159
- return sorted_array[(u-u.truncate)-1]
160
- else
161
- sample = sorted_array[u.truncate.abs-1]
162
- sample1 = sorted_array[(u.truncate.abs)]
163
- return sample+((sample1-sample)*(u-u.truncate))
164
- end
165
- end
13
+ # generate RandomGaussian instance with given mean and standard deviation
14
+ # @param mean [Float] mean value.
15
+ # @param sd [Float] standard deviation value.
166
16
 
167
- def lower_quartile
168
- return nil if self.empty?
169
- sorted_array = self.sort
170
- u = 0.25*sorted_array.length + 1
171
- if (u-u.truncate).is_a?(Integer)
172
- return sorted_array[(u-u.truncate)-1]
173
- else
174
- sample = sorted_array[u.truncate.abs-1]
175
- sample1 = sorted_array[(u.truncate.abs)]
176
- return sample+((sample1-sample)*(u-u.truncate))
177
- end
178
- end
179
- end
180
-
181
- # factorial method for an Integer
182
- # Integer.!
183
- class Integer
184
- def !
185
- if self == 0
186
- return 1
187
- else
188
- (1..self).inject(:*)
189
- end
190
- end
191
- end
192
-
193
-
194
- # Fisher's Exact Test Function Library
195
- #
196
- # Based on JavaScript version created by: Oyvind Langsrud
197
- # Ported to Ruby by Bryan Donovan
198
-
199
- module Rubystats
200
- class FishersExactTest
201
-
202
- def initialize
203
- @sn11 = 0.0
204
- @sn1_ = 0.0
205
- @sn_1 = 0.0
206
- @sn = 0.0
207
- @sprob = 0.0
208
-
209
- @sleft = 0.0
210
- @sright = 0.0
211
- @sless = 0.0
212
- @slarg = 0.0
213
-
214
- @left = 0.0
215
- @right = 0.0
216
- @twotail = 0.0
217
- end
218
-
219
- # Reference: "Lanczos, C. 'A precision approximation
220
- # of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
221
- # Translation of Alan Miller's FORTRAN-implementation
222
- # See http://lib.stat.cmu.edu/apstat/245
223
- def lngamm(z)
224
- x = 0
225
- x += 0.0000001659470187408462 / (z+7)
226
- x += 0.000009934937113930748 / (z+6)
227
- x -= 0.1385710331296526 / (z+5)
228
- x += 12.50734324009056 / (z+4)
229
- x -= 176.6150291498386 / (z+3)
230
- x += 771.3234287757674 / (z+2)
231
- x -= 1259.139216722289 / (z+1)
232
- x += 676.5203681218835 / (z)
233
- x += 0.9999999999995183
234
-
235
- return(Math.log(x)-5.58106146679532777-z+(z-0.5) * Math.log(z+6.5))
236
- end
237
-
238
- def lnfact(n)
239
- if n <= 1
240
- return 0
241
- else
242
- return lngamm(n+1)
17
+ def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
18
+ @mean, @sd, @rng = mean, sd, rng
19
+ @compute_next_pair = false
243
20
  end
244
- end
245
21
 
246
- def lnbico(n,k)
247
- return lnfact(n) - lnfact(k) - lnfact(n-k)
248
- end
249
-
250
- def hyper_323(n11, n1_, n_1, n)
251
- return Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
252
- end
253
-
254
- def hyper(n11)
255
- return hyper0(n11, 0, 0, 0)
256
- end
257
-
258
- def hyper0(n11i,n1_i,n_1i,ni)
259
- if n1_i == 0 and n_1i ==0 and ni == 0
260
- unless n11i % 10 == 0
261
- if n11i == @sn11+1
262
- @sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
263
- @sn11 = n11i
264
- return @sprob
265
- end
266
- if n11i == @sn11-1
267
- @sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
268
- @sn11 = n11i
269
- return @sprob
270
- end
22
+ # generate a random number that falls in the pre-defined gaussian distribution
23
+ # @return [Float]
24
+ # @example generate 10 random number that falls in the a gaussian distribution with mean at 0 and standard deviation at 1.0
25
+ # a = RandomGaussian.new
26
+ # numbers = []
27
+ # 10.times {numbers << a.rand.round(5)}
28
+ # numbers
29
+ # => [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
30
+
31
+ def rand
32
+ if (@compute_next_pair = !@compute_next_pair)
33
+ theta = 2 * ::Math::PI * @rng.call
34
+ scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
35
+ @g1 = @mean + scale * ::Math.sin(theta)
36
+ @g0 = @mean + scale * ::Math.cos(theta)
37
+ else
38
+ @g1
271
39
  end
272
- @sn11 = n11i
273
- else
274
- @sn11 = n11i
275
- @sn1_ = n1_i
276
- @sn_1 = n_1i
277
- @sn = ni
278
- end
279
- @sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
280
- return @sprob
281
- end
282
-
283
- def exact(n11,n1_,n_1,n)
284
-
285
- p = i = j = prob = 0.0
286
-
287
- max = n1_
288
- max = n_1 if n_1 < max
289
- min = n1_ + n_1 - n
290
- min = 0 if min < 0
291
-
292
- if min == max
293
- @sless = 1
294
- @sright = 1
295
- @sleft = 1
296
- @slarg = 1
297
- return 1
298
- end
299
-
300
- prob = hyper0(n11,n1_,n_1,n)
301
- @sleft = 0
302
-
303
- p = hyper(min)
304
- i = min + 1
305
- while p < (0.99999999 * prob)
306
- @sleft += p
307
- p = hyper(i)
308
- i += 1
309
40
  end
310
41
 
311
- i -= 1
42
+ end
312
43
 
313
- if p < (1.00000001*prob)
314
- @sleft += p
315
- else
316
- i -= 1
44
+ # class for poisson distribution.
45
+ # An event can occur 0, 1, 2, … times in an interval.
46
+ # The average number of events in an interval is designated λ (lambda).
47
+ # λ is the event rate, also called the rate parameter.
48
+ # The probability of observing k events in an interval is given by the equation
49
+ #
50
+ # P(k events in interval) = e^(-λ) * λ^k / k!
51
+ #
52
+ # λ is the average number of events per interval
53
+ # e is the number 2.71828... (Euler's number) the base of the natural logarithms
54
+ # k takes values 0, 1, 2, …
55
+ # k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
56
+ # @see https://en.wikipedia.org/wiki/Poisson_distribution Poisson Distribution (Wikipedia).
57
+ # @example given the mutation rate at 0.01 and sequence length of 1000 bp,
58
+ # calculate the probablity of 3 mutations on one sequence
59
+ # new_poisson_dist = ViralSeq::Math::PoissonDist.new(0.01)
60
+ # prob_hash = new_poisson_dist.poisson_hash
61
+ # 1000 * prob_hash[3].round(5)
62
+ # => 0.00017
63
+ class PoissonDist
64
+ # initialize with given event rate λ, default events upper limit set to 5
65
+ def initialize(rate,k = 5)
66
+ @rate = rate
67
+ @k = k
68
+ @poisson_hash = {}
69
+ (0..k).each do |n|
70
+ p = (rate**n * ::Math::E**(-rate))/!n
71
+ @poisson_hash[n] = p
72
+ end
317
73
  end
318
74
 
319
- @sright = 0
320
-
321
- p = hyper(max)
322
- j = max - 1
323
- while p < (0.99999999 * prob)
324
- @sright += p
325
- p = hyper(j)
326
- j -= 1
75
+ # @return [Float] event rate λ
76
+ attr_accessor :rate
77
+ # @return [Integer] maxinum events number shows in @poisson_hash
78
+ attr_accessor :k
79
+ # @return [Hash] probablity hash of :event_number => :probablity
80
+ attr_reader :poisson_hash
81
+ end # end of PoissonDist
82
+
83
+ # Use R to calculate binomial 95% confidence intervals. Require R function binom.test.
84
+ # @example mutation M184V found in 3 out of 923 sequences, calculate 95% confidence interval
85
+ # freq = ViralSeq::Math::BinomCI.new(3,923)
86
+ # freq.mean.round(5)
87
+ # => 0.00325
88
+ # freq.lower.round(5)
89
+ # => 0.00067
90
+ # freq.upper.round(5)
91
+ # => 0.00947
92
+
93
+ class BinomCI
94
+ # initialize with numerator @n1 and denominator @n2 as Integer
95
+ def initialize(n1, n2)
96
+ @n1 = n1
97
+ @n2 = n2
98
+ @mean = n1/n2.to_f
99
+ r_output = `Rscript -e 'binom.test(#{n1},#{n2})$conf.int[1];binom.test(#{n1},#{n2})$conf.int[2]'`
100
+ lines = r_output.split "\n"
101
+ @lower = lines[0].chomp[4..-1].to_f
102
+ @upper = lines[1].chomp[4..-1].to_f
327
103
  end
328
- j += 1
329
104
 
330
- if p < (1.00000001*prob)
331
- @sright += p
332
- else
333
- j += 1
105
+ # @return [Integer] number of observations
106
+ attr_accessor :n1
107
+ # @return [Integer] total numbers
108
+ attr_accessor :n2
109
+ # @return [Float] mean
110
+ attr_reader :mean
111
+ # @return [Float] lower limit of 95% CI
112
+ attr_reader :lower
113
+ # @return [Float] upper limit of 95% CI
114
+ attr_reader :upper
115
+
116
+ end # end of BinomCI
117
+
118
+
119
+ # A function to calcuate cut-off for offspring primer IDs.
120
+ # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
121
+ # @param m [Integer] PID abundance
122
+ # @param error_rate [Float] estimated platform error rate, the model supports error rate from 0.003 to 0.03.
123
+ # @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
124
+
125
+ def self.calculate_pid_cut_off(m, error_rate = 0.02)
126
+ if m <= 10
127
+ return 2
334
128
  end
335
-
336
- if (i - n11).abs < (j - n11).abs
337
- @sless = @sleft
338
- @slarg = 1 - @sleft + prob
129
+ n = 0
130
+ case error_rate
131
+ when 0...0.0075
132
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
133
+ when 0.0075...0.015
134
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
135
+ when 0.015..0.03
136
+ if m <= 8500
137
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
138
+ else
139
+ n = 0.0079 * m + 9.4869
140
+ end
339
141
  else
340
- @sless = 1 - @sright + prob
341
- @slarg = @sright
142
+ raise ArgumentError.new('Error_rate has be between 0 to 0.03')
342
143
  end
343
- return prob
344
- end
345
-
346
- def calculate(n11_,n12_,n21_,n22_)
347
- n11_ *= -1 if n11_ < 0
348
- n12_ *= -1 if n12_ < 0
349
- n21_ *= -1 if n21_ < 0
350
- n22_ *= -1 if n22_ < 0
351
- n1_ = n11_ + n12_
352
- n_1 = n11_ + n21_
353
- n = n11_ + n12_ + n21_ + n22_
354
- exact(n11_,n1_,n_1,n)
355
- left = @sless
356
- right = @slarg
357
- twotail = @sleft + @sright
358
- twotail = 1 if twotail > 1
359
- values_hash = { :left =>left, :right =>right, :twotail =>twotail }
360
- return values_hash
361
- end
362
- end
363
- end
364
-
365
-
366
- # generate values from the standard normal distribution with given mean and standard deviation
367
- # See http://en.wikipedia.org/wiki/Box-Muller_transform
368
- #
369
- # RandomGaussian.new(mean, sd, rng)
370
- # # generate RandomGaussian instance with given mean and standard deviation
371
- # # default value: mean = 0.0, sd = 1.0
372
- #
373
- # RandomGaussian.rand
374
- # # generate a random number that falls in the pre-defined gaussian distribution
375
- # =USAGE
376
- # # example
377
- # a = RandomGaussian.new
378
- # a.rand
379
- # numbers = []
380
- # 10.times {numbers << a.rand.round(5)}
381
- # numbers
382
- # [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
383
-
384
-
385
- class RandomGaussian
386
- def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
387
- @mean, @sd, @rng = mean, sd, rng
388
- @compute_next_pair = false
389
- end
390
-
391
- def rand
392
- if (@compute_next_pair = !@compute_next_pair)
393
- theta = 2 * Math::PI * @rng.call
394
- scale = @sd * Math.sqrt(-2 * Math.log(1 - @rng.call))
395
- @g1 = @mean + scale * Math.sin(theta)
396
- @g0 = @mean + scale * Math.cos(theta)
397
- else
398
- @g1
399
- end
400
- end
401
- end
144
+ n = n.round
145
+ n = 2 if n < 3
146
+ return n
147
+ end # end of .calculate_pid_cut_off
148
+ end # end of Math
149
+ end # end of ViralSeq