viral_seq 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ # viral_seq/hcv_dr
2
+ # HCV resistant mutation interpretation
3
+ # ViralSeq::hcv_ns5a
4
+
5
+ # ViralSeq.hcv_ns5a(amino_acid_sequence_array, start_aa_position)
6
+ # # amino_acid_sequence_array is Array object of the amino acid sequence.
7
+ # # can use ViralSeq::Sequence#aa_array to obtain the aa array sequence
8
+ # # start_aa_position is the starting aa number of the input sequence as Integer
9
+
10
+ module ViralSeq
11
+ def self.hcv_ns5a(aa_array,start_aa=1)
12
+ out_hash = {}
13
+ sdrm = {}
14
+ sdrm[28] = ['M',['T']]
15
+ sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
16
+ sdrm[31] = ['L',['M','V','F']]
17
+ sdrm[32] = ['P',['L']]
18
+ sdrm[44] = ['K',['R']]
19
+ sdrm[58] = ['H',['D','P','S']]
20
+ sdrm[64] = ['T',['A','S']]
21
+ sdrm[77] = ['P',['A','S']]
22
+ sdrm[78] = ['R',['K']]
23
+ sdrm[79] = ['T',['A']]
24
+ sdrm[83] = ['T',['M']]
25
+ sdrm[85] = ['S',['N','H','Y']]
26
+ sdrm[92] = ['A',['P','T','K','E']]
27
+ sdrm[93] = ['Y',['C','F','H','N']]
28
+ sdrm[107] = ['K',['T','S']]
29
+ sdrm[121] = ['I',['V']]
30
+ sdrm[135] = ['T',['A']]
31
+ aa_length = aa_array.size
32
+ end_aa = start_aa + aa_length - 1
33
+ (start_aa..end_aa).each do |position|
34
+ array_position = position - start_aa
35
+ if sdrm.keys.include?(position)
36
+ wt_aa = sdrm[position][0]
37
+ test_aa = aa_array[array_position]
38
+ if test_aa.size == 1
39
+ unless wt_aa == test_aa
40
+ if sdrm[position][1].include?(test_aa)
41
+ out_hash[position] = [wt_aa,test_aa]
42
+ end
43
+ end
44
+ else
45
+ test_aa_array = test_aa.split("/")
46
+ if (test_aa_array & sdrm[position][1])
47
+ out_hash[position] = [wt_aa,test_aa]
48
+ end
49
+ end
50
+ end
51
+ end
52
+ return out_hash
53
+ end
54
+ end
@@ -0,0 +1,299 @@
1
+ # viral_seq/locator.rb
2
+
3
+ # Including following methods:
4
+ # ViralSeq::sequence_locator
5
+ # ViralSeq::sequence_clip
6
+ # ViralSeq::qc_hiv_seq_check
7
+
8
+ # HIV sequence locator function
9
+ # resembling HIV Sequence Locator from LANL
10
+ # https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html
11
+ # require MUSCLE (http://www.drive5.com/muscle) installed
12
+ # current version only supports nucleotide sequence, not for amino acid sequence.
13
+
14
+ # =USAGE1
15
+ # # Find the location of a sequence
16
+ # ViralSeq.sequence_locator(input_sequence, reference_options, path_to_muscle)
17
+ # # input_sequence: String of nucleotide sequence
18
+ # # reference_options: choose a reference genome from :HXB2 (default), :NL43, or :MAC239
19
+ # # path_to_muscle: path to the muscle executable.
20
+ # # Default as :false, will call MuscleBio to run Muscle
21
+ # # specify path_to_muscle if other source of muscle needed
22
+ # # function returns an array of
23
+ # # start_location (Integer)
24
+ # # end_location (Integer)
25
+ # # percentage_of_similarity_to_reference_sequence (Float)
26
+ # # containing_indel? (Boolean)
27
+ # # aligned_input_sequence (String)
28
+ # # aligned_reference_sequence (String)
29
+ # # example code
30
+ # sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
31
+ # p ViralSeq.sequence_locator(sequence, :NL43, 'muscle')
32
+ # => [2333, 2433, 98.0, false, "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC", "AGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATC"]
33
+
34
+ # =USAGE2
35
+ # ViralSeq.sequence_clip(input_sequence, start_position, end_position, reference_options, path_to_muscle)
36
+ # # Given a pair of specific start and end positions, and an input sequence, return a sub-sequence of that range
37
+ # # return nil if the input sequence is not in the range
38
+ # # input_sequence: String of nucleotide sequence
39
+ # # start_position and end_position: Integer of the start and end reference number of the sub-sequence
40
+ # # reference_options and path_to_muscle are same as in ViralSeq.sequence_locator
41
+ # # path_to_muscle: path to the muscle executable.
42
+ # # Default as :false, will call MuscleBio to run Muscle
43
+ # # specify path_to_muscle if other source of muscle needed
44
+ # # example code
45
+ # seq = "CCTCAGATCACTCTTTGGCAACGACCCCTAGTTACAATAAGGGTAGGGGGGCAACTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAGATACCCATAGAAATTTGTGGACATGAAGCTATAGGTACAGTATTAGTGGGACCTACACCTGTCAACATAATTGGGAGAAATCTGTTGACTCAGATTGGTTGCACTCTAAATTTT"
46
+ # p ViralSeq.sequence_clip(seq, 2333, 2433, :HXB2, 'muscle')
47
+ # => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
48
+
49
+ # =USAGE3
50
+ # ViralSeq.qc_hiv_seq_check(seq_hash, start_nt, end_nt, allow_indel?, reference_options, path_to_muscle)
51
+ # # Given a sequence hash, start and end nt positions to a chosen reference genome (default :HXB2),
52
+ # # and a boolean value for allowing indels,
53
+ # # path_to_muscle: path to the muscle executable.
54
+ # # Default as :false, will call MuscleBio to run Muscle
55
+ # # specify path_to_muscle if other source of muscle needed
56
+ # # return a sequence sub-hash that meets the the criteria
57
+ # # example code
58
+ # sequence_hash = ViralSeq.fasta_to_hash('sample/sample_seq.fasta') # load the .fasta file as a sequence hash
59
+ # filtered_sequence_hash = ViralSeq.qc_hiv_seq_check(sequence_hash, 4384, 4751, false, :HXB2, 'muscle')
60
+ # puts sequence_hash.size
61
+ # => 6
62
+ # puts filtered_sequence_hash.size
63
+ # => 4
64
+
65
+ module ViralSeq
66
+
67
+ def self.sequence_locator(seq='', ref_option = :HXB2, path_to_muscle = false)
68
+
69
+ # ViralSeq.check_muscle(path_to_muscle)
70
+ ori_ref = ViralSeq.check_ref(ref_option)
71
+
72
+ begin
73
+ ori_ref_l = ori_ref.size
74
+ l1 = 0
75
+ l2 = 0
76
+
77
+ aln_seq = ViralSeq.muscle_align(ori_ref, seq, path_to_muscle)
78
+ aln_test = aln_seq[1]
79
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
80
+ gap_begin = $1.size
81
+ gap_end = $3.size
82
+ aln_test2 = $2
83
+ ref = aln_seq[0]
84
+ ref = ref[gap_begin..(-gap_end-1)]
85
+ ref_size = ref.size
86
+ if ref_size > 1.3*(seq.size)
87
+ l1 = l1 + gap_begin
88
+ l2 = l2 + gap_end
89
+ max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
90
+ aln_test2 =~ /#{max_seq}/
91
+ before_aln_seq = $`
92
+ before_aln = $`.size
93
+ post_aln_seq = $'
94
+ post_aln = $'.size
95
+ before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
96
+ b1 = (1.3 * before_aln_seq_size).to_i
97
+ post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
98
+ b2 = (1.3 * post_aln_seq_size).to_i
99
+ if (before_aln > seq.size) and (post_aln <= seq.size)
100
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
101
+ l1 = l1 + (before_aln - b1)
102
+ elsif (post_aln > seq.size) and (before_aln <= seq.size)
103
+ ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
104
+ l2 = l2 + post_aln - b2
105
+ elsif (post_aln > seq.size) and (before_aln > seq.size)
106
+ ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
107
+ l1 = l1 + (before_aln - b1)
108
+ l2 = l2 + (post_aln - b2)
109
+ end
110
+
111
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
112
+ aln_test = aln_seq[1]
113
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
114
+ gap_begin = $1.size
115
+ gap_end = $3.size
116
+ ref = aln_seq[0]
117
+ ref = ref[gap_begin..(-gap_end-1)]
118
+ end
119
+
120
+ aln_test = aln_seq[1]
121
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
122
+ gap_begin = $1.size
123
+ gap_end = $3.size
124
+ aln_test = $2
125
+ aln_test =~ /^(\w+)(\-*)\w/
126
+ s1 = $1.size
127
+ g1 = $2.size
128
+ aln_test =~ /\w(\-*)(\w+)$/
129
+ s2 = $2.size
130
+ g2 = $1.size
131
+
132
+ l1 = l1 + gap_begin
133
+ l2 = l2 + gap_end
134
+ repeat = 0
135
+
136
+ if g1 == g2 and (s1 + g1 + s2) == ref.size
137
+ if s1 > s2 and g2 > 2*s2
138
+ ref = ref[0..(-g2-1)]
139
+ repeat = 1
140
+ l2 = l2 + g2
141
+ elsif s1 < s2 and g1 > 2*s1
142
+ ref = ref[g1..-1]
143
+ repeat = 1
144
+ l1 = l1 + g1
145
+ end
146
+ else
147
+ if g1 > 2*s1
148
+ ref = ref[g1..-1]
149
+ repeat = 1
150
+ l1 = l1 + g1
151
+ end
152
+ if g2 > 2*s2
153
+ ref = ref[0..(-g2 - 1)]
154
+ repeat = 1
155
+ l2 = l2 + g2
156
+ end
157
+ end
158
+
159
+ while repeat == 1
160
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
161
+ aln_test = aln_seq[1]
162
+ aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
163
+ gap_begin = $1.size
164
+ gap_end = $3.size
165
+ aln_test = $2
166
+ aln_test =~ /^(\w+)(\-*)\w/
167
+ s1 = $1.size
168
+ g1 = $2.size
169
+ aln_test =~ /\w(\-*)(\w+)$/
170
+ s2 = $2.size
171
+ g2 = $1.size
172
+ ref = aln_seq[0]
173
+ ref = ref[gap_begin..(-gap_end-1)]
174
+ l1 = l1 + gap_begin
175
+ l2 = l2 + gap_end
176
+ repeat = 0
177
+ if g1 > 2*s1
178
+ ref = ref[g1..-1]
179
+ repeat = 1
180
+ l1 = l1 + g1
181
+ end
182
+ if g2 > 2*s2
183
+ ref = ref[0..(-g2 - 1)]
184
+ repeat = 1
185
+ l2 = l2 + g2
186
+ end
187
+ end
188
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
189
+
190
+
191
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
192
+ aln_test = aln_seq[1]
193
+ ref = aln_seq[0]
194
+
195
+ #refine alignment
196
+
197
+ if ref =~ /^(\-+)/
198
+ l1 = l1 - $1.size
199
+ elsif ref =~ /(\-+)$/
200
+ l2 = l2 + $1.size
201
+ end
202
+
203
+ if (ori_ref_l - l2 - 1) >= l1
204
+ ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
205
+ aln_seq = ViralSeq.muscle_align(ref, seq, path_to_muscle)
206
+ aln_test = aln_seq[1]
207
+ ref = aln_seq[0]
208
+
209
+ ref_size = ref.size
210
+ sim_count = 0
211
+ (0..(ref_size-1)).each do |n|
212
+ ref_base = ref[n]
213
+ test_base = aln_test[n]
214
+ sim_count += 1 if ref_base == test_base
215
+ end
216
+ similarity = (sim_count/ref_size.to_f*100).round(1)
217
+
218
+ loc_p1 = l1 + 1
219
+ loc_p2 = ori_ref_l - l2
220
+ if seq.size != (loc_p2 - loc_p1 + 1)
221
+ indel = true
222
+ elsif aln_test.include?("-")
223
+ indel = true
224
+ else
225
+ indel = false
226
+ end
227
+ return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
228
+ else
229
+ return [0,0,0,0,0,0,0]
230
+ end
231
+ rescue => e
232
+ puts "Unexpected error occured."
233
+ puts "Exception Class: #{ e.class.name }"
234
+ puts "Exception Message: #{ e.message }"
235
+ puts "Exception Backtrace: #{ e.backtrace[0] }"
236
+ puts "ViralSeq.sequence_locator returns nil"
237
+ return nil
238
+ end
239
+ end
240
+
241
+ # sequence clip function
242
+ def self.sequence_clip(seq='', p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
243
+ loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
244
+ l1 = loc[0]
245
+ l2 = loc[1]
246
+ if (p1 >= l1) & (p2 <= l2)
247
+ seq = loc[4]
248
+ ref = loc[5]
249
+ g1 = 0
250
+ ref.each_char do |char|
251
+ break if l1 == p1
252
+ g1 += 1
253
+ l1 += 1 unless char == "-"
254
+ end
255
+ g2 = 1
256
+ ref.reverse.each_char do |char|
257
+ break if l2 == p2
258
+ g2 += 1
259
+ l2 -= 1 unless char == "-"
260
+ end
261
+ return seq[g1..(-g2)].tr("-","")
262
+ else
263
+ return nil
264
+ end
265
+ end
266
+
267
+ # batch quality check of HIV sequences based on ViralSeq.sequence_locator
268
+ # input a sequence hash, start nt position(s) and end nt position(s) can be an Integer, Array or Range
269
+ # and allow the sequence to contain indels
270
+ # return a hash of filtered sequences
271
+
272
+ def self.qc_hiv_seq_check(seq_hash, start_nt, end_nt, indel=true, ref_option = :HXB2, path_to_muscle = false)
273
+ seq_hash_unique = seq_hash.values.uniq
274
+ seq_hash_unique_pass = []
275
+ start_nt = start_nt..start_nt if start_nt.is_a?(Integer)
276
+ end_nt = end_nt..end_nt if end_nt.is_a?(Integer)
277
+ seq_hash_unique.each do |seq|
278
+ loc = ViralSeq.sequence_locator(seq, ref_option, path_to_muscle)
279
+ if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
280
+ if indel
281
+ seq_hash_unique_pass << seq
282
+ elsif loc[3] == false
283
+ seq_hash_unique_pass << seq
284
+ end
285
+ end
286
+ end
287
+ seq_pass = {}
288
+ seq_hash_unique_pass.each do |seq|
289
+ seq_hash.each do |seq_name, orginal_seq|
290
+ if orginal_seq == seq
291
+ seq_pass[seq_name] = seq
292
+ seq_hash.delete(seq_name)
293
+ end
294
+ end
295
+ end
296
+ return seq_pass
297
+ end
298
+
299
+ end
@@ -0,0 +1,401 @@
1
+ # lib/math.rb
2
+
3
+ # math and statistic functions
4
+ # inlcuding the following methods
5
+ # ViralSeq::count
6
+ # ViralSeq::count_percentage
7
+ # ViralSeq::poisson_distribution
8
+ # ViralSeq::r_binom_CI
9
+ # Enumerable#median
10
+ # Enumerable#sum
11
+ # Enumerable#mean
12
+ # Enumerable#sample_variance
13
+ # Enumerable#stdev
14
+ # Enumerable#upper_quartile
15
+ # Enumerable#lower_quartile
16
+ # Integer#!
17
+ # Rubystats::FishersExactTest
18
+ # RandomGaussian::new
19
+ # RandomGaussian#rand
20
+
21
+ module ViralSeq
22
+
23
+ # count elements in a array, return a hash of {:element1 => number1, :element2 => number2, ...}
24
+ # =Usage
25
+ # array = %w{cat dog monkey cat cat cat monkey}
26
+ # ViralSeq.count(array)
27
+ # => {"cat"=>4, "dog"=>1, "monkey"=>2}
28
+
29
+ def self.count(array)
30
+ hash = Hash.new(0)
31
+ array.each do |element|
32
+ hash[element] +=1
33
+ end
34
+ return hash
35
+ end
36
+
37
+ # count elements in a array, return a hash of {:element1 => frequency1, :element2 => frequency2, ...}
38
+ # default decimal as 2
39
+ # =Usage
40
+ # array = %w{cat dog monkey cat cat cat monkey}
41
+ # ViralSeq.count_percentage(array)
42
+ # => {"cat"=>0.57, "dog"=>0.14, "monkey"=>0.29}
43
+
44
+ def self.count_percentage(array,decimal = 2)
45
+ hash1 = Hash.new(0)
46
+ array.each do |element|
47
+ hash1[element] += 1
48
+ end
49
+ total_elements = array.size
50
+ hash2 = Hash.new(0)
51
+ hash1.each do |key,value|
52
+ hash2[key] = (value/total_elements.to_f).round(decimal)
53
+ end
54
+ return hash2
55
+ end
56
+
57
+ # poisson distribution. input lambda and maximum k, return a hash with keys as k
58
+ # default k value is 5, meaning calculate up to 5 events.
59
+ #
60
+ # Poisson Distribution (https://en.wikipedia.org/wiki/Poisson_distribution)
61
+ # An event can occur 0, 1, 2, … times in an interval.
62
+ # The average number of events in an interval is designated λ (lambda).
63
+ # λ is the event rate, also called the rate parameter.
64
+ # The probability of observing k events in an interval is given by the equation
65
+ #
66
+ # P(k events in interval) = e^(-λ) * λ^k / k!
67
+ #
68
+ # λ is the average number of events per interval
69
+ # e is the number 2.71828... (Euler's number) the base of the natural logarithms
70
+ # k takes values 0, 1, 2, …
71
+ # k! = k × (k − 1) × (k − 2) × … × 2 × 1 is the factorial of k.
72
+ #
73
+ # =USAGE
74
+ # # We assume the mutaiton rate is 0.005 (event rate λ),
75
+ # # we would like to calculate the probablity of 3 mutations on one sequence
76
+ # prob_hash = ViralSeq::poisson_distribution(0.005)
77
+ # => {0=>0.9950124791926823, 1=>0.004975062395963412, 2=>1.243765598990853e-05, 3=>2.072942664984755e-08, 4=>2.5911783312309436e-11, 5=>2.591178331230944e-14}
78
+ # prob_hash[3]
79
+ # => 2.072942664984755e-08
80
+
81
+ def self.poisson_distribution(rate,k = 5)
82
+ out_hash = {}
83
+ (0..k).each do |n|
84
+ p = (rate**n * Math::E**(-rate))/!n
85
+ out_hash[n] = p
86
+ end
87
+ return out_hash
88
+ end
89
+
90
+
91
+ # require R pre-installed
92
+ # calculate binomial 95% confidence intervals by R. refer to R function binom.test
93
+ # input number x and n, return an array as [lower_interval, upper_interval]
94
+ #
95
+ # =USAGE
96
+ # # mutation M184V found in 3 out of 923 sequences, the 95% confidence interval is
97
+ # ViralSeq.r_binom_CI(3, 923)
98
+ # => [0.02223, 0.19234]
99
+ #
100
+ def self.r_binom_CI(x= 0, n= 0)
101
+ r_output = `Rscript -e 'binom.test(#{x},#{n})$conf.int[1];binom.test(#{x},#{n})$conf.int[2]'`
102
+ lines = r_output.split "\n"
103
+ low = lines[0].chomp[4..-1].to_f
104
+ high = lines[1].chomp[4..-1].to_f
105
+ return [low.round(5), high.round(5)]
106
+ end
107
+
108
+ end
109
+
110
+ # statistic methods
111
+ # :median :sum :mean :sample_variance :stdev :upper_quartile :lower_quartile
112
+ # =USAGE
113
+ # array = [1,2,3,4,5,6,7,8,9,10]
114
+ # array.median
115
+ # => 5.5
116
+ # array.sum
117
+ # => 55
118
+ # array.mean
119
+ # => 5.5
120
+ # array.sample_variance
121
+ # => 9.166666666666666
122
+ # array.stdev
123
+ # => 3.0276503540974917
124
+ # array.upper_quartile
125
+ # => 7.5
126
+ # array.lower_quartile
127
+ # => 3.5
128
+
129
+ module Enumerable
130
+ def median
131
+ len = self.length
132
+ sorted = self.sort
133
+ len % 2 == 1 ? sorted[len/2] : (sorted[len/2 - 1] + sorted[len/2]).to_f / 2
134
+ end
135
+
136
+ def sum
137
+ self.inject(0){|accum, i| accum + i }
138
+ end
139
+
140
+ def mean
141
+ self.sum/self.length.to_f
142
+ end
143
+
144
+ def sample_variance
145
+ m = self.mean
146
+ sum = self.inject(0){|accum, i| accum + (i-m)**2 }
147
+ sum/(self.length - 1).to_f
148
+ end
149
+
150
+ def stdev
151
+ return Math.sqrt(self.sample_variance)
152
+ end
153
+
154
+ def upper_quartile
155
+ return nil if self.empty?
156
+ sorted_array = self.sort
157
+ u = (0.25*(3*sorted_array.length))
158
+ if (u-u.truncate).is_a?(Integer)
159
+ return sorted_array[(u-u.truncate)-1]
160
+ else
161
+ sample = sorted_array[u.truncate.abs-1]
162
+ sample1 = sorted_array[(u.truncate.abs)]
163
+ return sample+((sample1-sample)*(u-u.truncate))
164
+ end
165
+ end
166
+
167
+ def lower_quartile
168
+ return nil if self.empty?
169
+ sorted_array = self.sort
170
+ u = 0.25*sorted_array.length + 1
171
+ if (u-u.truncate).is_a?(Integer)
172
+ return sorted_array[(u-u.truncate)-1]
173
+ else
174
+ sample = sorted_array[u.truncate.abs-1]
175
+ sample1 = sorted_array[(u.truncate.abs)]
176
+ return sample+((sample1-sample)*(u-u.truncate))
177
+ end
178
+ end
179
+ end
180
+
181
+ # factorial method for an Integer
182
+ # Integer.!
183
+ class Integer
184
+ def !
185
+ if self == 0
186
+ return 1
187
+ else
188
+ (1..self).inject(:*)
189
+ end
190
+ end
191
+ end
192
+
193
+
194
+ # Fisher's Exact Test Function Library
195
+ #
196
+ # Based on JavaScript version created by: Oyvind Langsrud
197
+ # Ported to Ruby by Bryan Donovan
198
+
199
+ module Rubystats
200
+ class FishersExactTest
201
+
202
+ def initialize
203
+ @sn11 = 0.0
204
+ @sn1_ = 0.0
205
+ @sn_1 = 0.0
206
+ @sn = 0.0
207
+ @sprob = 0.0
208
+
209
+ @sleft = 0.0
210
+ @sright = 0.0
211
+ @sless = 0.0
212
+ @slarg = 0.0
213
+
214
+ @left = 0.0
215
+ @right = 0.0
216
+ @twotail = 0.0
217
+ end
218
+
219
+ # Reference: "Lanczos, C. 'A precision approximation
220
+ # of the gamma function', J. SIAM Numer. Anal., B, 1, 86-96, 1964."
221
+ # Translation of Alan Miller's FORTRAN-implementation
222
+ # See http://lib.stat.cmu.edu/apstat/245
223
+ def lngamm(z)
224
+ x = 0
225
+ x += 0.0000001659470187408462 / (z+7)
226
+ x += 0.000009934937113930748 / (z+6)
227
+ x -= 0.1385710331296526 / (z+5)
228
+ x += 12.50734324009056 / (z+4)
229
+ x -= 176.6150291498386 / (z+3)
230
+ x += 771.3234287757674 / (z+2)
231
+ x -= 1259.139216722289 / (z+1)
232
+ x += 676.5203681218835 / (z)
233
+ x += 0.9999999999995183
234
+
235
+ return(Math.log(x)-5.58106146679532777-z+(z-0.5) * Math.log(z+6.5))
236
+ end
237
+
238
+ def lnfact(n)
239
+ if n <= 1
240
+ return 0
241
+ else
242
+ return lngamm(n+1)
243
+ end
244
+ end
245
+
246
+ def lnbico(n,k)
247
+ return lnfact(n) - lnfact(k) - lnfact(n-k)
248
+ end
249
+
250
+ def hyper_323(n11, n1_, n_1, n)
251
+ return Math.exp(lnbico(n1_, n11) + lnbico(n-n1_, n_1-n11) - lnbico(n, n_1))
252
+ end
253
+
254
+ def hyper(n11)
255
+ return hyper0(n11, 0, 0, 0)
256
+ end
257
+
258
+ def hyper0(n11i,n1_i,n_1i,ni)
259
+ if n1_i == 0 and n_1i ==0 and ni == 0
260
+ unless n11i % 10 == 0
261
+ if n11i == @sn11+1
262
+ @sprob *= ((@sn1_ - @sn11)/(n11i.to_f))*((@sn_1 - @sn11)/(n11i.to_f + @sn - @sn1_ - @sn_1))
263
+ @sn11 = n11i
264
+ return @sprob
265
+ end
266
+ if n11i == @sn11-1
267
+ @sprob *= ((@sn11)/(@sn1_-n11i.to_f))*((@sn11+@sn-@sn1_-@sn_1)/(@sn_1-n11i.to_f))
268
+ @sn11 = n11i
269
+ return @sprob
270
+ end
271
+ end
272
+ @sn11 = n11i
273
+ else
274
+ @sn11 = n11i
275
+ @sn1_ = n1_i
276
+ @sn_1 = n_1i
277
+ @sn = ni
278
+ end
279
+ @sprob = hyper_323(@sn11,@sn1_,@sn_1,@sn)
280
+ return @sprob
281
+ end
282
+
283
+ def exact(n11,n1_,n_1,n)
284
+
285
+ p = i = j = prob = 0.0
286
+
287
+ max = n1_
288
+ max = n_1 if n_1 < max
289
+ min = n1_ + n_1 - n
290
+ min = 0 if min < 0
291
+
292
+ if min == max
293
+ @sless = 1
294
+ @sright = 1
295
+ @sleft = 1
296
+ @slarg = 1
297
+ return 1
298
+ end
299
+
300
+ prob = hyper0(n11,n1_,n_1,n)
301
+ @sleft = 0
302
+
303
+ p = hyper(min)
304
+ i = min + 1
305
+ while p < (0.99999999 * prob)
306
+ @sleft += p
307
+ p = hyper(i)
308
+ i += 1
309
+ end
310
+
311
+ i -= 1
312
+
313
+ if p < (1.00000001*prob)
314
+ @sleft += p
315
+ else
316
+ i -= 1
317
+ end
318
+
319
+ @sright = 0
320
+
321
+ p = hyper(max)
322
+ j = max - 1
323
+ while p < (0.99999999 * prob)
324
+ @sright += p
325
+ p = hyper(j)
326
+ j -= 1
327
+ end
328
+ j += 1
329
+
330
+ if p < (1.00000001*prob)
331
+ @sright += p
332
+ else
333
+ j += 1
334
+ end
335
+
336
+ if (i - n11).abs < (j - n11).abs
337
+ @sless = @sleft
338
+ @slarg = 1 - @sleft + prob
339
+ else
340
+ @sless = 1 - @sright + prob
341
+ @slarg = @sright
342
+ end
343
+ return prob
344
+ end
345
+
346
+ def calculate(n11_,n12_,n21_,n22_)
347
+ n11_ *= -1 if n11_ < 0
348
+ n12_ *= -1 if n12_ < 0
349
+ n21_ *= -1 if n21_ < 0
350
+ n22_ *= -1 if n22_ < 0
351
+ n1_ = n11_ + n12_
352
+ n_1 = n11_ + n21_
353
+ n = n11_ + n12_ + n21_ + n22_
354
+ exact(n11_,n1_,n_1,n)
355
+ left = @sless
356
+ right = @slarg
357
+ twotail = @sleft + @sright
358
+ twotail = 1 if twotail > 1
359
+ values_hash = { :left =>left, :right =>right, :twotail =>twotail }
360
+ return values_hash
361
+ end
362
+ end
363
+ end
364
+
365
+
366
+ # generate values from the standard normal distribution with given mean and standard deviation
367
+ # See http://en.wikipedia.org/wiki/Box-Muller_transform
368
+ #
369
+ # RandomGaussian.new(mean, sd, rng)
370
+ # # generate RandomGaussian instance with given mean and standard deviation
371
+ # # default value: mean = 0.0, sd = 1.0
372
+ #
373
+ # RandomGaussian.rand
374
+ # # generate a random number that falls in the pre-defined gaussian distribution
375
+ # =USAGE
376
+ # # example
377
+ # a = RandomGaussian.new
378
+ # a.rand
379
+ # numbers = []
380
+ # 10.times {numbers << a.rand.round(5)}
381
+ # numbers
382
+ # [-1.83457, 1.24439, -0.30109, 0.13977, 0.61556, 1.3548, 1.72878, 2.46171, 0.97031, -0.29496]
383
+
384
+
385
+ class RandomGaussian
386
+ def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
387
+ @mean, @sd, @rng = mean, sd, rng
388
+ @compute_next_pair = false
389
+ end
390
+
391
+ def rand
392
+ if (@compute_next_pair = !@compute_next_pair)
393
+ theta = 2 * Math::PI * @rng.call
394
+ scale = @sd * Math.sqrt(-2 * Math.log(1 - @rng.call))
395
+ @g1 = @mean + scale * Math.sin(theta)
396
+ @g0 = @mean + scale * Math.cos(theta)
397
+ else
398
+ @g1
399
+ end
400
+ end
401
+ end